From 74838b75379a53678ffc5f59de86161d21e2c808 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 27 Jul 2012 20:55:27 -0400 Subject: swiotlb: add the late swiotlb initialization function with iotlb memory This enables the caller to initialize swiotlb with its own iotlb memory late in the bootup. See git commit eb605a5754d050a25a9f00d718fb173f24c486ef "swiotlb: add swiotlb_tbl_map_single library function" which will explain the full details of what it can be used for. CC: FUJITA Tomonori [v1: Fold in smatch warning] Signed-off-by: Konrad Rzeszutek Wilk --- lib/swiotlb.c | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) (limited to 'lib') diff --git a/lib/swiotlb.c b/lib/swiotlb.c index 45bc1f83a5ad..f114bf6a8e13 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -170,7 +170,7 @@ void __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) * Statically reserve bounce buffer space and initialize bounce buffer data * structures for the software IO TLB used to implement the DMA API. */ -void __init +static void __init swiotlb_init_with_default_size(size_t default_size, int verbose) { unsigned long bytes; @@ -206,8 +206,9 @@ swiotlb_init(int verbose) int swiotlb_late_init_with_default_size(size_t default_size) { - unsigned long i, bytes, req_nslabs = io_tlb_nslabs; + unsigned long bytes, req_nslabs = io_tlb_nslabs; unsigned int order; + int rc = 0; if (!io_tlb_nslabs) { io_tlb_nslabs = (default_size >> IO_TLB_SHIFT); @@ -229,16 +230,32 @@ swiotlb_late_init_with_default_size(size_t default_size) order--; } - if (!io_tlb_start) - goto cleanup1; - + if (!io_tlb_start) { + io_tlb_nslabs = req_nslabs; + return -ENOMEM; + } if (order != get_order(bytes)) { printk(KERN_WARNING "Warning: only able to allocate %ld MB " "for software IO TLB\n", (PAGE_SIZE << order) >> 20); io_tlb_nslabs = SLABS_PER_PAGE << order; - bytes = io_tlb_nslabs << IO_TLB_SHIFT; } + rc = swiotlb_late_init_with_tbl(io_tlb_start, io_tlb_nslabs); + if (rc) + free_pages((unsigned long)io_tlb_start, order); + return rc; +} + +int +swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) +{ + unsigned long i, bytes; + + bytes = nslabs << IO_TLB_SHIFT; + + io_tlb_nslabs = nslabs; + io_tlb_start = tlb; io_tlb_end = io_tlb_start + bytes; + memset(io_tlb_start, 0, bytes); /* @@ -288,10 +305,8 @@ cleanup3: io_tlb_list = NULL; cleanup2: io_tlb_end = NULL; - free_pages((unsigned long)io_tlb_start, order); io_tlb_start = NULL; -cleanup1: - io_tlb_nslabs = req_nslabs; + io_tlb_nslabs = 0; return -ENOMEM; } -- cgit v1.2.3 From 9eca2eb9942ee0b6f2bc76485e16e334aee2c0bf Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sat, 25 Aug 2012 22:47:57 +0000 Subject: netlink: add minlen validation for the new signed types Signed-off-by: Julian Anastasov Signed-off-by: David S. Miller --- lib/nlattr.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'lib') diff --git a/lib/nlattr.c b/lib/nlattr.c index 4226dfeb5178..18eca7809b08 100644 --- a/lib/nlattr.c +++ b/lib/nlattr.c @@ -22,6 +22,10 @@ static const u16 nla_attr_minlen[NLA_TYPE_MAX+1] = { [NLA_U64] = sizeof(u64), [NLA_MSECS] = sizeof(u64), [NLA_NESTED] = NLA_HDRLEN, + [NLA_S8] = sizeof(s8), + [NLA_S16] = sizeof(s16), + [NLA_S32] = sizeof(s32), + [NLA_S64] = sizeof(s64), }; static int validate_nla(const struct nlattr *nla, int maxtype, -- cgit v1.2.3 From 9785e10aedfa0fad5c1aac709dce5ada1b123783 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 8 Sep 2012 02:53:53 +0000 Subject: netlink: kill netlink_set_nonroot Replace netlink_set_nonroot by one new field `flags' in struct netlink_kernel_cfg that is passed to netlink_kernel_create. This patch also renames NL_NONROOT_* to NL_CFG_F_NONROOT_* since now the flags field in nl_table is generic (so we can add more flags if needed in the future). Also adjust all callers in the net-next tree to use these flags instead of netlink_set_nonroot. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/linux/netlink.h | 9 ++++----- lib/kobject_uevent.c | 2 +- net/core/rtnetlink.c | 2 +- net/netlink/af_netlink.c | 28 +++++++++++++--------------- net/netlink/genetlink.c | 3 +-- security/selinux/netlink.c | 2 +- 6 files changed, 21 insertions(+), 25 deletions(-) (limited to 'lib') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index df73cf4b0290..8719a4e235a5 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -176,12 +176,16 @@ struct netlink_skb_parms { extern void netlink_table_grab(void); extern void netlink_table_ungrab(void); +#define NL_CFG_F_NONROOT_RECV (1 << 0) +#define NL_CFG_F_NONROOT_SEND (1 << 1) + /* optional Netlink kernel configuration parameters */ struct netlink_kernel_cfg { unsigned int groups; void (*input)(struct sk_buff *skb); struct mutex *cb_mutex; void (*bind)(int group); + unsigned int flags; }; extern struct sock *netlink_kernel_create(struct net *net, int unit, @@ -260,11 +264,6 @@ extern int netlink_dump_start(struct sock *ssk, struct sk_buff *skb, const struct nlmsghdr *nlh, struct netlink_dump_control *control); - -#define NL_NONROOT_RECV 0x1 -#define NL_NONROOT_SEND 0x2 -extern void netlink_set_nonroot(int protocol, unsigned flag); - #endif /* __KERNEL__ */ #endif /* __LINUX_NETLINK_H */ diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c index 0401d2916d9f..c2e97787d01e 100644 --- a/lib/kobject_uevent.c +++ b/lib/kobject_uevent.c @@ -375,6 +375,7 @@ static int uevent_net_init(struct net *net) struct uevent_sock *ue_sk; struct netlink_kernel_cfg cfg = { .groups = 1, + .flags = NL_CFG_F_NONROOT_RECV, }; ue_sk = kzalloc(sizeof(*ue_sk), GFP_KERNEL); @@ -422,7 +423,6 @@ static struct pernet_operations uevent_net_ops = { static int __init kobject_uevent_init(void) { - netlink_set_nonroot(NETLINK_KOBJECT_UEVENT, NL_NONROOT_RECV); return register_pernet_subsys(&uevent_net_ops); } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index c64efcff8078..a71806eb9cc6 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2381,6 +2381,7 @@ static int __net_init rtnetlink_net_init(struct net *net) .groups = RTNLGRP_MAX, .input = rtnetlink_rcv, .cb_mutex = &rtnl_mutex, + .flags = NL_CFG_F_NONROOT_RECV, }; sk = netlink_kernel_create(net, NETLINK_ROUTE, THIS_MODULE, &cfg); @@ -2416,7 +2417,6 @@ void __init rtnetlink_init(void) if (register_pernet_subsys(&rtnetlink_net_ops)) panic("rtnetlink_init: cannot initialize rtnetlink\n"); - netlink_set_nonroot(NETLINK_ROUTE, NL_NONROOT_RECV); register_netdevice_notifier(&rtnetlink_dev_notifier); rtnl_register(PF_UNSPEC, RTM_GETLINK, rtnl_getlink, diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index f530b1ca1773..b74540ce3c14 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -121,7 +121,7 @@ struct netlink_table { struct nl_pid_hash hash; struct hlist_head mc_list; struct listeners __rcu *listeners; - unsigned int nl_nonroot; + unsigned int flags; unsigned int groups; struct mutex *cb_mutex; struct module *module; @@ -536,6 +536,8 @@ static int netlink_release(struct socket *sock) if (--nl_table[sk->sk_protocol].registered == 0) { kfree(nl_table[sk->sk_protocol].listeners); nl_table[sk->sk_protocol].module = NULL; + nl_table[sk->sk_protocol].bind = NULL; + nl_table[sk->sk_protocol].flags = 0; nl_table[sk->sk_protocol].registered = 0; } } else if (nlk->subscriptions) { @@ -596,7 +598,7 @@ retry: static inline int netlink_capable(const struct socket *sock, unsigned int flag) { - return (nl_table[sock->sk->sk_protocol].nl_nonroot & flag) || + return (nl_table[sock->sk->sk_protocol].flags & flag) || capable(CAP_NET_ADMIN); } @@ -659,7 +661,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, /* Only superuser is allowed to listen multicasts */ if (nladdr->nl_groups) { - if (!netlink_capable(sock, NL_NONROOT_RECV)) + if (!netlink_capable(sock, NL_CFG_F_NONROOT_RECV)) return -EPERM; err = netlink_realloc_groups(sk); if (err) @@ -721,7 +723,7 @@ static int netlink_connect(struct socket *sock, struct sockaddr *addr, return -EINVAL; /* Only superuser is allowed to send multicasts */ - if (nladdr->nl_groups && !netlink_capable(sock, NL_NONROOT_SEND)) + if (nladdr->nl_groups && !netlink_capable(sock, NL_CFG_F_NONROOT_SEND)) return -EPERM; if (!nlk->pid) @@ -1244,7 +1246,7 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, break; case NETLINK_ADD_MEMBERSHIP: case NETLINK_DROP_MEMBERSHIP: { - if (!netlink_capable(sock, NL_NONROOT_RECV)) + if (!netlink_capable(sock, NL_CFG_F_NONROOT_RECV)) return -EPERM; err = netlink_realloc_groups(sk); if (err) @@ -1376,7 +1378,7 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, dst_group = ffs(addr->nl_groups); err = -EPERM; if ((dst_group || dst_pid) && - !netlink_capable(sock, NL_NONROOT_SEND)) + !netlink_capable(sock, NL_CFG_F_NONROOT_SEND)) goto out; } else { dst_pid = nlk->dst_pid; @@ -1580,7 +1582,10 @@ netlink_kernel_create(struct net *net, int unit, rcu_assign_pointer(nl_table[unit].listeners, listeners); nl_table[unit].cb_mutex = cb_mutex; nl_table[unit].module = module; - nl_table[unit].bind = cfg ? cfg->bind : NULL; + if (cfg) { + nl_table[unit].bind = cfg->bind; + nl_table[unit].flags = cfg->flags; + } nl_table[unit].registered = 1; } else { kfree(listeners); @@ -1679,13 +1684,6 @@ void netlink_clear_multicast_users(struct sock *ksk, unsigned int group) netlink_table_ungrab(); } -void netlink_set_nonroot(int protocol, unsigned int flags) -{ - if ((unsigned int)protocol < MAX_LINKS) - nl_table[protocol].nl_nonroot = flags; -} -EXPORT_SYMBOL(netlink_set_nonroot); - struct nlmsghdr * __nlmsg_put(struct sk_buff *skb, u32 pid, u32 seq, int type, int len, int flags) { @@ -2150,7 +2148,7 @@ static void __init netlink_add_usersock_entry(void) rcu_assign_pointer(nl_table[NETLINK_USERSOCK].listeners, listeners); nl_table[NETLINK_USERSOCK].module = THIS_MODULE; nl_table[NETLINK_USERSOCK].registered = 1; - nl_table[NETLINK_USERSOCK].nl_nonroot = NL_NONROOT_SEND; + nl_table[NETLINK_USERSOCK].flags = NL_CFG_F_NONROOT_SEND; netlink_table_ungrab(); } diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index fda497412fc3..c1b71aef9f71 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -918,6 +918,7 @@ static int __net_init genl_pernet_init(struct net *net) struct netlink_kernel_cfg cfg = { .input = genl_rcv, .cb_mutex = &genl_mutex, + .flags = NL_CFG_F_NONROOT_RECV, }; /* we'll bump the group number right afterwards */ @@ -955,8 +956,6 @@ static int __init genl_init(void) if (err < 0) goto problem; - netlink_set_nonroot(NETLINK_GENERIC, NL_NONROOT_RECV); - err = register_pernet_subsys(&genl_pernet_ops); if (err) goto problem; diff --git a/security/selinux/netlink.c b/security/selinux/netlink.c index 8a77725423e0..0d2cd11f3c22 100644 --- a/security/selinux/netlink.c +++ b/security/selinux/netlink.c @@ -113,13 +113,13 @@ static int __init selnl_init(void) { struct netlink_kernel_cfg cfg = { .groups = SELNLGRP_MAX, + .flags = NL_CFG_F_NONROOT_RECV, }; selnl = netlink_kernel_create(&init_net, NETLINK_SELINUX, THIS_MODULE, &cfg); if (selnl == NULL) panic("SELinux: Cannot create netlink socket."); - netlink_set_nonroot(NETLINK_SELINUX, NL_NONROOT_RECV); return 0; } -- cgit v1.2.3 From 9f00d9776bc5beb92e8bfc884a7e96ddc5589e2e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 8 Sep 2012 02:53:54 +0000 Subject: netlink: hide struct module parameter in netlink_kernel_create This patch defines netlink_kernel_create as a wrapper function of __netlink_kernel_create to hide the struct module *me parameter (which seems to be THIS_MODULE in all existing netlink subsystems). Suggested by David S. Miller. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- crypto/crypto_user.c | 3 +-- drivers/connector/connector.c | 3 +-- drivers/infiniband/core/netlink.c | 2 +- drivers/scsi/scsi_netlink.c | 2 +- drivers/scsi/scsi_transport_iscsi.c | 3 +-- drivers/staging/gdm72xx/netlink_k.c | 2 +- include/linux/netlink.h | 13 ++++++++++--- kernel/audit.c | 3 +-- lib/kobject_uevent.c | 3 +-- net/bridge/netfilter/ebt_ulog.c | 3 +-- net/core/rtnetlink.c | 2 +- net/core/sock_diag.c | 3 +-- net/decnet/netfilter/dn_rtmsg.c | 3 +-- net/ipv4/fib_frontend.c | 2 +- net/ipv4/netfilter/ipt_ULOG.c | 3 +-- net/netfilter/nfnetlink.c | 2 +- net/netlink/af_netlink.c | 8 +++----- net/netlink/genetlink.c | 3 +-- net/xfrm/xfrm_user.c | 2 +- security/selinux/netlink.c | 3 +-- 20 files changed, 31 insertions(+), 37 deletions(-) (limited to 'lib') diff --git a/crypto/crypto_user.c b/crypto/crypto_user.c index ba2c611154af..165914e63ef2 100644 --- a/crypto/crypto_user.c +++ b/crypto/crypto_user.c @@ -500,8 +500,7 @@ static int __init crypto_user_init(void) .input = crypto_netlink_rcv, }; - crypto_nlsk = netlink_kernel_create(&init_net, NETLINK_CRYPTO, - THIS_MODULE, &cfg); + crypto_nlsk = netlink_kernel_create(&init_net, NETLINK_CRYPTO, &cfg); if (!crypto_nlsk) return -ENOMEM; diff --git a/drivers/connector/connector.c b/drivers/connector/connector.c index 82fa4f0f91d6..965b7811e04f 100644 --- a/drivers/connector/connector.c +++ b/drivers/connector/connector.c @@ -264,8 +264,7 @@ static int __devinit cn_init(void) .input = dev->input, }; - dev->nls = netlink_kernel_create(&init_net, NETLINK_CONNECTOR, - THIS_MODULE, &cfg); + dev->nls = netlink_kernel_create(&init_net, NETLINK_CONNECTOR, &cfg); if (!dev->nls) return -EIO; diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c index 3ae2bfd31015..fe10a949aef9 100644 --- a/drivers/infiniband/core/netlink.c +++ b/drivers/infiniband/core/netlink.c @@ -177,7 +177,7 @@ int __init ibnl_init(void) .input = ibnl_rcv, }; - nls = netlink_kernel_create(&init_net, NETLINK_RDMA, THIS_MODULE, &cfg); + nls = netlink_kernel_create(&init_net, NETLINK_RDMA, &cfg); if (!nls) { pr_warn("Failed to create netlink socket\n"); return -ENOMEM; diff --git a/drivers/scsi/scsi_netlink.c b/drivers/scsi/scsi_netlink.c index 8818dd681c19..3252bc9625ee 100644 --- a/drivers/scsi/scsi_netlink.c +++ b/drivers/scsi/scsi_netlink.c @@ -501,7 +501,7 @@ scsi_netlink_init(void) } scsi_nl_sock = netlink_kernel_create(&init_net, NETLINK_SCSITRANSPORT, - THIS_MODULE, &cfg); + &cfg); if (!scsi_nl_sock) { printk(KERN_ERR "%s: register of receive handler failed\n", __func__); diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index fa1dfaa83e32..519bd5303f3f 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -2969,8 +2969,7 @@ static __init int iscsi_transport_init(void) if (err) goto unregister_conn_class; - nls = netlink_kernel_create(&init_net, NETLINK_ISCSI, - THIS_MODULE, &cfg); + nls = netlink_kernel_create(&init_net, NETLINK_ISCSI, &cfg); if (!nls) { err = -ENOBUFS; goto unregister_session_class; diff --git a/drivers/staging/gdm72xx/netlink_k.c b/drivers/staging/gdm72xx/netlink_k.c index 3abb31df8f28..2109cab0a14c 100644 --- a/drivers/staging/gdm72xx/netlink_k.c +++ b/drivers/staging/gdm72xx/netlink_k.c @@ -95,7 +95,7 @@ struct sock *netlink_init(int unit, void (*cb)(struct net_device *dev, u16 type, init_MUTEX(&netlink_mutex); #endif - sock = netlink_kernel_create(&init_net, unit, THIS_MODULE, &cfg); + sock = netlink_kernel_create(&init_net, unit, &cfg); if (sock) rcv_cb = cb; diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 8719a4e235a5..cd17dda5a987 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -153,6 +153,7 @@ struct nlattr { #include #include +#include #include struct net; @@ -188,9 +189,15 @@ struct netlink_kernel_cfg { unsigned int flags; }; -extern struct sock *netlink_kernel_create(struct net *net, int unit, - struct module *module, - struct netlink_kernel_cfg *cfg); +extern struct sock *__netlink_kernel_create(struct net *net, int unit, + struct module *module, + struct netlink_kernel_cfg *cfg); +static inline struct sock * +netlink_kernel_create(struct net *net, int unit, struct netlink_kernel_cfg *cfg) +{ + return __netlink_kernel_create(net, unit, THIS_MODULE, cfg); +} + extern void netlink_kernel_release(struct sock *sk); extern int __netlink_change_ngroups(struct sock *sk, unsigned int groups); extern int netlink_change_ngroups(struct sock *sk, unsigned int groups); diff --git a/kernel/audit.c b/kernel/audit.c index ea3b7b6191c7..a24aafa850ae 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -971,8 +971,7 @@ static int __init audit_init(void) printk(KERN_INFO "audit: initializing netlink socket (%s)\n", audit_default ? "enabled" : "disabled"); - audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, - THIS_MODULE, &cfg); + audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, &cfg); if (!audit_sock) audit_panic("cannot initialize netlink socket"); else diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c index c2e97787d01e..52e5abbc41db 100644 --- a/lib/kobject_uevent.c +++ b/lib/kobject_uevent.c @@ -382,8 +382,7 @@ static int uevent_net_init(struct net *net) if (!ue_sk) return -ENOMEM; - ue_sk->sk = netlink_kernel_create(net, NETLINK_KOBJECT_UEVENT, - THIS_MODULE, &cfg); + ue_sk->sk = netlink_kernel_create(net, NETLINK_KOBJECT_UEVENT, &cfg); if (!ue_sk->sk) { printk(KERN_ERR "kobject_uevent: unable to create netlink socket!\n"); diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c index 19063473c71f..3476ec469740 100644 --- a/net/bridge/netfilter/ebt_ulog.c +++ b/net/bridge/netfilter/ebt_ulog.c @@ -298,8 +298,7 @@ static int __init ebt_ulog_init(void) spin_lock_init(&ulog_buffers[i].lock); } - ebtulognl = netlink_kernel_create(&init_net, NETLINK_NFLOG, - THIS_MODULE, &cfg); + ebtulognl = netlink_kernel_create(&init_net, NETLINK_NFLOG, &cfg); if (!ebtulognl) ret = -ENOMEM; else if ((ret = xt_register_target(&ebt_ulog_tg_reg)) != 0) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index a71806eb9cc6..508c5df4a09c 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2384,7 +2384,7 @@ static int __net_init rtnetlink_net_init(struct net *net) .flags = NL_CFG_F_NONROOT_RECV, }; - sk = netlink_kernel_create(net, NETLINK_ROUTE, THIS_MODULE, &cfg); + sk = netlink_kernel_create(net, NETLINK_ROUTE, &cfg); if (!sk) return -ENOMEM; net->rtnl = sk; diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index 9d8755e4a7a5..602cd637182e 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -172,8 +172,7 @@ static int __net_init diag_net_init(struct net *net) .input = sock_diag_rcv, }; - net->diag_nlsk = netlink_kernel_create(net, NETLINK_SOCK_DIAG, - THIS_MODULE, &cfg); + net->diag_nlsk = netlink_kernel_create(net, NETLINK_SOCK_DIAG, &cfg); return net->diag_nlsk == NULL ? -ENOMEM : 0; } diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c index 11db0ecf342f..dfe42012a044 100644 --- a/net/decnet/netfilter/dn_rtmsg.c +++ b/net/decnet/netfilter/dn_rtmsg.c @@ -130,8 +130,7 @@ static int __init dn_rtmsg_init(void) .input = dnrmg_receive_user_skb, }; - dnrmg = netlink_kernel_create(&init_net, - NETLINK_DNRTMSG, THIS_MODULE, &cfg); + dnrmg = netlink_kernel_create(&init_net, NETLINK_DNRTMSG, &cfg); if (dnrmg == NULL) { printk(KERN_ERR "dn_rtmsg: Cannot create netlink socket"); return -ENOMEM; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 8a7cd795a96e..dc1f10ed1872 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -986,7 +986,7 @@ static int __net_init nl_fib_lookup_init(struct net *net) .input = nl_fib_input, }; - sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, THIS_MODULE, &cfg); + sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, &cfg); if (sk == NULL) return -EAFNOSUPPORT; net->ipv4.fibnl = sk; diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c index 1109f7f6c254..b5ef3cba2250 100644 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ b/net/ipv4/netfilter/ipt_ULOG.c @@ -396,8 +396,7 @@ static int __init ulog_tg_init(void) for (i = 0; i < ULOG_MAXNLGROUPS; i++) setup_timer(&ulog_buffers[i].timer, ulog_timer, i); - nflognl = netlink_kernel_create(&init_net, NETLINK_NFLOG, - THIS_MODULE, &cfg); + nflognl = netlink_kernel_create(&init_net, NETLINK_NFLOG, &cfg); if (!nflognl) return -ENOMEM; diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index a26503342e71..ffb92c03a358 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -241,7 +241,7 @@ static int __net_init nfnetlink_net_init(struct net *net) #endif }; - nfnl = netlink_kernel_create(net, NETLINK_NETFILTER, THIS_MODULE, &cfg); + nfnl = netlink_kernel_create(net, NETLINK_NETFILTER, &cfg); if (!nfnl) return -ENOMEM; net->nfnl_stash = nfnl; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index b74540ce3c14..4d348e97e131 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1526,9 +1526,8 @@ static void netlink_data_ready(struct sock *sk, int len) */ struct sock * -netlink_kernel_create(struct net *net, int unit, - struct module *module, - struct netlink_kernel_cfg *cfg) +__netlink_kernel_create(struct net *net, int unit, struct module *module, + struct netlink_kernel_cfg *cfg) { struct socket *sock; struct sock *sk; @@ -1603,8 +1602,7 @@ out_sock_release_nosk: sock_release(sock); return NULL; } -EXPORT_SYMBOL(netlink_kernel_create); - +EXPORT_SYMBOL(__netlink_kernel_create); void netlink_kernel_release(struct sock *sk) diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index c1b71aef9f71..19288b7d6135 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -922,8 +922,7 @@ static int __net_init genl_pernet_init(struct net *net) }; /* we'll bump the group number right afterwards */ - net->genl_sock = netlink_kernel_create(net, NETLINK_GENERIC, - THIS_MODULE, &cfg); + net->genl_sock = netlink_kernel_create(net, NETLINK_GENERIC, &cfg); if (!net->genl_sock && net_eq(net, &init_net)) panic("GENL: Cannot initialize generic netlink\n"); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index ab58034c42d6..354070adb5ef 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -2963,7 +2963,7 @@ static int __net_init xfrm_user_net_init(struct net *net) .input = xfrm_netlink_rcv, }; - nlsk = netlink_kernel_create(net, NETLINK_XFRM, THIS_MODULE, &cfg); + nlsk = netlink_kernel_create(net, NETLINK_XFRM, &cfg); if (nlsk == NULL) return -ENOMEM; net->xfrm.nlsk_stash = nlsk; /* Don't set to NULL */ diff --git a/security/selinux/netlink.c b/security/selinux/netlink.c index 0d2cd11f3c22..14d810ead420 100644 --- a/security/selinux/netlink.c +++ b/security/selinux/netlink.c @@ -116,8 +116,7 @@ static int __init selnl_init(void) .flags = NL_CFG_F_NONROOT_RECV, }; - selnl = netlink_kernel_create(&init_net, NETLINK_SELINUX, - THIS_MODULE, &cfg); + selnl = netlink_kernel_create(&init_net, NETLINK_SELINUX, &cfg); if (selnl == NULL) panic("SELinux: Cannot create netlink socket."); return 0; -- cgit v1.2.3 From b53d657d84f530e5d83f34ff1b81ceedad3faa31 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 7 Sep 2012 14:31:45 +0200 Subject: usb/core: use bin2bcd() for bcdDevice in RH The kernel's version number is used as decimal in the bcdDevice field of the RH descriptor. For kernel version v3.12 we would see 3.0c in lsusb. I am not sure how important it is to stick with bcd values since this is this way since we started git history and nobody complained (however back then we reported only 2.6). Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/hcd.c | 6 +++--- include/linux/bcd.h | 17 +++++++++++++++-- lib/bcd.c | 8 ++++---- 3 files changed, 22 insertions(+), 9 deletions(-) (limited to 'lib') diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c index bc84106ac057..6e4fd28bc242 100644 --- a/drivers/usb/core/hcd.c +++ b/drivers/usb/core/hcd.c @@ -22,6 +22,7 @@ * Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +#include #include #include #include @@ -123,9 +124,8 @@ static inline int is_root_hub(struct usb_device *udev) */ /*-------------------------------------------------------------------------*/ - -#define KERNEL_REL ((LINUX_VERSION_CODE >> 16) & 0x0ff) -#define KERNEL_VER ((LINUX_VERSION_CODE >> 8) & 0x0ff) +#define KERNEL_REL bin2bcd(((LINUX_VERSION_CODE >> 16) & 0x0ff)) +#define KERNEL_VER bin2bcd(((LINUX_VERSION_CODE >> 8) & 0x0ff)) /* usb 3.0 root hub device descriptor */ static const u8 usb3_rh_dev_descriptor[18] = { diff --git a/include/linux/bcd.h b/include/linux/bcd.h index 22ea563ba3eb..18fff11fb3ea 100644 --- a/include/linux/bcd.h +++ b/include/linux/bcd.h @@ -3,7 +3,20 @@ #include -unsigned bcd2bin(unsigned char val) __attribute_const__; -unsigned char bin2bcd(unsigned val) __attribute_const__; +#define bcd2bin(x) \ + (__builtin_constant_p((u8 )(x)) ? \ + const_bcd2bin(x) : \ + _bcd2bin(x)) + +#define bin2bcd(x) \ + (__builtin_constant_p((u8 )(x)) ? \ + const_bin2bcd(x) : \ + _bin2bcd(x)) + +#define const_bcd2bin(x) (((x) & 0x0f) + ((x) >> 4) * 10) +#define const_bin2bcd(x) ((((x) / 10) << 4) + (x) % 10) + +unsigned _bcd2bin(unsigned char val) __attribute_const__; +unsigned char _bin2bcd(unsigned val) __attribute_const__; #endif /* _BCD_H */ diff --git a/lib/bcd.c b/lib/bcd.c index 55efaf742346..40d304efe272 100644 --- a/lib/bcd.c +++ b/lib/bcd.c @@ -1,14 +1,14 @@ #include #include -unsigned bcd2bin(unsigned char val) +unsigned _bcd2bin(unsigned char val) { return (val & 0x0f) + (val >> 4) * 10; } -EXPORT_SYMBOL(bcd2bin); +EXPORT_SYMBOL(_bcd2bin); -unsigned char bin2bcd(unsigned val) +unsigned char _bin2bcd(unsigned val) { return ((val / 10) << 4) + val % 10; } -EXPORT_SYMBOL(bin2bcd); +EXPORT_SYMBOL(_bin2bcd); -- cgit v1.2.3 From 2e484610296b25f0a04b516bc144a00731d1d845 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Thu, 27 Sep 2012 12:45:28 +0200 Subject: scatterlist: add sg_nents Useful helper to know the number of entries in scatterlist. Signed-off-by: Maxim Levitsky Cc: Alex Dubov Acked-by: Tejun Heo Cc: Jens Axboe Signed-off-by: Jens Axboe --- include/linux/scatterlist.h | 1 + lib/scatterlist.c | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+) (limited to 'lib') diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index 7b600da9a635..4bd6c06eb28e 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -201,6 +201,7 @@ static inline void *sg_virt(struct scatterlist *sg) return page_address(sg_page(sg)) + sg->offset; } +int sg_nents(struct scatterlist *sg); struct scatterlist *sg_next(struct scatterlist *); struct scatterlist *sg_last(struct scatterlist *s, unsigned int); void sg_init_table(struct scatterlist *, unsigned int); diff --git a/lib/scatterlist.c b/lib/scatterlist.c index fadae774a20c..1bf60efb5e02 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -38,6 +38,28 @@ struct scatterlist *sg_next(struct scatterlist *sg) } EXPORT_SYMBOL(sg_next); +/** + * sg_nents - return total count of entries in scatterlist + * @sg: The scatterlist + * + * Description: + * Allows to know how many entries are in sg, taking into acount + * chaining as well + * + **/ +int sg_nents(struct scatterlist *sg) +{ + int nents = 0; + while (sg) { + nents++; + sg = sg_next(sg); + } + + return nents; +} +EXPORT_SYMBOL(sg_nents); + + /** * sg_last - return the last scatterlist entry in a list * @sgl: First entry in the scatterlist -- cgit v1.2.3 From 232f1b51062553b7cf49f99719fbd1b8a8d80f29 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Fri, 28 Sep 2012 10:38:15 +0200 Subject: scatterlist: refactor the sg_nents Replace 'while' with 'for' as suggested by Tejun Heo Signed-off-by: Maxim Levitsky Signed-off-by: Jens Axboe --- lib/scatterlist.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'lib') diff --git a/lib/scatterlist.c b/lib/scatterlist.c index 1bf60efb5e02..8cd2ced68f36 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -49,12 +49,9 @@ EXPORT_SYMBOL(sg_next); **/ int sg_nents(struct scatterlist *sg) { - int nents = 0; - while (sg) { + int nents; + for (nents = 0; sg; sg = sg_next(sg)) nents++; - sg = sg_next(sg); - } - return nents; } EXPORT_SYMBOL(sg_nents); -- cgit v1.2.3 From 759643ce39f13c416236e2609ccaed9792107b2f Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Mon, 1 Oct 2012 12:48:31 -0600 Subject: dma-debug: Remove local BUS_NOTIFY_UNBOUND_DRIVER define Remove local BUS_NOTIFY_UNBOUND_DRIVER define. This is not used since BUS_NOTIFY_UNBOUND_DRIVER is defined in include/linux/device.h Signed-off-by: Shuah Khan Signed-off-by: Joerg Roedel --- lib/dma-debug.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'lib') diff --git a/lib/dma-debug.c b/lib/dma-debug.c index 66ce41489133..b9087bff008b 100644 --- a/lib/dma-debug.c +++ b/lib/dma-debug.c @@ -120,11 +120,6 @@ static const char *type2name[4] = { "single", "page", static const char *dir2name[4] = { "DMA_BIDIRECTIONAL", "DMA_TO_DEVICE", "DMA_FROM_DEVICE", "DMA_NONE" }; -/* little merge helper - remove it after the merge window */ -#ifndef BUS_NOTIFY_UNBOUND_DRIVER -#define BUS_NOTIFY_UNBOUND_DRIVER 0x0005 -#endif - /* * The access to some variables in this macro is racy. We can't use atomic_t * here because all these variables are exported to debugfs. Some of them even -- cgit v1.2.3 From 8f243af42adef5f589b8e39656284ca9c9374e44 Mon Sep 17 00:00:00 2001 From: Joe Mario Date: Thu, 4 Oct 2012 17:12:15 -0700 Subject: sections: fix const sections for crc32 table Fix the const sections for the code generated by crc32 table. There's no ro version of the cacheline aligned section, so we cannot put in const data without a conflict Just don't make the crc tables const for now. [ak@linux.intel.com: some fixes and new description] [akpm@linux-foundation.org: checkpatch fixes] Signed-off-by: Joe Mario Signed-off-by: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/crc32.c | 9 ++++++--- lib/gen_crc32table.c | 6 +++--- 2 files changed, 9 insertions(+), 6 deletions(-) (limited to 'lib') diff --git a/lib/crc32.c b/lib/crc32.c index 61774b8db4de..072fbd8234d5 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -188,11 +188,13 @@ u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len) #else u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len) { - return crc32_le_generic(crc, p, len, crc32table_le, CRCPOLY_LE); + return crc32_le_generic(crc, p, len, + (const u32 (*)[256])crc32table_le, CRCPOLY_LE); } u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len) { - return crc32_le_generic(crc, p, len, crc32ctable_le, CRC32C_POLY_LE); + return crc32_le_generic(crc, p, len, + (const u32 (*)[256])crc32ctable_le, CRC32C_POLY_LE); } #endif EXPORT_SYMBOL(crc32_le); @@ -253,7 +255,8 @@ u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len) #else u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len) { - return crc32_be_generic(crc, p, len, crc32table_be, CRCPOLY_BE); + return crc32_be_generic(crc, p, len, + (const u32 (*)[256])crc32table_be, CRCPOLY_BE); } #endif EXPORT_SYMBOL(crc32_be); diff --git a/lib/gen_crc32table.c b/lib/gen_crc32table.c index 8f8d5439e2d9..71fcfcd96410 100644 --- a/lib/gen_crc32table.c +++ b/lib/gen_crc32table.c @@ -109,7 +109,7 @@ int main(int argc, char** argv) if (CRC_LE_BITS > 1) { crc32init_le(); - printf("static const u32 __cacheline_aligned " + printf("static u32 __cacheline_aligned " "crc32table_le[%d][%d] = {", LE_TABLE_ROWS, LE_TABLE_SIZE); output_table(crc32table_le, LE_TABLE_ROWS, @@ -119,7 +119,7 @@ int main(int argc, char** argv) if (CRC_BE_BITS > 1) { crc32init_be(); - printf("static const u32 __cacheline_aligned " + printf("static u32 __cacheline_aligned " "crc32table_be[%d][%d] = {", BE_TABLE_ROWS, BE_TABLE_SIZE); output_table(crc32table_be, LE_TABLE_ROWS, @@ -128,7 +128,7 @@ int main(int argc, char** argv) } if (CRC_LE_BITS > 1) { crc32cinit_le(); - printf("static const u32 __cacheline_aligned " + printf("static u32 __cacheline_aligned " "crc32ctable_le[%d][%d] = {", LE_TABLE_ROWS, LE_TABLE_SIZE); output_table(crc32ctable_le, LE_TABLE_ROWS, -- cgit v1.2.3 From e49317d415f5a44bad8377a208d61902d752303e Mon Sep 17 00:00:00 2001 From: George Spelvin Date: Thu, 4 Oct 2012 17:12:27 -0700 Subject: lib: vsprintf: optimize division by 10 for small integers Shrink the reciprocal approximations used in put_dec_full4() based on the comments in put_dec_full9(). Signed-off-by: George Spelvin Cc: Denys Vlasenko Cc: Michal Nazarewicz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/vsprintf.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 0e337541f005..67e74cbefa90 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -243,13 +243,14 @@ char *put_dec(char *buf, unsigned long long n) /* Second algorithm: valid only for 64-bit long longs */ +/* See comment in put_dec_full9 for choice of constants */ static noinline_for_stack char *put_dec_full4(char *buf, unsigned q) { unsigned r; - r = (q * 0xcccd) >> 19; + r = (q * 0xccd) >> 15; *buf++ = (q - 10 * r) + '0'; - q = (r * 0x199a) >> 16; + q = (r * 0xcd) >> 11; *buf++ = (r - 10 * q) + '0'; r = (q * 0xcd) >> 11; *buf++ = (q - 10 * r) + '0'; -- cgit v1.2.3 From 2359172a75986359ce9cf041a9aca6a32cdf8779 Mon Sep 17 00:00:00 2001 From: George Spelvin Date: Thu, 4 Oct 2012 17:12:29 -0700 Subject: lib: vsprintf: optimize division by 10000 The same multiply-by-inverse technique can be used to convert division by 10000 to a 32x32->64-bit multiply. Signed-off-by: George Spelvin Cc: Denys Vlasenko Cc: Michal Nazarewicz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/vsprintf.c | 60 ++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 33 insertions(+), 27 deletions(-) (limited to 'lib') diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 67e74cbefa90..8cb7635b2ce3 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -245,17 +245,32 @@ char *put_dec(char *buf, unsigned long long n) /* See comment in put_dec_full9 for choice of constants */ static noinline_for_stack -char *put_dec_full4(char *buf, unsigned q) +void put_dec_full4(char *buf, unsigned q) { unsigned r; r = (q * 0xccd) >> 15; - *buf++ = (q - 10 * r) + '0'; + buf[0] = (q - 10 * r) + '0'; q = (r * 0xcd) >> 11; - *buf++ = (r - 10 * q) + '0'; + buf[1] = (r - 10 * q) + '0'; r = (q * 0xcd) >> 11; - *buf++ = (q - 10 * r) + '0'; - *buf++ = r + '0'; - return buf; + buf[2] = (q - 10 * r) + '0'; + buf[3] = r + '0'; +} + +/* + * Call put_dec_full4 on x % 10000, return x / 10000. + * The approximation x/10000 == (x * 0x346DC5D7) >> 43 + * holds for all x < 1,128,869,999. The largest value this + * helper will ever be asked to convert is 1,125,520,955. + * (d1 in the put_dec code, assuming n is all-ones). + */ +static +unsigned put_dec_helper4(char *buf, unsigned x) +{ + uint32_t q = (x * (uint64_t)0x346DC5D7) >> 43; + + put_dec_full4(buf, x - q * 10000); + return q; } /* Based on code by Douglas W. Jones found at @@ -277,28 +292,19 @@ char *put_dec(char *buf, unsigned long long n) d3 = (h >> 16); /* implicit "& 0xffff" */ q = 656 * d3 + 7296 * d2 + 5536 * d1 + ((uint32_t)n & 0xffff); + q = put_dec_helper4(buf, q); + + q += 7671 * d3 + 9496 * d2 + 6 * d1; + q = put_dec_helper4(buf+4, q); + + q += 4749 * d3 + 42 * d2; + q = put_dec_helper4(buf+8, q); - buf = put_dec_full4(buf, q % 10000); - q = q / 10000; - - d1 = q + 7671 * d3 + 9496 * d2 + 6 * d1; - buf = put_dec_full4(buf, d1 % 10000); - q = d1 / 10000; - - d2 = q + 4749 * d3 + 42 * d2; - buf = put_dec_full4(buf, d2 % 10000); - q = d2 / 10000; - - d3 = q + 281 * d3; - if (!d3) - goto done; - buf = put_dec_full4(buf, d3 % 10000); - q = d3 / 10000; - if (!q) - goto done; - buf = put_dec_full4(buf, q); - done: - while (buf[-1] == '0') + q += 281 * d3; + buf += 12; + if (q) + buf = put_dec_trunc8(buf, q); + else while (buf[-1] == '0') --buf; return buf; -- cgit v1.2.3 From cb239d0a97d573150d6106a92c0641da0d03f6a1 Mon Sep 17 00:00:00 2001 From: George Spelvin Date: Thu, 4 Oct 2012 17:12:30 -0700 Subject: lib: vsprintf: optimize put_dec_trunc8() If you're going to have a conditional branch after each 32x32->64-bit multiply, might as well shrink the code and make it a loop. This also avoids using the long multiply for small integers. (This leaves the comments in a confusing state, but that's a separate patch to make review easier.) Signed-off-by: George Spelvin Cc: Denys Vlasenko Cc: Michal Nazarewicz Cc: Rabin Vincent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/vsprintf.c | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) (limited to 'lib') diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 8cb7635b2ce3..c2236f14640f 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -174,22 +174,12 @@ char *put_dec_trunc8(char *buf, unsigned r) unsigned q; /* Copy of previous function's body with added early returns */ - q = (r * (uint64_t)0x1999999a) >> 32; - *buf++ = (r - 10 * q) + '0'; /* 2 */ - if (q == 0) - return buf; - r = (q * (uint64_t)0x1999999a) >> 32; - *buf++ = (q - 10 * r) + '0'; /* 3 */ - if (r == 0) - return buf; - q = (r * (uint64_t)0x1999999a) >> 32; - *buf++ = (r - 10 * q) + '0'; /* 4 */ - if (q == 0) - return buf; - r = (q * (uint64_t)0x1999999a) >> 32; - *buf++ = (q - 10 * r) + '0'; /* 5 */ - if (r == 0) - return buf; + while (r >= 10000) { + q = r + '0'; + r = (r * (uint64_t)0x1999999a) >> 32; + *buf++ = q - 10*r; + } + q = (r * 0x199a) >> 16; *buf++ = (r - 10 * q) + '0'; /* 6 */ if (q == 0) -- cgit v1.2.3 From f40005165f7f0bda6cc268bdbcaad98a8f26fb1a Mon Sep 17 00:00:00 2001 From: George Spelvin Date: Thu, 4 Oct 2012 17:12:32 -0700 Subject: lib: vsprintf: fix broken comments Numbering the 8 potential digits 2 though 9 never did make a lot of sense. Signed-off-by: George Spelvin Cc: Denys Vlasenko Cc: Michal Nazarewicz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/vsprintf.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'lib') diff --git a/lib/vsprintf.c b/lib/vsprintf.c index c2236f14640f..852f89f590a6 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -180,19 +180,19 @@ char *put_dec_trunc8(char *buf, unsigned r) *buf++ = q - 10*r; } - q = (r * 0x199a) >> 16; - *buf++ = (r - 10 * q) + '0'; /* 6 */ + q = (r * 0x199a) >> 16; /* r <= 9999 */ + *buf++ = (r - 10 * q) + '0'; if (q == 0) return buf; - r = (q * 0xcd) >> 11; - *buf++ = (q - 10 * r) + '0'; /* 7 */ + r = (q * 0xcd) >> 11; /* q <= 999 */ + *buf++ = (q - 10 * r) + '0'; if (r == 0) return buf; - q = (r * 0xcd) >> 11; - *buf++ = (r - 10 * q) + '0'; /* 8 */ + q = (r * 0xcd) >> 11; /* r <= 99 */ + *buf++ = (r - 10 * q) + '0'; if (q == 0) return buf; - *buf++ = q + '0'; /* 9 */ + *buf++ = q + '0'; /* q <= 9 */ return buf; } -- cgit v1.2.3 From 7c59154e7548429ff80384803577176466d2ab9a Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 4 Oct 2012 17:12:33 -0700 Subject: lib/vsprintf: update documentation to cover all of %p[Mm][FR] Acked-by: Andrei Emeltchenko Signed-off-by: Andy Shevchenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/printk-formats.txt | 1 + lib/vsprintf.c | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/Documentation/printk-formats.txt b/Documentation/printk-formats.txt index 7561d7ed8e11..8ffb274367c7 100644 --- a/Documentation/printk-formats.txt +++ b/Documentation/printk-formats.txt @@ -69,6 +69,7 @@ MAC/FDDI addresses: %pMR 05:04:03:02:01:00 %pMF 00-01-02-03-04-05 %pm 000102030405 + %pmR 050403020100 For printing 6-byte MAC/FDDI addresses in hex notation. The 'M' and 'm' specifiers result in a printed address with ('M') or without ('m') byte diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 852f89f590a6..9287e2549936 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -987,7 +987,7 @@ int kptr_restrict __read_mostly; * - 'm' For a 6-byte MAC address, it prints the hex address without colons * - 'MF' For a 6-byte MAC FDDI address, it prints the address * with a dash-separated hex notation - * - '[mM]R For a 6-byte MAC address, Reverse order (Bluetooth) + * - '[mM]R' For a 6-byte MAC address, Reverse order (Bluetooth) * - 'I' [46] for IPv4/IPv6 addresses printed in the usual way * IPv4 uses dot-separated decimal without leading 0's (1.2.3.4) * IPv6 uses colon separated network-order 16 bit hex with leading 0's @@ -1338,7 +1338,10 @@ qualifier: * %pR output the address range in a struct resource with decoded flags * %pr output the address range in a struct resource with raw flags * %pM output a 6-byte MAC address with colons + * %pMR output a 6-byte MAC address with colons in reversed order + * %pMF output a 6-byte MAC address with dashes * %pm output a 6-byte MAC address without colons + * %pmR output a 6-byte MAC address without colons in reversed order * %pI4 print an IPv4 address without leading zeros * %pi4 print an IPv4 address with leading zeros * %pI6 print an IPv6 address with colons -- cgit v1.2.3 From 125c4c706b680c7831f0966ff873c1ad0354ec25 Mon Sep 17 00:00:00 2001 From: Fengguang Wu Date: Thu, 4 Oct 2012 17:13:15 -0700 Subject: idr: rename MAX_LEVEL to MAX_IDR_LEVEL To avoid name conflicts: drivers/video/riva/fbdev.c:281:9: sparse: preprocessor token MAX_LEVEL redefined While at it, also make the other names more consistent and add parentheses. [akpm@linux-foundation.org: repair fallout] [sfr@canb.auug.org.au: IB/mlx4: fix for MAX_ID_MASK to MAX_IDR_MASK name change] Signed-off-by: Fengguang Wu Cc: Bernd Petrovitsch Cc: walter harms Cc: Glauber Costa Signed-off-by: Stephen Rothwell Cc: Roland Dreier Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/i2c/i2c-core.c | 2 +- drivers/infiniband/core/cm.c | 2 +- drivers/infiniband/hw/mlx4/cm.c | 2 +- drivers/pps/pps.c | 2 +- drivers/thermal/thermal_sys.c | 2 +- fs/super.c | 2 +- include/linux/idr.h | 10 +++++----- lib/idr.c | 32 ++++++++++++++++---------------- 8 files changed, 27 insertions(+), 27 deletions(-) (limited to 'lib') diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c index 2091ae8f539a..2421d95130d4 100644 --- a/drivers/i2c/i2c-core.c +++ b/drivers/i2c/i2c-core.c @@ -982,7 +982,7 @@ int i2c_add_numbered_adapter(struct i2c_adapter *adap) if (adap->nr == -1) /* -1 means dynamically assign bus id */ return i2c_add_adapter(adap); - if (adap->nr & ~MAX_ID_MASK) + if (adap->nr & ~MAX_IDR_MASK) return -EINVAL; retry: diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index d67999f6e34a..394fea2ba1bc 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -390,7 +390,7 @@ static int cm_alloc_id(struct cm_id_private *cm_id_priv) ret = idr_get_new_above(&cm.local_id_table, cm_id_priv, next_id, &id); if (!ret) - next_id = ((unsigned) id + 1) & MAX_ID_MASK; + next_id = ((unsigned) id + 1) & MAX_IDR_MASK; spin_unlock_irqrestore(&cm.lock, flags); } while( (ret == -EAGAIN) && idr_pre_get(&cm.local_id_table, GFP_KERNEL) ); diff --git a/drivers/infiniband/hw/mlx4/cm.c b/drivers/infiniband/hw/mlx4/cm.c index e25e4dafb8a8..80079e5a2e30 100644 --- a/drivers/infiniband/hw/mlx4/cm.c +++ b/drivers/infiniband/hw/mlx4/cm.c @@ -225,7 +225,7 @@ id_map_alloc(struct ib_device *ibdev, int slave_id, u32 sl_cm_id) ret = idr_get_new_above(&sriov->pv_id_table, ent, next_id, &id); if (!ret) { - next_id = ((unsigned) id + 1) & MAX_ID_MASK; + next_id = ((unsigned) id + 1) & MAX_IDR_MASK; ent->pv_cm_id = (u32)id; sl_id_map_add(ibdev, ent); } diff --git a/drivers/pps/pps.c b/drivers/pps/pps.c index e771487132f7..2420d5af0583 100644 --- a/drivers/pps/pps.c +++ b/drivers/pps/pps.c @@ -306,7 +306,7 @@ int pps_register_cdev(struct pps_device *pps) if (err < 0) return err; - pps->id &= MAX_ID_MASK; + pps->id &= MAX_IDR_MASK; if (pps->id >= PPS_MAX_SOURCES) { pr_err("%s: too many PPS sources in the system\n", pps->info.name); diff --git a/drivers/thermal/thermal_sys.c b/drivers/thermal/thermal_sys.c index 67789b8345d2..efd81bb25e01 100644 --- a/drivers/thermal/thermal_sys.c +++ b/drivers/thermal/thermal_sys.c @@ -78,7 +78,7 @@ again: else if (unlikely(err)) return err; - *id = *id & MAX_ID_MASK; + *id = *id & MAX_IDR_MASK; return 0; } diff --git a/fs/super.c b/fs/super.c index 5fdf7ff32c4e..a3bc935069d9 100644 --- a/fs/super.c +++ b/fs/super.c @@ -865,7 +865,7 @@ int get_anon_bdev(dev_t *p) else if (error) return -EAGAIN; - if ((dev & MAX_ID_MASK) == (1 << MINORBITS)) { + if ((dev & MAX_IDR_MASK) == (1 << MINORBITS)) { spin_lock(&unnamed_dev_lock); ida_remove(&unnamed_dev_ida, dev); if (unnamed_dev_start > dev) diff --git a/include/linux/idr.h b/include/linux/idr.h index 255491cf522e..87259a44c251 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -38,15 +38,15 @@ #define IDR_SIZE (1 << IDR_BITS) #define IDR_MASK ((1 << IDR_BITS)-1) -#define MAX_ID_SHIFT (sizeof(int)*8 - 1) -#define MAX_ID_BIT (1U << MAX_ID_SHIFT) -#define MAX_ID_MASK (MAX_ID_BIT - 1) +#define MAX_IDR_SHIFT (sizeof(int)*8 - 1) +#define MAX_IDR_BIT (1U << MAX_IDR_SHIFT) +#define MAX_IDR_MASK (MAX_IDR_BIT - 1) /* Leave the possibility of an incomplete final layer */ -#define MAX_LEVEL (MAX_ID_SHIFT + IDR_BITS - 1) / IDR_BITS +#define MAX_IDR_LEVEL ((MAX_IDR_SHIFT + IDR_BITS - 1) / IDR_BITS) /* Number of id_layer structs to leave in free list */ -#define IDR_FREE_MAX MAX_LEVEL + MAX_LEVEL +#define MAX_IDR_FREE (MAX_IDR_LEVEL * 2) struct idr_layer { unsigned long bitmap; /* A zero bit means "space here" */ diff --git a/lib/idr.c b/lib/idr.c index 4046e29c0a99..648239079dd2 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -20,7 +20,7 @@ * that id to this code and it returns your pointer. * You can release ids at any time. When all ids are released, most of - * the memory is returned (we keep IDR_FREE_MAX) in a local pool so we + * the memory is returned (we keep MAX_IDR_FREE) in a local pool so we * don't need to go to the memory "store" during an id allocate, just * so you don't need to be too concerned about locking and conflicts * with the slab allocator. @@ -122,7 +122,7 @@ static void idr_mark_full(struct idr_layer **pa, int id) */ int idr_pre_get(struct idr *idp, gfp_t gfp_mask) { - while (idp->id_free_cnt < IDR_FREE_MAX) { + while (idp->id_free_cnt < MAX_IDR_FREE) { struct idr_layer *new; new = kmem_cache_zalloc(idr_layer_cache, gfp_mask); if (new == NULL) @@ -179,7 +179,7 @@ static int sub_alloc(struct idr *idp, int *starting_id, struct idr_layer **pa) sh = IDR_BITS*l; id = ((id >> sh) ^ n ^ m) << sh; } - if ((id >= MAX_ID_BIT) || (id < 0)) + if ((id >= MAX_IDR_BIT) || (id < 0)) return IDR_NOMORE_SPACE; if (l == 0) break; @@ -223,7 +223,7 @@ build_up: * Add a new layer to the top of the tree if the requested * id is larger than the currently allocated space. */ - while ((layers < (MAX_LEVEL - 1)) && (id >= (1 << (layers*IDR_BITS)))) { + while ((layers < (MAX_IDR_LEVEL - 1)) && (id >= (1 << (layers*IDR_BITS)))) { layers++; if (!p->count) { /* special case: if the tree is currently empty, @@ -265,7 +265,7 @@ build_up: static int idr_get_new_above_int(struct idr *idp, void *ptr, int starting_id) { - struct idr_layer *pa[MAX_LEVEL]; + struct idr_layer *pa[MAX_IDR_LEVEL]; int id; id = idr_get_empty_slot(idp, starting_id, pa); @@ -357,7 +357,7 @@ static void idr_remove_warning(int id) static void sub_remove(struct idr *idp, int shift, int id) { struct idr_layer *p = idp->top; - struct idr_layer **pa[MAX_LEVEL]; + struct idr_layer **pa[MAX_IDR_LEVEL]; struct idr_layer ***paa = &pa[0]; struct idr_layer *to_free; int n; @@ -402,7 +402,7 @@ void idr_remove(struct idr *idp, int id) struct idr_layer *to_free; /* Mask off upper bits we don't use for the search. */ - id &= MAX_ID_MASK; + id &= MAX_IDR_MASK; sub_remove(idp, (idp->layers - 1) * IDR_BITS, id); if (idp->top && idp->top->count == 1 && (idp->layers > 1) && @@ -420,7 +420,7 @@ void idr_remove(struct idr *idp, int id) to_free->bitmap = to_free->count = 0; free_layer(to_free); } - while (idp->id_free_cnt >= IDR_FREE_MAX) { + while (idp->id_free_cnt >= MAX_IDR_FREE) { p = get_from_free_list(idp); /* * Note: we don't call the rcu callback here, since the only @@ -451,7 +451,7 @@ void idr_remove_all(struct idr *idp) int n, id, max; int bt_mask; struct idr_layer *p; - struct idr_layer *pa[MAX_LEVEL]; + struct idr_layer *pa[MAX_IDR_LEVEL]; struct idr_layer **paa = &pa[0]; n = idp->layers * IDR_BITS; @@ -517,7 +517,7 @@ void *idr_find(struct idr *idp, int id) n = (p->layer+1) * IDR_BITS; /* Mask off upper bits we don't use for the search. */ - id &= MAX_ID_MASK; + id &= MAX_IDR_MASK; if (id >= (1 << n)) return NULL; @@ -555,7 +555,7 @@ int idr_for_each(struct idr *idp, { int n, id, max, error = 0; struct idr_layer *p; - struct idr_layer *pa[MAX_LEVEL]; + struct idr_layer *pa[MAX_IDR_LEVEL]; struct idr_layer **paa = &pa[0]; n = idp->layers * IDR_BITS; @@ -601,7 +601,7 @@ EXPORT_SYMBOL(idr_for_each); */ void *idr_get_next(struct idr *idp, int *nextidp) { - struct idr_layer *p, *pa[MAX_LEVEL]; + struct idr_layer *p, *pa[MAX_IDR_LEVEL]; struct idr_layer **paa = &pa[0]; int id = *nextidp; int n, max; @@ -659,7 +659,7 @@ void *idr_replace(struct idr *idp, void *ptr, int id) n = (p->layer+1) * IDR_BITS; - id &= MAX_ID_MASK; + id &= MAX_IDR_MASK; if (id >= (1 << n)) return ERR_PTR(-EINVAL); @@ -780,7 +780,7 @@ EXPORT_SYMBOL(ida_pre_get); */ int ida_get_new_above(struct ida *ida, int starting_id, int *p_id) { - struct idr_layer *pa[MAX_LEVEL]; + struct idr_layer *pa[MAX_IDR_LEVEL]; struct ida_bitmap *bitmap; unsigned long flags; int idr_id = starting_id / IDA_BITMAP_BITS; @@ -793,7 +793,7 @@ int ida_get_new_above(struct ida *ida, int starting_id, int *p_id) if (t < 0) return _idr_rc_to_errno(t); - if (t * IDA_BITMAP_BITS >= MAX_ID_BIT) + if (t * IDA_BITMAP_BITS >= MAX_IDR_BIT) return -ENOSPC; if (t != idr_id) @@ -827,7 +827,7 @@ int ida_get_new_above(struct ida *ida, int starting_id, int *p_id) } id = idr_id * IDA_BITMAP_BITS + t; - if (id >= MAX_ID_BIT) + if (id >= MAX_IDR_BIT) return -ENOSPC; __set_bit(t, bitmap->bitmap); -- cgit v1.2.3 From 77dd3b0bd17a0849b2f98b915ce3fc9247db1013 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 4 Oct 2012 17:13:16 -0700 Subject: lib/parser.c: avoid overflow in match_number() The result of converting an integer value to another signed integer type that's unable to represent the original value is implementation defined. (See notes in section 6.3.1.3 of the C standard.) In match_number(), the result of simple_strtol() (which returns type long) is assigned to a value of type int. Instead, handle the result of simple_strtol() in a well-defined way, and return -ERANGE if the result won't fit in the int variable used to hold the parsed result. No current callers pay attention to the particular error value returned, so this additional return code shouldn't do any harm. [akpm@linux-foundation.org: coding-style tweaks] Signed-off-by: Alex Elder Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/parser.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/parser.c b/lib/parser.c index c43410084838..52cfa69f73df 100644 --- a/lib/parser.c +++ b/lib/parser.c @@ -122,13 +122,14 @@ int match_token(char *s, const match_table_t table, substring_t args[]) * * Description: Given a &substring_t and a base, attempts to parse the substring * as a number in that base. On success, sets @result to the integer represented - * by the string and returns 0. Returns either -ENOMEM or -EINVAL on failure. + * by the string and returns 0. Returns -ENOMEM, -EINVAL, or -ERANGE on failure. */ static int match_number(substring_t *s, int *result, int base) { char *endp; char *buf; int ret; + long val; size_t len = s->to - s->from; buf = kmalloc(len + 1, GFP_KERNEL); @@ -136,10 +137,15 @@ static int match_number(substring_t *s, int *result, int base) return -ENOMEM; memcpy(buf, s->from, len); buf[len] = '\0'; - *result = simple_strtol(buf, &endp, base); + ret = 0; + val = simple_strtol(buf, &endp, base); if (endp == buf) ret = -EINVAL; + else if (val < (long)INT_MIN || val > (long)INT_MAX) + ret = -ERANGE; + else + *result = (int) val; kfree(buf); return ret; } -- cgit v1.2.3 From 8f1f66ed7e1bdb7c88bb0bc45ac78cd075430d78 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 4 Oct 2012 17:13:17 -0700 Subject: lib/Kconfig.debug: adjust hard-lockup related Kconfig options The main option should not appear in the resulting .config when the dependencies aren't met (i.e. use "depends on" rather than directly setting the default from the combined dependency values). The sub-options should depend on the main option rather than a more generic higher level one. Signed-off-by: Jan Beulich Acked-by: Don Zickus Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.debug | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'lib') diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 35c4565ee8fa..7fba3a98967f 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -196,12 +196,13 @@ config LOCKUP_DETECTOR thresholds can be controlled through the sysctl watchdog_thresh. config HARDLOCKUP_DETECTOR - def_bool LOCKUP_DETECTOR && PERF_EVENTS && HAVE_PERF_EVENTS_NMI && \ - !HAVE_NMI_WATCHDOG + def_bool y + depends on LOCKUP_DETECTOR && !HAVE_NMI_WATCHDOG + depends on PERF_EVENTS && HAVE_PERF_EVENTS_NMI config BOOTPARAM_HARDLOCKUP_PANIC bool "Panic (Reboot) On Hard Lockups" - depends on LOCKUP_DETECTOR + depends on HARDLOCKUP_DETECTOR help Say Y here to enable the kernel to panic on "hard lockups", which are bugs that cause the kernel to loop in kernel @@ -212,7 +213,7 @@ config BOOTPARAM_HARDLOCKUP_PANIC config BOOTPARAM_HARDLOCKUP_PANIC_VALUE int - depends on LOCKUP_DETECTOR + depends on HARDLOCKUP_DETECTOR range 0 1 default 0 if !BOOTPARAM_HARDLOCKUP_PANIC default 1 if BOOTPARAM_HARDLOCKUP_PANIC -- cgit v1.2.3 From e96875677fb2b7cb739c5d7769824dff7260d31d Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 4 Oct 2012 17:13:18 -0700 Subject: lib/gcd.c: prevent possible div by 0 Account for all properties when a and/or b are 0: gcd(0, 0) = 0 gcd(a, 0) = a gcd(0, b) = b Fixes no known problems in current kernels. Signed-off-by: Davidlohr Bueso Cc: Eric Dumazet Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/gcd.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'lib') diff --git a/lib/gcd.c b/lib/gcd.c index cce4f3cd14b3..3657f129d7b8 100644 --- a/lib/gcd.c +++ b/lib/gcd.c @@ -9,6 +9,9 @@ unsigned long gcd(unsigned long a, unsigned long b) if (a < b) swap(a, b); + + if (!b) + return a; while ((r = a % b) != 0) { a = b; b = r; -- cgit v1.2.3 From ca279cf1065fb689abea1dc7d8c11787729bb185 Mon Sep 17 00:00:00 2001 From: Benjamin Gaignard Date: Thu, 4 Oct 2012 17:13:20 -0700 Subject: genalloc: make it possible to use a custom allocation algorithm Premit use of another algorithm than the default first-fit one. For example a custom algorithm could be used to manage alignment requirements. As I can't predict all the possible requirements/needs for all allocation uses cases, I add a "free" field 'void *data' to pass any needed information to the allocation function. For example 'data' could be used to handle a structure where you store the alignment, the expected memory bank, the requester device, or any information that could influence the allocation algorithm. An usage example may look like this: struct my_pool_constraints { int align; int bank; ... }; unsigned long my_custom_algo(unsigned long *map, unsigned long size, unsigned long start, unsigned int nr, void *data) { struct my_pool_constraints *constraints = data; ... deal with allocation contraints ... return the index in bitmap where perform the allocation } void create_my_pool() { struct my_pool_constraints c; struct gen_pool *pool = gen_pool_create(...); gen_pool_add(pool, ...); gen_pool_set_algo(pool, my_custom_algo, &c); } Add of best-fit algorithm function: most of the time best-fit is slower then first-fit but memory fragmentation is lower. The random buffer allocation/free tests don't show any arithmetic relation between the allocation time and fragmentation but the best-fit algorithm is sometime able to perform the allocation when the first-fit can't. This new algorithm help to remove static allocations on ESRAM, a small but fast on-chip RAM of few KB, used for high-performance uses cases like DMA linked lists, graphic accelerators, encoders/decoders. On the Ux500 (in the ARM tree) we have define 5 ESRAM banks of 128 KB each and use of static allocations becomes unmaintainable: cd arch/arm/mach-ux500 && grep -r ESRAM . ./include/mach/db8500-regs.h:/* Base address and bank offsets for ESRAM */ ./include/mach/db8500-regs.h:#define U8500_ESRAM_BASE 0x40000000 ./include/mach/db8500-regs.h:#define U8500_ESRAM_BANK_SIZE 0x00020000 ./include/mach/db8500-regs.h:#define U8500_ESRAM_BANK0 U8500_ESRAM_BASE ./include/mach/db8500-regs.h:#define U8500_ESRAM_BANK1 (U8500_ESRAM_BASE + U8500_ESRAM_BANK_SIZE) ./include/mach/db8500-regs.h:#define U8500_ESRAM_BANK2 (U8500_ESRAM_BANK1 + U8500_ESRAM_BANK_SIZE) ./include/mach/db8500-regs.h:#define U8500_ESRAM_BANK3 (U8500_ESRAM_BANK2 + U8500_ESRAM_BANK_SIZE) ./include/mach/db8500-regs.h:#define U8500_ESRAM_BANK4 (U8500_ESRAM_BANK3 + U8500_ESRAM_BANK_SIZE) ./include/mach/db8500-regs.h:#define U8500_ESRAM_DMA_LCPA_OFFSET 0x10000 ./include/mach/db8500-regs.h:#define U8500_DMA_LCPA_BASE (U8500_ESRAM_BANK0 + U8500_ESRAM_DMA_LCPA_OFFSET) ./include/mach/db8500-regs.h:#define U8500_DMA_LCLA_BASE U8500_ESRAM_BANK4 I want to use genalloc to do dynamic allocations but I need to be able to fine tune the allocation algorithm. I my case best-fit algorithm give better results than first-fit, but it will not be true for every use case. Signed-off-by: Benjamin Gaignard Cc: Huang Ying Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/genalloc.h | 27 +++++++++++++++ lib/genalloc.c | 88 +++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 111 insertions(+), 4 deletions(-) (limited to 'lib') diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h index 5e98eeb2af3b..dd7c569aacad 100644 --- a/include/linux/genalloc.h +++ b/include/linux/genalloc.h @@ -29,6 +29,20 @@ #ifndef __GENALLOC_H__ #define __GENALLOC_H__ +/** + * Allocation callback function type definition + * @map: Pointer to bitmap + * @size: The bitmap size in bits + * @start: The bitnumber to start searching at + * @nr: The number of zeroed bits we're looking for + * @data: optional additional data used by @genpool_algo_t + */ +typedef unsigned long (*genpool_algo_t)(unsigned long *map, + unsigned long size, + unsigned long start, + unsigned int nr, + void *data); + /* * General purpose special memory pool descriptor. */ @@ -36,6 +50,9 @@ struct gen_pool { spinlock_t lock; struct list_head chunks; /* list of chunks in this pool */ int min_alloc_order; /* minimum allocation order */ + + genpool_algo_t algo; /* allocation function */ + void *data; }; /* @@ -78,4 +95,14 @@ extern void gen_pool_for_each_chunk(struct gen_pool *, void (*)(struct gen_pool *, struct gen_pool_chunk *, void *), void *); extern size_t gen_pool_avail(struct gen_pool *); extern size_t gen_pool_size(struct gen_pool *); + +extern void gen_pool_set_algo(struct gen_pool *pool, genpool_algo_t algo, + void *data); + +extern unsigned long gen_pool_first_fit(unsigned long *map, unsigned long size, + unsigned long start, unsigned int nr, void *data); + +extern unsigned long gen_pool_best_fit(unsigned long *map, unsigned long size, + unsigned long start, unsigned int nr, void *data); + #endif /* __GENALLOC_H__ */ diff --git a/lib/genalloc.c b/lib/genalloc.c index 6bc04aab6ec7..ca208a92628c 100644 --- a/lib/genalloc.c +++ b/lib/genalloc.c @@ -152,6 +152,8 @@ struct gen_pool *gen_pool_create(int min_alloc_order, int nid) spin_lock_init(&pool->lock); INIT_LIST_HEAD(&pool->chunks); pool->min_alloc_order = min_alloc_order; + pool->algo = gen_pool_first_fit; + pool->data = NULL; } return pool; } @@ -255,8 +257,9 @@ EXPORT_SYMBOL(gen_pool_destroy); * @size: number of bytes to allocate from the pool * * Allocate the requested number of bytes from the specified pool. - * Uses a first-fit algorithm. Can not be used in NMI handler on - * architectures without NMI-safe cmpxchg implementation. + * Uses the pool allocation function (with first-fit algorithm by default). + * Can not be used in NMI handler on architectures without + * NMI-safe cmpxchg implementation. */ unsigned long gen_pool_alloc(struct gen_pool *pool, size_t size) { @@ -280,8 +283,8 @@ unsigned long gen_pool_alloc(struct gen_pool *pool, size_t size) end_bit = (chunk->end_addr - chunk->start_addr) >> order; retry: - start_bit = bitmap_find_next_zero_area(chunk->bits, end_bit, - start_bit, nbits, 0); + start_bit = pool->algo(chunk->bits, end_bit, start_bit, nbits, + pool->data); if (start_bit >= end_bit) continue; remain = bitmap_set_ll(chunk->bits, start_bit, nbits); @@ -400,3 +403,80 @@ size_t gen_pool_size(struct gen_pool *pool) return size; } EXPORT_SYMBOL_GPL(gen_pool_size); + +/** + * gen_pool_set_algo - set the allocation algorithm + * @pool: pool to change allocation algorithm + * @algo: custom algorithm function + * @data: additional data used by @algo + * + * Call @algo for each memory allocation in the pool. + * If @algo is NULL use gen_pool_first_fit as default + * memory allocation function. + */ +void gen_pool_set_algo(struct gen_pool *pool, genpool_algo_t algo, void *data) +{ + rcu_read_lock(); + + pool->algo = algo; + if (!pool->algo) + pool->algo = gen_pool_first_fit; + + pool->data = data; + + rcu_read_unlock(); +} +EXPORT_SYMBOL(gen_pool_set_algo); + +/** + * gen_pool_first_fit - find the first available region + * of memory matching the size requirement (no alignment constraint) + * @map: The address to base the search on + * @size: The bitmap size in bits + * @start: The bitnumber to start searching at + * @nr: The number of zeroed bits we're looking for + * @data: additional data - unused + */ +unsigned long gen_pool_first_fit(unsigned long *map, unsigned long size, + unsigned long start, unsigned int nr, void *data) +{ + return bitmap_find_next_zero_area(map, size, start, nr, 0); +} +EXPORT_SYMBOL(gen_pool_first_fit); + +/** + * gen_pool_best_fit - find the best fitting region of memory + * macthing the size requirement (no alignment constraint) + * @map: The address to base the search on + * @size: The bitmap size in bits + * @start: The bitnumber to start searching at + * @nr: The number of zeroed bits we're looking for + * @data: additional data - unused + * + * Iterate over the bitmap to find the smallest free region + * which we can allocate the memory. + */ +unsigned long gen_pool_best_fit(unsigned long *map, unsigned long size, + unsigned long start, unsigned int nr, void *data) +{ + unsigned long start_bit = size; + unsigned long len = size + 1; + unsigned long index; + + index = bitmap_find_next_zero_area(map, size, start, nr, 0); + + while (index < size) { + int next_bit = find_next_bit(map, size, index + nr); + if ((next_bit - index) < len) { + len = next_bit - index; + start_bit = index; + if (len == nr) + return start_bit; + } + index = bitmap_find_next_zero_area(map, size, + next_bit + 1, nr, 0); + } + + return start_bit; +} +EXPORT_SYMBOL(gen_pool_best_fit); -- cgit v1.2.3 From 214f766ea0909e743122966c4617b3a112e405d7 Mon Sep 17 00:00:00 2001 From: Vikram Mulukutla Date: Thu, 4 Oct 2012 17:13:22 -0700 Subject: lib/spinlock_debug: avoid livelock in do_raw_spin_lock() The logic in do_raw_spin_lock() attempts to acquire a spinlock by invoking arch_spin_trylock() in a loop with a delay between each attempt. Now consider the following situation in a 2 CPU system: 1. CPU-0 continually acquires and releases a spinlock in a tight loop; it stays in this loop until some condition X is satisfied. X can only be satisfied by another CPU. 2. CPU-1 tries to acquire the same spinlock, in an attempt to satisfy the aforementioned condition X. However, it never sees the unlocked value of the lock because the debug spinlock code uses trylock instead of just lock; it checks at all the wrong moments - whenever CPU-0 has locked the lock. Now in the absence of debug spinlocks, the architecture specific spinlock code can correctly allow CPU-1 to wait in a "queue" (e.g., ticket spinlocks), ensuring that it acquires the lock at some point. However, with the debug spinlock code, livelock can easily occur due to the use of try_lock, which obviously cannot put the CPU in that "queue". This queueing mechanism is implemented in both x86 and ARM spinlock code. Note that the situation mentioned above is not hypothetical. A real problem was encountered where CPU-0 was running hrtimer_cancel with interrupts disabled, and CPU-1 was attempting to run the hrtimer that CPU-0 was trying to cancel. Address this by actually attempting arch_spin_lock once it is suspected that there is a spinlock lockup. If we're in a situation that is described above, the arch_spin_lock should succeed; otherwise other timeout mechanisms (e.g., watchdog) should alert the system of a lockup. Therefore, if there is a genuine system problem and the spinlock can't be acquired, the end result (irrespective of this change being present) is the same. If there is a livelock caused by the debug code, this change will allow the lock to be acquired, depending on the implementation of the lower level arch specific spinlock code. [akpm@linux-foundation.org: tweak comment] Signed-off-by: Vikram Mulukutla Cc: Thomas Gleixner Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/spinlock_debug.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) (limited to 'lib') diff --git a/lib/spinlock_debug.c b/lib/spinlock_debug.c index eb10578ae055..0374a596cffa 100644 --- a/lib/spinlock_debug.c +++ b/lib/spinlock_debug.c @@ -107,23 +107,27 @@ static void __spin_lock_debug(raw_spinlock_t *lock) { u64 i; u64 loops = loops_per_jiffy * HZ; - int print_once = 1; - for (;;) { - for (i = 0; i < loops; i++) { - if (arch_spin_trylock(&lock->raw_lock)) - return; - __delay(1); - } - /* lockup suspected: */ - if (print_once) { - print_once = 0; - spin_dump(lock, "lockup suspected"); + for (i = 0; i < loops; i++) { + if (arch_spin_trylock(&lock->raw_lock)) + return; + __delay(1); + } + /* lockup suspected: */ + spin_dump(lock, "lockup suspected"); #ifdef CONFIG_SMP - trigger_all_cpu_backtrace(); + trigger_all_cpu_backtrace(); #endif - } - } + + /* + * The trylock above was causing a livelock. Give the lower level arch + * specific lock code a chance to acquire the lock. We have already + * printed a warning/backtrace at this point. The non-debug arch + * specific code might actually succeed in acquiring the lock. If it is + * not successful, the end-result is the same - there is no forward + * progress. + */ + arch_spin_lock(&lock->raw_lock); } void do_raw_spin_lock(raw_spinlock_t *lock) -- cgit v1.2.3 From da99075c1d368315e1508b6143226c0d27b621e0 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 4 Oct 2012 17:13:24 -0700 Subject: lib/vsprintf.c: improve standard conformance of sscanf() Xen's pciback points out a couple of deficiencies with vsscanf()'s standard conformance: - Trailing character matching cannot be checked by the caller: With a format string of "(%x:%x.%x) %n" absence of the closing parenthesis cannot be checked, as input of "(00:00.0)" doesn't cause the %n to be evaluated (because of the code not skipping white space before the trailing %n). - The parameter corresponding to a trailing %n could get filled even if there was a matching error: With a format string of "(%x:%x.%x)%n", input of "(00:00.0]" would still fill the respective variable pointed to (and hence again make the mismatch non-detectable by the caller). This patch aims at fixing those, but leaves other non-conforming aspects of it untouched, among them these possibly relevant ones: - improper handling of the assignment suppression character '*' (blindly discarding all succeeding non-white space from the format and input strings), - not honoring conversion specifiers for %n, - not recognizing the C99 conversion specifier 't' (recognized by vsprintf()). Signed-off-by: Jan Beulich Cc: Konrad Rzeszutek Wilk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/vsprintf.c | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) (limited to 'lib') diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 9287e2549936..39c99fea7c03 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -2017,7 +2017,7 @@ int vsscanf(const char *buf, const char *fmt, va_list args) s16 field_width; bool is_sign; - while (*fmt && *str) { + while (*fmt) { /* skip any white space in format */ /* white space in format matchs any amount of * white space, including none, in the input. @@ -2042,6 +2042,8 @@ int vsscanf(const char *buf, const char *fmt, va_list args) * advance both strings to next white space */ if (*fmt == '*') { + if (!*str) + break; while (!isspace(*fmt) && *fmt != '%' && *fmt) fmt++; while (!isspace(*str) && *str) @@ -2070,7 +2072,17 @@ int vsscanf(const char *buf, const char *fmt, va_list args) } } - if (!*fmt || !*str) + if (!*fmt) + break; + + if (*fmt == 'n') { + /* return number of characters read so far */ + *va_arg(args, int *) = str - buf; + ++fmt; + continue; + } + + if (!*str) break; base = 10; @@ -2103,13 +2115,6 @@ int vsscanf(const char *buf, const char *fmt, va_list args) num++; } continue; - case 'n': - /* return number of characters read so far */ - { - int *i = (int *)va_arg(args, int*); - *i = str - buf; - } - continue; case 'o': base = 8; break; @@ -2210,16 +2215,6 @@ int vsscanf(const char *buf, const char *fmt, va_list args) str = next; } - /* - * Now we've come all the way through so either the input string or the - * format ended. In the former case, there can be a %n at the current - * position in the format that needs to be filled. - */ - if (*fmt == '%' && *(fmt + 1) == 'n') { - int *p = (int *)va_arg(args, int *); - *p = str - buf; - } - return num; } EXPORT_SYMBOL(vsscanf); -- cgit v1.2.3 From 17d7aac9a55b164874f3d4b7b4f5af87ab457b84 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 4 Oct 2012 17:13:27 -0700 Subject: lib/plist.c: make plist test announcements KERN_DEBUG They show up in dmesg [ 4.041094] start plist test [ 4.045804] end plist test without a lot of meaning so hide them behind debug loglevel. Signed-off-by: Borislav Petkov Cc: Lai Jiangshan Acked-by: Steven Rostedt Cc: Paul Gortmaker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/plist.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/plist.c b/lib/plist.c index 6ab0e521c48b..1ebc95f7a46f 100644 --- a/lib/plist.c +++ b/lib/plist.c @@ -175,7 +175,7 @@ static int __init plist_test(void) int nr_expect = 0, i, loop; unsigned int r = local_clock(); - printk(KERN_INFO "start plist test\n"); + pr_debug("start plist test\n"); plist_head_init(&test_head); for (i = 0; i < ARRAY_SIZE(test_node); i++) plist_node_init(test_node + i, 0); @@ -203,7 +203,7 @@ static int __init plist_test(void) plist_test_check(nr_expect); } - printk(KERN_INFO "end plist test\n"); + pr_debug("end plist test\n"); return 0; } -- cgit v1.2.3 From 8290e2d2dcbf0d379d4b1379e17916515ee20a39 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 4 Oct 2012 17:13:28 -0700 Subject: scatterlist: atomic sg_mapping_iter() no longer needs disabled IRQs SG mapping iterator w/ SG_MITER_ATOMIC set required IRQ disabled because it originally used KM_BIO_SRC_IRQ to allow use from IRQ handlers. kmap_atomic() has long been updated to handle stacking atomic mapping requests on per-cpu basis and only requires not sleeping while mapped. Update sg_mapping_iter such that atomic iterators only require disabling preemption instead of disabling IRQ. While at it, convert wte weird @ARG@ notations to @ARG in the comment of sg_miter_start(). Signed-off-by: Tejun Heo Cc: Maxim Levitsky Cc: Alex Dubov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/scatterlist.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'lib') diff --git a/lib/scatterlist.c b/lib/scatterlist.c index fadae774a20c..e76d85cf3175 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -404,14 +404,13 @@ EXPORT_SYMBOL(sg_miter_start); * @miter: sg mapping iter to proceed * * Description: - * Proceeds @miter@ to the next mapping. @miter@ should have been - * started using sg_miter_start(). On successful return, - * @miter@->page, @miter@->addr and @miter@->length point to the - * current mapping. + * Proceeds @miter to the next mapping. @miter should have been started + * using sg_miter_start(). On successful return, @miter->page, + * @miter->addr and @miter->length point to the current mapping. * * Context: - * IRQ disabled if SG_MITER_ATOMIC. IRQ must stay disabled till - * @miter@ is stopped. May sleep if !SG_MITER_ATOMIC. + * Preemption disabled if SG_MITER_ATOMIC. Preemption must stay disabled + * till @miter is stopped. May sleep if !SG_MITER_ATOMIC. * * Returns: * true if @miter contains the next mapping. false if end of sg @@ -465,7 +464,8 @@ EXPORT_SYMBOL(sg_miter_next); * resources (kmap) need to be released during iteration. * * Context: - * IRQ disabled if the SG_MITER_ATOMIC is set. Don't care otherwise. + * Preemption disabled if the SG_MITER_ATOMIC is set. Don't care + * otherwise. */ void sg_miter_stop(struct sg_mapping_iter *miter) { @@ -479,7 +479,7 @@ void sg_miter_stop(struct sg_mapping_iter *miter) flush_kernel_dcache_page(miter->page); if (miter->__flags & SG_MITER_ATOMIC) { - WARN_ON(!irqs_disabled()); + WARN_ON_ONCE(preemptible()); kunmap_atomic(miter->addr); } else kunmap(miter->page); -- cgit v1.2.3 From 33e2a4227ddff7c18921ac175fae3ab0e3ff8a76 Mon Sep 17 00:00:00 2001 From: Hein Tibosch Date: Thu, 4 Oct 2012 17:16:58 -0700 Subject: lib/decompress.c add __init to decompress_method and data Fix the warning: WARNING: vmlinux.o(.text+0x14cfd8): Section mismatch in reference from the variable compressed_formats to the function .init.text:gunzip() The function compressed_formats() references the function __init gunzip(). etc.. Within decompress.c, compressed_formats[] needs 'a __initdata annotation', because some of it's data members refer to functions which will be unloaded after init. Consequently, its user decompress_method() will get the __init prefix. Signed-off-by: Hein Tibosch Cc: Albin Tonnerre Cc: Phillip Lougher Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/decompress.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'lib') diff --git a/lib/decompress.c b/lib/decompress.c index 3d766b7f60ab..31a804277282 100644 --- a/lib/decompress.c +++ b/lib/decompress.c @@ -14,6 +14,7 @@ #include #include +#include #ifndef CONFIG_DECOMPRESS_GZIP # define gunzip NULL @@ -31,11 +32,13 @@ # define unlzo NULL #endif -static const struct compress_format { +struct compress_format { unsigned char magic[2]; const char *name; decompress_fn decompressor; -} compressed_formats[] = { +}; + +static const struct compress_format compressed_formats[] __initdata = { { {037, 0213}, "gzip", gunzip }, { {037, 0236}, "gzip", gunzip }, { {0x42, 0x5a}, "bzip2", bunzip2 }, @@ -45,7 +48,7 @@ static const struct compress_format { { {0, 0}, NULL, NULL } }; -decompress_fn decompress_method(const unsigned char *inbuf, int len, +decompress_fn __init decompress_method(const unsigned char *inbuf, int len, const char **name) { const struct compress_format *cf; -- cgit v1.2.3 From aacf29bf1bf133f6219e6f8969d4ebc2ac76458f Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 13 Sep 2012 13:09:33 +0100 Subject: MPILIB: Provide count_leading/trailing_zeros() based on arch functions Provide count_leading/trailing_zeros() macros based on extant arch bit scanning functions rather than reimplementing from scratch in MPILIB. Whilst we're at it, turn count_foo_zeros(n, x) into n = count_foo_zeros(x). Also move the definition to asm-generic as other people may be interested in using it. Signed-off-by: David Howells Cc: David S. Miller Cc: Dmitry Kasatkin Cc: Arnd Bergmann Signed-off-by: Rusty Russell --- include/asm-generic/bitops/count_zeros.h | 57 +++++++++++++ lib/mpi/longlong.h | 138 +------------------------------ lib/mpi/mpi-bit.c | 2 +- lib/mpi/mpi-pow.c | 4 +- 4 files changed, 62 insertions(+), 139 deletions(-) create mode 100644 include/asm-generic/bitops/count_zeros.h (limited to 'lib') diff --git a/include/asm-generic/bitops/count_zeros.h b/include/asm-generic/bitops/count_zeros.h new file mode 100644 index 000000000000..97520d21fe62 --- /dev/null +++ b/include/asm-generic/bitops/count_zeros.h @@ -0,0 +1,57 @@ +/* Count leading and trailing zeros functions + * + * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#ifndef _ASM_GENERIC_BITOPS_COUNT_ZEROS_H_ +#define _ASM_GENERIC_BITOPS_COUNT_ZEROS_H_ + +#include + +/** + * count_leading_zeros - Count the number of zeros from the MSB back + * @x: The value + * + * Count the number of leading zeros from the MSB going towards the LSB in @x. + * + * If the MSB of @x is set, the result is 0. + * If only the LSB of @x is set, then the result is BITS_PER_LONG-1. + * If @x is 0 then the result is COUNT_LEADING_ZEROS_0. + */ +static inline int count_leading_zeros(unsigned long x) +{ + if (sizeof(x) == 4) + return BITS_PER_LONG - fls(x); + else + return BITS_PER_LONG - fls64(x); +} + +#define COUNT_LEADING_ZEROS_0 BITS_PER_LONG + +/** + * count_trailing_zeros - Count the number of zeros from the LSB forwards + * @x: The value + * + * Count the number of trailing zeros from the LSB going towards the MSB in @x. + * + * If the LSB of @x is set, the result is 0. + * If only the MSB of @x is set, then the result is BITS_PER_LONG-1. + * If @x is 0 then the result is COUNT_TRAILING_ZEROS_0. + */ +static inline int count_trailing_zeros(unsigned long x) +{ +#define COUNT_TRAILING_ZEROS_0 (-1) + + if (sizeof(x) == 4) + return ffs(x); + else + return (x != 0) ? __ffs(x) : COUNT_TRAILING_ZEROS_0; +} + +#endif /* _ASM_GENERIC_BITOPS_COUNT_ZEROS_H_ */ diff --git a/lib/mpi/longlong.h b/lib/mpi/longlong.h index 29f98624ef93..678ce4f1e124 100644 --- a/lib/mpi/longlong.h +++ b/lib/mpi/longlong.h @@ -19,6 +19,8 @@ * the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, * MA 02111-1307, USA. */ +#include + /* You have to define the following before including this file: * * UWtype -- An unsigned type, default type for operations (typically a "word") @@ -146,12 +148,6 @@ do { \ : "1" ((USItype)(n1)), \ "r" ((USItype)(n0)), \ "r" ((USItype)(d))) - -#define count_leading_zeros(count, x) \ - __asm__ ("clz %0,%1" \ - : "=r" ((USItype)(count)) \ - : "r" ((USItype)(x))) -#define COUNT_LEADING_ZEROS_0 32 #endif /* __a29k__ */ #if defined(__alpha) && W_TYPE_SIZE == 64 @@ -298,11 +294,6 @@ extern UDItype __udiv_qrnnd(); : "1" ((USItype)(nh)), \ "0" ((USItype)(nl)), \ "g" ((USItype)(d))) -#define count_leading_zeros(count, x) \ - __asm__ ("bsch/1 %1,%0" \ - : "=g" (count) \ - : "g" ((USItype)(x)), \ - "0" ((USItype)0)) #endif /*************************************** @@ -354,27 +345,6 @@ do { USItype __r; \ } while (0) extern USItype __udiv_qrnnd(); #endif /* LONGLONG_STANDALONE */ -#define count_leading_zeros(count, x) \ -do { \ - USItype __tmp; \ - __asm__ ( \ - "ldi 1,%0\n" \ - "extru,= %1,15,16,%%r0 ; Bits 31..16 zero?\n" \ - "extru,tr %1,15,16,%1 ; No. Shift down, skip add.\n" \ - "ldo 16(%0),%0 ; Yes. Perform add.\n" \ - "extru,= %1,23,8,%%r0 ; Bits 15..8 zero?\n" \ - "extru,tr %1,23,8,%1 ; No. Shift down, skip add.\n" \ - "ldo 8(%0),%0 ; Yes. Perform add.\n" \ - "extru,= %1,27,4,%%r0 ; Bits 7..4 zero?\n" \ - "extru,tr %1,27,4,%1 ; No. Shift down, skip add.\n" \ - "ldo 4(%0),%0 ; Yes. Perform add.\n" \ - "extru,= %1,29,2,%%r0 ; Bits 3..2 zero?\n" \ - "extru,tr %1,29,2,%1 ; No. Shift down, skip add.\n" \ - "ldo 2(%0),%0 ; Yes. Perform add.\n" \ - "extru %1,30,1,%1 ; Extract bit 1.\n" \ - "sub %0,%1,%0 ; Subtract it. " \ - : "=r" (count), "=r" (__tmp) : "1" (x)); \ -} while (0) #endif /* hppa */ /*************************************** @@ -457,15 +427,6 @@ do { \ : "0" ((USItype)(n0)), \ "1" ((USItype)(n1)), \ "rm" ((USItype)(d))) -#define count_leading_zeros(count, x) \ -do { \ - USItype __cbtmp; \ - __asm__ ("bsrl %1,%0" \ - : "=r" (__cbtmp) : "rm" ((USItype)(x))); \ - (count) = __cbtmp ^ 31; \ -} while (0) -#define count_trailing_zeros(count, x) \ - __asm__ ("bsfl %1,%0" : "=r" (count) : "rm" ((USItype)(x))) #ifndef UMUL_TIME #define UMUL_TIME 40 #endif @@ -536,15 +497,6 @@ do { \ "dI" ((USItype)(d))); \ (r) = __rq.__i.__l; (q) = __rq.__i.__h; \ } while (0) -#define count_leading_zeros(count, x) \ -do { \ - USItype __cbtmp; \ - __asm__ ("scanbit %1,%0" \ - : "=r" (__cbtmp) \ - : "r" ((USItype)(x))); \ - (count) = __cbtmp ^ 31; \ -} while (0) -#define COUNT_LEADING_ZEROS_0 (-32) /* sic */ #if defined(__i960mx) /* what is the proper symbol to test??? */ #define rshift_rhlc(r, h, l, c) \ do { \ @@ -603,11 +555,6 @@ do { \ : "0" ((USItype)(n0)), \ "1" ((USItype)(n1)), \ "dmi" ((USItype)(d))) -#define count_leading_zeros(count, x) \ - __asm__ ("bfffo %1{%b2:%b2},%0" \ - : "=d" ((USItype)(count)) \ - : "od" ((USItype)(x)), "n" (0)) -#define COUNT_LEADING_ZEROS_0 32 #else /* not mc68020 */ #define umul_ppmm(xh, xl, a, b) \ do { USItype __umul_tmp1, __umul_tmp2; \ @@ -664,15 +611,6 @@ do { USItype __umul_tmp1, __umul_tmp2; \ "rJ" ((USItype)(bh)), \ "rJ" ((USItype)(al)), \ "rJ" ((USItype)(bl))) -#define count_leading_zeros(count, x) \ -do { \ - USItype __cbtmp; \ - __asm__ ("ff1 %0,%1" \ - : "=r" (__cbtmp) \ - : "r" ((USItype)(x))); \ - (count) = __cbtmp ^ 31; \ -} while (0) -#define COUNT_LEADING_ZEROS_0 63 /* sic */ #if defined(__m88110__) #define umul_ppmm(wh, wl, u, v) \ do { \ @@ -779,12 +717,6 @@ do { \ : "0" (__xx.__ll), \ "g" ((USItype)(d))); \ (r) = __xx.__i.__l; (q) = __xx.__i.__h; }) -#define count_trailing_zeros(count, x) \ -do { \ - __asm__("ffsd %2,%0" \ - : "=r"((USItype) (count)) \ - : "0"((USItype) 0), "r"((USItype) (x))); \ - } while (0) #endif /* __ns32000__ */ /*************************************** @@ -855,11 +787,6 @@ do { \ "rI" ((USItype)(al)), \ "r" ((USItype)(bl))); \ } while (0) -#define count_leading_zeros(count, x) \ - __asm__ ("{cntlz|cntlzw} %0,%1" \ - : "=r" ((USItype)(count)) \ - : "r" ((USItype)(x))) -#define COUNT_LEADING_ZEROS_0 32 #if defined(_ARCH_PPC) #define umul_ppmm(ph, pl, m0, m1) \ do { \ @@ -1001,19 +928,6 @@ do { \ } while (0) #define UMUL_TIME 20 #define UDIV_TIME 200 -#define count_leading_zeros(count, x) \ -do { \ - if ((x) >= 0x10000) \ - __asm__ ("clz %0,%1" \ - : "=r" ((USItype)(count)) \ - : "r" ((USItype)(x) >> 16)); \ - else { \ - __asm__ ("clz %0,%1" \ - : "=r" ((USItype)(count)) \ - : "r" ((USItype)(x))); \ - (count) += 16; \ - } \ -} while (0) #endif /* RT/ROMP */ /*************************************** @@ -1142,13 +1056,6 @@ do { \ "rI" ((USItype)(d)) \ : "%g1" __AND_CLOBBER_CC) #define UDIV_TIME 37 -#define count_leading_zeros(count, x) \ - __asm__ ("scan %1,0,%0" \ - : "=r" ((USItype)(x)) \ - : "r" ((USItype)(count))) -/* Early sparclites return 63 for an argument of 0, but they warn that future - implementations might change this. Therefore, leave COUNT_LEADING_ZEROS_0 - undefined. */ #endif /* __sparclite__ */ #endif /* __sparc_v8__ */ /* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd. */ @@ -1454,47 +1361,6 @@ do { \ #define udiv_qrnnd __udiv_qrnnd_c #endif -#undef count_leading_zeros -#if !defined(count_leading_zeros) - extern -#ifdef __STDC__ - const -#endif - unsigned char __clz_tab[]; -#define count_leading_zeros(count, x) \ -do { \ - UWtype __xr = (x); \ - UWtype __a; \ - \ - if (W_TYPE_SIZE <= 32) { \ - __a = __xr < ((UWtype) 1 << 2*__BITS4) \ - ? (__xr < ((UWtype) 1 << __BITS4) ? 0 : __BITS4) \ - : (__xr < ((UWtype) 1 << 3*__BITS4) ? 2*__BITS4 : 3*__BITS4); \ - } \ - else { \ - for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8) \ - if (((__xr >> __a) & 0xff) != 0) \ - break; \ - } \ - \ - (count) = W_TYPE_SIZE - (__clz_tab[__xr >> __a] + __a); \ -} while (0) - /* This version gives a well-defined value for zero. */ -#define COUNT_LEADING_ZEROS_0 W_TYPE_SIZE -#endif - -#if !defined(count_trailing_zeros) -/* Define count_trailing_zeros using count_leading_zeros. The latter might be - defined in asm, but if it is not, the C version above is good enough. */ -#define count_trailing_zeros(count, x) \ -do { \ - UWtype __ctz_x = (x); \ - UWtype __ctz_c; \ - count_leading_zeros(__ctz_c, __ctz_x & -__ctz_x); \ - (count) = W_TYPE_SIZE - 1 - __ctz_c; \ -} while (0) -#endif - #ifndef UDIV_NEEDS_NORMALIZATION #define UDIV_NEEDS_NORMALIZATION 0 #endif diff --git a/lib/mpi/mpi-bit.c b/lib/mpi/mpi-bit.c index 568724804f29..503537e08436 100644 --- a/lib/mpi/mpi-bit.c +++ b/lib/mpi/mpi-bit.c @@ -45,7 +45,7 @@ unsigned mpi_get_nbits(MPI a) if (a->nlimbs) { mpi_limb_t alimb = a->d[a->nlimbs - 1]; if (alimb) - count_leading_zeros(n, alimb); + n = count_leading_zeros(alimb); else n = BITS_PER_MPI_LIMB; n = BITS_PER_MPI_LIMB - n + (a->nlimbs - 1) * BITS_PER_MPI_LIMB; diff --git a/lib/mpi/mpi-pow.c b/lib/mpi/mpi-pow.c index 67f3e79af914..5464c8744ea9 100644 --- a/lib/mpi/mpi-pow.c +++ b/lib/mpi/mpi-pow.c @@ -77,7 +77,7 @@ int mpi_powm(MPI res, MPI base, MPI exp, MPI mod) mp = mp_marker = mpi_alloc_limb_space(msize); if (!mp) goto enomem; - count_leading_zeros(mod_shift_cnt, mod->d[msize - 1]); + mod_shift_cnt = count_leading_zeros(mod->d[msize - 1]); if (mod_shift_cnt) mpihelp_lshift(mp, mod->d, msize, mod_shift_cnt); else @@ -169,7 +169,7 @@ int mpi_powm(MPI res, MPI base, MPI exp, MPI mod) i = esize - 1; e = ep[i]; - count_leading_zeros(c, e); + c = count_leading_zeros(e); e = (e << c) << 1; /* shift the exp bits to the left, lose msb */ c = BITS_PER_MPI_LIMB - 1 - c; -- cgit v1.2.3 From 12f008b6dc5ff1c822fdb2198d20e3dbdc92f3f5 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 21 Sep 2012 23:25:22 +0100 Subject: MPILIB: Reinstate mpi_cmp[_ui]() and export for RSA signature verification Reinstate and export mpi_cmp() and mpi_cmp_ui() from the MPI library for use by RSA signature verification as per RFC3447 section 5.2.2 step 1. Signed-off-by: David Howells Signed-off-by: Rusty Russell --- lib/mpi/Makefile | 1 + lib/mpi/mpi-cmp.c | 70 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 lib/mpi/mpi-cmp.c (limited to 'lib') diff --git a/lib/mpi/Makefile b/lib/mpi/Makefile index 45ca90a8639c..019a68c90144 100644 --- a/lib/mpi/Makefile +++ b/lib/mpi/Makefile @@ -14,6 +14,7 @@ mpi-y = \ generic_mpih-add1.o \ mpicoder.o \ mpi-bit.o \ + mpi-cmp.o \ mpih-cmp.o \ mpih-div.o \ mpih-mul.o \ diff --git a/lib/mpi/mpi-cmp.c b/lib/mpi/mpi-cmp.c new file mode 100644 index 000000000000..1871e7b61ca0 --- /dev/null +++ b/lib/mpi/mpi-cmp.c @@ -0,0 +1,70 @@ +/* mpi-cmp.c - MPI functions + * Copyright (C) 1998, 1999 Free Software Foundation, Inc. + * + * This file is part of GnuPG. + * + * GnuPG is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * GnuPG is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + */ + +#include "mpi-internal.h" + +int mpi_cmp_ui(MPI u, unsigned long v) +{ + mpi_limb_t limb = v; + + mpi_normalize(u); + if (!u->nlimbs && !limb) + return 0; + if (u->sign) + return -1; + if (u->nlimbs > 1) + return 1; + + if (u->d[0] == limb) + return 0; + else if (u->d[0] > limb) + return 1; + else + return -1; +} +EXPORT_SYMBOL_GPL(mpi_cmp_ui); + +int mpi_cmp(MPI u, MPI v) +{ + mpi_size_t usize, vsize; + int cmp; + + mpi_normalize(u); + mpi_normalize(v); + usize = u->nlimbs; + vsize = v->nlimbs; + if (!u->sign && v->sign) + return 1; + if (u->sign && !v->sign) + return -1; + if (usize != vsize && !u->sign && !v->sign) + return usize - vsize; + if (usize != vsize && u->sign && v->sign) + return vsize + usize; + if (!usize) + return 0; + cmp = mpihelp_cmp(u->d, v->d, usize); + if (!cmp) + return 0; + if ((cmp < 0 ? 1 : 0) == (u->sign ? 1 : 0)) + return 1; + return -1; +} +EXPORT_SYMBOL_GPL(mpi_cmp); -- cgit v1.2.3 From a77ad6ea0b0bb1f9d1f52ed494bd72a5fdde208e Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 21 Sep 2012 23:30:46 +0100 Subject: X.509: Implement simple static OID registry Implement a simple static OID registry that allows the mapping of an encoded OID to an enum value for ease of use. The OID registry index enum appears in the: linux/oid_registry.h header file. A script generates the registry from lines in the header file that look like: OID_foo,/*1.2.3.4*/ The actual OID is taken to be represented by the numbers with interpolated dots in the comment. All other lines in the header are ignored. The registry is queries by calling: OID look_up_oid(const void *data, size_t datasize); This returns a number from the registry enum representing the OID if found or OID__NR if not. Signed-off-by: David Howells Signed-off-by: Rusty Russell --- include/linux/oid_registry.h | 90 +++++++++++++++++++ lib/.gitignore | 2 +- lib/Kconfig | 5 ++ lib/Makefile | 16 ++++ lib/build_OID_registry | 209 +++++++++++++++++++++++++++++++++++++++++++ lib/oid_registry.c | 89 ++++++++++++++++++ 6 files changed, 410 insertions(+), 1 deletion(-) create mode 100644 include/linux/oid_registry.h create mode 100755 lib/build_OID_registry create mode 100644 lib/oid_registry.c (limited to 'lib') diff --git a/include/linux/oid_registry.h b/include/linux/oid_registry.h new file mode 100644 index 000000000000..5928546bf00a --- /dev/null +++ b/include/linux/oid_registry.h @@ -0,0 +1,90 @@ +/* ASN.1 Object identifier (OID) registry + * + * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#ifndef _LINUX_OID_REGISTRY_H +#define _LINUX_OID_REGISTRY_H + +#include + +/* + * OIDs are turned into these values if possible, or OID__NR if not held here. + * + * NOTE! Do not mess with the format of each line as this is read by + * build_OID_registry.pl to generate the data for look_up_OID(). + */ +enum OID { + OID_id_dsa_with_sha1, /* 1.2.840.10030.4.3 */ + OID_id_dsa, /* 1.2.840.10040.4.1 */ + OID_id_ecdsa_with_sha1, /* 1.2.840.10045.4.1 */ + OID_id_ecPublicKey, /* 1.2.840.10045.2.1 */ + + /* PKCS#1 {iso(1) member-body(2) us(840) rsadsi(113549) pkcs(1) pkcs-1(1)} */ + OID_rsaEncryption, /* 1.2.840.113549.1.1.1 */ + OID_md2WithRSAEncryption, /* 1.2.840.113549.1.1.2 */ + OID_md3WithRSAEncryption, /* 1.2.840.113549.1.1.3 */ + OID_md4WithRSAEncryption, /* 1.2.840.113549.1.1.4 */ + OID_sha1WithRSAEncryption, /* 1.2.840.113549.1.1.5 */ + OID_sha256WithRSAEncryption, /* 1.2.840.113549.1.1.11 */ + OID_sha384WithRSAEncryption, /* 1.2.840.113549.1.1.12 */ + OID_sha512WithRSAEncryption, /* 1.2.840.113549.1.1.13 */ + OID_sha224WithRSAEncryption, /* 1.2.840.113549.1.1.14 */ + /* PKCS#7 {iso(1) member-body(2) us(840) rsadsi(113549) pkcs(1) pkcs-7(7)} */ + OID_data, /* 1.2.840.113549.1.7.1 */ + OID_signed_data, /* 1.2.840.113549.1.7.2 */ + /* PKCS#9 {iso(1) member-body(2) us(840) rsadsi(113549) pkcs(1) pkcs-9(9)} */ + OID_email_address, /* 1.2.840.113549.1.9.1 */ + OID_content_type, /* 1.2.840.113549.1.9.3 */ + OID_messageDigest, /* 1.2.840.113549.1.9.4 */ + OID_signingTime, /* 1.2.840.113549.1.9.5 */ + OID_smimeCapabilites, /* 1.2.840.113549.1.9.15 */ + OID_smimeAuthenticatedAttrs, /* 1.2.840.113549.1.9.16.2.11 */ + + /* {iso(1) member-body(2) us(840) rsadsi(113549) digestAlgorithm(2)} */ + OID_md2, /* 1.2.840.113549.2.2 */ + OID_md4, /* 1.2.840.113549.2.4 */ + OID_md5, /* 1.2.840.113549.2.5 */ + + OID_certAuthInfoAccess, /* 1.3.6.1.5.5.7.1.1 */ + OID_msOutlookExpress, /* 1.3.6.1.4.1.311.16.4 */ + OID_sha1, /* 1.3.14.3.2.26 */ + + /* Distinguished Name attribute IDs [RFC 2256] */ + OID_commonName, /* 2.5.4.3 */ + OID_surname, /* 2.5.4.4 */ + OID_countryName, /* 2.5.4.6 */ + OID_locality, /* 2.5.4.7 */ + OID_stateOrProvinceName, /* 2.5.4.8 */ + OID_organizationName, /* 2.5.4.10 */ + OID_organizationUnitName, /* 2.5.4.11 */ + OID_title, /* 2.5.4.12 */ + OID_description, /* 2.5.4.13 */ + OID_name, /* 2.5.4.41 */ + OID_givenName, /* 2.5.4.42 */ + OID_initials, /* 2.5.4.43 */ + OID_generationalQualifier, /* 2.5.4.44 */ + + /* Certificate extension IDs */ + OID_subjectKeyIdentifier, /* 2.5.29.14 */ + OID_keyUsage, /* 2.5.29.15 */ + OID_subjectAltName, /* 2.5.29.17 */ + OID_issuerAltName, /* 2.5.29.18 */ + OID_basicConstraints, /* 2.5.29.19 */ + OID_crlDistributionPoints, /* 2.5.29.31 */ + OID_certPolicies, /* 2.5.29.32 */ + OID_authorityKeyIdentifier, /* 2.5.29.35 */ + OID_extKeyUsage, /* 2.5.29.37 */ + + OID__NR +}; + +extern enum OID look_up_OID(const void *data, size_t datasize); + +#endif /* _LINUX_OID_REGISTRY_H */ diff --git a/lib/.gitignore b/lib/.gitignore index 3bef1ea94c99..09aae85418ab 100644 --- a/lib/.gitignore +++ b/lib/.gitignore @@ -3,4 +3,4 @@ # gen_crc32table crc32table.h - +oid_registry_data.c diff --git a/lib/Kconfig b/lib/Kconfig index bb94c1ba616a..4b31a46fb307 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -396,4 +396,9 @@ config SIGNATURE config LIBFDT bool +config OID_REGISTRY + tristate + help + Enable fast lookup object identifier registry. + endmenu diff --git a/lib/Makefile b/lib/Makefile index 42d283edc4d3..b0428960939f 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -150,3 +150,19 @@ quiet_cmd_crc32 = GEN $@ $(obj)/crc32table.h: $(obj)/gen_crc32table $(call cmd,crc32) + +# +# Build a fast OID lookip registry from include/linux/oid_registry.h +# +obj-$(CONFIG_OID_REGISTRY) += oid_registry.o + +$(obj)/oid_registry.c: $(obj)/oid_registry_data.c + +$(obj)/oid_registry_data.c: $(srctree)/include/linux/oid_registry.h \ + $(src)/build_OID_registry + $(call cmd,build_OID_registry) + +quiet_cmd_build_OID_registry = GEN $@ + cmd_build_OID_registry = perl $(srctree)/$(src)/build_OID_registry $< $@ + +clean-files += oid_registry_data.c diff --git a/lib/build_OID_registry b/lib/build_OID_registry new file mode 100755 index 000000000000..dfbdaab81bc8 --- /dev/null +++ b/lib/build_OID_registry @@ -0,0 +1,209 @@ +#!/usr/bin/perl -w +# +# Build a static ASN.1 Object Identified (OID) registry +# +# Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. +# Written by David Howells (dhowells@redhat.com) +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public Licence +# as published by the Free Software Foundation; either version +# 2 of the Licence, or (at your option) any later version. +# + +use strict; + +my @names = (); +my @oids = (); + +if ($#ARGV != 1) { + print STDERR "Format: ", $0, " \n"; + exit(2); +} + +# +# Open the file to read from +# +open IN_FILE, "<$ARGV[0]" || die; +while () { + chomp; + if (m!\s+OID_([a-zA-z][a-zA-Z0-9_]+),\s+/[*]\s+([012][.0-9]*)\s+[*]/!) { + push @names, $1; + push @oids, $2; + } +} +close IN_FILE || die; + +# +# Open the files to write into +# +open C_FILE, ">$ARGV[1]" or die; +print C_FILE "/*\n"; +print C_FILE " * Automatically generated by ", $0, ". Do not edit\n"; +print C_FILE " */\n"; + +# +# Split the data up into separate lists and also determine the lengths of the +# encoded data arrays. +# +my @indices = (); +my @lengths = (); +my $total_length = 0; + +print "Compiling ", $#names + 1, " OIDs\n"; + +for (my $i = 0; $i <= $#names; $i++) { + my $name = $names[$i]; + my $oid = $oids[$i]; + + my @components = split(/[.]/, $oid); + + # Determine the encoded length of this OID + my $size = $#components; + for (my $loop = 2; $loop <= $#components; $loop++) { + my $c = $components[$loop]; + + # We will base128 encode the number + my $tmp = ($c == 0) ? 0 : int(log($c)/log(2)); + $tmp = int($tmp / 7); + $size += $tmp; + } + push @lengths, $size; + push @indices, $total_length; + $total_length += $size; +} + +# +# Emit the look-up-by-OID index table +# +print C_FILE "\n"; +if ($total_length <= 255) { + print C_FILE "static const unsigned char oid_index[OID__NR + 1] = {\n"; +} else { + print C_FILE "static const unsigned short oid_index[OID__NR + 1] = {\n"; +} +for (my $i = 0; $i <= $#names; $i++) { + print C_FILE "\t[OID_", $names[$i], "] = ", $indices[$i], ",\n" +} +print C_FILE "\t[OID__NR] = ", $total_length, "\n"; +print C_FILE "};\n"; + +# +# Encode the OIDs +# +my @encoded_oids = (); + +for (my $i = 0; $i <= $#names; $i++) { + my @octets = (); + + my @components = split(/[.]/, $oids[$i]); + + push @octets, $components[0] * 40 + $components[1]; + + for (my $loop = 2; $loop <= $#components; $loop++) { + my $c = $components[$loop]; + + # Base128 encode the number + my $tmp = ($c == 0) ? 0 : int(log($c)/log(2)); + $tmp = int($tmp / 7); + + for (; $tmp > 0; $tmp--) { + push @octets, (($c >> $tmp * 7) & 0x7f) | 0x80; + } + push @octets, $c & 0x7f; + } + + push @encoded_oids, \@octets; +} + +# +# Create a hash value for each OID +# +my @hash_values = (); +for (my $i = 0; $i <= $#names; $i++) { + my @octets = @{$encoded_oids[$i]}; + + my $hash = $#octets; + foreach (@octets) { + $hash += $_ * 33; + } + + $hash = ($hash >> 24) ^ ($hash >> 16) ^ ($hash >> 8) ^ ($hash); + + push @hash_values, $hash & 0xff; +} + +# +# Emit the OID data +# +print C_FILE "\n"; +print C_FILE "static const unsigned char oid_data[", $total_length, "] = {\n"; +for (my $i = 0; $i <= $#names; $i++) { + my @octets = @{$encoded_oids[$i]}; + print C_FILE "\t"; + print C_FILE $_, ", " foreach (@octets); + print C_FILE "\t// ", $names[$i]; + print C_FILE "\n"; +} +print C_FILE "};\n"; + +# +# Build the search index table (ordered by length then hash then content) +# +my @index_table = ( 0 .. $#names ); + +@index_table = sort { + my @octets_a = @{$encoded_oids[$a]}; + my @octets_b = @{$encoded_oids[$b]}; + + return $hash_values[$a] <=> $hash_values[$b] + if ($hash_values[$a] != $hash_values[$b]); + return $#octets_a <=> $#octets_b + if ($#octets_a != $#octets_b); + for (my $i = $#octets_a; $i >= 0; $i--) { + return $octets_a[$i] <=> $octets_b[$i] + if ($octets_a[$i] != $octets_b[$i]); + } + return 0; + +} @index_table; + +# +# Emit the search index and hash value table +# +print C_FILE "\n"; +print C_FILE "static const struct {\n"; +print C_FILE "\tunsigned char hash;\n"; +if ($#names <= 255) { + print C_FILE "\tenum OID oid : 8;\n"; +} else { + print C_FILE "\tenum OID oid : 16;\n"; +} +print C_FILE "} oid_search_table[OID__NR] = {\n"; +for (my $i = 0; $i <= $#names; $i++) { + my @octets = @{$encoded_oids[$index_table[$i]]}; + printf(C_FILE "\t[%3u] = { %3u, OID_%-35s }, // ", + $i, + $hash_values[$index_table[$i]], + $names[$index_table[$i]]); + printf C_FILE "%02x", $_ foreach (@octets); + print C_FILE "\n"; +} +print C_FILE "};\n"; + +# +# Emit the OID debugging name table +# +#print C_FILE "\n"; +#print C_FILE "const char *const oid_name_table[OID__NR + 1] = {\n"; +# +#for (my $i = 0; $i <= $#names; $i++) { +# print C_FILE "\t\"", $names[$i], "\",\n" +#} +#print C_FILE "\t\"Unknown-OID\"\n"; +#print C_FILE "};\n"; + +# +# Polish off +# +close C_FILE or die; diff --git a/lib/oid_registry.c b/lib/oid_registry.c new file mode 100644 index 000000000000..33cfd171f627 --- /dev/null +++ b/lib/oid_registry.c @@ -0,0 +1,89 @@ +/* ASN.1 Object identifier (OID) registry + * + * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include +#include +#include "oid_registry_data.c" + +/** + * look_up_OID - Find an OID registration for the specified data + * @data: Binary representation of the OID + * @datasize: Size of the binary representation + */ +enum OID look_up_OID(const void *data, size_t datasize) +{ + const unsigned char *octets = data; + enum OID oid; + unsigned char xhash; + unsigned i, j, k, hash; + size_t len; + + /* Hash the OID data */ + hash = datasize - 1; + + for (i = 0; i < datasize; i++) + hash += octets[i] * 33; + hash = (hash >> 24) ^ (hash >> 16) ^ (hash >> 8) ^ hash; + hash &= 0xff; + + /* Binary search the OID registry. OIDs are stored in ascending order + * of hash value then ascending order of size and then in ascending + * order of reverse value. + */ + i = 0; + k = OID__NR; + while (i < k) { + j = (i + k) / 2; + + xhash = oid_search_table[j].hash; + if (xhash > hash) { + k = j; + continue; + } + if (xhash < hash) { + i = j + 1; + continue; + } + + oid = oid_search_table[j].oid; + len = oid_index[oid + 1] - oid_index[oid]; + if (len > datasize) { + k = j; + continue; + } + if (len < datasize) { + i = j + 1; + continue; + } + + /* Variation is most likely to be at the tail end of the + * OID, so do the comparison in reverse. + */ + while (len > 0) { + unsigned char a = oid_data[oid_index[oid] + --len]; + unsigned char b = octets[len]; + if (a > b) { + k = j; + goto next; + } + if (a < b) { + i = j + 1; + goto next; + } + } + return oid; + next: + ; + } + + return OID__NR; +} +EXPORT_SYMBOL_GPL(look_up_OID); -- cgit v1.2.3 From 4f73175d0375a7c1b3ae625e76acee8b39741f28 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 21 Sep 2012 23:30:51 +0100 Subject: X.509: Add utility functions to render OIDs as strings Add a pair of utility functions to render OIDs as strings. The first takes an encoded OID and turns it into a "a.b.c.d" form string: int sprint_oid(const void *data, size_t datasize, char *buffer, size_t bufsize); The second takes an OID enum index and calls the first on the data held therein: int sprint_OID(enum OID oid, char *buffer, size_t bufsize); Signed-off-by: David Howells Signed-off-by: Rusty Russell --- include/linux/oid_registry.h | 2 ++ lib/oid_registry.c | 81 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+) (limited to 'lib') diff --git a/include/linux/oid_registry.h b/include/linux/oid_registry.h index 5928546bf00a..6926db724258 100644 --- a/include/linux/oid_registry.h +++ b/include/linux/oid_registry.h @@ -86,5 +86,7 @@ enum OID { }; extern enum OID look_up_OID(const void *data, size_t datasize); +extern int sprint_oid(const void *, size_t, char *, size_t); +extern int sprint_OID(enum OID, char *, size_t); #endif /* _LINUX_OID_REGISTRY_H */ diff --git a/lib/oid_registry.c b/lib/oid_registry.c index 33cfd171f627..d8de11f45908 100644 --- a/lib/oid_registry.c +++ b/lib/oid_registry.c @@ -11,6 +11,9 @@ #include #include +#include +#include +#include #include "oid_registry_data.c" /** @@ -87,3 +90,81 @@ enum OID look_up_OID(const void *data, size_t datasize) return OID__NR; } EXPORT_SYMBOL_GPL(look_up_OID); + +/* + * sprint_OID - Print an Object Identifier into a buffer + * @data: The encoded OID to print + * @datasize: The size of the encoded OID + * @buffer: The buffer to render into + * @bufsize: The size of the buffer + * + * The OID is rendered into the buffer in "a.b.c.d" format and the number of + * bytes is returned. -EBADMSG is returned if the data could not be intepreted + * and -ENOBUFS if the buffer was too small. + */ +int sprint_oid(const void *data, size_t datasize, char *buffer, size_t bufsize) +{ + const unsigned char *v = data, *end = v + datasize; + unsigned long num; + unsigned char n; + size_t ret; + int count; + + if (v >= end) + return -EBADMSG; + + n = *v++; + ret = count = snprintf(buffer, bufsize, "%u.%u", n / 40, n % 40); + buffer += count; + bufsize -= count; + if (bufsize == 0) + return -ENOBUFS; + + while (v < end) { + num = 0; + n = *v++; + if (!(n & 0x80)) { + num = n; + } else { + num = n & 0x7f; + do { + if (v >= end) + return -EBADMSG; + n = *v++; + num <<= 7; + num |= n & 0x7f; + } while (n & 0x80); + } + ret += count = snprintf(buffer, bufsize, ".%lu", num); + buffer += count; + bufsize -= count; + if (bufsize == 0) + return -ENOBUFS; + } + + return ret; +} +EXPORT_SYMBOL_GPL(sprint_oid); + +/** + * sprint_OID - Print an Object Identifier into a buffer + * @oid: The OID to print + * @buffer: The buffer to render into + * @bufsize: The size of the buffer + * + * The OID is rendered into the buffer in "a.b.c.d" format and the number of + * bytes is returned. + */ +int sprint_OID(enum OID oid, char *buffer, size_t bufsize) +{ + int ret; + + BUG_ON(oid >= OID__NR); + + ret = sprint_oid(oid_data + oid_index[oid], + oid_index[oid + 1] - oid_index[oid], + buffer, bufsize); + BUG_ON(ret == -EBADMSG); + return ret; +} +EXPORT_SYMBOL_GPL(sprint_OID); -- cgit v1.2.3 From 42d5ec27f873c654a68f7f865dcd7737513e9508 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 24 Sep 2012 17:11:16 +0100 Subject: X.509: Add an ASN.1 decoder Add an ASN.1 BER/DER/CER decoder. This uses the bytecode from the ASN.1 compiler in the previous patch to inform it as to what to expect to find in the encoded byte stream. The output from the compiler also tells it what functions to call on what tags, thus allowing the caller to retrieve information. The decoder is called as follows: int asn1_decoder(const struct asn1_decoder *decoder, void *context, const unsigned char *data, size_t datalen); The decoder argument points to the bytecode from the ASN.1 compiler. context is the caller's context and is passed to the action functions. data and datalen define the byte stream to be decoded. Note that the decoder is currently limited to datalen being less than 64K. This reduces the amount of stack space used by the decoder because ASN.1 is a nested construct. Similarly, the decoder is limited to a maximum of 10 levels of constructed data outside of a leaf node also in an effort to keep stack usage down. These restrictions can be raised if necessary. Signed-off-by: David Howells Signed-off-by: Rusty Russell --- include/linux/asn1_decoder.h | 24 +++ lib/Makefile | 2 + lib/asn1_decoder.c | 477 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 503 insertions(+) create mode 100644 include/linux/asn1_decoder.h create mode 100644 lib/asn1_decoder.c (limited to 'lib') diff --git a/include/linux/asn1_decoder.h b/include/linux/asn1_decoder.h new file mode 100644 index 000000000000..fa2ff5bc0483 --- /dev/null +++ b/include/linux/asn1_decoder.h @@ -0,0 +1,24 @@ +/* ASN.1 decoder + * + * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#ifndef _LINUX_ASN1_DECODER_H +#define _LINUX_ASN1_DECODER_H + +#include + +struct asn1_decoder; + +extern int asn1_ber_decoder(const struct asn1_decoder *decoder, + void *context, + const unsigned char *data, + size_t datalen); + +#endif /* _LINUX_ASN1_DECODER_H */ diff --git a/lib/Makefile b/lib/Makefile index b0428960939f..ca856b69a21d 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -140,6 +140,8 @@ $(foreach file, $(libfdt_files), \ $(eval CFLAGS_$(file) = -I$(src)/../scripts/dtc/libfdt)) lib-$(CONFIG_LIBFDT) += $(libfdt_files) +obj-$(CONFIG_ASN1) += asn1_decoder.o + hostprogs-y := gen_crc32table clean-files := crc32table.h diff --git a/lib/asn1_decoder.c b/lib/asn1_decoder.c new file mode 100644 index 000000000000..2e4196ddf06f --- /dev/null +++ b/lib/asn1_decoder.c @@ -0,0 +1,477 @@ +/* Decoder for ASN.1 BER/DER/CER encoded bytestream + * + * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include + +static const unsigned char asn1_op_lengths[ASN1_OP__NR] = { + /* OPC TAG JMP ACT */ + [ASN1_OP_MATCH] = 1 + 1, + [ASN1_OP_MATCH_OR_SKIP] = 1 + 1, + [ASN1_OP_MATCH_ACT] = 1 + 1 + 1, + [ASN1_OP_MATCH_ACT_OR_SKIP] = 1 + 1 + 1, + [ASN1_OP_MATCH_JUMP] = 1 + 1 + 1, + [ASN1_OP_MATCH_JUMP_OR_SKIP] = 1 + 1 + 1, + [ASN1_OP_MATCH_ANY] = 1, + [ASN1_OP_MATCH_ANY_ACT] = 1 + 1, + [ASN1_OP_COND_MATCH_OR_SKIP] = 1 + 1, + [ASN1_OP_COND_MATCH_ACT_OR_SKIP] = 1 + 1 + 1, + [ASN1_OP_COND_MATCH_JUMP_OR_SKIP] = 1 + 1 + 1, + [ASN1_OP_COND_MATCH_ANY] = 1, + [ASN1_OP_COND_MATCH_ANY_ACT] = 1 + 1, + [ASN1_OP_COND_FAIL] = 1, + [ASN1_OP_COMPLETE] = 1, + [ASN1_OP_ACT] = 1 + 1, + [ASN1_OP_RETURN] = 1, + [ASN1_OP_END_SEQ] = 1, + [ASN1_OP_END_SEQ_OF] = 1 + 1, + [ASN1_OP_END_SET] = 1, + [ASN1_OP_END_SET_OF] = 1 + 1, + [ASN1_OP_END_SEQ_ACT] = 1 + 1, + [ASN1_OP_END_SEQ_OF_ACT] = 1 + 1 + 1, + [ASN1_OP_END_SET_ACT] = 1 + 1, + [ASN1_OP_END_SET_OF_ACT] = 1 + 1 + 1, +}; + +/* + * Find the length of an indefinite length object + */ +static int asn1_find_indefinite_length(const unsigned char *data, size_t datalen, + const char **_errmsg, size_t *_err_dp) +{ + unsigned char tag, tmp; + size_t dp = 0, len, n; + int indef_level = 1; + +next_tag: + if (unlikely(datalen - dp < 2)) { + if (datalen == dp) + goto missing_eoc; + goto data_overrun_error; + } + + /* Extract a tag from the data */ + tag = data[dp++]; + if (tag == 0) { + /* It appears to be an EOC. */ + if (data[dp++] != 0) + goto invalid_eoc; + if (--indef_level <= 0) + return dp; + goto next_tag; + } + + if (unlikely((tag & 0x1f) == 0x1f)) { + do { + if (unlikely(datalen - dp < 2)) + goto data_overrun_error; + tmp = data[dp++]; + } while (tmp & 0x80); + } + + /* Extract the length */ + len = data[dp++]; + if (len < 0x7f) { + dp += len; + goto next_tag; + } + + if (unlikely(len == 0x80)) { + /* Indefinite length */ + if (unlikely((tag & ASN1_CONS_BIT) == ASN1_PRIM << 5)) + goto indefinite_len_primitive; + indef_level++; + goto next_tag; + } + + n = len - 0x80; + if (unlikely(n > sizeof(size_t) - 1)) + goto length_too_long; + if (unlikely(n > datalen - dp)) + goto data_overrun_error; + for (len = 0; n > 0; n--) { + len <<= 8; + len |= data[dp++]; + } + dp += len; + goto next_tag; + +length_too_long: + *_errmsg = "Unsupported length"; + goto error; +indefinite_len_primitive: + *_errmsg = "Indefinite len primitive not permitted"; + goto error; +invalid_eoc: + *_errmsg = "Invalid length EOC"; + goto error; +data_overrun_error: + *_errmsg = "Data overrun error"; + goto error; +missing_eoc: + *_errmsg = "Missing EOC in indefinite len cons"; +error: + *_err_dp = dp; + return -1; +} + +/** + * asn1_ber_decoder - Decoder BER/DER/CER ASN.1 according to pattern + * @decoder: The decoder definition (produced by asn1_compiler) + * @context: The caller's context (to be passed to the action functions) + * @data: The encoded data + * @datasize: The size of the encoded data + * + * Decode BER/DER/CER encoded ASN.1 data according to a bytecode pattern + * produced by asn1_compiler. Action functions are called on marked tags to + * allow the caller to retrieve significant data. + * + * LIMITATIONS: + * + * To keep down the amount of stack used by this function, the following limits + * have been imposed: + * + * (1) This won't handle datalen > 65535 without increasing the size of the + * cons stack elements and length_too_long checking. + * + * (2) The stack of constructed types is 10 deep. If the depth of non-leaf + * constructed types exceeds this, the decode will fail. + * + * (3) The SET type (not the SET OF type) isn't really supported as tracking + * what members of the set have been seen is a pain. + */ +int asn1_ber_decoder(const struct asn1_decoder *decoder, + void *context, + const unsigned char *data, + size_t datalen) +{ + const unsigned char *machine = decoder->machine; + const asn1_action_t *actions = decoder->actions; + size_t machlen = decoder->machlen; + enum asn1_opcode op; + unsigned char tag = 0, csp = 0, jsp = 0, optag = 0, hdr = 0; + const char *errmsg; + size_t pc = 0, dp = 0, tdp = 0, len = 0; + int ret; + + unsigned char flags = 0; +#define FLAG_INDEFINITE_LENGTH 0x01 +#define FLAG_MATCHED 0x02 +#define FLAG_CONS 0x20 /* Corresponds to CONS bit in the opcode tag + * - ie. whether or not we are going to parse + * a compound type. + */ + +#define NR_CONS_STACK 10 + unsigned short cons_dp_stack[NR_CONS_STACK]; + unsigned short cons_datalen_stack[NR_CONS_STACK]; + unsigned char cons_hdrlen_stack[NR_CONS_STACK]; +#define NR_JUMP_STACK 10 + unsigned char jump_stack[NR_JUMP_STACK]; + + if (datalen > 65535) + return -EMSGSIZE; + +next_op: + pr_debug("next_op: pc=\e[32m%zu\e[m/%zu dp=\e[33m%zu\e[m/%zu C=%d J=%d\n", + pc, machlen, dp, datalen, csp, jsp); + if (unlikely(pc >= machlen)) + goto machine_overrun_error; + op = machine[pc]; + if (unlikely(pc + asn1_op_lengths[op] > machlen)) + goto machine_overrun_error; + + /* If this command is meant to match a tag, then do that before + * evaluating the command. + */ + if (op <= ASN1_OP__MATCHES_TAG) { + unsigned char tmp; + + /* Skip conditional matches if possible */ + if ((op & ASN1_OP_MATCH__COND && + flags & FLAG_MATCHED) || + dp == datalen) { + pc += asn1_op_lengths[op]; + goto next_op; + } + + flags = 0; + hdr = 2; + + /* Extract a tag from the data */ + if (unlikely(dp >= datalen - 1)) + goto data_overrun_error; + tag = data[dp++]; + if (unlikely((tag & 0x1f) == 0x1f)) + goto long_tag_not_supported; + + if (op & ASN1_OP_MATCH__ANY) { + pr_debug("- any %02x\n", tag); + } else { + /* Extract the tag from the machine + * - Either CONS or PRIM are permitted in the data if + * CONS is not set in the op stream, otherwise CONS + * is mandatory. + */ + optag = machine[pc + 1]; + flags |= optag & FLAG_CONS; + + /* Determine whether the tag matched */ + tmp = optag ^ tag; + tmp &= ~(optag & ASN1_CONS_BIT); + pr_debug("- match? %02x %02x %02x\n", tag, optag, tmp); + if (tmp != 0) { + /* All odd-numbered tags are MATCH_OR_SKIP. */ + if (op & ASN1_OP_MATCH__SKIP) { + pc += asn1_op_lengths[op]; + dp--; + goto next_op; + } + goto tag_mismatch; + } + } + flags |= FLAG_MATCHED; + + len = data[dp++]; + if (len > 0x7f) { + if (unlikely(len == 0x80)) { + /* Indefinite length */ + if (unlikely(!(tag & ASN1_CONS_BIT))) + goto indefinite_len_primitive; + flags |= FLAG_INDEFINITE_LENGTH; + if (unlikely(2 > datalen - dp)) + goto data_overrun_error; + } else { + int n = len - 0x80; + if (unlikely(n > 2)) + goto length_too_long; + if (unlikely(dp >= datalen - n)) + goto data_overrun_error; + hdr += n; + for (len = 0; n > 0; n--) { + len <<= 8; + len |= data[dp++]; + } + if (unlikely(len > datalen - dp)) + goto data_overrun_error; + } + } + + if (flags & FLAG_CONS) { + /* For expected compound forms, we stack the positions + * of the start and end of the data. + */ + if (unlikely(csp >= NR_CONS_STACK)) + goto cons_stack_overflow; + cons_dp_stack[csp] = dp; + cons_hdrlen_stack[csp] = hdr; + if (!(flags & FLAG_INDEFINITE_LENGTH)) { + cons_datalen_stack[csp] = datalen; + datalen = dp + len; + } else { + cons_datalen_stack[csp] = 0; + } + csp++; + } + + pr_debug("- TAG: %02x %zu%s\n", + tag, len, flags & FLAG_CONS ? " CONS" : ""); + tdp = dp; + } + + /* Decide how to handle the operation */ + switch (op) { + case ASN1_OP_MATCH_ANY_ACT: + case ASN1_OP_COND_MATCH_ANY_ACT: + ret = actions[machine[pc + 1]](context, hdr, tag, data + dp, len); + if (ret < 0) + return ret; + goto skip_data; + + case ASN1_OP_MATCH_ACT: + case ASN1_OP_MATCH_ACT_OR_SKIP: + case ASN1_OP_COND_MATCH_ACT_OR_SKIP: + ret = actions[machine[pc + 2]](context, hdr, tag, data + dp, len); + if (ret < 0) + return ret; + goto skip_data; + + case ASN1_OP_MATCH: + case ASN1_OP_MATCH_OR_SKIP: + case ASN1_OP_MATCH_ANY: + case ASN1_OP_COND_MATCH_OR_SKIP: + case ASN1_OP_COND_MATCH_ANY: + skip_data: + if (!(flags & FLAG_CONS)) { + if (flags & FLAG_INDEFINITE_LENGTH) { + len = asn1_find_indefinite_length( + data + dp, datalen - dp, &errmsg, &dp); + if (len < 0) + goto error; + } + pr_debug("- LEAF: %zu\n", len); + dp += len; + } + pc += asn1_op_lengths[op]; + goto next_op; + + case ASN1_OP_MATCH_JUMP: + case ASN1_OP_MATCH_JUMP_OR_SKIP: + case ASN1_OP_COND_MATCH_JUMP_OR_SKIP: + pr_debug("- MATCH_JUMP\n"); + if (unlikely(jsp == NR_JUMP_STACK)) + goto jump_stack_overflow; + jump_stack[jsp++] = pc + asn1_op_lengths[op]; + pc = machine[pc + 2]; + goto next_op; + + case ASN1_OP_COND_FAIL: + if (unlikely(!(flags & FLAG_MATCHED))) + goto tag_mismatch; + pc += asn1_op_lengths[op]; + goto next_op; + + case ASN1_OP_COMPLETE: + if (unlikely(jsp != 0 || csp != 0)) { + pr_err("ASN.1 decoder error: Stacks not empty at completion (%u, %u)\n", + jsp, csp); + return -EBADMSG; + } + return 0; + + case ASN1_OP_END_SET: + case ASN1_OP_END_SET_ACT: + if (unlikely(!(flags & FLAG_MATCHED))) + goto tag_mismatch; + case ASN1_OP_END_SEQ: + case ASN1_OP_END_SET_OF: + case ASN1_OP_END_SEQ_OF: + case ASN1_OP_END_SEQ_ACT: + case ASN1_OP_END_SET_OF_ACT: + case ASN1_OP_END_SEQ_OF_ACT: + if (unlikely(csp <= 0)) + goto cons_stack_underflow; + csp--; + tdp = cons_dp_stack[csp]; + hdr = cons_hdrlen_stack[csp]; + len = datalen; + datalen = cons_datalen_stack[csp]; + pr_debug("- end cons t=%zu dp=%zu l=%zu/%zu\n", + tdp, dp, len, datalen); + if (datalen == 0) { + /* Indefinite length - check for the EOC. */ + datalen = len; + if (unlikely(datalen - dp < 2)) + goto data_overrun_error; + if (data[dp++] != 0) { + if (op & ASN1_OP_END__OF) { + dp--; + csp++; + pc = machine[pc + 1]; + pr_debug("- continue\n"); + goto next_op; + } + goto missing_eoc; + } + if (data[dp++] != 0) + goto invalid_eoc; + len = dp - tdp - 2; + } else { + if (dp < len && (op & ASN1_OP_END__OF)) { + datalen = len; + csp++; + pc = machine[pc + 1]; + pr_debug("- continue\n"); + goto next_op; + } + if (dp != len) + goto cons_length_error; + len -= tdp; + pr_debug("- cons len l=%zu d=%zu\n", len, dp - tdp); + } + + if (op & ASN1_OP_END__ACT) { + unsigned char act; + if (op & ASN1_OP_END__OF) + act = machine[pc + 2]; + else + act = machine[pc + 1]; + ret = actions[act](context, hdr, 0, data + tdp, len); + } + pc += asn1_op_lengths[op]; + goto next_op; + + case ASN1_OP_ACT: + ret = actions[machine[pc + 1]](context, hdr, tag, data + tdp, len); + pc += asn1_op_lengths[op]; + goto next_op; + + case ASN1_OP_RETURN: + if (unlikely(jsp <= 0)) + goto jump_stack_underflow; + pc = jump_stack[--jsp]; + goto next_op; + + default: + break; + } + + /* Shouldn't reach here */ + pr_err("ASN.1 decoder error: Found reserved opcode (%u)\n", op); + return -EBADMSG; + +data_overrun_error: + errmsg = "Data overrun error"; + goto error; +machine_overrun_error: + errmsg = "Machine overrun error"; + goto error; +jump_stack_underflow: + errmsg = "Jump stack underflow"; + goto error; +jump_stack_overflow: + errmsg = "Jump stack overflow"; + goto error; +cons_stack_underflow: + errmsg = "Cons stack underflow"; + goto error; +cons_stack_overflow: + errmsg = "Cons stack overflow"; + goto error; +cons_length_error: + errmsg = "Cons length error"; + goto error; +missing_eoc: + errmsg = "Missing EOC in indefinite len cons"; + goto error; +invalid_eoc: + errmsg = "Invalid length EOC"; + goto error; +length_too_long: + errmsg = "Unsupported length"; + goto error; +indefinite_len_primitive: + errmsg = "Indefinite len primitive not permitted"; + goto error; +tag_mismatch: + errmsg = "Unexpected tag"; + goto error; +long_tag_not_supported: + errmsg = "Long tag not supported"; +error: + pr_debug("\nASN1: %s [m=%zu d=%zu ot=%02x t=%02x l=%zu]\n", + errmsg, pc, dp, optag, tag, len); + return -EBADMSG; +} +EXPORT_SYMBOL_GPL(asn1_ber_decoder); -- cgit v1.2.3 From e1045992949160b56309b730b8bdc428f2f8b69e Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 24 Sep 2012 17:11:27 +0100 Subject: MPILIB: Provide a function to read raw data into an MPI Provide a function to read raw data of a predetermined size into an MPI rather than expecting the size to be encoded within the data. The data is assumed to represent an unsigned integer, and the resulting MPI will be positive. The function looks like this: MPI mpi_read_raw_data(const void *, size_t); This is useful for reading ASN.1 integer primitives where the length is encoded in the ASN.1 metadata. Signed-off-by: David Howells Signed-off-by: Rusty Russell --- include/linux/mpi.h | 1 + lib/mpi/mpicoder.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) (limited to 'lib') diff --git a/include/linux/mpi.h b/include/linux/mpi.h index d02cca6cc8ce..5af1b81def49 100644 --- a/include/linux/mpi.h +++ b/include/linux/mpi.h @@ -76,6 +76,7 @@ void mpi_swap(MPI a, MPI b); /*-- mpicoder.c --*/ MPI do_encode_md(const void *sha_buffer, unsigned nbits); +MPI mpi_read_raw_data(const void *xbuffer, size_t nbytes); MPI mpi_read_from_buffer(const void *buffer, unsigned *ret_nread); int mpi_fromstr(MPI val, const char *str); u32 mpi_get_keyid(MPI a, u32 *keyid); diff --git a/lib/mpi/mpicoder.c b/lib/mpi/mpicoder.c index f0fa65995800..3962b7f7fe3f 100644 --- a/lib/mpi/mpicoder.c +++ b/lib/mpi/mpicoder.c @@ -18,10 +18,65 @@ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA */ +#include +#include #include "mpi-internal.h" #define MAX_EXTERN_MPI_BITS 16384 +/** + * mpi_read_raw_data - Read a raw byte stream as a positive integer + * @xbuffer: The data to read + * @nbytes: The amount of data to read + */ +MPI mpi_read_raw_data(const void *xbuffer, size_t nbytes) +{ + const uint8_t *buffer = xbuffer; + int i, j; + unsigned nbits, nlimbs; + mpi_limb_t a; + MPI val = NULL; + + while (nbytes >= 0 && buffer[0] == 0) { + buffer++; + nbytes--; + } + + nbits = nbytes * 8; + if (nbits > MAX_EXTERN_MPI_BITS) { + pr_info("MPI: mpi too large (%u bits)\n", nbits); + return NULL; + } + if (nbytes > 0) + nbits -= count_leading_zeros(buffer[0]); + else + nbits = 0; + + nlimbs = (nbytes + BYTES_PER_MPI_LIMB - 1) / BYTES_PER_MPI_LIMB; + val = mpi_alloc(nlimbs); + if (!val) + return NULL; + val->nbits = nbits; + val->sign = 0; + val->nlimbs = nlimbs; + + if (nbytes > 0) { + i = BYTES_PER_MPI_LIMB - nbytes % BYTES_PER_MPI_LIMB; + i %= BYTES_PER_MPI_LIMB; + for (j = nlimbs; j > 0; j--) { + a = 0; + for (; i < BYTES_PER_MPI_LIMB; i++) { + a <<= 8; + a |= *buffer++; + } + i = 0; + val->d[j - 1] = a; + } + } + return val; +} +EXPORT_SYMBOL_GPL(mpi_read_raw_data); + MPI mpi_read_from_buffer(const void *xbuffer, unsigned *ret_nread) { const uint8_t *buffer = xbuffer; -- cgit v1.2.3 From b69ec42b1b194cc88f04b3fbcda8d3f93182d6c3 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 8 Oct 2012 16:28:11 -0700 Subject: Kconfig: clean up the long arch list for the DEBUG_KMEMLEAK config option Introduce HAVE_DEBUG_KMEMLEAK config option and select it in corresponding architecture Kconfig files. DEBUG_KMEMLEAK now only depends on HAVE_DEBUG_KMEMLEAK. Signed-off-by: Catalin Marinas Cc: Russell King Cc: Michal Simek Cc: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Paul Mundt Cc: "David S. Miller" Cc: Chris Metcalf Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/Kconfig | 1 + arch/arm64/Kconfig | 1 + arch/microblaze/Kconfig | 1 + arch/mips/Kconfig | 1 + arch/powerpc/Kconfig | 1 + arch/s390/Kconfig | 1 + arch/sh/Kconfig | 1 + arch/sparc/Kconfig | 1 + arch/tile/Kconfig | 1 + arch/x86/Kconfig | 1 + lib/Kconfig.debug | 8 ++++---- 11 files changed, 14 insertions(+), 4 deletions(-) (limited to 'lib') diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 5f5439672932..2867a7742306 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -25,6 +25,7 @@ config ARM select HAVE_FUNCTION_GRAPH_TRACER if (!THUMB2_KERNEL) select ARCH_BINFMT_ELF_RANDOMIZE_PIE select HAVE_GENERIC_DMA_COHERENT + select HAVE_DEBUG_KMEMLEAK select HAVE_KERNEL_GZIP select HAVE_KERNEL_LZO select HAVE_KERNEL_LZMA diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index e61acae0d891..5dc9273781d6 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -10,6 +10,7 @@ config ARM64 select GENERIC_TIME_VSYSCALL select HARDIRQS_SW_RESEND select HAVE_ARCH_TRACEHOOK + select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_API_DEBUG select HAVE_DMA_ATTRS select HAVE_GENERIC_DMA_COHERENT diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index 6133bed2b855..53fd94ab60f0 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -16,6 +16,7 @@ config MICROBLAZE select OF select OF_EARLY_FLATTREE select ARCH_WANT_IPC_PARSE_VERSION + select HAVE_DEBUG_KMEMLEAK select IRQ_DOMAIN select HAVE_GENERIC_HARDIRQS select GENERIC_IRQ_PROBE diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index faf65286574e..335115e5bdd9 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -17,6 +17,7 @@ config MIPS select HAVE_FUNCTION_GRAPH_TRACER select HAVE_KPROBES select HAVE_KRETPROBES + select HAVE_DEBUG_KMEMLEAK select ARCH_BINFMT_ELF_RANDOMIZE_PIE select RTC_LIB if !MACH_LOONGSON select GENERIC_ATOMIC64 if !64BIT diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 4ce0be32d153..6a798a70a6d1 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -113,6 +113,7 @@ config PPC select HAVE_DMA_API_DEBUG select USE_GENERIC_SMP_HELPERS if SMP select HAVE_OPROFILE + select HAVE_DEBUG_KMEMLEAK select HAVE_SYSCALL_WRAPPERS if PPC64 select GENERIC_ATOMIC64 if PPC32 select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index baba37cfcf84..8c6d7986f6d2 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -80,6 +80,7 @@ config S390 select HAVE_IRQ_WORK select HAVE_PERF_EVENTS select ARCH_HAVE_NMI_SAFE_CMPXCHG + select HAVE_DEBUG_KMEMLEAK select HAVE_KERNEL_GZIP select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_LZMA diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index f0c85e424777..cfbf3e3c982b 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -16,6 +16,7 @@ config SUPERH select ARCH_HAVE_CUSTOM_GPIO_H select ARCH_HAVE_NMI_SAFE_CMPXCHG if (GUSA_RB || CPU_SH4A) select PERF_USE_VMALLOC + select HAVE_DEBUG_KMEMLEAK select HAVE_KERNEL_GZIP select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_LZMA diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index e66481015d3b..274d6cf0ada2 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -60,6 +60,7 @@ config SPARC64 select HAVE_DYNAMIC_FTRACE select HAVE_FTRACE_MCOUNT_RECORD select HAVE_SYSCALL_TRACEPOINTS + select HAVE_DEBUG_KMEMLEAK select RTC_DRV_CMOS select RTC_DRV_BQ4802 select RTC_DRV_SUN4V diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig index c9a3c1fe7297..9a0d77d3ba14 100644 --- a/arch/tile/Kconfig +++ b/arch/tile/Kconfig @@ -9,6 +9,7 @@ config TILE select GENERIC_FIND_FIRST_BIT select USE_GENERIC_SMP_HELPERS select CC_OPTIMIZE_FOR_SIZE + select HAVE_DEBUG_KMEMLEAK select HAVE_GENERIC_HARDIRQS select GENERIC_IRQ_PROBE select GENERIC_PENDING_IRQ if SMP diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index fd5d7c2c2daa..3fea1848d955 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -66,6 +66,7 @@ config X86 select HAVE_PERF_EVENTS_NMI select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP + select HAVE_DEBUG_KMEMLEAK select ANON_INODES select HAVE_ALIGNED_STRUCT_PAGE if SLUB && !M386 select HAVE_CMPXCHG_LOCAL if !M386 diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 7fba3a98967f..736db3990506 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -450,12 +450,12 @@ config SLUB_STATS out which slabs are relevant to a particular load. Try running: slabinfo -DA +config HAVE_DEBUG_KMEMLEAK + bool + config DEBUG_KMEMLEAK bool "Kernel memory leak detector" - depends on DEBUG_KERNEL && EXPERIMENTAL && \ - (X86 || ARM || PPC || MIPS || S390 || SPARC64 || SUPERH || \ - MICROBLAZE || TILE || ARM64) - + depends on DEBUG_KERNEL && EXPERIMENTAL && HAVE_DEBUG_KMEMLEAK select DEBUG_FS select STACKTRACE if STACKTRACE_SUPPORT select KALLSYMS -- cgit v1.2.3 From 9b2a60c484715e2d2f07d8192fd9f18435cbc77c Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 8 Oct 2012 16:28:13 -0700 Subject: Kconfig: clean up the long arch list for the DEBUG_BUGVERBOSE config option Introduce HAVE_DEBUG_BUGVERBOSE config option and select it in corresponding architecture Kconfig files. Architectures that already select GENERIC_BUG don't need to select HAVE_DEBUG_BUGVERBOSE. Signed-off-by: Catalin Marinas Acked-by: Geert Uytterhoeven Cc: David Howells Cc: Hirokazu Takata Cc: Paul Mundt Cc: "David S. Miller" Cc: Chris Metcalf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/Kconfig | 1 + arch/frv/Kconfig | 1 + arch/m32r/Kconfig | 1 + arch/m68k/Kconfig | 1 + arch/sh/Kconfig | 1 + arch/sparc/Kconfig | 1 + arch/tile/Kconfig | 1 + lib/Kconfig.debug | 8 ++++---- 8 files changed, 11 insertions(+), 4 deletions(-) (limited to 'lib') diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 5dc9273781d6..a30856058742 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -10,6 +10,7 @@ config ARM64 select GENERIC_TIME_VSYSCALL select HARDIRQS_SW_RESEND select HAVE_ARCH_TRACEHOOK + select HAVE_DEBUG_BUGVERBOSE select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_API_DEBUG select HAVE_DMA_ATTRS diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig index cc5709d18350..9d262645f667 100644 --- a/arch/frv/Kconfig +++ b/arch/frv/Kconfig @@ -8,6 +8,7 @@ config FRV select HAVE_UID16 select HAVE_GENERIC_HARDIRQS select GENERIC_IRQ_SHOW + select HAVE_DEBUG_BUGVERBOSE select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_CPU_DEVICES select ARCH_WANT_IPC_PARSE_VERSION diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig index 49498bbb9616..e875fc3ce9cb 100644 --- a/arch/m32r/Kconfig +++ b/arch/m32r/Kconfig @@ -8,6 +8,7 @@ config M32R select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_LZMA select ARCH_WANT_IPC_PARSE_VERSION + select HAVE_DEBUG_BUGVERBOSE select HAVE_GENERIC_HARDIRQS select GENERIC_IRQ_PROBE select GENERIC_IRQ_SHOW diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index 3e2b2f66db60..dae1e7e16a37 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -3,6 +3,7 @@ config M68K default y select HAVE_IDE select HAVE_AOUT if MMU + select HAVE_DEBUG_BUGVERBOSE select HAVE_GENERIC_HARDIRQS select GENERIC_IRQ_SHOW select GENERIC_ATOMIC64 diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index cfbf3e3c982b..3b3e27a3ff2c 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -13,6 +13,7 @@ config SUPERH select HAVE_DMA_ATTRS select HAVE_IRQ_WORK select HAVE_PERF_EVENTS + select HAVE_DEBUG_BUGVERBOSE select ARCH_HAVE_CUSTOM_GPIO_H select ARCH_HAVE_NMI_SAFE_CMPXCHG if (GUSA_RB || CPU_SH4A) select PERF_USE_VMALLOC diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 274d6cf0ada2..700a01adec3a 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -32,6 +32,7 @@ config SPARC select GENERIC_PCI_IOMAP select HAVE_NMI_WATCHDOG if SPARC64 select HAVE_BPF_JIT + select HAVE_DEBUG_BUGVERBOSE select GENERIC_SMP_IDLE_THREAD select GENERIC_CMOS_UPDATE select GENERIC_CLOCKEVENTS diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig index 9a0d77d3ba14..df69d4296b4b 100644 --- a/arch/tile/Kconfig +++ b/arch/tile/Kconfig @@ -14,6 +14,7 @@ config TILE select GENERIC_IRQ_PROBE select GENERIC_PENDING_IRQ if SMP select GENERIC_IRQ_SHOW + select HAVE_DEBUG_BUGVERBOSE select HAVE_SYSCALL_WRAPPERS if TILEGX select SYS_HYPERVISOR select ARCH_HAVE_NMI_SAFE_CMPXCHG diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 736db3990506..b7281e4d1473 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -751,12 +751,12 @@ config DEBUG_HIGHMEM This options enables addition error checking for high memory systems. Disable for production systems. +config HAVE_DEBUG_BUGVERBOSE + bool + config DEBUG_BUGVERBOSE bool "Verbose BUG() reporting (adds 70K)" if DEBUG_KERNEL && EXPERT - depends on BUG - depends on ARM || AVR32 || M32R || M68K || SPARC32 || SPARC64 || \ - FRV || SUPERH || GENERIC_BUG || BLACKFIN || MN10300 || \ - TILE || ARM64 + depends on BUG && (GENERIC_BUG || HAVE_DEBUG_BUGVERBOSE) default y help Say Y here to make BUG() panics output the file name and line number -- cgit v1.2.3 From 4c199a93a2d36b277a9fd209a0f2793f8460a215 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:30:32 -0700 Subject: rbtree: empty nodes have no color Empty nodes have no color. We can make use of this property to simplify the code emitted by the RB_EMPTY_NODE and RB_CLEAR_NODE macros. Also, we can get rid of the rb_init_node function which had been introduced by commit 88d19cf37952 ("timers: Add rb_init_node() to allow for stack allocated rb nodes") to avoid some issue with the empty node's color not being initialized. I'm not sure what the RB_EMPTY_NODE checks in rb_prev() / rb_next() are doing there, though. axboe introduced them in commit 10fd48f2376d ("rbtree: fixed reversed RB_EMPTY_NODE and rb_next/prev"). The way I see it, the 'empty node' abstraction is only used by rbtree users to flag nodes that they haven't inserted in any rbtree, so asking the predecessor or successor of such nodes doesn't make any sense. One final rb_init_node() caller was recently added in sysctl code to implement faster sysctl name lookups. This code doesn't make use of RB_EMPTY_NODE at all, and from what I could see it only called rb_init_node() under the mistaken assumption that such initialization was required before node insertion. [sfr@canb.auug.org.au: fix net/ceph/osd_client.c build] Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Acked-by: David Woodhouse Cc: Rik van Riel Cc: Peter Zijlstra Cc: Daniel Santos Cc: Jens Axboe Cc: "Eric W. Biederman" Cc: John Stultz Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 4 +--- include/linux/rbtree.h | 15 +++++---------- include/linux/timerqueue.h | 2 +- lib/rbtree.c | 4 ++-- net/ceph/osd_client.c | 1 - 5 files changed, 9 insertions(+), 17 deletions(-) (limited to 'lib') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index dcd56f84db7e..fddc50729632 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -168,10 +168,8 @@ static void init_header(struct ctl_table_header *head, head->node = node; if (node) { struct ctl_table *entry; - for (entry = table; entry->procname; entry++, node++) { - rb_init_node(&node->node); + for (entry = table; entry->procname; entry++, node++) node->header = head; - } } } diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index e6a807720ded..2049087c43b7 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -67,17 +67,12 @@ static inline void rb_set_color(struct rb_node *rb, int color) #define RB_ROOT (struct rb_root) { NULL, } #define rb_entry(ptr, type, member) container_of(ptr, type, member) -#define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL) -#define RB_EMPTY_NODE(node) (rb_parent(node) == node) -#define RB_CLEAR_NODE(node) (rb_set_parent(node, node)) +#define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL) + +/* 'empty' nodes are nodes that are known not to be inserted in an rbree */ +#define RB_EMPTY_NODE(node) ((node)->rb_parent_color == (unsigned long)(node)) +#define RB_CLEAR_NODE(node) ((node)->rb_parent_color = (unsigned long)(node)) -static inline void rb_init_node(struct rb_node *rb) -{ - rb->rb_parent_color = 0; - rb->rb_right = NULL; - rb->rb_left = NULL; - RB_CLEAR_NODE(rb); -} extern void rb_insert_color(struct rb_node *, struct rb_root *); extern void rb_erase(struct rb_node *, struct rb_root *); diff --git a/include/linux/timerqueue.h b/include/linux/timerqueue.h index 5088727478fd..a520fd70a59f 100644 --- a/include/linux/timerqueue.h +++ b/include/linux/timerqueue.h @@ -39,7 +39,7 @@ struct timerqueue_node *timerqueue_getnext(struct timerqueue_head *head) static inline void timerqueue_init(struct timerqueue_node *node) { - rb_init_node(&node->node); + RB_CLEAR_NODE(&node->node); } static inline void timerqueue_init_head(struct timerqueue_head *head) diff --git a/lib/rbtree.c b/lib/rbtree.c index d4175565dc2c..fe43c8c5f527 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -387,7 +387,7 @@ struct rb_node *rb_next(const struct rb_node *node) { struct rb_node *parent; - if (rb_parent(node) == node) + if (RB_EMPTY_NODE(node)) return NULL; /* If we have a right-hand child, go down and then left as far @@ -416,7 +416,7 @@ struct rb_node *rb_prev(const struct rb_node *node) { struct rb_node *parent; - if (rb_parent(node) == node) + if (RB_EMPTY_NODE(node)) return NULL; /* If we have a left-hand child, go down and then right as far diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index ccbdfbba9e53..c1d756cc7448 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -221,7 +221,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, kref_init(&req->r_kref); init_completion(&req->r_completion); init_completion(&req->r_safe_completion); - rb_init_node(&req->r_node); INIT_LIST_HEAD(&req->r_unsafe_item); INIT_LIST_HEAD(&req->r_linger_item); INIT_LIST_HEAD(&req->r_linger_osd); -- cgit v1.2.3 From bf7ad8eeab995710c766df49c9c69a8592ca0216 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:30:37 -0700 Subject: rbtree: move some implementation details from rbtree.h to rbtree.c rbtree users must use the documented APIs to manipulate the tree structure. Low-level helpers to manipulate node colors and parenthood are not part of that API, so move them to lib/rbtree.c [dwmw2@infradead.org: fix jffs2 build issue due to renamed __rb_parent_color field] Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Acked-by: David Woodhouse Cc: Rik van Riel Cc: Peter Zijlstra Cc: Daniel Santos Cc: Jens Axboe Cc: "Eric W. Biederman" Signed-off-by: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/jffs2/readinode.c | 13 ++++++++----- include/linux/rbtree.h | 34 +++++++++------------------------- lib/rbtree.c | 20 +++++++++++++++++++- 3 files changed, 36 insertions(+), 31 deletions(-) (limited to 'lib') diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c index 1ea349fff68b..ae81b01e6fd7 100644 --- a/fs/jffs2/readinode.c +++ b/fs/jffs2/readinode.c @@ -394,8 +394,11 @@ static int jffs2_add_tn_to_tree(struct jffs2_sb_info *c, } /* Trivial function to remove the last node in the tree. Which by definition - has no right-hand -- so can be removed just by making its only child (if - any) take its place under its parent. */ + has no right-hand child — so can be removed just by making its left-hand + child (if any) take its place under its parent. Since this is only done + when we're consuming the whole tree, there's no need to use rb_erase() + and let it worry about adjusting colours and balancing the tree. That + would just be a waste of time. */ static void eat_last(struct rb_root *root, struct rb_node *node) { struct rb_node *parent = rb_parent(node); @@ -412,12 +415,12 @@ static void eat_last(struct rb_root *root, struct rb_node *node) link = &parent->rb_right; *link = node->rb_left; - /* Colour doesn't matter now. Only the parent pointer. */ if (node->rb_left) - node->rb_left->rb_parent_color = node->rb_parent_color; + node->rb_left->__rb_parent_color = node->__rb_parent_color; } -/* We put this in reverse order, so we can just use eat_last */ +/* We put the version tree in reverse order, so we can use the same eat_last() + function that we use to consume the tmpnode tree (tn_root). */ static void ver_insert(struct rb_root *ver_root, struct jffs2_tmp_dnode_info *tn) { struct rb_node **link = &ver_root->rb_node; diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index 2049087c43b7..bf836a2c6a32 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -32,37 +32,19 @@ #include #include -struct rb_node -{ - unsigned long rb_parent_color; -#define RB_RED 0 -#define RB_BLACK 1 +struct rb_node { + unsigned long __rb_parent_color; struct rb_node *rb_right; struct rb_node *rb_left; } __attribute__((aligned(sizeof(long)))); /* The alignment might seem pointless, but allegedly CRIS needs it */ -struct rb_root -{ +struct rb_root { struct rb_node *rb_node; }; -#define rb_parent(r) ((struct rb_node *)((r)->rb_parent_color & ~3)) -#define rb_color(r) ((r)->rb_parent_color & 1) -#define rb_is_red(r) (!rb_color(r)) -#define rb_is_black(r) rb_color(r) -#define rb_set_red(r) do { (r)->rb_parent_color &= ~1; } while (0) -#define rb_set_black(r) do { (r)->rb_parent_color |= 1; } while (0) - -static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p) -{ - rb->rb_parent_color = (rb->rb_parent_color & 3) | (unsigned long)p; -} -static inline void rb_set_color(struct rb_node *rb, int color) -{ - rb->rb_parent_color = (rb->rb_parent_color & ~1) | color; -} +#define rb_parent(r) ((struct rb_node *)((r)->__rb_parent_color & ~3)) #define RB_ROOT (struct rb_root) { NULL, } #define rb_entry(ptr, type, member) container_of(ptr, type, member) @@ -70,8 +52,10 @@ static inline void rb_set_color(struct rb_node *rb, int color) #define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL) /* 'empty' nodes are nodes that are known not to be inserted in an rbree */ -#define RB_EMPTY_NODE(node) ((node)->rb_parent_color == (unsigned long)(node)) -#define RB_CLEAR_NODE(node) ((node)->rb_parent_color = (unsigned long)(node)) +#define RB_EMPTY_NODE(node) \ + ((node)->__rb_parent_color == (unsigned long)(node)) +#define RB_CLEAR_NODE(node) \ + ((node)->__rb_parent_color = (unsigned long)(node)) extern void rb_insert_color(struct rb_node *, struct rb_root *); @@ -98,7 +82,7 @@ extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, static inline void rb_link_node(struct rb_node * node, struct rb_node * parent, struct rb_node ** rb_link) { - node->rb_parent_color = (unsigned long )parent; + node->__rb_parent_color = (unsigned long)parent; node->rb_left = node->rb_right = NULL; *rb_link = node; diff --git a/lib/rbtree.c b/lib/rbtree.c index fe43c8c5f527..ccada9abe6f5 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -23,6 +23,24 @@ #include #include +#define RB_RED 0 +#define RB_BLACK 1 + +#define rb_color(r) ((r)->__rb_parent_color & 1) +#define rb_is_red(r) (!rb_color(r)) +#define rb_is_black(r) rb_color(r) +#define rb_set_red(r) do { (r)->__rb_parent_color &= ~1; } while (0) +#define rb_set_black(r) do { (r)->__rb_parent_color |= 1; } while (0) + +static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p) +{ + rb->__rb_parent_color = rb_color(rb) | (unsigned long)p; +} +static inline void rb_set_color(struct rb_node *rb, int color) +{ + rb->__rb_parent_color = (rb->__rb_parent_color & ~1) | color; +} + static void __rb_rotate_left(struct rb_node *node, struct rb_root *root) { struct rb_node *right = node->rb_right; @@ -255,7 +273,7 @@ void rb_erase(struct rb_node *node, struct rb_root *root) rb_set_parent(old->rb_right, node); } - node->rb_parent_color = old->rb_parent_color; + node->__rb_parent_color = old->__rb_parent_color; node->rb_left = old->rb_left; rb_set_parent(old->rb_left, node); -- cgit v1.2.3 From 910a742d4ba863848c7283d69c21bfa779d3b9a8 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:30:39 -0700 Subject: rbtree: performance and correctness test This small module helps measure the performance of rbtree insert and erase. Additionally, we run a few correctness tests to check that the rbtrees have all desired properties: - contains the right number of nodes in the order desired, - never two consecutive red nodes on any path, - all paths to leaf nodes have the same number of black nodes, - root node is black [akpm@linux-foundation.org: fix printk warning: sparc64 cycles_t is unsigned long] Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Acked-by: David Woodhouse Cc: Rik van Riel Cc: Peter Zijlstra Cc: Daniel Santos Cc: Jens Axboe Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.debug | 7 +++ lib/Makefile | 2 + lib/rbtree_test.c | 135 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 144 insertions(+) create mode 100644 lib/rbtree_test.c (limited to 'lib') diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index b7281e4d1473..a4e5d93b0f41 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1282,6 +1282,13 @@ config LATENCYTOP source mm/Kconfig.debug source kernel/trace/Kconfig +config RBTREE_TEST + tristate "Red-Black tree test" + depends on m && DEBUG_KERNEL + help + A benchmark measuring the performance of the rbtree library. + Also includes rbtree invariant checks. + config PROVIDE_OHCI1394_DMA_INIT bool "Remote debugging over FireWire early on boot" depends on PCI && X86 diff --git a/lib/Makefile b/lib/Makefile index 42d283edc4d3..f49445d26b65 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -140,6 +140,8 @@ $(foreach file, $(libfdt_files), \ $(eval CFLAGS_$(file) = -I$(src)/../scripts/dtc/libfdt)) lib-$(CONFIG_LIBFDT) += $(libfdt_files) +obj-$(CONFIG_RBTREE_TEST) += rbtree_test.o + hostprogs-y := gen_crc32table clean-files := crc32table.h diff --git a/lib/rbtree_test.c b/lib/rbtree_test.c new file mode 100644 index 000000000000..19dfca9ff7d7 --- /dev/null +++ b/lib/rbtree_test.c @@ -0,0 +1,135 @@ +#include +#include +#include +#include + +#define NODES 100 +#define PERF_LOOPS 100000 +#define CHECK_LOOPS 100 + +struct test_node { + struct rb_node rb; + u32 key; +}; + +static struct rb_root root = RB_ROOT; +static struct test_node nodes[NODES]; + +static struct rnd_state rnd; + +static void insert(struct test_node *node, struct rb_root *root) +{ + struct rb_node **new = &root->rb_node, *parent = NULL; + + while (*new) { + parent = *new; + if (node->key < rb_entry(parent, struct test_node, rb)->key) + new = &parent->rb_left; + else + new = &parent->rb_right; + } + + rb_link_node(&node->rb, parent, new); + rb_insert_color(&node->rb, root); +} + +static inline void erase(struct test_node *node, struct rb_root *root) +{ + rb_erase(&node->rb, root); +} + +static void init(void) +{ + int i; + for (i = 0; i < NODES; i++) + nodes[i].key = prandom32(&rnd); +} + +static bool is_red(struct rb_node *rb) +{ + return !(rb->__rb_parent_color & 1); +} + +static int black_path_count(struct rb_node *rb) +{ + int count; + for (count = 0; rb; rb = rb_parent(rb)) + count += !is_red(rb); + return count; +} + +static void check(int nr_nodes) +{ + struct rb_node *rb; + int count = 0; + int blacks; + u32 prev_key = 0; + + for (rb = rb_first(&root); rb; rb = rb_next(rb)) { + struct test_node *node = rb_entry(rb, struct test_node, rb); + WARN_ON_ONCE(node->key < prev_key); + WARN_ON_ONCE(is_red(rb) && + (!rb_parent(rb) || is_red(rb_parent(rb)))); + if (!count) + blacks = black_path_count(rb); + else + WARN_ON_ONCE((!rb->rb_left || !rb->rb_right) && + blacks != black_path_count(rb)); + prev_key = node->key; + count++; + } + WARN_ON_ONCE(count != nr_nodes); +} + +static int rbtree_test_init(void) +{ + int i, j; + cycles_t time1, time2, time; + + printk(KERN_ALERT "rbtree testing"); + + prandom32_seed(&rnd, 3141592653589793238); + init(); + + time1 = get_cycles(); + + for (i = 0; i < PERF_LOOPS; i++) { + for (j = 0; j < NODES; j++) + insert(nodes + j, &root); + for (j = 0; j < NODES; j++) + erase(nodes + j, &root); + } + + time2 = get_cycles(); + time = time2 - time1; + + time = div_u64(time, PERF_LOOPS); + printk(" -> %llu cycles\n", (unsigned long long)time); + + for (i = 0; i < CHECK_LOOPS; i++) { + init(); + for (j = 0; j < NODES; j++) { + check(j); + insert(nodes + j, &root); + } + for (j = 0; j < NODES; j++) { + check(NODES - j); + erase(nodes + j, &root); + } + check(0); + } + + return -EAGAIN; /* Fail will directly unload the module */ +} + +static void rbtree_test_exit(void) +{ + printk(KERN_ALERT "test exit\n"); +} + +module_init(rbtree_test_init) +module_exit(rbtree_test_exit) + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Michel Lespinasse"); +MODULE_DESCRIPTION("Red Black Tree test"); -- cgit v1.2.3 From 1f0528653e41ec230c60f5738820e8a544731399 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:30:42 -0700 Subject: rbtree: break out of rb_insert_color loop after tree rotation It is a well known property of rbtrees that insertion never requires more than two tree rotations. In our implementation, after one loop iteration identified one or two necessary tree rotations, we would iterate and look for more. However at that point the node's parent would always be black, which would cause us to exit the loop. We can make the code flow more obvious by just adding a break statement after the tree rotations, where we know we are done. Additionally, in the cases where two tree rotations are necessary, we don't have to update the 'node' pointer as it wouldn't be used until the next loop iteration, which we now avoid due to this break statement. Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Acked-by: David Woodhouse Cc: Rik van Riel Cc: Peter Zijlstra Cc: Daniel Santos Cc: Jens Axboe Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/rbtree.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'lib') diff --git a/lib/rbtree.c b/lib/rbtree.c index ccada9abe6f5..12abb8abf442 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -109,18 +109,15 @@ void rb_insert_color(struct rb_node *node, struct rb_root *root) } } - if (parent->rb_right == node) - { - register struct rb_node *tmp; + if (parent->rb_right == node) { __rb_rotate_left(parent, root); - tmp = parent; parent = node; - node = tmp; } rb_set_black(parent); rb_set_red(gparent); __rb_rotate_right(gparent, root); + break; } else { { register struct rb_node *uncle = gparent->rb_left; @@ -134,18 +131,15 @@ void rb_insert_color(struct rb_node *node, struct rb_root *root) } } - if (parent->rb_left == node) - { - register struct rb_node *tmp; + if (parent->rb_left == node) { __rb_rotate_right(parent, root); - tmp = parent; parent = node; - node = tmp; } rb_set_black(parent); rb_set_red(gparent); __rb_rotate_left(gparent, root); + break; } } -- cgit v1.2.3 From 6d58452dc066db61acdff7b84671db1b11a3de1c Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:30:44 -0700 Subject: rbtree: adjust root color in rb_insert_color() only when necessary The root node of an rbtree must always be black. However, rb_insert_color() only needs to maintain this invariant when it has been broken - that is, when it exits the loop due to the current (red) node being the root. In all other cases (exiting after tree rotations, or exiting due to an existing black parent) the invariant is already satisfied, so there is no need to adjust the root node color. Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Acked-by: David Woodhouse Cc: Rik van Riel Cc: Peter Zijlstra Cc: Daniel Santos Cc: Jens Axboe Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/rbtree.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'lib') diff --git a/lib/rbtree.c b/lib/rbtree.c index 12abb8abf442..d0be5fcaafe8 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -91,8 +91,21 @@ void rb_insert_color(struct rb_node *node, struct rb_root *root) { struct rb_node *parent, *gparent; - while ((parent = rb_parent(node)) && rb_is_red(parent)) - { + while (true) { + /* + * Loop invariant: node is red + * + * If there is a black parent, we are done. + * Otherwise, take some corrective action as we don't + * want a red root or two consecutive red nodes. + */ + parent = rb_parent(node); + if (!parent) { + rb_set_black(node); + break; + } else if (rb_is_black(parent)) + break; + gparent = rb_parent(parent); if (parent == gparent->rb_left) @@ -142,8 +155,6 @@ void rb_insert_color(struct rb_node *node, struct rb_root *root) break; } } - - rb_set_black(root->rb_node); } EXPORT_SYMBOL(rb_insert_color); -- cgit v1.2.3 From 5bc9188aa207dafd47eab57df7c4fe5b3d3f636a Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:30:47 -0700 Subject: rbtree: low level optimizations in rb_insert_color() - Use the newly introduced rb_set_parent_color() function to flip the color of nodes whose parent is already known. - Optimize rb_parent() when the node is known to be red - there is no need to mask out the color in that case. - Flipping gparent's color to red requires us to fetch its rb_parent_color field, so we can reuse it as the parent value for the next loop iteration. - Do not use __rb_rotate_left() and __rb_rotate_right() to handle tree rotations: we already have pointers to all relevant nodes, and know their colors (either because we want to adjust it, or because we've tested it, or we can deduce it as black due to the node proximity to a known red node). So we can generate more efficient code by making use of the node pointers we already have, and setting both the parent and color attributes for nodes all at once. Also in Case 2, some node attributes don't have to be set because we know another tree rotation (Case 3) will always follow and override them. Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Acked-by: David Woodhouse Cc: Rik van Riel Cc: Peter Zijlstra Cc: Daniel Santos Cc: Jens Axboe Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/rbtree.c | 166 ++++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 131 insertions(+), 35 deletions(-) (limited to 'lib') diff --git a/lib/rbtree.c b/lib/rbtree.c index d0be5fcaafe8..41cf19b2fe51 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -23,6 +23,25 @@ #include #include +/* + * red-black trees properties: http://en.wikipedia.org/wiki/Rbtree + * + * 1) A node is either red or black + * 2) The root is black + * 3) All leaves (NULL) are black + * 4) Both children of every red node are black + * 5) Every simple path from root to leaves contains the same number + * of black nodes. + * + * 4 and 5 give the O(log n) guarantee, since 4 implies you cannot have two + * consecutive red nodes in a path and every red node is therefore followed by + * a black. So if B is the number of black nodes on every simple path (as per + * 5), then the longest possible path due to 4 is 2B. + * + * We shall indicate color with case, where black nodes are uppercase and red + * nodes will be lowercase. + */ + #define RB_RED 0 #define RB_BLACK 1 @@ -41,6 +60,17 @@ static inline void rb_set_color(struct rb_node *rb, int color) rb->__rb_parent_color = (rb->__rb_parent_color & ~1) | color; } +static inline void rb_set_parent_color(struct rb_node *rb, + struct rb_node *p, int color) +{ + rb->__rb_parent_color = (unsigned long)p | color; +} + +static inline struct rb_node *rb_red_parent(struct rb_node *red) +{ + return (struct rb_node *)red->__rb_parent_color; +} + static void __rb_rotate_left(struct rb_node *node, struct rb_root *root) { struct rb_node *right = node->rb_right; @@ -87,9 +117,30 @@ static void __rb_rotate_right(struct rb_node *node, struct rb_root *root) rb_set_parent(node, left); } +/* + * Helper function for rotations: + * - old's parent and color get assigned to new + * - old gets assigned new as a parent and 'color' as a color. + */ +static inline void +__rb_rotate_set_parents(struct rb_node *old, struct rb_node *new, + struct rb_root *root, int color) +{ + struct rb_node *parent = rb_parent(old); + new->__rb_parent_color = old->__rb_parent_color; + rb_set_parent_color(old, new, color); + if (parent) { + if (parent->rb_left == old) + parent->rb_left = new; + else + parent->rb_right = new; + } else + root->rb_node = new; +} + void rb_insert_color(struct rb_node *node, struct rb_root *root) { - struct rb_node *parent, *gparent; + struct rb_node *parent = rb_red_parent(node), *gparent, *tmp; while (true) { /* @@ -99,59 +150,104 @@ void rb_insert_color(struct rb_node *node, struct rb_root *root) * Otherwise, take some corrective action as we don't * want a red root or two consecutive red nodes. */ - parent = rb_parent(node); if (!parent) { - rb_set_black(node); + rb_set_parent_color(node, NULL, RB_BLACK); break; } else if (rb_is_black(parent)) break; - gparent = rb_parent(parent); - - if (parent == gparent->rb_left) - { - { - register struct rb_node *uncle = gparent->rb_right; - if (uncle && rb_is_red(uncle)) - { - rb_set_black(uncle); - rb_set_black(parent); - rb_set_red(gparent); - node = gparent; - continue; - } + gparent = rb_red_parent(parent); + + if (parent == gparent->rb_left) { + tmp = gparent->rb_right; + if (tmp && rb_is_red(tmp)) { + /* + * Case 1 - color flips + * + * G g + * / \ / \ + * p u --> P U + * / / + * n N + * + * However, since g's parent might be red, and + * 4) does not allow this, we need to recurse + * at g. + */ + rb_set_parent_color(tmp, gparent, RB_BLACK); + rb_set_parent_color(parent, gparent, RB_BLACK); + node = gparent; + parent = rb_parent(node); + rb_set_parent_color(node, parent, RB_RED); + continue; } if (parent->rb_right == node) { - __rb_rotate_left(parent, root); + /* + * Case 2 - left rotate at parent + * + * G G + * / \ / \ + * p U --> n U + * \ / + * n p + * + * This still leaves us in violation of 4), the + * continuation into Case 3 will fix that. + */ + parent->rb_right = tmp = node->rb_left; + node->rb_left = parent; + if (tmp) + rb_set_parent_color(tmp, parent, + RB_BLACK); + rb_set_parent_color(parent, node, RB_RED); parent = node; } - rb_set_black(parent); - rb_set_red(gparent); - __rb_rotate_right(gparent, root); + /* + * Case 3 - right rotate at gparent + * + * G P + * / \ / \ + * p U --> n g + * / \ + * n U + */ + gparent->rb_left = tmp = parent->rb_right; + parent->rb_right = gparent; + if (tmp) + rb_set_parent_color(tmp, gparent, RB_BLACK); + __rb_rotate_set_parents(gparent, parent, root, RB_RED); break; } else { - { - register struct rb_node *uncle = gparent->rb_left; - if (uncle && rb_is_red(uncle)) - { - rb_set_black(uncle); - rb_set_black(parent); - rb_set_red(gparent); - node = gparent; - continue; - } + tmp = gparent->rb_left; + if (tmp && rb_is_red(tmp)) { + /* Case 1 - color flips */ + rb_set_parent_color(tmp, gparent, RB_BLACK); + rb_set_parent_color(parent, gparent, RB_BLACK); + node = gparent; + parent = rb_parent(node); + rb_set_parent_color(node, parent, RB_RED); + continue; } if (parent->rb_left == node) { - __rb_rotate_right(parent, root); + /* Case 2 - right rotate at parent */ + parent->rb_left = tmp = node->rb_right; + node->rb_right = parent; + if (tmp) + rb_set_parent_color(tmp, parent, + RB_BLACK); + rb_set_parent_color(parent, node, RB_RED); parent = node; } - rb_set_black(parent); - rb_set_red(gparent); - __rb_rotate_left(gparent, root); + /* Case 3 - left rotate at gparent */ + gparent->rb_right = tmp = parent->rb_left; + parent->rb_left = gparent; + if (tmp) + rb_set_parent_color(tmp, gparent, RB_BLACK); + __rb_rotate_set_parents(gparent, parent, root, RB_RED); break; } } -- cgit v1.2.3 From d6ff1273928ebf15466a85b7e1810cd00e72998b Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:30:50 -0700 Subject: rbtree: adjust node color in __rb_erase_color() only when necessary In __rb_erase_color(), we were always setting a node to black after exiting the main loop. And in one case, after fixing up the tree to satisfy all rbtree invariants, we were setting the current node to root just to guarantee a loop exit, at which point the root would be set to black. However this is not necessary, as the root of an rbtree is already known to be black. The only case where the color flip is required is when we exit the loop due to the current node being red, and it's easiest to just do the flip at that point instead of doing it after the loop. [adrian.hunter@intel.com: perf tools: fix build for another rbtree.c change] Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Acked-by: David Woodhouse Cc: Rik van Riel Cc: Peter Zijlstra Cc: Daniel Santos Cc: Jens Axboe Cc: "Eric W. Biederman" Signed-off-by: Adrian Hunter Cc: Alexander Shishkin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/rbtree.c | 28 +++++++++++++++++----------- tools/perf/util/include/linux/rbtree.h | 1 + 2 files changed, 18 insertions(+), 11 deletions(-) (limited to 'lib') diff --git a/lib/rbtree.c b/lib/rbtree.c index 41cf19b2fe51..baf7c835c57c 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -259,10 +259,22 @@ static void __rb_erase_color(struct rb_node *node, struct rb_node *parent, { struct rb_node *other; - while ((!node || rb_is_black(node)) && node != root->rb_node) - { - if (parent->rb_left == node) - { + while (true) { + /* + * Loop invariant: all leaf paths going through node have a + * black node count that is 1 lower than other leaf paths. + * + * If node is red, we can flip it to black to adjust. + * If node is the root, all leaf paths go through it. + * Otherwise, we need to adjust the tree through color flips + * and tree rotations as per one of the 4 cases below. + */ + if (node && rb_is_red(node)) { + rb_set_black(node); + break; + } else if (!parent) { + break; + } else if (parent->rb_left == node) { other = parent->rb_right; if (rb_is_red(other)) { @@ -291,12 +303,9 @@ static void __rb_erase_color(struct rb_node *node, struct rb_node *parent, rb_set_black(parent); rb_set_black(other->rb_right); __rb_rotate_left(parent, root); - node = root->rb_node; break; } - } - else - { + } else { other = parent->rb_left; if (rb_is_red(other)) { @@ -325,13 +334,10 @@ static void __rb_erase_color(struct rb_node *node, struct rb_node *parent, rb_set_black(parent); rb_set_black(other->rb_left); __rb_rotate_right(parent, root); - node = root->rb_node; break; } } } - if (node) - rb_set_black(node); } void rb_erase(struct rb_node *node, struct rb_root *root) diff --git a/tools/perf/util/include/linux/rbtree.h b/tools/perf/util/include/linux/rbtree.h index 2a030c5af3aa..9bcdc844b330 100644 --- a/tools/perf/util/include/linux/rbtree.h +++ b/tools/perf/util/include/linux/rbtree.h @@ -1,2 +1,3 @@ #include +#include #include "../../../../include/linux/rbtree.h" -- cgit v1.2.3 From e125d1471a4f8f1bf7ea9a83deb8d23cb40bd712 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:30:54 -0700 Subject: rbtree: optimize case selection logic in __rb_erase_color() In __rb_erase_color(), we have to select one of 3 cases depending on the color on the 'other' node children. If both children are black, we flip a few node colors and iterate. Otherwise, we do either one or two tree rotations, depending on the color of the 'other' child opposite to 'node', and then we are done. The corresponding logic had duplicate checks for the color of the 'other' child opposite to 'node'. It was checking it first to determine if both children are black, and then to determine how many tree rotations are required. Rearrange the logic to avoid that extra check. Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Acked-by: David Woodhouse Cc: Rik van Riel Cc: Peter Zijlstra Cc: Daniel Santos Cc: Jens Axboe Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/rbtree.c | 68 +++++++++++++++++++++++++++--------------------------------- 1 file changed, 30 insertions(+), 38 deletions(-) (limited to 'lib') diff --git a/lib/rbtree.c b/lib/rbtree.c index baf7c835c57c..eb823a31099c 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -283,28 +283,24 @@ static void __rb_erase_color(struct rb_node *node, struct rb_node *parent, __rb_rotate_left(parent, root); other = parent->rb_right; } - if ((!other->rb_left || rb_is_black(other->rb_left)) && - (!other->rb_right || rb_is_black(other->rb_right))) - { - rb_set_red(other); - node = parent; - parent = rb_parent(node); - } - else - { - if (!other->rb_right || rb_is_black(other->rb_right)) - { - rb_set_black(other->rb_left); + if (!other->rb_right || rb_is_black(other->rb_right)) { + if (!other->rb_left || + rb_is_black(other->rb_left)) { rb_set_red(other); - __rb_rotate_right(other, root); - other = parent->rb_right; + node = parent; + parent = rb_parent(node); + continue; } - rb_set_color(other, rb_color(parent)); - rb_set_black(parent); - rb_set_black(other->rb_right); - __rb_rotate_left(parent, root); - break; + rb_set_black(other->rb_left); + rb_set_red(other); + __rb_rotate_right(other, root); + other = parent->rb_right; } + rb_set_color(other, rb_color(parent)); + rb_set_black(parent); + rb_set_black(other->rb_right); + __rb_rotate_left(parent, root); + break; } else { other = parent->rb_left; if (rb_is_red(other)) @@ -314,28 +310,24 @@ static void __rb_erase_color(struct rb_node *node, struct rb_node *parent, __rb_rotate_right(parent, root); other = parent->rb_left; } - if ((!other->rb_left || rb_is_black(other->rb_left)) && - (!other->rb_right || rb_is_black(other->rb_right))) - { - rb_set_red(other); - node = parent; - parent = rb_parent(node); - } - else - { - if (!other->rb_left || rb_is_black(other->rb_left)) - { - rb_set_black(other->rb_right); + if (!other->rb_left || rb_is_black(other->rb_left)) { + if (!other->rb_right || + rb_is_black(other->rb_right)) { rb_set_red(other); - __rb_rotate_left(other, root); - other = parent->rb_left; + node = parent; + parent = rb_parent(node); + continue; } - rb_set_color(other, rb_color(parent)); - rb_set_black(parent); - rb_set_black(other->rb_left); - __rb_rotate_right(parent, root); - break; + rb_set_black(other->rb_right); + rb_set_red(other); + __rb_rotate_left(other, root); + other = parent->rb_left; } + rb_set_color(other, rb_color(parent)); + rb_set_black(parent); + rb_set_black(other->rb_left); + __rb_rotate_right(parent, root); + break; } } } -- cgit v1.2.3 From 6280d2356fd8ad0936a63c10dc1e6accf48d0c61 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:30:57 -0700 Subject: rbtree: low level optimizations in __rb_erase_color() In __rb_erase_color(), we often already have pointers to the nodes being rotated and/or know what their colors must be, so we can generate more efficient code than the generic __rb_rotate_left() and __rb_rotate_right() functions. Also when the current node is red or when flipping the sibling's color, the parent is already known so we can use the more efficient rb_set_parent_color() function to set the desired color. Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Acked-by: David Woodhouse Cc: Rik van Riel Cc: Peter Zijlstra Cc: Daniel Santos Cc: Jens Axboe Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/rbtree.c | 208 +++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 115 insertions(+), 93 deletions(-) (limited to 'lib') diff --git a/lib/rbtree.c b/lib/rbtree.c index eb823a31099c..a38e473d8fe7 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -39,7 +39,8 @@ * 5), then the longest possible path due to 4 is 2B. * * We shall indicate color with case, where black nodes are uppercase and red - * nodes will be lowercase. + * nodes will be lowercase. Unknown color nodes shall be drawn as red within + * parentheses and have some accompanying text comment. */ #define RB_RED 0 @@ -48,17 +49,11 @@ #define rb_color(r) ((r)->__rb_parent_color & 1) #define rb_is_red(r) (!rb_color(r)) #define rb_is_black(r) rb_color(r) -#define rb_set_red(r) do { (r)->__rb_parent_color &= ~1; } while (0) -#define rb_set_black(r) do { (r)->__rb_parent_color |= 1; } while (0) static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p) { rb->__rb_parent_color = rb_color(rb) | (unsigned long)p; } -static inline void rb_set_color(struct rb_node *rb, int color) -{ - rb->__rb_parent_color = (rb->__rb_parent_color & ~1) | color; -} static inline void rb_set_parent_color(struct rb_node *rb, struct rb_node *p, int color) @@ -71,52 +66,6 @@ static inline struct rb_node *rb_red_parent(struct rb_node *red) return (struct rb_node *)red->__rb_parent_color; } -static void __rb_rotate_left(struct rb_node *node, struct rb_root *root) -{ - struct rb_node *right = node->rb_right; - struct rb_node *parent = rb_parent(node); - - if ((node->rb_right = right->rb_left)) - rb_set_parent(right->rb_left, node); - right->rb_left = node; - - rb_set_parent(right, parent); - - if (parent) - { - if (node == parent->rb_left) - parent->rb_left = right; - else - parent->rb_right = right; - } - else - root->rb_node = right; - rb_set_parent(node, right); -} - -static void __rb_rotate_right(struct rb_node *node, struct rb_root *root) -{ - struct rb_node *left = node->rb_left; - struct rb_node *parent = rb_parent(node); - - if ((node->rb_left = left->rb_right)) - rb_set_parent(left->rb_right, node); - left->rb_right = node; - - rb_set_parent(left, parent); - - if (parent) - { - if (node == parent->rb_right) - parent->rb_right = left; - else - parent->rb_left = left; - } - else - root->rb_node = left; - rb_set_parent(node, left); -} - /* * Helper function for rotations: * - old's parent and color get assigned to new @@ -257,7 +206,7 @@ EXPORT_SYMBOL(rb_insert_color); static void __rb_erase_color(struct rb_node *node, struct rb_node *parent, struct rb_root *root) { - struct rb_node *other; + struct rb_node *sibling, *tmp1, *tmp2; while (true) { /* @@ -270,63 +219,136 @@ static void __rb_erase_color(struct rb_node *node, struct rb_node *parent, * and tree rotations as per one of the 4 cases below. */ if (node && rb_is_red(node)) { - rb_set_black(node); + rb_set_parent_color(node, parent, RB_BLACK); break; } else if (!parent) { break; } else if (parent->rb_left == node) { - other = parent->rb_right; - if (rb_is_red(other)) - { - rb_set_black(other); - rb_set_red(parent); - __rb_rotate_left(parent, root); - other = parent->rb_right; + sibling = parent->rb_right; + if (rb_is_red(sibling)) { + /* + * Case 1 - left rotate at parent + * + * P S + * / \ / \ + * N s --> p Sr + * / \ / \ + * Sl Sr N Sl + */ + parent->rb_right = tmp1 = sibling->rb_left; + sibling->rb_left = parent; + rb_set_parent_color(tmp1, parent, RB_BLACK); + __rb_rotate_set_parents(parent, sibling, root, + RB_RED); + sibling = tmp1; } - if (!other->rb_right || rb_is_black(other->rb_right)) { - if (!other->rb_left || - rb_is_black(other->rb_left)) { - rb_set_red(other); + tmp1 = sibling->rb_right; + if (!tmp1 || rb_is_black(tmp1)) { + tmp2 = sibling->rb_left; + if (!tmp2 || rb_is_black(tmp2)) { + /* + * Case 2 - sibling color flip + * (p could be either color here) + * + * (p) (p) + * / \ / \ + * N S --> N s + * / \ / \ + * Sl Sr Sl Sr + * + * This leaves us violating 5), so + * recurse at p. If p is red, the + * recursion will just flip it to black + * and exit. If coming from Case 1, + * p is known to be red. + */ + rb_set_parent_color(sibling, parent, + RB_RED); node = parent; parent = rb_parent(node); continue; } - rb_set_black(other->rb_left); - rb_set_red(other); - __rb_rotate_right(other, root); - other = parent->rb_right; + /* + * Case 3 - right rotate at sibling + * (p could be either color here) + * + * (p) (p) + * / \ / \ + * N S --> N Sl + * / \ \ + * sl Sr s + * \ + * Sr + */ + sibling->rb_left = tmp1 = tmp2->rb_right; + tmp2->rb_right = sibling; + parent->rb_right = tmp2; + if (tmp1) + rb_set_parent_color(tmp1, sibling, + RB_BLACK); + tmp1 = sibling; + sibling = tmp2; } - rb_set_color(other, rb_color(parent)); - rb_set_black(parent); - rb_set_black(other->rb_right); - __rb_rotate_left(parent, root); + /* + * Case 4 - left rotate at parent + color flips + * (p and sl could be either color here. + * After rotation, p becomes black, s acquires + * p's color, and sl keeps its color) + * + * (p) (s) + * / \ / \ + * N S --> P Sr + * / \ / \ + * (sl) sr N (sl) + */ + parent->rb_right = tmp2 = sibling->rb_left; + sibling->rb_left = parent; + rb_set_parent_color(tmp1, sibling, RB_BLACK); + if (tmp2) + rb_set_parent(tmp2, parent); + __rb_rotate_set_parents(parent, sibling, root, + RB_BLACK); break; } else { - other = parent->rb_left; - if (rb_is_red(other)) - { - rb_set_black(other); - rb_set_red(parent); - __rb_rotate_right(parent, root); - other = parent->rb_left; + sibling = parent->rb_left; + if (rb_is_red(sibling)) { + /* Case 1 - right rotate at parent */ + parent->rb_left = tmp1 = sibling->rb_right; + sibling->rb_right = parent; + rb_set_parent_color(tmp1, parent, RB_BLACK); + __rb_rotate_set_parents(parent, sibling, root, + RB_RED); + sibling = tmp1; } - if (!other->rb_left || rb_is_black(other->rb_left)) { - if (!other->rb_right || - rb_is_black(other->rb_right)) { - rb_set_red(other); + tmp1 = sibling->rb_left; + if (!tmp1 || rb_is_black(tmp1)) { + tmp2 = sibling->rb_right; + if (!tmp2 || rb_is_black(tmp2)) { + /* Case 2 - sibling color flip */ + rb_set_parent_color(sibling, parent, + RB_RED); node = parent; parent = rb_parent(node); continue; } - rb_set_black(other->rb_right); - rb_set_red(other); - __rb_rotate_left(other, root); - other = parent->rb_left; + /* Case 3 - right rotate at sibling */ + sibling->rb_right = tmp1 = tmp2->rb_left; + tmp2->rb_left = sibling; + parent->rb_left = tmp2; + if (tmp1) + rb_set_parent_color(tmp1, sibling, + RB_BLACK); + tmp1 = sibling; + sibling = tmp2; } - rb_set_color(other, rb_color(parent)); - rb_set_black(parent); - rb_set_black(other->rb_left); - __rb_rotate_right(parent, root); + /* Case 4 - left rotate at parent + color flips */ + parent->rb_left = tmp2 = sibling->rb_right; + sibling->rb_right = parent; + rb_set_parent_color(tmp1, sibling, RB_BLACK); + if (tmp2) + rb_set_parent(tmp2, parent); + __rb_rotate_set_parents(parent, sibling, root, + RB_BLACK); break; } } -- cgit v1.2.3 From 7ce6ff9e5de99e7b72019c7de82fb438fe1dc5a0 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:01 -0700 Subject: rbtree: coding style adjustments Set comment and indentation style to be consistent with linux coding style and the rest of the file, as suggested by Peter Zijlstra Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Acked-by: David Woodhouse Cc: Rik van Riel Cc: Peter Zijlstra Cc: Daniel Santos Cc: Jens Axboe Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/rbtree.c | 42 +++++++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 19 deletions(-) (limited to 'lib') diff --git a/lib/rbtree.c b/lib/rbtree.c index a38e473d8fe7..08926709b4f9 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -363,8 +363,7 @@ void rb_erase(struct rb_node *node, struct rb_root *root) child = node->rb_right; else if (!node->rb_right) child = node->rb_left; - else - { + else { struct rb_node *old = node, *left; node = node->rb_right; @@ -406,17 +405,15 @@ void rb_erase(struct rb_node *node, struct rb_root *root) if (child) rb_set_parent(child, parent); - if (parent) - { + if (parent) { if (parent->rb_left == node) parent->rb_left = child; else parent->rb_right = child; - } - else + } else root->rb_node = child; - color: +color: if (color == RB_BLACK) __rb_erase_color(child, parent, root); } @@ -529,8 +526,10 @@ struct rb_node *rb_next(const struct rb_node *node) if (RB_EMPTY_NODE(node)) return NULL; - /* If we have a right-hand child, go down and then left as far - as we can. */ + /* + * If we have a right-hand child, go down and then left as far + * as we can. + */ if (node->rb_right) { node = node->rb_right; while (node->rb_left) @@ -538,12 +537,13 @@ struct rb_node *rb_next(const struct rb_node *node) return (struct rb_node *)node; } - /* No right-hand children. Everything down and left is - smaller than us, so any 'next' node must be in the general - direction of our parent. Go up the tree; any time the - ancestor is a right-hand child of its parent, keep going - up. First time it's a left-hand child of its parent, said - parent is our 'next' node. */ + /* + * No right-hand children. Everything down and left is smaller than us, + * so any 'next' node must be in the general direction of our parent. + * Go up the tree; any time the ancestor is a right-hand child of its + * parent, keep going up. First time it's a left-hand child of its + * parent, said parent is our 'next' node. + */ while ((parent = rb_parent(node)) && node == parent->rb_right) node = parent; @@ -558,8 +558,10 @@ struct rb_node *rb_prev(const struct rb_node *node) if (RB_EMPTY_NODE(node)) return NULL; - /* If we have a left-hand child, go down and then right as far - as we can. */ + /* + * If we have a left-hand child, go down and then right as far + * as we can. + */ if (node->rb_left) { node = node->rb_left; while (node->rb_right) @@ -567,8 +569,10 @@ struct rb_node *rb_prev(const struct rb_node *node) return (struct rb_node *)node; } - /* No left-hand children. Go up till we find an ancestor which - is a right-hand child of its parent */ + /* + * No left-hand children. Go up till we find an ancestor which + * is a right-hand child of its parent. + */ while ((parent = rb_parent(node)) && node == parent->rb_left) node = parent; -- cgit v1.2.3 From 59633abf34e2f44b8e772a2c12a92132aa7c2220 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:02 -0700 Subject: rbtree: optimize fetching of sibling node When looking to fetch a node's sibling, we went through a sequence of: - check if node is the parent's left child - if it is, then fetch the parent's right child This can be replaced with: - fetch the parent's right child as an assumed sibling - check that node is NOT the fetched child This avoids fetching the parent's left child when node is actually that child. Saves a bit on code size, though it doesn't seem to make a large difference in speed. Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Cc: David Woodhouse Acked-by: Rik van Riel Cc: Peter Zijlstra Cc: Daniel Santos Cc: Jens Axboe Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/rbtree.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'lib') diff --git a/lib/rbtree.c b/lib/rbtree.c index 08926709b4f9..61cdd0e3e538 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -107,8 +107,8 @@ void rb_insert_color(struct rb_node *node, struct rb_root *root) gparent = rb_red_parent(parent); - if (parent == gparent->rb_left) { - tmp = gparent->rb_right; + tmp = gparent->rb_right; + if (parent != tmp) { /* parent == gparent->rb_left */ if (tmp && rb_is_red(tmp)) { /* * Case 1 - color flips @@ -131,7 +131,8 @@ void rb_insert_color(struct rb_node *node, struct rb_root *root) continue; } - if (parent->rb_right == node) { + tmp = parent->rb_right; + if (node == tmp) { /* * Case 2 - left rotate at parent * @@ -151,6 +152,7 @@ void rb_insert_color(struct rb_node *node, struct rb_root *root) RB_BLACK); rb_set_parent_color(parent, node, RB_RED); parent = node; + tmp = node->rb_right; } /* @@ -162,7 +164,7 @@ void rb_insert_color(struct rb_node *node, struct rb_root *root) * / \ * n U */ - gparent->rb_left = tmp = parent->rb_right; + gparent->rb_left = tmp; /* == parent->rb_right */ parent->rb_right = gparent; if (tmp) rb_set_parent_color(tmp, gparent, RB_BLACK); @@ -180,7 +182,8 @@ void rb_insert_color(struct rb_node *node, struct rb_root *root) continue; } - if (parent->rb_left == node) { + tmp = parent->rb_left; + if (node == tmp) { /* Case 2 - right rotate at parent */ parent->rb_left = tmp = node->rb_right; node->rb_right = parent; @@ -189,10 +192,11 @@ void rb_insert_color(struct rb_node *node, struct rb_root *root) RB_BLACK); rb_set_parent_color(parent, node, RB_RED); parent = node; + tmp = node->rb_left; } /* Case 3 - left rotate at gparent */ - gparent->rb_right = tmp = parent->rb_left; + gparent->rb_right = tmp; /* == parent->rb_left */ parent->rb_left = gparent; if (tmp) rb_set_parent_color(tmp, gparent, RB_BLACK); @@ -223,8 +227,9 @@ static void __rb_erase_color(struct rb_node *node, struct rb_node *parent, break; } else if (!parent) { break; - } else if (parent->rb_left == node) { - sibling = parent->rb_right; + } + sibling = parent->rb_right; + if (node != sibling) { /* node == parent->rb_left */ if (rb_is_red(sibling)) { /* * Case 1 - left rotate at parent -- cgit v1.2.3 From 28d7530928d01638678f63c3c70113540b0e6abe Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:04 -0700 Subject: rbtree test: fix sparse warning about 64-bit constant Just a small fix to make sparse happy. Signed-off-by: Michel Lespinasse Reported-by: Fengguang Wu Acked-by: Rik van Riel Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/rbtree_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/rbtree_test.c b/lib/rbtree_test.c index 19dfca9ff7d7..fd09465d82ca 100644 --- a/lib/rbtree_test.c +++ b/lib/rbtree_test.c @@ -88,7 +88,7 @@ static int rbtree_test_init(void) printk(KERN_ALERT "rbtree testing"); - prandom32_seed(&rnd, 3141592653589793238); + prandom32_seed(&rnd, 3141592653589793238ULL); init(); time1 = get_cycles(); -- cgit v1.2.3 From 7abc704ae399fcb9c51ca200b0456f8a975a8011 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:07 -0700 Subject: rbtree: add __rb_change_child() helper function Add __rb_change_child() as an inline helper function to replace code that would otherwise be duplicated 4 times in the source. No changes to binary size or speed. Signed-off-by: Michel Lespinasse Reviewed-by: Rik van Riel Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/rbtree.c | 46 +++++++++++++++++----------------------------- 1 file changed, 17 insertions(+), 29 deletions(-) (limited to 'lib') diff --git a/lib/rbtree.c b/lib/rbtree.c index 61cdd0e3e538..de89a614b1ba 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -66,6 +66,19 @@ static inline struct rb_node *rb_red_parent(struct rb_node *red) return (struct rb_node *)red->__rb_parent_color; } +static inline void +__rb_change_child(struct rb_node *old, struct rb_node *new, + struct rb_node *parent, struct rb_root *root) +{ + if (parent) { + if (parent->rb_left == old) + parent->rb_left = new; + else + parent->rb_right = new; + } else + root->rb_node = new; +} + /* * Helper function for rotations: * - old's parent and color get assigned to new @@ -78,13 +91,7 @@ __rb_rotate_set_parents(struct rb_node *old, struct rb_node *new, struct rb_node *parent = rb_parent(old); new->__rb_parent_color = old->__rb_parent_color; rb_set_parent_color(old, new, color); - if (parent) { - if (parent->rb_left == old) - parent->rb_left = new; - else - parent->rb_right = new; - } else - root->rb_node = new; + __rb_change_child(old, new, parent, root); } void rb_insert_color(struct rb_node *node, struct rb_root *root) @@ -375,13 +382,7 @@ void rb_erase(struct rb_node *node, struct rb_root *root) while ((left = node->rb_left) != NULL) node = left; - if (rb_parent(old)) { - if (rb_parent(old)->rb_left == old) - rb_parent(old)->rb_left = node; - else - rb_parent(old)->rb_right = node; - } else - root->rb_node = node; + __rb_change_child(old, node, rb_parent(old), root); child = node->rb_right; parent = rb_parent(node); @@ -410,13 +411,7 @@ void rb_erase(struct rb_node *node, struct rb_root *root) if (child) rb_set_parent(child, parent); - if (parent) { - if (parent->rb_left == node) - parent->rb_left = child; - else - parent->rb_right = child; - } else - root->rb_node = child; + __rb_change_child(node, child, parent, root); color: if (color == RB_BLACK) @@ -591,14 +586,7 @@ void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_node *parent = rb_parent(victim); /* Set the surrounding nodes to point to the replacement */ - if (parent) { - if (victim == parent->rb_left) - parent->rb_left = new; - else - parent->rb_right = new; - } else { - root->rb_node = new; - } + __rb_change_child(victim, new, parent, root); if (victim->rb_left) rb_set_parent(victim->rb_left, new); if (victim->rb_right) -- cgit v1.2.3 From 60670b8034d6e2ba860af79c9379b7788d09db73 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:10 -0700 Subject: rbtree: place easiest case first in rb_erase() In rb_erase, move the easy case (node to erase has no more than 1 child) first. I feel the code reads easier that way. Signed-off-by: Michel Lespinasse Reviewed-by: Rik van Riel Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/rbtree.c | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) (limited to 'lib') diff --git a/lib/rbtree.c b/lib/rbtree.c index de89a614b1ba..bde1b5c5fb33 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -368,17 +368,28 @@ static void __rb_erase_color(struct rb_node *node, struct rb_node *parent, void rb_erase(struct rb_node *node, struct rb_root *root) { - struct rb_node *child, *parent; + struct rb_node *child = node->rb_right, *tmp = node->rb_left; + struct rb_node *parent; int color; - if (!node->rb_left) - child = node->rb_right; - else if (!node->rb_right) - child = node->rb_left; - else { + if (!tmp) { + case1: + /* Case 1: node to erase has no more than 1 child (easy!) */ + + parent = rb_parent(node); + color = rb_color(node); + + if (child) + rb_set_parent(child, parent); + __rb_change_child(node, child, parent, root); + } else if (!child) { + /* Still case 1, but this time the child is node->rb_left */ + child = tmp; + goto case1; + } else { struct rb_node *old = node, *left; - node = node->rb_right; + node = child; while ((left = node->rb_left) != NULL) node = left; @@ -402,18 +413,8 @@ void rb_erase(struct rb_node *node, struct rb_root *root) node->__rb_parent_color = old->__rb_parent_color; node->rb_left = old->rb_left; rb_set_parent(old->rb_left, node); - - goto color; } - parent = rb_parent(node); - color = rb_color(node); - - if (child) - rb_set_parent(child, parent); - __rb_change_child(node, child, parent, root); - -color: if (color == RB_BLACK) __rb_erase_color(child, parent, root); } -- cgit v1.2.3 From 46b6135a7402ac23c5b25f2bd79b03bab8f98278 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:11 -0700 Subject: rbtree: handle 1-child recoloring in rb_erase() instead of rb_erase_color() An interesting observation for rb_erase() is that when a node has exactly one child, the node must be black and the child must be red. An interesting consequence is that removing such a node can be done by simply replacing it with its child and making the child black, which we can do efficiently in rb_erase(). __rb_erase_color() then only needs to handle the no-childs case and can be modified accordingly. Signed-off-by: Michel Lespinasse Acked-by: Rik van Riel Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/rbtree.c | 105 +++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 62 insertions(+), 43 deletions(-) (limited to 'lib') diff --git a/lib/rbtree.c b/lib/rbtree.c index bde1b5c5fb33..80b092538fa9 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -2,7 +2,8 @@ Red Black Trees (C) 1999 Andrea Arcangeli (C) 2002 David Woodhouse - + (C) 2012 Michel Lespinasse + This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or @@ -50,6 +51,11 @@ #define rb_is_red(r) (!rb_color(r)) #define rb_is_black(r) rb_color(r) +static inline void rb_set_black(struct rb_node *rb) +{ + rb->__rb_parent_color |= RB_BLACK; +} + static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p) { rb->__rb_parent_color = rb_color(rb) | (unsigned long)p; @@ -214,27 +220,18 @@ void rb_insert_color(struct rb_node *node, struct rb_root *root) } EXPORT_SYMBOL(rb_insert_color); -static void __rb_erase_color(struct rb_node *node, struct rb_node *parent, - struct rb_root *root) +static void __rb_erase_color(struct rb_node *parent, struct rb_root *root) { - struct rb_node *sibling, *tmp1, *tmp2; + struct rb_node *node = NULL, *sibling, *tmp1, *tmp2; while (true) { /* - * Loop invariant: all leaf paths going through node have a - * black node count that is 1 lower than other leaf paths. - * - * If node is red, we can flip it to black to adjust. - * If node is the root, all leaf paths go through it. - * Otherwise, we need to adjust the tree through color flips - * and tree rotations as per one of the 4 cases below. + * Loop invariants: + * - node is black (or NULL on first iteration) + * - node is not the root (parent is not NULL) + * - All leaf paths going through parent and node have a + * black node count that is 1 lower than other leaf paths. */ - if (node && rb_is_red(node)) { - rb_set_parent_color(node, parent, RB_BLACK); - break; - } else if (!parent) { - break; - } sibling = parent->rb_right; if (node != sibling) { /* node == parent->rb_left */ if (rb_is_red(sibling)) { @@ -268,17 +265,22 @@ static void __rb_erase_color(struct rb_node *node, struct rb_node *parent, * / \ / \ * Sl Sr Sl Sr * - * This leaves us violating 5), so - * recurse at p. If p is red, the - * recursion will just flip it to black - * and exit. If coming from Case 1, - * p is known to be red. + * This leaves us violating 5) which + * can be fixed by flipping p to black + * if it was red, or by recursing at p. + * p is red when coming from Case 1. */ rb_set_parent_color(sibling, parent, RB_RED); - node = parent; - parent = rb_parent(node); - continue; + if (rb_is_red(parent)) + rb_set_black(parent); + else { + node = parent; + parent = rb_parent(node); + if (parent) + continue; + } + break; } /* * Case 3 - right rotate at sibling @@ -339,9 +341,15 @@ static void __rb_erase_color(struct rb_node *node, struct rb_node *parent, /* Case 2 - sibling color flip */ rb_set_parent_color(sibling, parent, RB_RED); - node = parent; - parent = rb_parent(node); - continue; + if (rb_is_red(parent)) + rb_set_black(parent); + else { + node = parent; + parent = rb_parent(node); + if (parent) + continue; + } + break; } /* Case 3 - right rotate at sibling */ sibling->rb_right = tmp1 = tmp2->rb_left; @@ -369,23 +377,31 @@ static void __rb_erase_color(struct rb_node *node, struct rb_node *parent, void rb_erase(struct rb_node *node, struct rb_root *root) { struct rb_node *child = node->rb_right, *tmp = node->rb_left; - struct rb_node *parent; - int color; + struct rb_node *parent, *rebalance; if (!tmp) { - case1: - /* Case 1: node to erase has no more than 1 child (easy!) */ + /* + * Case 1: node to erase has no more than 1 child (easy!) + * + * Note that if there is one child it must be red due to 5) + * and node must be black due to 4). We adjust colors locally + * so as to bypass __rb_erase_color() later on. + */ parent = rb_parent(node); - color = rb_color(node); - - if (child) - rb_set_parent(child, parent); __rb_change_child(node, child, parent, root); + if (child) { + rb_set_parent_color(child, parent, RB_BLACK); + rebalance = NULL; + } else { + rebalance = rb_is_black(node) ? parent : NULL; + } } else if (!child) { /* Still case 1, but this time the child is node->rb_left */ - child = tmp; - goto case1; + parent = rb_parent(node); + __rb_change_child(node, tmp, parent, root); + rb_set_parent_color(tmp, parent, RB_BLACK); + rebalance = NULL; } else { struct rb_node *old = node, *left; @@ -397,26 +413,29 @@ void rb_erase(struct rb_node *node, struct rb_root *root) child = node->rb_right; parent = rb_parent(node); - color = rb_color(node); if (parent == old) { parent = node; } else { - if (child) - rb_set_parent(child, parent); parent->rb_left = child; node->rb_right = old->rb_right; rb_set_parent(old->rb_right, node); } + if (child) { + rb_set_parent_color(child, parent, RB_BLACK); + rebalance = NULL; + } else { + rebalance = rb_is_black(node) ? parent : NULL; + } node->__rb_parent_color = old->__rb_parent_color; node->rb_left = old->rb_left; rb_set_parent(old->rb_left, node); } - if (color == RB_BLACK) - __rb_erase_color(child, parent, root); + if (rebalance) + __rb_erase_color(rebalance, root); } EXPORT_SYMBOL(rb_erase); -- cgit v1.2.3 From 4f035ad67f4633c233cb3642711d49b4efc9c82d Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:13 -0700 Subject: rbtree: low level optimizations in rb_erase() Various minor optimizations in rb_erase(): - Avoid multiple loading of node->__rb_parent_color when computing parent and color information (possibly not in close sequence, as there might be further branches in the algorithm) - In the 1-child subcase of case 1, copy the __rb_parent_color field from the erased node to the child instead of recomputing it from the desired parent and color - When searching for the erased node's successor, differentiate between cases 2 and 3 based on whether any left links were followed. This avoids a condition later down. - In case 3, keep a pointer to the erased node's right child so we don't have to refetch it later to adjust its parent. - In the no-childs subcase of cases 2 and 3, place the rebalance assigment last so that the compiler can remove the following if(rebalance) test. Also, added some comments to illustrate cases 2 and 3. Signed-off-by: Michel Lespinasse Acked-by: Rik van Riel Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/rbtree.c | 98 +++++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 64 insertions(+), 34 deletions(-) (limited to 'lib') diff --git a/lib/rbtree.c b/lib/rbtree.c index 80b092538fa9..938061ecbe61 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -47,9 +47,14 @@ #define RB_RED 0 #define RB_BLACK 1 -#define rb_color(r) ((r)->__rb_parent_color & 1) -#define rb_is_red(r) (!rb_color(r)) -#define rb_is_black(r) rb_color(r) +#define __rb_parent(pc) ((struct rb_node *)(pc & ~3)) + +#define __rb_color(pc) ((pc) & 1) +#define __rb_is_black(pc) __rb_color(pc) +#define __rb_is_red(pc) (!__rb_color(pc)) +#define rb_color(rb) __rb_color((rb)->__rb_parent_color) +#define rb_is_red(rb) __rb_is_red((rb)->__rb_parent_color) +#define rb_is_black(rb) __rb_is_black((rb)->__rb_parent_color) static inline void rb_set_black(struct rb_node *rb) { @@ -378,6 +383,7 @@ void rb_erase(struct rb_node *node, struct rb_root *root) { struct rb_node *child = node->rb_right, *tmp = node->rb_left; struct rb_node *parent, *rebalance; + unsigned long pc; if (!tmp) { /* @@ -387,51 +393,75 @@ void rb_erase(struct rb_node *node, struct rb_root *root) * and node must be black due to 4). We adjust colors locally * so as to bypass __rb_erase_color() later on. */ - - parent = rb_parent(node); + pc = node->__rb_parent_color; + parent = __rb_parent(pc); __rb_change_child(node, child, parent, root); if (child) { - rb_set_parent_color(child, parent, RB_BLACK); + child->__rb_parent_color = pc; rebalance = NULL; - } else { - rebalance = rb_is_black(node) ? parent : NULL; - } + } else + rebalance = __rb_is_black(pc) ? parent : NULL; } else if (!child) { /* Still case 1, but this time the child is node->rb_left */ - parent = rb_parent(node); + tmp->__rb_parent_color = pc = node->__rb_parent_color; + parent = __rb_parent(pc); __rb_change_child(node, tmp, parent, root); - rb_set_parent_color(tmp, parent, RB_BLACK); rebalance = NULL; } else { - struct rb_node *old = node, *left; - - node = child; - while ((left = node->rb_left) != NULL) - node = left; - - __rb_change_child(old, node, rb_parent(old), root); - - child = node->rb_right; - parent = rb_parent(node); - - if (parent == old) { - parent = node; + struct rb_node *successor = child, *child2; + tmp = child->rb_left; + if (!tmp) { + /* + * Case 2: node's successor is its right child + * + * (n) (s) + * / \ / \ + * (x) (s) -> (x) (c) + * \ + * (c) + */ + parent = child; + child2 = child->rb_right; } else { - parent->rb_left = child; - - node->rb_right = old->rb_right; - rb_set_parent(old->rb_right, node); + /* + * Case 3: node's successor is leftmost under + * node's right child subtree + * + * (n) (s) + * / \ / \ + * (x) (y) -> (x) (y) + * / / + * (p) (p) + * / / + * (s) (c) + * \ + * (c) + */ + do { + parent = successor; + successor = tmp; + tmp = tmp->rb_left; + } while (tmp); + parent->rb_left = child2 = successor->rb_right; + successor->rb_right = child; + rb_set_parent(child, successor); } - if (child) { - rb_set_parent_color(child, parent, RB_BLACK); + successor->rb_left = tmp = node->rb_left; + rb_set_parent(tmp, successor); + + pc = node->__rb_parent_color; + tmp = __rb_parent(pc); + __rb_change_child(node, successor, tmp, root); + if (child2) { + successor->__rb_parent_color = pc; + rb_set_parent_color(child2, parent, RB_BLACK); rebalance = NULL; } else { - rebalance = rb_is_black(node) ? parent : NULL; + unsigned long pc2 = successor->__rb_parent_color; + successor->__rb_parent_color = pc; + rebalance = __rb_is_black(pc2) ? parent : NULL; } - node->__rb_parent_color = old->__rb_parent_color; - node->rb_left = old->rb_left; - rb_set_parent(old->rb_left, node); } if (rebalance) -- cgit v1.2.3 From dadf93534f125b9eda486b471446a8456a603d27 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:15 -0700 Subject: rbtree: augmented rbtree test Small test to measure the performance of augmented rbtrees. Signed-off-by: Michel Lespinasse Acked-by: Rik van Riel Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/rbtree_test.c | 103 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 101 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/rbtree_test.c b/lib/rbtree_test.c index fd09465d82ca..66e41d4bfc39 100644 --- a/lib/rbtree_test.c +++ b/lib/rbtree_test.c @@ -10,6 +10,10 @@ struct test_node { struct rb_node rb; u32 key; + + /* following fields used for testing augmented rbtree functionality */ + u32 val; + u32 augmented; }; static struct rb_root root = RB_ROOT; @@ -20,10 +24,11 @@ static struct rnd_state rnd; static void insert(struct test_node *node, struct rb_root *root) { struct rb_node **new = &root->rb_node, *parent = NULL; + u32 key = node->key; while (*new) { parent = *new; - if (node->key < rb_entry(parent, struct test_node, rb)->key) + if (key < rb_entry(parent, struct test_node, rb)->key) new = &parent->rb_left; else new = &parent->rb_right; @@ -38,11 +43,62 @@ static inline void erase(struct test_node *node, struct rb_root *root) rb_erase(&node->rb, root); } +static inline u32 augment_recompute(struct test_node *node) +{ + u32 max = node->val, child_augmented; + if (node->rb.rb_left) { + child_augmented = rb_entry(node->rb.rb_left, struct test_node, + rb)->augmented; + if (max < child_augmented) + max = child_augmented; + } + if (node->rb.rb_right) { + child_augmented = rb_entry(node->rb.rb_right, struct test_node, + rb)->augmented; + if (max < child_augmented) + max = child_augmented; + } + return max; +} + +static void augment_callback(struct rb_node *rb, void *unused) +{ + struct test_node *node = rb_entry(rb, struct test_node, rb); + node->augmented = augment_recompute(node); +} + +static void insert_augmented(struct test_node *node, struct rb_root *root) +{ + struct rb_node **new = &root->rb_node, *parent = NULL; + u32 key = node->key; + + while (*new) { + parent = *new; + if (key < rb_entry(parent, struct test_node, rb)->key) + new = &parent->rb_left; + else + new = &parent->rb_right; + } + + rb_link_node(&node->rb, parent, new); + rb_insert_color(&node->rb, root); + rb_augment_insert(&node->rb, augment_callback, NULL); +} + +static void erase_augmented(struct test_node *node, struct rb_root *root) +{ + struct rb_node *deepest = rb_augment_erase_begin(&node->rb); + rb_erase(&node->rb, root); + rb_augment_erase_end(deepest, augment_callback, NULL); +} + static void init(void) { int i; - for (i = 0; i < NODES; i++) + for (i = 0; i < NODES; i++) { nodes[i].key = prandom32(&rnd); + nodes[i].val = prandom32(&rnd); + } } static bool is_red(struct rb_node *rb) @@ -81,6 +137,17 @@ static void check(int nr_nodes) WARN_ON_ONCE(count != nr_nodes); } +static void check_augmented(int nr_nodes) +{ + struct rb_node *rb; + + check(nr_nodes); + for (rb = rb_first(&root); rb; rb = rb_next(rb)) { + struct test_node *node = rb_entry(rb, struct test_node, rb); + WARN_ON_ONCE(node->augmented != augment_recompute(node)); + } +} + static int rbtree_test_init(void) { int i, j; @@ -119,6 +186,38 @@ static int rbtree_test_init(void) check(0); } + printk(KERN_ALERT "augmented rbtree testing"); + + init(); + + time1 = get_cycles(); + + for (i = 0; i < PERF_LOOPS; i++) { + for (j = 0; j < NODES; j++) + insert_augmented(nodes + j, &root); + for (j = 0; j < NODES; j++) + erase_augmented(nodes + j, &root); + } + + time2 = get_cycles(); + time = time2 - time1; + + time = div_u64(time, PERF_LOOPS); + printk(" -> %llu cycles\n", (unsigned long long)time); + + for (i = 0; i < CHECK_LOOPS; i++) { + init(); + for (j = 0; j < NODES; j++) { + check_augmented(j); + insert_augmented(nodes + j, &root); + } + for (j = 0; j < NODES; j++) { + check_augmented(NODES - j); + erase_augmented(nodes + j, &root); + } + check_augmented(0); + } + return -EAGAIN; /* Fail will directly unload the module */ } -- cgit v1.2.3 From 14b94af0b251a2c80885b60538166fb7d04a642e Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:17 -0700 Subject: rbtree: faster augmented rbtree manipulation Introduce new augmented rbtree APIs that allow minimal recalculation of augmented node information. A new callback is added to the rbtree insertion and erase rebalancing functions, to be called on each tree rotations. Such rotations preserve the subtree's root augmented value, but require recalculation of the one child that was previously located at the subtree root. In the insertion case, the handcoded search phase must be updated to maintain the augmented information on insertion, and then the rbtree coloring/rebalancing algorithms keep it up to date. In the erase case, things are more complicated since it is library code that manipulates the rbtree in order to remove internal nodes. This requires a couple additional callbacks to copy a subtree's augmented value when a new root is stitched in, and to recompute augmented values down the ancestry path when a node is removed from the tree. In order to preserve maximum speed for the non-augmented case, we provide two versions of each tree manipulation function. rb_insert_augmented() is the augmented equivalent of rb_insert_color(), and rb_erase_augmented() is the augmented equivalent of rb_erase(). Signed-off-by: Michel Lespinasse Acked-by: Rik van Riel Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/rbtree.txt | 190 +++++++++++++++++++++++++++++++++++++++-------- include/linux/rbtree.h | 19 +++++ lib/rbtree.c | 83 +++++++++++++++++++-- lib/rbtree_test.c | 58 +++++++++++---- 4 files changed, 296 insertions(+), 54 deletions(-) (limited to 'lib') diff --git a/Documentation/rbtree.txt b/Documentation/rbtree.txt index 8d32d85a5234..0a0b6dce3e08 100644 --- a/Documentation/rbtree.txt +++ b/Documentation/rbtree.txt @@ -193,24 +193,42 @@ Example: Support for Augmented rbtrees ----------------------------- -Augmented rbtree is an rbtree with "some" additional data stored in each node. -This data can be used to augment some new functionality to rbtree. -Augmented rbtree is an optional feature built on top of basic rbtree -infrastructure. An rbtree user who wants this feature will have to call the -augmentation functions with the user provided augmentation callback -when inserting and erasing nodes. +Augmented rbtree is an rbtree with "some" additional data stored in +each node, where the additional data for node N must be a function of +the contents of all nodes in the subtree rooted at N. This data can +be used to augment some new functionality to rbtree. Augmented rbtree +is an optional feature built on top of basic rbtree infrastructure. +An rbtree user who wants this feature will have to call the augmentation +functions with the user provided augmentation callback when inserting +and erasing nodes. -On insertion, the user must call rb_augment_insert() once the new node is in -place. This will cause the augmentation function callback to be called for -each node between the new node and the root which has been affected by the -insertion. +On insertion, the user must update the augmented information on the path +leading to the inserted node, then call rb_link_node() as usual and +rb_augment_inserted() instead of the usual rb_insert_color() call. +If rb_augment_inserted() rebalances the rbtree, it will callback into +a user provided function to update the augmented information on the +affected subtrees. -When erasing a node, the user must call rb_augment_erase_begin() first to -retrieve the deepest node on the rebalance path. Then, after erasing the -original node, the user must call rb_augment_erase_end() with the deepest -node found earlier. This will cause the augmentation function to be called -for each affected node between the deepest node and the root. +When erasing a node, the user must call rb_erase_augmented() instead of +rb_erase(). rb_erase_augmented() calls back into user provided functions +to updated the augmented information on affected subtrees. +In both cases, the callbacks are provided through struct rb_augment_callbacks. +3 callbacks must be defined: + +- A propagation callback, which updates the augmented value for a given + node and its ancestors, up to a given stop point (or NULL to update + all the way to the root). + +- A copy callback, which copies the augmented value for a given subtree + to a newly assigned subtree root. + +- A tree rotation callback, which copies the augmented value for a given + subtree to a newly assigned subtree root AND recomputes the augmented + information for the former subtree root. + + +Sample usage: Interval tree is an example of augmented rb tree. Reference - "Introduction to Algorithms" by Cormen, Leiserson, Rivest and Stein. @@ -230,26 +248,132 @@ and its immediate children. And this will be used in O(log n) lookup for lowest match (lowest start address among all possible matches) with something like: -find_lowest_match(lo, hi, node) +struct interval_tree_node * +interval_tree_first_match(struct rb_root *root, + unsigned long start, unsigned long last) { - lowest_match = NULL; - while (node) { - if (max_hi(node->left) > lo) { - // Lowest overlap if any must be on left side - node = node->left; - } else if (overlap(lo, hi, node)) { - lowest_match = node; - break; - } else if (lo > node->lo) { - // Lowest overlap if any must be on right side - node = node->right; - } else { - break; + struct interval_tree_node *node; + + if (!root->rb_node) + return NULL; + node = rb_entry(root->rb_node, struct interval_tree_node, rb); + + while (true) { + if (node->rb.rb_left) { + struct interval_tree_node *left = + rb_entry(node->rb.rb_left, + struct interval_tree_node, rb); + if (left->__subtree_last >= start) { + /* + * Some nodes in left subtree satisfy Cond2. + * Iterate to find the leftmost such node N. + * If it also satisfies Cond1, that's the match + * we are looking for. Otherwise, there is no + * matching interval as nodes to the right of N + * can't satisfy Cond1 either. + */ + node = left; + continue; + } } + if (node->start <= last) { /* Cond1 */ + if (node->last >= start) /* Cond2 */ + return node; /* node is leftmost match */ + if (node->rb.rb_right) { + node = rb_entry(node->rb.rb_right, + struct interval_tree_node, rb); + if (node->__subtree_last >= start) + continue; + } + } + return NULL; /* No match */ + } +} + +Insertion/removal are defined using the following augmented callbacks: + +static inline unsigned long +compute_subtree_last(struct interval_tree_node *node) +{ + unsigned long max = node->last, subtree_last; + if (node->rb.rb_left) { + subtree_last = rb_entry(node->rb.rb_left, + struct interval_tree_node, rb)->__subtree_last; + if (max < subtree_last) + max = subtree_last; + } + if (node->rb.rb_right) { + subtree_last = rb_entry(node->rb.rb_right, + struct interval_tree_node, rb)->__subtree_last; + if (max < subtree_last) + max = subtree_last; + } + return max; +} + +static void augment_propagate(struct rb_node *rb, struct rb_node *stop) +{ + while (rb != stop) { + struct interval_tree_node *node = + rb_entry(rb, struct interval_tree_node, rb); + unsigned long subtree_last = compute_subtree_last(node); + if (node->__subtree_last == subtree_last) + break; + node->__subtree_last = subtree_last; + rb = rb_parent(&node->rb); + } +} + +static void augment_copy(struct rb_node *rb_old, struct rb_node *rb_new) +{ + struct interval_tree_node *old = + rb_entry(rb_old, struct interval_tree_node, rb); + struct interval_tree_node *new = + rb_entry(rb_new, struct interval_tree_node, rb); + + new->__subtree_last = old->__subtree_last; +} + +static void augment_rotate(struct rb_node *rb_old, struct rb_node *rb_new) +{ + struct interval_tree_node *old = + rb_entry(rb_old, struct interval_tree_node, rb); + struct interval_tree_node *new = + rb_entry(rb_new, struct interval_tree_node, rb); + + new->__subtree_last = old->__subtree_last; + old->__subtree_last = compute_subtree_last(old); +} + +static const struct rb_augment_callbacks augment_callbacks = { + augment_propagate, augment_copy, augment_rotate +}; + +void interval_tree_insert(struct interval_tree_node *node, + struct rb_root *root) +{ + struct rb_node **link = &root->rb_node, *rb_parent = NULL; + unsigned long start = node->start, last = node->last; + struct interval_tree_node *parent; + + while (*link) { + rb_parent = *link; + parent = rb_entry(rb_parent, struct interval_tree_node, rb); + if (parent->__subtree_last < last) + parent->__subtree_last = last; + if (start < parent->start) + link = &parent->rb.rb_left; + else + link = &parent->rb.rb_right; } - return lowest_match; + + node->__subtree_last = last; + rb_link_node(&node->rb, rb_parent, link); + rb_insert_augmented(&node->rb, root, &augment_callbacks); } -Finding exact match will be to first find lowest match and then to follow -successor nodes looking for exact match, until the start of a node is beyond -the hi value we are looking for. +void interval_tree_remove(struct interval_tree_node *node, + struct rb_root *root) +{ + rb_erase_augmented(&node->rb, root, &augment_callbacks); +} diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index bf836a2c6a32..c902eb9d6506 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -61,6 +61,25 @@ struct rb_root { extern void rb_insert_color(struct rb_node *, struct rb_root *); extern void rb_erase(struct rb_node *, struct rb_root *); + +struct rb_augment_callbacks { + void (*propagate)(struct rb_node *node, struct rb_node *stop); + void (*copy)(struct rb_node *old, struct rb_node *new); + void (*rotate)(struct rb_node *old, struct rb_node *new); +}; + +extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root, + void (*augment_rotate)(struct rb_node *old, struct rb_node *new)); +extern void rb_erase_augmented(struct rb_node *node, struct rb_root *root, + const struct rb_augment_callbacks *augment); +static inline void +rb_insert_augmented(struct rb_node *node, struct rb_root *root, + const struct rb_augment_callbacks *augment) +{ + __rb_insert_augmented(node, root, augment->rotate); +} + + typedef void (*rb_augment_f)(struct rb_node *node, void *data); extern void rb_augment_insert(struct rb_node *node, diff --git a/lib/rbtree.c b/lib/rbtree.c index 938061ecbe61..a37ee7954b8f 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -105,7 +105,9 @@ __rb_rotate_set_parents(struct rb_node *old, struct rb_node *new, __rb_change_child(old, new, parent, root); } -void rb_insert_color(struct rb_node *node, struct rb_root *root) +static __always_inline void +__rb_insert(struct rb_node *node, struct rb_root *root, + void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) { struct rb_node *parent = rb_red_parent(node), *gparent, *tmp; @@ -169,6 +171,7 @@ void rb_insert_color(struct rb_node *node, struct rb_root *root) rb_set_parent_color(tmp, parent, RB_BLACK); rb_set_parent_color(parent, node, RB_RED); + augment_rotate(parent, node); parent = node; tmp = node->rb_right; } @@ -187,6 +190,7 @@ void rb_insert_color(struct rb_node *node, struct rb_root *root) if (tmp) rb_set_parent_color(tmp, gparent, RB_BLACK); __rb_rotate_set_parents(gparent, parent, root, RB_RED); + augment_rotate(gparent, parent); break; } else { tmp = gparent->rb_left; @@ -209,6 +213,7 @@ void rb_insert_color(struct rb_node *node, struct rb_root *root) rb_set_parent_color(tmp, parent, RB_BLACK); rb_set_parent_color(parent, node, RB_RED); + augment_rotate(parent, node); parent = node; tmp = node->rb_left; } @@ -219,13 +224,15 @@ void rb_insert_color(struct rb_node *node, struct rb_root *root) if (tmp) rb_set_parent_color(tmp, gparent, RB_BLACK); __rb_rotate_set_parents(gparent, parent, root, RB_RED); + augment_rotate(gparent, parent); break; } } } -EXPORT_SYMBOL(rb_insert_color); -static void __rb_erase_color(struct rb_node *parent, struct rb_root *root) +static __always_inline void +__rb_erase_color(struct rb_node *parent, struct rb_root *root, + const struct rb_augment_callbacks *augment) { struct rb_node *node = NULL, *sibling, *tmp1, *tmp2; @@ -254,6 +261,7 @@ static void __rb_erase_color(struct rb_node *parent, struct rb_root *root) rb_set_parent_color(tmp1, parent, RB_BLACK); __rb_rotate_set_parents(parent, sibling, root, RB_RED); + augment->rotate(parent, sibling); sibling = tmp1; } tmp1 = sibling->rb_right; @@ -305,6 +313,7 @@ static void __rb_erase_color(struct rb_node *parent, struct rb_root *root) if (tmp1) rb_set_parent_color(tmp1, sibling, RB_BLACK); + augment->rotate(sibling, tmp2); tmp1 = sibling; sibling = tmp2; } @@ -327,6 +336,7 @@ static void __rb_erase_color(struct rb_node *parent, struct rb_root *root) rb_set_parent(tmp2, parent); __rb_rotate_set_parents(parent, sibling, root, RB_BLACK); + augment->rotate(parent, sibling); break; } else { sibling = parent->rb_left; @@ -337,6 +347,7 @@ static void __rb_erase_color(struct rb_node *parent, struct rb_root *root) rb_set_parent_color(tmp1, parent, RB_BLACK); __rb_rotate_set_parents(parent, sibling, root, RB_RED); + augment->rotate(parent, sibling); sibling = tmp1; } tmp1 = sibling->rb_left; @@ -363,6 +374,7 @@ static void __rb_erase_color(struct rb_node *parent, struct rb_root *root) if (tmp1) rb_set_parent_color(tmp1, sibling, RB_BLACK); + augment->rotate(sibling, tmp2); tmp1 = sibling; sibling = tmp2; } @@ -374,12 +386,15 @@ static void __rb_erase_color(struct rb_node *parent, struct rb_root *root) rb_set_parent(tmp2, parent); __rb_rotate_set_parents(parent, sibling, root, RB_BLACK); + augment->rotate(parent, sibling); break; } } } -void rb_erase(struct rb_node *node, struct rb_root *root) +static __always_inline void +__rb_erase(struct rb_node *node, struct rb_root *root, + const struct rb_augment_callbacks *augment) { struct rb_node *child = node->rb_right, *tmp = node->rb_left; struct rb_node *parent, *rebalance; @@ -401,12 +416,14 @@ void rb_erase(struct rb_node *node, struct rb_root *root) rebalance = NULL; } else rebalance = __rb_is_black(pc) ? parent : NULL; + tmp = parent; } else if (!child) { /* Still case 1, but this time the child is node->rb_left */ tmp->__rb_parent_color = pc = node->__rb_parent_color; parent = __rb_parent(pc); __rb_change_child(node, tmp, parent, root); rebalance = NULL; + tmp = parent; } else { struct rb_node *successor = child, *child2; tmp = child->rb_left; @@ -420,8 +437,9 @@ void rb_erase(struct rb_node *node, struct rb_root *root) * \ * (c) */ - parent = child; - child2 = child->rb_right; + parent = successor; + child2 = successor->rb_right; + augment->copy(node, successor); } else { /* * Case 3: node's successor is leftmost under @@ -445,6 +463,8 @@ void rb_erase(struct rb_node *node, struct rb_root *root) parent->rb_left = child2 = successor->rb_right; successor->rb_right = child; rb_set_parent(child, successor); + augment->copy(node, successor); + augment->propagate(parent, successor); } successor->rb_left = tmp = node->rb_left; @@ -462,13 +482,62 @@ void rb_erase(struct rb_node *node, struct rb_root *root) successor->__rb_parent_color = pc; rebalance = __rb_is_black(pc2) ? parent : NULL; } + tmp = successor; } + augment->propagate(tmp, NULL); if (rebalance) - __rb_erase_color(rebalance, root); + __rb_erase_color(rebalance, root, augment); +} + +/* + * Non-augmented rbtree manipulation functions. + * + * We use dummy augmented callbacks here, and have the compiler optimize them + * out of the rb_insert_color() and rb_erase() function definitions. + */ + +static inline void dummy_propagate(struct rb_node *node, struct rb_node *stop) {} +static inline void dummy_copy(struct rb_node *old, struct rb_node *new) {} +static inline void dummy_rotate(struct rb_node *old, struct rb_node *new) {} + +static const struct rb_augment_callbacks dummy_callbacks = { + dummy_propagate, dummy_copy, dummy_rotate +}; + +void rb_insert_color(struct rb_node *node, struct rb_root *root) +{ + __rb_insert(node, root, dummy_rotate); +} +EXPORT_SYMBOL(rb_insert_color); + +void rb_erase(struct rb_node *node, struct rb_root *root) +{ + __rb_erase(node, root, &dummy_callbacks); } EXPORT_SYMBOL(rb_erase); +/* + * Augmented rbtree manipulation functions. + * + * This instantiates the same __always_inline functions as in the non-augmented + * case, but this time with user-defined callbacks. + */ + +void __rb_insert_augmented(struct rb_node *node, struct rb_root *root, + void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) +{ + __rb_insert(node, root, augment_rotate); +} +EXPORT_SYMBOL(__rb_insert_augmented); + +void rb_erase_augmented(struct rb_node *node, struct rb_root *root, + const struct rb_augment_callbacks *augment) +{ + __rb_erase(node, root, augment); +} +EXPORT_SYMBOL(rb_erase_augmented); + static void rb_augment_path(struct rb_node *node, rb_augment_f func, void *data) { struct rb_node *parent; diff --git a/lib/rbtree_test.c b/lib/rbtree_test.c index 66e41d4bfc39..e28345df09bf 100644 --- a/lib/rbtree_test.c +++ b/lib/rbtree_test.c @@ -61,35 +61,65 @@ static inline u32 augment_recompute(struct test_node *node) return max; } -static void augment_callback(struct rb_node *rb, void *unused) +static void augment_propagate(struct rb_node *rb, struct rb_node *stop) { - struct test_node *node = rb_entry(rb, struct test_node, rb); - node->augmented = augment_recompute(node); + while (rb != stop) { + struct test_node *node = rb_entry(rb, struct test_node, rb); + u32 augmented = augment_recompute(node); + if (node->augmented == augmented) + break; + node->augmented = augmented; + rb = rb_parent(&node->rb); + } +} + +static void augment_copy(struct rb_node *rb_old, struct rb_node *rb_new) +{ + struct test_node *old = rb_entry(rb_old, struct test_node, rb); + struct test_node *new = rb_entry(rb_new, struct test_node, rb); + new->augmented = old->augmented; } +static void augment_rotate(struct rb_node *rb_old, struct rb_node *rb_new) +{ + struct test_node *old = rb_entry(rb_old, struct test_node, rb); + struct test_node *new = rb_entry(rb_new, struct test_node, rb); + + /* Rotation doesn't change subtree's augmented value */ + new->augmented = old->augmented; + old->augmented = augment_recompute(old); +} + +static const struct rb_augment_callbacks augment_callbacks = { + augment_propagate, augment_copy, augment_rotate +}; + static void insert_augmented(struct test_node *node, struct rb_root *root) { - struct rb_node **new = &root->rb_node, *parent = NULL; + struct rb_node **new = &root->rb_node, *rb_parent = NULL; u32 key = node->key; + u32 val = node->val; + struct test_node *parent; while (*new) { - parent = *new; - if (key < rb_entry(parent, struct test_node, rb)->key) - new = &parent->rb_left; + rb_parent = *new; + parent = rb_entry(rb_parent, struct test_node, rb); + if (parent->augmented < val) + parent->augmented = val; + if (key < parent->key) + new = &parent->rb.rb_left; else - new = &parent->rb_right; + new = &parent->rb.rb_right; } - rb_link_node(&node->rb, parent, new); - rb_insert_color(&node->rb, root); - rb_augment_insert(&node->rb, augment_callback, NULL); + node->augmented = val; + rb_link_node(&node->rb, rb_parent, new); + rb_insert_augmented(&node->rb, root, &augment_callbacks); } static void erase_augmented(struct test_node *node, struct rb_root *root) { - struct rb_node *deepest = rb_augment_erase_begin(&node->rb); - rb_erase(&node->rb, root); - rb_augment_erase_end(deepest, augment_callback, NULL); + rb_erase_augmented(&node->rb, root, &augment_callbacks); } static void init(void) -- cgit v1.2.3 From 9d9e6f9703bbd642f3f2f807e6aaa642a4cbcec9 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:20 -0700 Subject: rbtree: remove prior augmented rbtree implementation convert arch/x86/mm/pat_rbtree.c to the proposed augmented rbtree api and remove the old augmented rbtree implementation. Signed-off-by: Michel Lespinasse Acked-by: Rik van Riel Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/pat_rbtree.c | 65 +++++++++++++++++++++++++++++++------------- include/linux/rbtree.h | 8 ------ lib/rbtree.c | 71 ------------------------------------------------ 3 files changed, 46 insertions(+), 98 deletions(-) (limited to 'lib') diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c index 8acaddd0fb21..7e1515bd4770 100644 --- a/arch/x86/mm/pat_rbtree.c +++ b/arch/x86/mm/pat_rbtree.c @@ -54,29 +54,57 @@ static u64 get_subtree_max_end(struct rb_node *node) return ret; } -/* Update 'subtree_max_end' for a node, based on node and its children */ -static void memtype_rb_augment_cb(struct rb_node *node, void *__unused) +static u64 compute_subtree_max_end(struct memtype *data) { - struct memtype *data; - u64 max_end, child_max_end; - - if (!node) - return; - - data = container_of(node, struct memtype, rb); - max_end = data->end; + u64 max_end = data->end, child_max_end; - child_max_end = get_subtree_max_end(node->rb_right); + child_max_end = get_subtree_max_end(data->rb.rb_right); if (child_max_end > max_end) max_end = child_max_end; - child_max_end = get_subtree_max_end(node->rb_left); + child_max_end = get_subtree_max_end(data->rb.rb_left); if (child_max_end > max_end) max_end = child_max_end; - data->subtree_max_end = max_end; + return max_end; +} + +/* Update 'subtree_max_end' for node and its parents */ +static void memtype_rb_propagate_cb(struct rb_node *node, struct rb_node *stop) +{ + while (node != stop) { + struct memtype *data = container_of(node, struct memtype, rb); + u64 subtree_max_end = compute_subtree_max_end(data); + if (data->subtree_max_end == subtree_max_end) + break; + data->subtree_max_end = subtree_max_end; + node = rb_parent(&data->rb); + } +} + +static void memtype_rb_copy_cb(struct rb_node *old, struct rb_node *new) +{ + struct memtype *old_data = container_of(old, struct memtype, rb); + struct memtype *new_data = container_of(new, struct memtype, rb); + + new_data->subtree_max_end = old_data->subtree_max_end; } +/* Update 'subtree_max_end' after tree rotation. old and new are the + * former and current subtree roots */ +static void memtype_rb_rotate_cb(struct rb_node *old, struct rb_node *new) +{ + struct memtype *old_data = container_of(old, struct memtype, rb); + struct memtype *new_data = container_of(new, struct memtype, rb); + + new_data->subtree_max_end = old_data->subtree_max_end; + old_data->subtree_max_end = compute_subtree_max_end(old_data); +} + +static const struct rb_augment_callbacks memtype_rb_augment_cb = { + memtype_rb_propagate_cb, memtype_rb_copy_cb, memtype_rb_rotate_cb +}; + /* Find the first (lowest start addr) overlapping range from rb tree */ static struct memtype *memtype_rb_lowest_match(struct rb_root *root, u64 start, u64 end) @@ -179,15 +207,17 @@ static void memtype_rb_insert(struct rb_root *root, struct memtype *newdata) struct memtype *data = container_of(*node, struct memtype, rb); parent = *node; + if (data->subtree_max_end < newdata->end) + data->subtree_max_end = newdata->end; if (newdata->start <= data->start) node = &((*node)->rb_left); else if (newdata->start > data->start) node = &((*node)->rb_right); } + newdata->subtree_max_end = newdata->end; rb_link_node(&newdata->rb, parent, node); - rb_insert_color(&newdata->rb, root); - rb_augment_insert(&newdata->rb, memtype_rb_augment_cb, NULL); + rb_insert_augmented(&newdata->rb, root, &memtype_rb_augment_cb); } int rbt_memtype_check_insert(struct memtype *new, unsigned long *ret_type) @@ -209,16 +239,13 @@ int rbt_memtype_check_insert(struct memtype *new, unsigned long *ret_type) struct memtype *rbt_memtype_erase(u64 start, u64 end) { - struct rb_node *deepest; struct memtype *data; data = memtype_rb_exact_match(&memtype_rbroot, start, end); if (!data) goto out; - deepest = rb_augment_erase_begin(&data->rb); - rb_erase(&data->rb, &memtype_rbroot); - rb_augment_erase_end(deepest, memtype_rb_augment_cb, NULL); + rb_erase_augmented(&data->rb, &memtype_rbroot, &memtype_rb_augment_cb); out: return data; } diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index c902eb9d6506..4ace31b33380 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -80,14 +80,6 @@ rb_insert_augmented(struct rb_node *node, struct rb_root *root, } -typedef void (*rb_augment_f)(struct rb_node *node, void *data); - -extern void rb_augment_insert(struct rb_node *node, - rb_augment_f func, void *data); -extern struct rb_node *rb_augment_erase_begin(struct rb_node *node); -extern void rb_augment_erase_end(struct rb_node *node, - rb_augment_f func, void *data); - /* Find logical next and previous nodes in a tree */ extern struct rb_node *rb_next(const struct rb_node *); extern struct rb_node *rb_prev(const struct rb_node *); diff --git a/lib/rbtree.c b/lib/rbtree.c index a37ee7954b8f..c0088ca345f9 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -538,77 +538,6 @@ void rb_erase_augmented(struct rb_node *node, struct rb_root *root, } EXPORT_SYMBOL(rb_erase_augmented); -static void rb_augment_path(struct rb_node *node, rb_augment_f func, void *data) -{ - struct rb_node *parent; - -up: - func(node, data); - parent = rb_parent(node); - if (!parent) - return; - - if (node == parent->rb_left && parent->rb_right) - func(parent->rb_right, data); - else if (parent->rb_left) - func(parent->rb_left, data); - - node = parent; - goto up; -} - -/* - * after inserting @node into the tree, update the tree to account for - * both the new entry and any damage done by rebalance - */ -void rb_augment_insert(struct rb_node *node, rb_augment_f func, void *data) -{ - if (node->rb_left) - node = node->rb_left; - else if (node->rb_right) - node = node->rb_right; - - rb_augment_path(node, func, data); -} -EXPORT_SYMBOL(rb_augment_insert); - -/* - * before removing the node, find the deepest node on the rebalance path - * that will still be there after @node gets removed - */ -struct rb_node *rb_augment_erase_begin(struct rb_node *node) -{ - struct rb_node *deepest; - - if (!node->rb_right && !node->rb_left) - deepest = rb_parent(node); - else if (!node->rb_right) - deepest = node->rb_left; - else if (!node->rb_left) - deepest = node->rb_right; - else { - deepest = rb_next(node); - if (deepest->rb_right) - deepest = deepest->rb_right; - else if (rb_parent(deepest) != node) - deepest = rb_parent(deepest); - } - - return deepest; -} -EXPORT_SYMBOL(rb_augment_erase_begin); - -/* - * after removal, update the tree to account for the removed entry - * and any rebalance damage. - */ -void rb_augment_erase_end(struct rb_node *node, rb_augment_f func, void *data) -{ - if (node) - rb_augment_path(node, func, data); -} -EXPORT_SYMBOL(rb_augment_erase_end); - /* * This function returns the first node (in sort order) of the tree. */ -- cgit v1.2.3 From 3908836aa77e3621aaf2101f2920e01d7c8460d6 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:21 -0700 Subject: rbtree: add RB_DECLARE_CALLBACKS() macro As proposed by Peter Zijlstra, this makes it easier to define the augmented rbtree callbacks. Signed-off-by: Michel Lespinasse Cc: Rik van Riel Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/pat_rbtree.c | 37 ++----------------------------------- include/linux/rbtree.h | 30 ++++++++++++++++++++++++++++++ lib/rbtree_test.c | 34 ++-------------------------------- 3 files changed, 34 insertions(+), 67 deletions(-) (limited to 'lib') diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c index 7e1515bd4770..4d116959075d 100644 --- a/arch/x86/mm/pat_rbtree.c +++ b/arch/x86/mm/pat_rbtree.c @@ -69,41 +69,8 @@ static u64 compute_subtree_max_end(struct memtype *data) return max_end; } -/* Update 'subtree_max_end' for node and its parents */ -static void memtype_rb_propagate_cb(struct rb_node *node, struct rb_node *stop) -{ - while (node != stop) { - struct memtype *data = container_of(node, struct memtype, rb); - u64 subtree_max_end = compute_subtree_max_end(data); - if (data->subtree_max_end == subtree_max_end) - break; - data->subtree_max_end = subtree_max_end; - node = rb_parent(&data->rb); - } -} - -static void memtype_rb_copy_cb(struct rb_node *old, struct rb_node *new) -{ - struct memtype *old_data = container_of(old, struct memtype, rb); - struct memtype *new_data = container_of(new, struct memtype, rb); - - new_data->subtree_max_end = old_data->subtree_max_end; -} - -/* Update 'subtree_max_end' after tree rotation. old and new are the - * former and current subtree roots */ -static void memtype_rb_rotate_cb(struct rb_node *old, struct rb_node *new) -{ - struct memtype *old_data = container_of(old, struct memtype, rb); - struct memtype *new_data = container_of(new, struct memtype, rb); - - new_data->subtree_max_end = old_data->subtree_max_end; - old_data->subtree_max_end = compute_subtree_max_end(old_data); -} - -static const struct rb_augment_callbacks memtype_rb_augment_cb = { - memtype_rb_propagate_cb, memtype_rb_copy_cb, memtype_rb_rotate_cb -}; +RB_DECLARE_CALLBACKS(static, memtype_rb_augment_cb, struct memtype, rb, + u64, subtree_max_end, compute_subtree_max_end) /* Find the first (lowest start addr) overlapping range from rb tree */ static struct memtype *memtype_rb_lowest_match(struct rb_root *root, diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index 4ace31b33380..8d1e83b1c87b 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -79,6 +79,36 @@ rb_insert_augmented(struct rb_node *node, struct rb_root *root, __rb_insert_augmented(node, root, augment->rotate); } +#define RB_DECLARE_CALLBACKS(rbstatic, rbname, rbstruct, rbfield, \ + rbtype, rbaugmented, rbcompute) \ +static void rbname ## _propagate(struct rb_node *rb, struct rb_node *stop) \ +{ \ + while (rb != stop) { \ + rbstruct *node = rb_entry(rb, rbstruct, rbfield); \ + rbtype augmented = rbcompute(node); \ + if (node->rbaugmented == augmented) \ + break; \ + node->rbaugmented = augmented; \ + rb = rb_parent(&node->rbfield); \ + } \ +} \ +static void rbname ## _copy(struct rb_node *rb_old, struct rb_node *rb_new) \ +{ \ + rbstruct *old = rb_entry(rb_old, rbstruct, rbfield); \ + rbstruct *new = rb_entry(rb_new, rbstruct, rbfield); \ + new->rbaugmented = old->rbaugmented; \ +} \ +static void rbname ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \ +{ \ + rbstruct *old = rb_entry(rb_old, rbstruct, rbfield); \ + rbstruct *new = rb_entry(rb_new, rbstruct, rbfield); \ + new->rbaugmented = old->rbaugmented; \ + old->rbaugmented = rbcompute(old); \ +} \ +rbstatic const struct rb_augment_callbacks rbname = { \ + rbname ## _propagate, rbname ## _copy, rbname ## _rotate \ +}; + /* Find logical next and previous nodes in a tree */ extern struct rb_node *rb_next(const struct rb_node *); diff --git a/lib/rbtree_test.c b/lib/rbtree_test.c index e28345df09bf..b20e99969b0f 100644 --- a/lib/rbtree_test.c +++ b/lib/rbtree_test.c @@ -61,38 +61,8 @@ static inline u32 augment_recompute(struct test_node *node) return max; } -static void augment_propagate(struct rb_node *rb, struct rb_node *stop) -{ - while (rb != stop) { - struct test_node *node = rb_entry(rb, struct test_node, rb); - u32 augmented = augment_recompute(node); - if (node->augmented == augmented) - break; - node->augmented = augmented; - rb = rb_parent(&node->rb); - } -} - -static void augment_copy(struct rb_node *rb_old, struct rb_node *rb_new) -{ - struct test_node *old = rb_entry(rb_old, struct test_node, rb); - struct test_node *new = rb_entry(rb_new, struct test_node, rb); - new->augmented = old->augmented; -} - -static void augment_rotate(struct rb_node *rb_old, struct rb_node *rb_new) -{ - struct test_node *old = rb_entry(rb_old, struct test_node, rb); - struct test_node *new = rb_entry(rb_new, struct test_node, rb); - - /* Rotation doesn't change subtree's augmented value */ - new->augmented = old->augmented; - old->augmented = augment_recompute(old); -} - -static const struct rb_augment_callbacks augment_callbacks = { - augment_propagate, augment_copy, augment_rotate -}; +RB_DECLARE_CALLBACKS(static, augment_callbacks, struct test_node, rb, + u32, augmented, augment_recompute) static void insert_augmented(struct test_node *node, struct rb_root *root) { -- cgit v1.2.3 From fff3fd8a1210a165252cd7cd01206da7a90d3a06 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:23 -0700 Subject: rbtree: add prio tree and interval tree tests Patch 1 implements support for interval trees, on top of the augmented rbtree API. It also adds synthetic tests to compare the performance of interval trees vs prio trees. Short answers is that interval trees are slightly faster (~25%) on insert/erase, and much faster (~2.4 - 3x) on search. It is debatable how realistic the synthetic test is, and I have not made such measurements yet, but my impression is that interval trees would still come out faster. Patch 2 uses a preprocessor template to make the interval tree generic, and uses it as a replacement for the vma prio_tree. Patch 3 takes the other prio_tree user, kmemleak, and converts it to use a basic rbtree. We don't actually need the augmented rbtree support here because the intervals are always non-overlapping. Patch 4 removes the now-unused prio tree library. Patch 5 proposes an additional optimization to rb_erase_augmented, now providing it as an inline function so that the augmented callbacks can be inlined in. This provides an additional 5-10% performance improvement for the interval tree insert/erase benchmark. There is a maintainance cost as it exposes augmented rbtree users to some of the rbtree library internals; however I think this cost shouldn't be too high as I expect the augmented rbtree will always have much less users than the base rbtree. I should probably add a quick summary of why I think it makes sense to replace prio trees with augmented rbtree based interval trees now. One of the drivers is that we need augmented rbtrees for Rik's vma gap finding code, and once you have them, it just makes sense to use them for interval trees as well, as this is the simpler and more well known algorithm. prio trees, in comparison, seem *too* clever: they impose an additional 'heap' constraint on the tree, which they use to guarantee a faster worst-case complexity of O(k+log N) for stabbing queries in a well-balanced prio tree, vs O(k*log N) for interval trees (where k=number of matches, N=number of intervals). Now this sounds great, but in practice prio trees don't realize this theorical benefit. First, the additional constraint makes them harder to update, so that the kernel implementation has to simplify things by balancing them like a radix tree, which is not always ideal. Second, the fact that there are both index and heap properties makes both tree manipulation and search more complex, which results in a higher multiplicative time constant. As it turns out, the simple interval tree algorithm ends up running faster than the more clever prio tree. This patch: Add two test modules: - prio_tree_test measures the performance of lib/prio_tree.c, both for insertion/removal and for stabbing searches - interval_tree_test measures the performance of a library of equivalent functionality, built using the augmented rbtree support. In order to support the second test module, lib/interval_tree.c is introduced. It is kept separate from the interval_tree_test main file for two reasons: first we don't want to provide an unfair advantage over prio_tree_test by having everything in a single compilation unit, and second there is the possibility that the interval tree functionality could get some non-test users in kernel over time. Signed-off-by: Michel Lespinasse Cc: Rik van Riel Cc: Hillf Danton Cc: Peter Zijlstra Cc: Catalin Marinas Cc: Andrea Arcangeli Cc: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/interval_tree.h | 27 +++++++ lib/Kconfig.debug | 12 ++++ lib/Makefile | 4 ++ lib/interval_tree.c | 159 ++++++++++++++++++++++++++++++++++++++++++ lib/interval_tree_test_main.c | 105 ++++++++++++++++++++++++++++ lib/prio_tree.c | 4 ++ lib/prio_tree_test.c | 106 ++++++++++++++++++++++++++++ 7 files changed, 417 insertions(+) create mode 100644 include/linux/interval_tree.h create mode 100644 lib/interval_tree.c create mode 100644 lib/interval_tree_test_main.c create mode 100644 lib/prio_tree_test.c (limited to 'lib') diff --git a/include/linux/interval_tree.h b/include/linux/interval_tree.h new file mode 100644 index 000000000000..724556aa3c95 --- /dev/null +++ b/include/linux/interval_tree.h @@ -0,0 +1,27 @@ +#ifndef _LINUX_INTERVAL_TREE_H +#define _LINUX_INTERVAL_TREE_H + +#include + +struct interval_tree_node { + struct rb_node rb; + unsigned long start; /* Start of interval */ + unsigned long last; /* Last location _in_ interval */ + unsigned long __subtree_last; +}; + +extern void +interval_tree_insert(struct interval_tree_node *node, struct rb_root *root); + +extern void +interval_tree_remove(struct interval_tree_node *node, struct rb_root *root); + +extern struct interval_tree_node * +interval_tree_iter_first(struct rb_root *root, + unsigned long start, unsigned long last); + +extern struct interval_tree_node * +interval_tree_iter_next(struct interval_tree_node *node, + unsigned long start, unsigned long last); + +#endif /* _LINUX_INTERVAL_TREE_H */ diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index a4e5d93b0f41..ee9f030b6951 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1289,6 +1289,18 @@ config RBTREE_TEST A benchmark measuring the performance of the rbtree library. Also includes rbtree invariant checks. +config PRIO_TREE_TEST + tristate "Prio tree test" + depends on m && DEBUG_KERNEL + help + A benchmark measuring the performance of the prio tree library + +config INTERVAL_TREE_TEST + tristate "Interval tree test" + depends on m && DEBUG_KERNEL + help + A benchmark measuring the performance of the interval tree library + config PROVIDE_OHCI1394_DMA_INIT bool "Remote debugging over FireWire early on boot" depends on PCI && X86 diff --git a/lib/Makefile b/lib/Makefile index f49445d26b65..26f578bf616a 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -141,6 +141,10 @@ $(foreach file, $(libfdt_files), \ lib-$(CONFIG_LIBFDT) += $(libfdt_files) obj-$(CONFIG_RBTREE_TEST) += rbtree_test.o +obj-$(CONFIG_PRIO_TREE_TEST) += prio_tree_test.o +obj-$(CONFIG_INTERVAL_TREE_TEST) += interval_tree_test.o + +interval_tree_test-objs := interval_tree_test_main.o interval_tree.o hostprogs-y := gen_crc32table clean-files := crc32table.h diff --git a/lib/interval_tree.c b/lib/interval_tree.c new file mode 100644 index 000000000000..6fd540b1e499 --- /dev/null +++ b/lib/interval_tree.c @@ -0,0 +1,159 @@ +#include +#include + +/* Callbacks for augmented rbtree insert and remove */ + +static inline unsigned long +compute_subtree_last(struct interval_tree_node *node) +{ + unsigned long max = node->last, subtree_last; + if (node->rb.rb_left) { + subtree_last = rb_entry(node->rb.rb_left, + struct interval_tree_node, rb)->__subtree_last; + if (max < subtree_last) + max = subtree_last; + } + if (node->rb.rb_right) { + subtree_last = rb_entry(node->rb.rb_right, + struct interval_tree_node, rb)->__subtree_last; + if (max < subtree_last) + max = subtree_last; + } + return max; +} + +RB_DECLARE_CALLBACKS(static, augment_callbacks, struct interval_tree_node, rb, + unsigned long, __subtree_last, compute_subtree_last) + +/* Insert / remove interval nodes from the tree */ + +void interval_tree_insert(struct interval_tree_node *node, + struct rb_root *root) +{ + struct rb_node **link = &root->rb_node, *rb_parent = NULL; + unsigned long start = node->start, last = node->last; + struct interval_tree_node *parent; + + while (*link) { + rb_parent = *link; + parent = rb_entry(rb_parent, struct interval_tree_node, rb); + if (parent->__subtree_last < last) + parent->__subtree_last = last; + if (start < parent->start) + link = &parent->rb.rb_left; + else + link = &parent->rb.rb_right; + } + + node->__subtree_last = last; + rb_link_node(&node->rb, rb_parent, link); + rb_insert_augmented(&node->rb, root, &augment_callbacks); +} + +void interval_tree_remove(struct interval_tree_node *node, + struct rb_root *root) +{ + rb_erase_augmented(&node->rb, root, &augment_callbacks); +} + +/* + * Iterate over intervals intersecting [start;last] + * + * Note that a node's interval intersects [start;last] iff: + * Cond1: node->start <= last + * and + * Cond2: start <= node->last + */ + +static struct interval_tree_node * +subtree_search(struct interval_tree_node *node, + unsigned long start, unsigned long last) +{ + while (true) { + /* + * Loop invariant: start <= node->__subtree_last + * (Cond2 is satisfied by one of the subtree nodes) + */ + if (node->rb.rb_left) { + struct interval_tree_node *left = + rb_entry(node->rb.rb_left, + struct interval_tree_node, rb); + if (start <= left->__subtree_last) { + /* + * Some nodes in left subtree satisfy Cond2. + * Iterate to find the leftmost such node N. + * If it also satisfies Cond1, that's the match + * we are looking for. Otherwise, there is no + * matching interval as nodes to the right of N + * can't satisfy Cond1 either. + */ + node = left; + continue; + } + } + if (node->start <= last) { /* Cond1 */ + if (start <= node->last) /* Cond2 */ + return node; /* node is leftmost match */ + if (node->rb.rb_right) { + node = rb_entry(node->rb.rb_right, + struct interval_tree_node, rb); + if (start <= node->__subtree_last) + continue; + } + } + return NULL; /* No match */ + } +} + +struct interval_tree_node * +interval_tree_iter_first(struct rb_root *root, + unsigned long start, unsigned long last) +{ + struct interval_tree_node *node; + + if (!root->rb_node) + return NULL; + node = rb_entry(root->rb_node, struct interval_tree_node, rb); + if (node->__subtree_last < start) + return NULL; + return subtree_search(node, start, last); +} + +struct interval_tree_node * +interval_tree_iter_next(struct interval_tree_node *node, + unsigned long start, unsigned long last) +{ + struct rb_node *rb = node->rb.rb_right, *prev; + + while (true) { + /* + * Loop invariants: + * Cond1: node->start <= last + * rb == node->rb.rb_right + * + * First, search right subtree if suitable + */ + if (rb) { + struct interval_tree_node *right = + rb_entry(rb, struct interval_tree_node, rb); + if (start <= right->__subtree_last) + return subtree_search(right, start, last); + } + + /* Move up the tree until we come from a node's left child */ + do { + rb = rb_parent(&node->rb); + if (!rb) + return NULL; + prev = &node->rb; + node = rb_entry(rb, struct interval_tree_node, rb); + rb = node->rb.rb_right; + } while (prev == rb); + + /* Check if the node intersects [start;last] */ + if (last < node->start) /* !Cond1 */ + return NULL; + else if (start <= node->last) /* Cond2 */ + return node; + } +} diff --git a/lib/interval_tree_test_main.c b/lib/interval_tree_test_main.c new file mode 100644 index 000000000000..b25903987f7a --- /dev/null +++ b/lib/interval_tree_test_main.c @@ -0,0 +1,105 @@ +#include +#include +#include +#include + +#define NODES 100 +#define PERF_LOOPS 100000 +#define SEARCHES 100 +#define SEARCH_LOOPS 10000 + +static struct rb_root root = RB_ROOT; +static struct interval_tree_node nodes[NODES]; +static u32 queries[SEARCHES]; + +static struct rnd_state rnd; + +static inline unsigned long +search(unsigned long query, struct rb_root *root) +{ + struct interval_tree_node *node; + unsigned long results = 0; + + for (node = interval_tree_iter_first(root, query, query); node; + node = interval_tree_iter_next(node, query, query)) + results++; + return results; +} + +static void init(void) +{ + int i; + for (i = 0; i < NODES; i++) { + u32 a = prandom32(&rnd), b = prandom32(&rnd); + if (a <= b) { + nodes[i].start = a; + nodes[i].last = b; + } else { + nodes[i].start = b; + nodes[i].last = a; + } + } + for (i = 0; i < SEARCHES; i++) + queries[i] = prandom32(&rnd); +} + +static int interval_tree_test_init(void) +{ + int i, j; + unsigned long results; + cycles_t time1, time2, time; + + printk(KERN_ALERT "interval tree insert/remove"); + + prandom32_seed(&rnd, 3141592653589793238ULL); + init(); + + time1 = get_cycles(); + + for (i = 0; i < PERF_LOOPS; i++) { + for (j = 0; j < NODES; j++) + interval_tree_insert(nodes + j, &root); + for (j = 0; j < NODES; j++) + interval_tree_remove(nodes + j, &root); + } + + time2 = get_cycles(); + time = time2 - time1; + + time = div_u64(time, PERF_LOOPS); + printk(" -> %llu cycles\n", (unsigned long long)time); + + printk(KERN_ALERT "interval tree search"); + + for (j = 0; j < NODES; j++) + interval_tree_insert(nodes + j, &root); + + time1 = get_cycles(); + + results = 0; + for (i = 0; i < SEARCH_LOOPS; i++) + for (j = 0; j < SEARCHES; j++) + results += search(queries[j], &root); + + time2 = get_cycles(); + time = time2 - time1; + + time = div_u64(time, SEARCH_LOOPS); + results = div_u64(results, SEARCH_LOOPS); + printk(" -> %llu cycles (%lu results)\n", + (unsigned long long)time, results); + + return -EAGAIN; /* Fail will directly unload the module */ +} + +static void interval_tree_test_exit(void) +{ + printk(KERN_ALERT "test exit\n"); +} + +module_init(interval_tree_test_init) +module_exit(interval_tree_test_exit) + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Michel Lespinasse"); +MODULE_DESCRIPTION("Interval Tree test"); diff --git a/lib/prio_tree.c b/lib/prio_tree.c index 8d443af03b4c..4e0d2edff2b4 100644 --- a/lib/prio_tree.c +++ b/lib/prio_tree.c @@ -14,6 +14,7 @@ #include #include #include +#include /* * A clever mix of heap and radix trees forms a radix priority search tree (PST) @@ -241,6 +242,7 @@ struct prio_tree_node *prio_tree_insert(struct prio_tree_root *root, BUG(); return NULL; } +EXPORT_SYMBOL(prio_tree_insert); /* * Remove a prio_tree_node @node from a radix priority search tree @root. The @@ -290,6 +292,7 @@ void prio_tree_remove(struct prio_tree_root *root, struct prio_tree_node *node) while (cur != node) cur = prio_tree_replace(root, cur->parent, cur); } +EXPORT_SYMBOL(prio_tree_remove); static void iter_walk_down(struct prio_tree_iter *iter) { @@ -464,3 +467,4 @@ repeat: goto repeat; } +EXPORT_SYMBOL(prio_tree_next); diff --git a/lib/prio_tree_test.c b/lib/prio_tree_test.c new file mode 100644 index 000000000000..c26084ddc6a4 --- /dev/null +++ b/lib/prio_tree_test.c @@ -0,0 +1,106 @@ +#include +#include +#include +#include + +#define NODES 100 +#define PERF_LOOPS 100000 +#define SEARCHES 100 +#define SEARCH_LOOPS 10000 + +static struct prio_tree_root root; +static struct prio_tree_node nodes[NODES]; +static u32 queries[SEARCHES]; + +static struct rnd_state rnd; + +static inline unsigned long +search(unsigned long query, struct prio_tree_root *root) +{ + struct prio_tree_iter iter; + unsigned long results = 0; + + prio_tree_iter_init(&iter, root, query, query); + while (prio_tree_next(&iter)) + results++; + return results; +} + +static void init(void) +{ + int i; + for (i = 0; i < NODES; i++) { + u32 a = prandom32(&rnd), b = prandom32(&rnd); + if (a <= b) { + nodes[i].start = a; + nodes[i].last = b; + } else { + nodes[i].start = b; + nodes[i].last = a; + } + } + for (i = 0; i < SEARCHES; i++) + queries[i] = prandom32(&rnd); +} + +static int prio_tree_test_init(void) +{ + int i, j; + unsigned long results; + cycles_t time1, time2, time; + + printk(KERN_ALERT "prio tree insert/remove"); + + prandom32_seed(&rnd, 3141592653589793238ULL); + INIT_PRIO_TREE_ROOT(&root); + init(); + + time1 = get_cycles(); + + for (i = 0; i < PERF_LOOPS; i++) { + for (j = 0; j < NODES; j++) + prio_tree_insert(&root, nodes + j); + for (j = 0; j < NODES; j++) + prio_tree_remove(&root, nodes + j); + } + + time2 = get_cycles(); + time = time2 - time1; + + time = div_u64(time, PERF_LOOPS); + printk(" -> %llu cycles\n", (unsigned long long)time); + + printk(KERN_ALERT "prio tree search"); + + for (j = 0; j < NODES; j++) + prio_tree_insert(&root, nodes + j); + + time1 = get_cycles(); + + results = 0; + for (i = 0; i < SEARCH_LOOPS; i++) + for (j = 0; j < SEARCHES; j++) + results += search(queries[j], &root); + + time2 = get_cycles(); + time = time2 - time1; + + time = div_u64(time, SEARCH_LOOPS); + results = div_u64(results, SEARCH_LOOPS); + printk(" -> %llu cycles (%lu results)\n", + (unsigned long long)time, results); + + return -EAGAIN; /* Fail will directly unload the module */ +} + +static void prio_tree_test_exit(void) +{ + printk(KERN_ALERT "test exit\n"); +} + +module_init(prio_tree_test_init) +module_exit(prio_tree_test_exit) + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Michel Lespinasse"); +MODULE_DESCRIPTION("Prio Tree test"); -- cgit v1.2.3 From 6b2dbba8b6ac4df26f72eda1e5ea7bab9f950e08 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:25 -0700 Subject: mm: replace vma prio_tree with an interval tree Implement an interval tree as a replacement for the VMA prio_tree. The algorithms are similar to lib/interval_tree.c; however that code can't be directly reused as the interval endpoints are not explicitly stored in the VMA. So instead, the common algorithm is moved into a template and the details (node type, how to get interval endpoints from the node, etc) are filled in using the C preprocessor. Once the interval tree functions are available, using them as a replacement to the VMA prio tree is a relatively simple, mechanical job. Signed-off-by: Michel Lespinasse Cc: Rik van Riel Cc: Hillf Danton Cc: Peter Zijlstra Cc: Catalin Marinas Cc: Andrea Arcangeli Cc: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/fault-armv.c | 3 +- arch/arm/mm/flush.c | 3 +- arch/parisc/kernel/cache.c | 3 +- arch/x86/mm/hugetlbpage.c | 3 +- fs/hugetlbfs/inode.c | 9 +- fs/inode.c | 2 +- include/linux/fs.h | 6 +- include/linux/interval_tree_tmpl.h | 215 +++++++++++++++++++++++++++++++++++++ include/linux/mm.h | 30 +++--- include/linux/mm_types.h | 14 +-- kernel/events/uprobes.c | 3 +- kernel/fork.c | 2 +- lib/interval_tree.c | 166 ++-------------------------- lib/prio_tree.c | 19 +--- mm/Makefile | 4 +- mm/filemap_xip.c | 3 +- mm/fremap.c | 2 +- mm/hugetlb.c | 3 +- mm/interval_tree.c | 61 +++++++++++ mm/memory-failure.c | 3 +- mm/memory.c | 9 +- mm/mmap.c | 22 ++-- mm/nommu.c | 12 +-- mm/prio_tree.c | 208 ----------------------------------- mm/rmap.c | 18 ++-- 25 files changed, 357 insertions(+), 466 deletions(-) create mode 100644 include/linux/interval_tree_tmpl.h create mode 100644 mm/interval_tree.c delete mode 100644 mm/prio_tree.c (limited to 'lib') diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c index 7599e2625c7d..2a5907b5c8d2 100644 --- a/arch/arm/mm/fault-armv.c +++ b/arch/arm/mm/fault-armv.c @@ -134,7 +134,6 @@ make_coherent(struct address_space *mapping, struct vm_area_struct *vma, { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *mpnt; - struct prio_tree_iter iter; unsigned long offset; pgoff_t pgoff; int aliases = 0; @@ -147,7 +146,7 @@ make_coherent(struct address_space *mapping, struct vm_area_struct *vma, * cache coherency. */ flush_dcache_mmap_lock(mapping); - vma_prio_tree_foreach(mpnt, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(mpnt, &mapping->i_mmap, pgoff, pgoff) { /* * If this VMA is not in our MM, we can ignore it. * Note that we intentionally mask out the VMA diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c index 40ca11ed6e5f..1c8f7f564175 100644 --- a/arch/arm/mm/flush.c +++ b/arch/arm/mm/flush.c @@ -196,7 +196,6 @@ static void __flush_dcache_aliases(struct address_space *mapping, struct page *p { struct mm_struct *mm = current->active_mm; struct vm_area_struct *mpnt; - struct prio_tree_iter iter; pgoff_t pgoff; /* @@ -208,7 +207,7 @@ static void __flush_dcache_aliases(struct address_space *mapping, struct page *p pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); flush_dcache_mmap_lock(mapping); - vma_prio_tree_foreach(mpnt, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(mpnt, &mapping->i_mmap, pgoff, pgoff) { unsigned long offset; /* diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c index 9d181890a7e3..48e16dc20102 100644 --- a/arch/parisc/kernel/cache.c +++ b/arch/parisc/kernel/cache.c @@ -276,7 +276,6 @@ void flush_dcache_page(struct page *page) { struct address_space *mapping = page_mapping(page); struct vm_area_struct *mpnt; - struct prio_tree_iter iter; unsigned long offset; unsigned long addr, old_addr = 0; pgoff_t pgoff; @@ -299,7 +298,7 @@ void flush_dcache_page(struct page *page) * to flush one address here for them all to become coherent */ flush_dcache_mmap_lock(mapping); - vma_prio_tree_foreach(mpnt, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(mpnt, &mapping->i_mmap, pgoff, pgoff) { offset = (pgoff - mpnt->vm_pgoff) << PAGE_SHIFT; addr = mpnt->vm_start + offset; diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index b91e48512425..937bff5cdaa7 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -71,7 +71,6 @@ huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) struct address_space *mapping = vma->vm_file->f_mapping; pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - struct prio_tree_iter iter; struct vm_area_struct *svma; unsigned long saddr; pte_t *spte = NULL; @@ -81,7 +80,7 @@ huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) return (pte_t *)pmd_alloc(mm, pud, addr); mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) { + vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { if (svma == vma) continue; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 0a0ab8e21b19..c5bc355d8243 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -397,17 +397,16 @@ static void hugetlbfs_evict_inode(struct inode *inode) } static inline void -hugetlb_vmtruncate_list(struct prio_tree_root *root, pgoff_t pgoff) +hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff) { struct vm_area_struct *vma; - struct prio_tree_iter iter; - vma_prio_tree_foreach(vma, &iter, root, pgoff, ULONG_MAX) { + vma_interval_tree_foreach(vma, root, pgoff, ULONG_MAX) { unsigned long v_offset; /* * Can the expression below overflow on 32-bit arches? - * No, because the prio_tree returns us only those vmas + * No, because the interval tree returns us only those vmas * which overlap the truncated area starting at pgoff, * and no vma on a 32-bit arch can span beyond the 4GB. */ @@ -432,7 +431,7 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) i_size_write(inode, offset); mutex_lock(&mapping->i_mmap_mutex); - if (!prio_tree_empty(&mapping->i_mmap)) + if (!RB_EMPTY_ROOT(&mapping->i_mmap)) hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); mutex_unlock(&mapping->i_mmap_mutex); truncate_hugepages(inode, offset); diff --git a/fs/inode.c b/fs/inode.c index ac8d904b3f16..b03c71957246 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -348,7 +348,7 @@ void address_space_init_once(struct address_space *mapping) mutex_init(&mapping->i_mmap_mutex); INIT_LIST_HEAD(&mapping->private_list); spin_lock_init(&mapping->private_lock); - INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap); + mapping->i_mmap = RB_ROOT; INIT_LIST_HEAD(&mapping->i_mmap_nonlinear); } EXPORT_SYMBOL(address_space_init_once); diff --git a/include/linux/fs.h b/include/linux/fs.h index 5a8a273d5b2f..c617ed024df8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -401,7 +401,7 @@ struct inodes_stat_t { #include #include #include -#include +#include #include #include #include @@ -669,7 +669,7 @@ struct address_space { struct radix_tree_root page_tree; /* radix tree of all pages */ spinlock_t tree_lock; /* and lock protecting it */ unsigned int i_mmap_writable;/* count VM_SHARED mappings */ - struct prio_tree_root i_mmap; /* tree of private and shared mappings */ + struct rb_root i_mmap; /* tree of private and shared mappings */ struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */ struct mutex i_mmap_mutex; /* protect tree, count, list */ /* Protected by tree_lock together with the radix tree */ @@ -741,7 +741,7 @@ int mapping_tagged(struct address_space *mapping, int tag); */ static inline int mapping_mapped(struct address_space *mapping) { - return !prio_tree_empty(&mapping->i_mmap) || + return !RB_EMPTY_ROOT(&mapping->i_mmap) || !list_empty(&mapping->i_mmap_nonlinear); } diff --git a/include/linux/interval_tree_tmpl.h b/include/linux/interval_tree_tmpl.h new file mode 100644 index 000000000000..c65deda31413 --- /dev/null +++ b/include/linux/interval_tree_tmpl.h @@ -0,0 +1,215 @@ +/* + Interval Trees + (C) 2012 Michel Lespinasse + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + include/linux/interval_tree_tmpl.h +*/ + +/* + * Template for implementing interval trees + * + * ITSTRUCT: struct type of the interval tree nodes + * ITRB: name of struct rb_node field within ITSTRUCT + * ITTYPE: type of the interval endpoints + * ITSUBTREE: name of ITTYPE field within ITSTRUCT holding last-in-subtree + * ITSTART(n): start endpoint of ITSTRUCT node n + * ITLAST(n): last endpoing of ITSTRUCT node n + * ITSTATIC: 'static' or empty + * ITPREFIX: prefix to use for the inline tree definitions + */ + +/* IT(name) -> ITPREFIX_name */ +#define _ITNAME(prefix, name) prefix ## _ ## name +#define ITNAME(prefix, name) _ITNAME(prefix, name) +#define IT(name) ITNAME(ITPREFIX, name) + +/* Callbacks for augmented rbtree insert and remove */ + +static inline ITTYPE IT(compute_subtree_last)(ITSTRUCT *node) +{ + ITTYPE max = ITLAST(node), subtree_last; + if (node->ITRB.rb_left) { + subtree_last = rb_entry(node->ITRB.rb_left, + ITSTRUCT, ITRB)->ITSUBTREE; + if (max < subtree_last) + max = subtree_last; + } + if (node->ITRB.rb_right) { + subtree_last = rb_entry(node->ITRB.rb_right, + ITSTRUCT, ITRB)->ITSUBTREE; + if (max < subtree_last) + max = subtree_last; + } + return max; +} + +static void IT(augment_propagate)(struct rb_node *rb, struct rb_node *stop) +{ + while (rb != stop) { + ITSTRUCT *node = rb_entry(rb, ITSTRUCT, ITRB); + ITTYPE subtree_last = IT(compute_subtree_last)(node); + if (node->ITSUBTREE == subtree_last) + break; + node->ITSUBTREE = subtree_last; + rb = rb_parent(&node->ITRB); + } +} + +static void IT(augment_copy)(struct rb_node *rb_old, struct rb_node *rb_new) +{ + ITSTRUCT *old = rb_entry(rb_old, ITSTRUCT, ITRB); + ITSTRUCT *new = rb_entry(rb_new, ITSTRUCT, ITRB); + + new->ITSUBTREE = old->ITSUBTREE; +} + +static void IT(augment_rotate)(struct rb_node *rb_old, struct rb_node *rb_new) +{ + ITSTRUCT *old = rb_entry(rb_old, ITSTRUCT, ITRB); + ITSTRUCT *new = rb_entry(rb_new, ITSTRUCT, ITRB); + + new->ITSUBTREE = old->ITSUBTREE; + old->ITSUBTREE = IT(compute_subtree_last)(old); +} + +static const struct rb_augment_callbacks IT(augment_callbacks) = { + IT(augment_propagate), IT(augment_copy), IT(augment_rotate) +}; + +/* Insert / remove interval nodes from the tree */ + +ITSTATIC void IT(insert)(ITSTRUCT *node, struct rb_root *root) +{ + struct rb_node **link = &root->rb_node, *rb_parent = NULL; + ITTYPE start = ITSTART(node), last = ITLAST(node); + ITSTRUCT *parent; + + while (*link) { + rb_parent = *link; + parent = rb_entry(rb_parent, ITSTRUCT, ITRB); + if (parent->ITSUBTREE < last) + parent->ITSUBTREE = last; + if (start < ITSTART(parent)) + link = &parent->ITRB.rb_left; + else + link = &parent->ITRB.rb_right; + } + + node->ITSUBTREE = last; + rb_link_node(&node->ITRB, rb_parent, link); + rb_insert_augmented(&node->ITRB, root, &IT(augment_callbacks)); +} + +ITSTATIC void IT(remove)(ITSTRUCT *node, struct rb_root *root) +{ + rb_erase_augmented(&node->ITRB, root, &IT(augment_callbacks)); +} + +/* + * Iterate over intervals intersecting [start;last] + * + * Note that a node's interval intersects [start;last] iff: + * Cond1: ITSTART(node) <= last + * and + * Cond2: start <= ITLAST(node) + */ + +static ITSTRUCT *IT(subtree_search)(ITSTRUCT *node, ITTYPE start, ITTYPE last) +{ + while (true) { + /* + * Loop invariant: start <= node->ITSUBTREE + * (Cond2 is satisfied by one of the subtree nodes) + */ + if (node->ITRB.rb_left) { + ITSTRUCT *left = rb_entry(node->ITRB.rb_left, + ITSTRUCT, ITRB); + if (start <= left->ITSUBTREE) { + /* + * Some nodes in left subtree satisfy Cond2. + * Iterate to find the leftmost such node N. + * If it also satisfies Cond1, that's the match + * we are looking for. Otherwise, there is no + * matching interval as nodes to the right of N + * can't satisfy Cond1 either. + */ + node = left; + continue; + } + } + if (ITSTART(node) <= last) { /* Cond1 */ + if (start <= ITLAST(node)) /* Cond2 */ + return node; /* node is leftmost match */ + if (node->ITRB.rb_right) { + node = rb_entry(node->ITRB.rb_right, + ITSTRUCT, ITRB); + if (start <= node->ITSUBTREE) + continue; + } + } + return NULL; /* No match */ + } +} + +ITSTATIC ITSTRUCT *IT(iter_first)(struct rb_root *root, + ITTYPE start, ITTYPE last) +{ + ITSTRUCT *node; + + if (!root->rb_node) + return NULL; + node = rb_entry(root->rb_node, ITSTRUCT, ITRB); + if (node->ITSUBTREE < start) + return NULL; + return IT(subtree_search)(node, start, last); +} + +ITSTATIC ITSTRUCT *IT(iter_next)(ITSTRUCT *node, ITTYPE start, ITTYPE last) +{ + struct rb_node *rb = node->ITRB.rb_right, *prev; + + while (true) { + /* + * Loop invariants: + * Cond1: ITSTART(node) <= last + * rb == node->ITRB.rb_right + * + * First, search right subtree if suitable + */ + if (rb) { + ITSTRUCT *right = rb_entry(rb, ITSTRUCT, ITRB); + if (start <= right->ITSUBTREE) + return IT(subtree_search)(right, start, last); + } + + /* Move up the tree until we come from a node's left child */ + do { + rb = rb_parent(&node->ITRB); + if (!rb) + return NULL; + prev = &node->ITRB; + node = rb_entry(rb, ITSTRUCT, ITRB); + rb = node->ITRB.rb_right; + } while (prev == rb); + + /* Check if the node intersects [start;last] */ + if (last < ITSTART(node)) /* !Cond1 */ + return NULL; + else if (start <= ITLAST(node)) /* Cond2 */ + return node; + } +} diff --git a/include/linux/mm.h b/include/linux/mm.h index 5ddb11b2b4bb..0f671ef09eba 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -10,7 +10,6 @@ #include #include #include -#include #include #include #include @@ -1355,22 +1354,27 @@ extern void zone_pcp_reset(struct zone *zone); extern atomic_long_t mmap_pages_allocated; extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t); -/* prio_tree.c */ -void vma_prio_tree_add(struct vm_area_struct *, struct vm_area_struct *old); -void vma_prio_tree_insert(struct vm_area_struct *, struct prio_tree_root *); -void vma_prio_tree_remove(struct vm_area_struct *, struct prio_tree_root *); -struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma, - struct prio_tree_iter *iter); - -#define vma_prio_tree_foreach(vma, iter, root, begin, end) \ - for (prio_tree_iter_init(iter, root, begin, end), vma = NULL; \ - (vma = vma_prio_tree_next(vma, iter)); ) +/* interval_tree.c */ +void vma_interval_tree_add(struct vm_area_struct *vma, + struct vm_area_struct *old, + struct address_space *mapping); +void vma_interval_tree_insert(struct vm_area_struct *node, + struct rb_root *root); +void vma_interval_tree_remove(struct vm_area_struct *node, + struct rb_root *root); +struct vm_area_struct *vma_interval_tree_iter_first(struct rb_root *root, + unsigned long start, unsigned long last); +struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node, + unsigned long start, unsigned long last); + +#define vma_interval_tree_foreach(vma, root, start, last) \ + for (vma = vma_interval_tree_iter_first(root, start, last); \ + vma; vma = vma_interval_tree_iter_next(vma, start, last)) static inline void vma_nonlinear_insert(struct vm_area_struct *vma, struct list_head *list) { - vma->shared.vm_set.parent = NULL; - list_add_tail(&vma->shared.vm_set.list, list); + list_add_tail(&vma->shared.nonlinear, list); } /* mmap.c */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index a57a43f5ca7c..31f8a3af7d94 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -6,7 +6,6 @@ #include #include #include -#include #include #include #include @@ -240,18 +239,15 @@ struct vm_area_struct { /* * For areas with an address space and backing store, - * linkage into the address_space->i_mmap prio tree, or - * linkage to the list of like vmas hanging off its node, or + * linkage into the address_space->i_mmap interval tree, or * linkage of vma in the address_space->i_mmap_nonlinear list. */ union { struct { - struct list_head list; - void *parent; /* aligns with prio_tree_node parent */ - struct vm_area_struct *head; - } vm_set; - - struct raw_prio_tree_node prio_tree_node; + struct rb_node rb; + unsigned long rb_subtree_last; + } linear; + struct list_head nonlinear; } shared; /* diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 912ef48d28ab..1d9c0a985960 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -735,7 +735,6 @@ static struct map_info * build_map_info(struct address_space *mapping, loff_t offset, bool is_register) { unsigned long pgoff = offset >> PAGE_SHIFT; - struct prio_tree_iter iter; struct vm_area_struct *vma; struct map_info *curr = NULL; struct map_info *prev = NULL; @@ -744,7 +743,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) again: mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { if (!valid_vma(vma, is_register)) continue; diff --git a/kernel/fork.c b/kernel/fork.c index 972762e01024..90dace52715e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -423,7 +423,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) mapping->i_mmap_writable++; flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ - vma_prio_tree_add(tmp, mpnt); + vma_interval_tree_add(tmp, mpnt, mapping); flush_dcache_mmap_unlock(mapping); mutex_unlock(&mapping->i_mmap_mutex); } diff --git a/lib/interval_tree.c b/lib/interval_tree.c index 6fd540b1e499..77a793e0644b 100644 --- a/lib/interval_tree.c +++ b/lib/interval_tree.c @@ -1,159 +1,13 @@ #include #include -/* Callbacks for augmented rbtree insert and remove */ - -static inline unsigned long -compute_subtree_last(struct interval_tree_node *node) -{ - unsigned long max = node->last, subtree_last; - if (node->rb.rb_left) { - subtree_last = rb_entry(node->rb.rb_left, - struct interval_tree_node, rb)->__subtree_last; - if (max < subtree_last) - max = subtree_last; - } - if (node->rb.rb_right) { - subtree_last = rb_entry(node->rb.rb_right, - struct interval_tree_node, rb)->__subtree_last; - if (max < subtree_last) - max = subtree_last; - } - return max; -} - -RB_DECLARE_CALLBACKS(static, augment_callbacks, struct interval_tree_node, rb, - unsigned long, __subtree_last, compute_subtree_last) - -/* Insert / remove interval nodes from the tree */ - -void interval_tree_insert(struct interval_tree_node *node, - struct rb_root *root) -{ - struct rb_node **link = &root->rb_node, *rb_parent = NULL; - unsigned long start = node->start, last = node->last; - struct interval_tree_node *parent; - - while (*link) { - rb_parent = *link; - parent = rb_entry(rb_parent, struct interval_tree_node, rb); - if (parent->__subtree_last < last) - parent->__subtree_last = last; - if (start < parent->start) - link = &parent->rb.rb_left; - else - link = &parent->rb.rb_right; - } - - node->__subtree_last = last; - rb_link_node(&node->rb, rb_parent, link); - rb_insert_augmented(&node->rb, root, &augment_callbacks); -} - -void interval_tree_remove(struct interval_tree_node *node, - struct rb_root *root) -{ - rb_erase_augmented(&node->rb, root, &augment_callbacks); -} - -/* - * Iterate over intervals intersecting [start;last] - * - * Note that a node's interval intersects [start;last] iff: - * Cond1: node->start <= last - * and - * Cond2: start <= node->last - */ - -static struct interval_tree_node * -subtree_search(struct interval_tree_node *node, - unsigned long start, unsigned long last) -{ - while (true) { - /* - * Loop invariant: start <= node->__subtree_last - * (Cond2 is satisfied by one of the subtree nodes) - */ - if (node->rb.rb_left) { - struct interval_tree_node *left = - rb_entry(node->rb.rb_left, - struct interval_tree_node, rb); - if (start <= left->__subtree_last) { - /* - * Some nodes in left subtree satisfy Cond2. - * Iterate to find the leftmost such node N. - * If it also satisfies Cond1, that's the match - * we are looking for. Otherwise, there is no - * matching interval as nodes to the right of N - * can't satisfy Cond1 either. - */ - node = left; - continue; - } - } - if (node->start <= last) { /* Cond1 */ - if (start <= node->last) /* Cond2 */ - return node; /* node is leftmost match */ - if (node->rb.rb_right) { - node = rb_entry(node->rb.rb_right, - struct interval_tree_node, rb); - if (start <= node->__subtree_last) - continue; - } - } - return NULL; /* No match */ - } -} - -struct interval_tree_node * -interval_tree_iter_first(struct rb_root *root, - unsigned long start, unsigned long last) -{ - struct interval_tree_node *node; - - if (!root->rb_node) - return NULL; - node = rb_entry(root->rb_node, struct interval_tree_node, rb); - if (node->__subtree_last < start) - return NULL; - return subtree_search(node, start, last); -} - -struct interval_tree_node * -interval_tree_iter_next(struct interval_tree_node *node, - unsigned long start, unsigned long last) -{ - struct rb_node *rb = node->rb.rb_right, *prev; - - while (true) { - /* - * Loop invariants: - * Cond1: node->start <= last - * rb == node->rb.rb_right - * - * First, search right subtree if suitable - */ - if (rb) { - struct interval_tree_node *right = - rb_entry(rb, struct interval_tree_node, rb); - if (start <= right->__subtree_last) - return subtree_search(right, start, last); - } - - /* Move up the tree until we come from a node's left child */ - do { - rb = rb_parent(&node->rb); - if (!rb) - return NULL; - prev = &node->rb; - node = rb_entry(rb, struct interval_tree_node, rb); - rb = node->rb.rb_right; - } while (prev == rb); - - /* Check if the node intersects [start;last] */ - if (last < node->start) /* !Cond1 */ - return NULL; - else if (start <= node->last) /* Cond2 */ - return node; - } -} +#define ITSTRUCT struct interval_tree_node +#define ITRB rb +#define ITTYPE unsigned long +#define ITSUBTREE __subtree_last +#define ITSTART(n) ((n)->start) +#define ITLAST(n) ((n)->last) +#define ITSTATIC +#define ITPREFIX interval_tree + +#include diff --git a/lib/prio_tree.c b/lib/prio_tree.c index 4e0d2edff2b4..bba37148c15e 100644 --- a/lib/prio_tree.c +++ b/lib/prio_tree.c @@ -44,27 +44,12 @@ * The following macros are used for implementing prio_tree for i_mmap */ -#define RADIX_INDEX(vma) ((vma)->vm_pgoff) -#define VMA_SIZE(vma) (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT) -/* avoid overflow */ -#define HEAP_INDEX(vma) ((vma)->vm_pgoff + (VMA_SIZE(vma) - 1)) - - static void get_index(const struct prio_tree_root *root, const struct prio_tree_node *node, unsigned long *radix, unsigned long *heap) { - if (root->raw) { - struct vm_area_struct *vma = prio_tree_entry( - node, struct vm_area_struct, shared.prio_tree_node); - - *radix = RADIX_INDEX(vma); - *heap = HEAP_INDEX(vma); - } - else { - *radix = node->start; - *heap = node->last; - } + *radix = node->start; + *heap = node->last; } static unsigned long index_bits_to_maxindex[BITS_PER_LONG]; diff --git a/mm/Makefile b/mm/Makefile index 92753e2d82da..6b025f80af34 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -14,9 +14,9 @@ endif obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ maccess.o page_alloc.o page-writeback.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ - prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ + util.o mmzone.o vmstat.o backing-dev.o \ mm_init.o mmu_context.o percpu.o slab_common.o \ - compaction.o $(mmu-y) + compaction.o interval_tree.o $(mmu-y) obj-y += init-mm.o diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 91750227a191..a52daee11d3f 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -167,7 +167,6 @@ __xip_unmap (struct address_space * mapping, { struct vm_area_struct *vma; struct mm_struct *mm; - struct prio_tree_iter iter; unsigned long address; pte_t *pte; pte_t pteval; @@ -184,7 +183,7 @@ __xip_unmap (struct address_space * mapping, retry: mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { mm = vma->vm_mm; address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); diff --git a/mm/fremap.c b/mm/fremap.c index 3d731a498788..3899a86851ce 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -214,7 +214,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, mutex_lock(&mapping->i_mmap_mutex); flush_dcache_mmap_lock(mapping); vma->vm_flags |= VM_NONLINEAR; - vma_prio_tree_remove(vma, &mapping->i_mmap); + vma_interval_tree_remove(vma, &mapping->i_mmap); vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); flush_dcache_mmap_unlock(mapping); mutex_unlock(&mapping->i_mmap_mutex); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f1bb534254f6..c9b40e3a9936 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2474,7 +2474,6 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, struct hstate *h = hstate_vma(vma); struct vm_area_struct *iter_vma; struct address_space *mapping; - struct prio_tree_iter iter; pgoff_t pgoff; /* @@ -2491,7 +2490,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, * __unmap_hugepage_range() is called as the lock is already held */ mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) { /* Do not unmap the current VMA */ if (iter_vma == vma) continue; diff --git a/mm/interval_tree.c b/mm/interval_tree.c new file mode 100644 index 000000000000..7dc565660e56 --- /dev/null +++ b/mm/interval_tree.c @@ -0,0 +1,61 @@ +/* + * mm/interval_tree.c - interval tree for mapping->i_mmap + * + * Copyright (C) 2012, Michel Lespinasse + * + * This file is released under the GPL v2. + */ + +#include +#include + +#define ITSTRUCT struct vm_area_struct +#define ITRB shared.linear.rb +#define ITTYPE unsigned long +#define ITSUBTREE shared.linear.rb_subtree_last +#define ITSTART(n) ((n)->vm_pgoff) +#define ITLAST(n) ((n)->vm_pgoff + \ + (((n)->vm_end - (n)->vm_start) >> PAGE_SHIFT) - 1) +#define ITSTATIC +#define ITPREFIX vma_interval_tree + +#include + +/* Insert old immediately after vma in the interval tree */ +void vma_interval_tree_add(struct vm_area_struct *vma, + struct vm_area_struct *old, + struct address_space *mapping) +{ + struct rb_node **link; + struct vm_area_struct *parent; + unsigned long last; + + if (unlikely(vma->vm_flags & VM_NONLINEAR)) { + list_add(&vma->shared.nonlinear, &old->shared.nonlinear); + return; + } + + last = ITLAST(vma); + + if (!old->shared.linear.rb.rb_right) { + parent = old; + link = &old->shared.linear.rb.rb_right; + } else { + parent = rb_entry(old->shared.linear.rb.rb_right, + struct vm_area_struct, shared.linear.rb); + if (parent->shared.linear.rb_subtree_last < last) + parent->shared.linear.rb_subtree_last = last; + while (parent->shared.linear.rb.rb_left) { + parent = rb_entry(parent->shared.linear.rb.rb_left, + struct vm_area_struct, shared.linear.rb); + if (parent->shared.linear.rb_subtree_last < last) + parent->shared.linear.rb_subtree_last = last; + } + link = &parent->shared.linear.rb.rb_left; + } + + vma->shared.linear.rb_subtree_last = last; + rb_link_node(&vma->shared.linear.rb, &parent->shared.linear.rb, link); + rb_insert_augmented(&vma->shared.linear.rb, &mapping->i_mmap, + &vma_interval_tree_augment_callbacks); +} diff --git a/mm/memory-failure.c b/mm/memory-failure.c index a6e2141a6610..c38a6257d082 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -431,7 +431,6 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, { struct vm_area_struct *vma; struct task_struct *tsk; - struct prio_tree_iter iter; struct address_space *mapping = page->mapping; mutex_lock(&mapping->i_mmap_mutex); @@ -442,7 +441,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, if (!task_early_kill(tsk)) continue; - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { /* * Send early kill signal to tasks where a vma covers diff --git a/mm/memory.c b/mm/memory.c index e09c04813186..d205e4381a34 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2801,14 +2801,13 @@ static void unmap_mapping_range_vma(struct vm_area_struct *vma, zap_page_range_single(vma, start_addr, end_addr - start_addr, details); } -static inline void unmap_mapping_range_tree(struct prio_tree_root *root, +static inline void unmap_mapping_range_tree(struct rb_root *root, struct zap_details *details) { struct vm_area_struct *vma; - struct prio_tree_iter iter; pgoff_t vba, vea, zba, zea; - vma_prio_tree_foreach(vma, &iter, root, + vma_interval_tree_foreach(vma, root, details->first_index, details->last_index) { vba = vma->vm_pgoff; @@ -2839,7 +2838,7 @@ static inline void unmap_mapping_range_list(struct list_head *head, * across *all* the pages in each nonlinear VMA, not just the pages * whose virtual address lies outside the file truncation point. */ - list_for_each_entry(vma, head, shared.vm_set.list) { + list_for_each_entry(vma, head, shared.nonlinear) { details->nonlinear_vma = vma; unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details); } @@ -2883,7 +2882,7 @@ void unmap_mapping_range(struct address_space *mapping, mutex_lock(&mapping->i_mmap_mutex); - if (unlikely(!prio_tree_empty(&mapping->i_mmap))) + if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) unmap_mapping_range_tree(&mapping->i_mmap, &details); if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); diff --git a/mm/mmap.c b/mm/mmap.c index e3c365ff1b6a..5ac533f88e99 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -199,14 +199,14 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma, flush_dcache_mmap_lock(mapping); if (unlikely(vma->vm_flags & VM_NONLINEAR)) - list_del_init(&vma->shared.vm_set.list); + list_del_init(&vma->shared.nonlinear); else - vma_prio_tree_remove(vma, &mapping->i_mmap); + vma_interval_tree_remove(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); } /* - * Unlink a file-based vm structure from its prio_tree, to hide + * Unlink a file-based vm structure from its interval tree, to hide * vma from rmap and vmtruncate before freeing its page tables. */ void unlink_file_vma(struct vm_area_struct *vma) @@ -411,7 +411,7 @@ static void __vma_link_file(struct vm_area_struct *vma) if (unlikely(vma->vm_flags & VM_NONLINEAR)) vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); else - vma_prio_tree_insert(vma, &mapping->i_mmap); + vma_interval_tree_insert(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); } } @@ -449,7 +449,7 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, /* * Helper for vma_adjust() in the split_vma insert case: insert a vma into the - * mm's list and rbtree. It has already been inserted into the prio_tree. + * mm's list and rbtree. It has already been inserted into the interval tree. */ static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) { @@ -491,7 +491,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, struct vm_area_struct *next = vma->vm_next; struct vm_area_struct *importer = NULL; struct address_space *mapping = NULL; - struct prio_tree_root *root = NULL; + struct rb_root *root = NULL; struct anon_vma *anon_vma = NULL; struct file *file = vma->vm_file; long adjust_next = 0; @@ -554,7 +554,7 @@ again: remove_next = 1 + (end > next->vm_end); mutex_lock(&mapping->i_mmap_mutex); if (insert) { /* - * Put into prio_tree now, so instantiated pages + * Put into interval tree now, so instantiated pages * are visible to arm/parisc __flush_dcache_page * throughout; but we cannot insert into address * space until vma start or end is updated. @@ -582,9 +582,9 @@ again: remove_next = 1 + (end > next->vm_end); if (root) { flush_dcache_mmap_lock(mapping); - vma_prio_tree_remove(vma, root); + vma_interval_tree_remove(vma, root); if (adjust_next) - vma_prio_tree_remove(next, root); + vma_interval_tree_remove(next, root); } vma->vm_start = start; @@ -597,8 +597,8 @@ again: remove_next = 1 + (end > next->vm_end); if (root) { if (adjust_next) - vma_prio_tree_insert(next, root); - vma_prio_tree_insert(vma, root); + vma_interval_tree_insert(next, root); + vma_interval_tree_insert(vma, root); flush_dcache_mmap_unlock(mapping); } diff --git a/mm/nommu.c b/mm/nommu.c index 12e84e69dd06..45131b41bcdb 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -698,7 +698,7 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) mutex_lock(&mapping->i_mmap_mutex); flush_dcache_mmap_lock(mapping); - vma_prio_tree_insert(vma, &mapping->i_mmap); + vma_interval_tree_insert(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); mutex_unlock(&mapping->i_mmap_mutex); } @@ -764,7 +764,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) mutex_lock(&mapping->i_mmap_mutex); flush_dcache_mmap_lock(mapping); - vma_prio_tree_remove(vma, &mapping->i_mmap); + vma_interval_tree_remove(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); mutex_unlock(&mapping->i_mmap_mutex); } @@ -2044,7 +2044,6 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, size_t newsize) { struct vm_area_struct *vma; - struct prio_tree_iter iter; struct vm_region *region; pgoff_t low, high; size_t r_size, r_top; @@ -2056,8 +2055,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, mutex_lock(&inode->i_mapping->i_mmap_mutex); /* search for VMAs that fall within the dead zone */ - vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, - low, high) { + vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) { /* found one - only interested if it's shared out of the page * cache */ if (vma->vm_flags & VM_SHARED) { @@ -2073,8 +2071,8 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, * we don't check for any regions that start beyond the EOF as there * shouldn't be any */ - vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, - 0, ULONG_MAX) { + vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, + 0, ULONG_MAX) { if (!(vma->vm_flags & VM_SHARED)) continue; diff --git a/mm/prio_tree.c b/mm/prio_tree.c deleted file mode 100644 index 799dcfd7cd8c..000000000000 --- a/mm/prio_tree.c +++ /dev/null @@ -1,208 +0,0 @@ -/* - * mm/prio_tree.c - priority search tree for mapping->i_mmap - * - * Copyright (C) 2004, Rajesh Venkatasubramanian - * - * This file is released under the GPL v2. - * - * Based on the radix priority search tree proposed by Edward M. McCreight - * SIAM Journal of Computing, vol. 14, no.2, pages 257-276, May 1985 - * - * 02Feb2004 Initial version - */ - -#include -#include -#include - -/* - * See lib/prio_tree.c for details on the general radix priority search tree - * code. - */ - -/* - * The following #defines are mirrored from lib/prio_tree.c. They're only used - * for debugging, and should be removed (along with the debugging code using - * them) when switching also VMAs to the regular prio_tree code. - */ - -#define RADIX_INDEX(vma) ((vma)->vm_pgoff) -#define VMA_SIZE(vma) (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT) -/* avoid overflow */ -#define HEAP_INDEX(vma) ((vma)->vm_pgoff + (VMA_SIZE(vma) - 1)) - -/* - * Radix priority search tree for address_space->i_mmap - * - * For each vma that map a unique set of file pages i.e., unique [radix_index, - * heap_index] value, we have a corresponding priority search tree node. If - * multiple vmas have identical [radix_index, heap_index] value, then one of - * them is used as a tree node and others are stored in a vm_set list. The tree - * node points to the first vma (head) of the list using vm_set.head. - * - * prio_tree_root - * | - * A vm_set.head - * / \ / - * L R -> H-I-J-K-M-N-O-P-Q-S - * ^ ^ <-- vm_set.list --> - * tree nodes - * - * We need some way to identify whether a vma is a tree node, head of a vm_set - * list, or just a member of a vm_set list. We cannot use vm_flags to store - * such information. The reason is, in the above figure, it is possible that - * vm_flags' of R and H are covered by the different mmap_sems. When R is - * removed under R->mmap_sem, H replaces R as a tree node. Since we do not hold - * H->mmap_sem, we cannot use H->vm_flags for marking that H is a tree node now. - * That's why some trick involving shared.vm_set.parent is used for identifying - * tree nodes and list head nodes. - * - * vma radix priority search tree node rules: - * - * vma->shared.vm_set.parent != NULL ==> a tree node - * vma->shared.vm_set.head != NULL ==> list of others mapping same range - * vma->shared.vm_set.head == NULL ==> no others map the same range - * - * vma->shared.vm_set.parent == NULL - * vma->shared.vm_set.head != NULL ==> list head of vmas mapping same range - * vma->shared.vm_set.head == NULL ==> a list node - */ - -/* - * Add a new vma known to map the same set of pages as the old vma: - * useful for fork's dup_mmap as well as vma_prio_tree_insert below. - * Note that it just happens to work correctly on i_mmap_nonlinear too. - */ -void vma_prio_tree_add(struct vm_area_struct *vma, struct vm_area_struct *old) -{ - /* Leave these BUG_ONs till prio_tree patch stabilizes */ - BUG_ON(RADIX_INDEX(vma) != RADIX_INDEX(old)); - BUG_ON(HEAP_INDEX(vma) != HEAP_INDEX(old)); - - vma->shared.vm_set.head = NULL; - vma->shared.vm_set.parent = NULL; - - if (!old->shared.vm_set.parent) - list_add(&vma->shared.vm_set.list, - &old->shared.vm_set.list); - else if (old->shared.vm_set.head) - list_add_tail(&vma->shared.vm_set.list, - &old->shared.vm_set.head->shared.vm_set.list); - else { - INIT_LIST_HEAD(&vma->shared.vm_set.list); - vma->shared.vm_set.head = old; - old->shared.vm_set.head = vma; - } -} - -void vma_prio_tree_insert(struct vm_area_struct *vma, - struct prio_tree_root *root) -{ - struct prio_tree_node *ptr; - struct vm_area_struct *old; - - vma->shared.vm_set.head = NULL; - - ptr = raw_prio_tree_insert(root, &vma->shared.prio_tree_node); - if (ptr != (struct prio_tree_node *) &vma->shared.prio_tree_node) { - old = prio_tree_entry(ptr, struct vm_area_struct, - shared.prio_tree_node); - vma_prio_tree_add(vma, old); - } -} - -void vma_prio_tree_remove(struct vm_area_struct *vma, - struct prio_tree_root *root) -{ - struct vm_area_struct *node, *head, *new_head; - - if (!vma->shared.vm_set.head) { - if (!vma->shared.vm_set.parent) - list_del_init(&vma->shared.vm_set.list); - else - raw_prio_tree_remove(root, &vma->shared.prio_tree_node); - } else { - /* Leave this BUG_ON till prio_tree patch stabilizes */ - BUG_ON(vma->shared.vm_set.head->shared.vm_set.head != vma); - if (vma->shared.vm_set.parent) { - head = vma->shared.vm_set.head; - if (!list_empty(&head->shared.vm_set.list)) { - new_head = list_entry( - head->shared.vm_set.list.next, - struct vm_area_struct, - shared.vm_set.list); - list_del_init(&head->shared.vm_set.list); - } else - new_head = NULL; - - raw_prio_tree_replace(root, &vma->shared.prio_tree_node, - &head->shared.prio_tree_node); - head->shared.vm_set.head = new_head; - if (new_head) - new_head->shared.vm_set.head = head; - - } else { - node = vma->shared.vm_set.head; - if (!list_empty(&vma->shared.vm_set.list)) { - new_head = list_entry( - vma->shared.vm_set.list.next, - struct vm_area_struct, - shared.vm_set.list); - list_del_init(&vma->shared.vm_set.list); - node->shared.vm_set.head = new_head; - new_head->shared.vm_set.head = node; - } else - node->shared.vm_set.head = NULL; - } - } -} - -/* - * Helper function to enumerate vmas that map a given file page or a set of - * contiguous file pages. The function returns vmas that at least map a single - * page in the given range of contiguous file pages. - */ -struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma, - struct prio_tree_iter *iter) -{ - struct prio_tree_node *ptr; - struct vm_area_struct *next; - - if (!vma) { - /* - * First call is with NULL vma - */ - ptr = prio_tree_next(iter); - if (ptr) { - next = prio_tree_entry(ptr, struct vm_area_struct, - shared.prio_tree_node); - prefetch(next->shared.vm_set.head); - return next; - } else - return NULL; - } - - if (vma->shared.vm_set.parent) { - if (vma->shared.vm_set.head) { - next = vma->shared.vm_set.head; - prefetch(next->shared.vm_set.list.next); - return next; - } - } else { - next = list_entry(vma->shared.vm_set.list.next, - struct vm_area_struct, shared.vm_set.list); - if (!next->shared.vm_set.head) { - prefetch(next->shared.vm_set.list.next); - return next; - } - } - - ptr = prio_tree_next(iter); - if (ptr) { - next = prio_tree_entry(ptr, struct vm_area_struct, - shared.prio_tree_node); - prefetch(next->shared.vm_set.head); - return next; - } else - return NULL; -} diff --git a/mm/rmap.c b/mm/rmap.c index 0f3b7cda2a24..7b5b51d25fc5 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -820,7 +820,6 @@ static int page_referenced_file(struct page *page, struct address_space *mapping = page->mapping; pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); struct vm_area_struct *vma; - struct prio_tree_iter iter; int referenced = 0; /* @@ -846,7 +845,7 @@ static int page_referenced_file(struct page *page, */ mapcount = page_mapcount(page); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); if (address == -EFAULT) continue; @@ -945,13 +944,12 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page) { pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); struct vm_area_struct *vma; - struct prio_tree_iter iter; int ret = 0; BUG_ON(PageAnon(page)); mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { if (vma->vm_flags & VM_SHARED) { unsigned long address = vma_address(page, vma); if (address == -EFAULT) @@ -1547,7 +1545,6 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) struct address_space *mapping = page->mapping; pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); struct vm_area_struct *vma; - struct prio_tree_iter iter; int ret = SWAP_AGAIN; unsigned long cursor; unsigned long max_nl_cursor = 0; @@ -1555,7 +1552,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) unsigned int mapcount; mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); if (address == -EFAULT) continue; @@ -1576,7 +1573,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) goto out; list_for_each_entry(vma, &mapping->i_mmap_nonlinear, - shared.vm_set.list) { + shared.nonlinear) { cursor = (unsigned long) vma->vm_private_data; if (cursor > max_nl_cursor) max_nl_cursor = cursor; @@ -1608,7 +1605,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) do { list_for_each_entry(vma, &mapping->i_mmap_nonlinear, - shared.vm_set.list) { + shared.nonlinear) { cursor = (unsigned long) vma->vm_private_data; while ( cursor < max_nl_cursor && cursor < vma->vm_end - vma->vm_start) { @@ -1631,7 +1628,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) * in locked vmas). Reset cursor on all unreserved nonlinear * vmas, now forgetting on which ones it had fallen behind. */ - list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) + list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear) vma->vm_private_data = NULL; out: mutex_unlock(&mapping->i_mmap_mutex); @@ -1748,13 +1745,12 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, struct address_space *mapping = page->mapping; pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); struct vm_area_struct *vma; - struct prio_tree_iter iter; int ret = SWAP_AGAIN; if (!mapping) return ret; mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); if (address == -EFAULT) continue; -- cgit v1.2.3 From 147e615f83c2c36caf89e7a3bf78090ade6f266c Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:30 -0700 Subject: prio_tree: remove After both prio_tree users have been converted to use red-black trees, there is no need to keep around the prio tree library anymore. Signed-off-by: Michel Lespinasse Cc: Rik van Riel Cc: Hillf Danton Cc: Peter Zijlstra Cc: Catalin Marinas Cc: Andrea Arcangeli Cc: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/00-INDEX | 2 - Documentation/prio_tree.txt | 107 ----------- include/linux/prio_tree.h | 120 ------------ init/main.c | 2 - lib/Kconfig.debug | 6 - lib/Makefile | 3 +- lib/prio_tree.c | 455 -------------------------------------------- lib/prio_tree_test.c | 106 ----------- 8 files changed, 1 insertion(+), 800 deletions(-) delete mode 100644 Documentation/prio_tree.txt delete mode 100644 include/linux/prio_tree.h delete mode 100644 lib/prio_tree.c delete mode 100644 lib/prio_tree_test.c (limited to 'lib') diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX index 49c051380daf..f54273e2ac97 100644 --- a/Documentation/00-INDEX +++ b/Documentation/00-INDEX @@ -270,8 +270,6 @@ preempt-locking.txt - info on locking under a preemptive kernel. printk-formats.txt - how to get printk format specifiers right -prio_tree.txt - - info on radix-priority-search-tree use for indexing vmas. ramoops.txt - documentation of the ramoops oops/panic logging module. rbtree.txt diff --git a/Documentation/prio_tree.txt b/Documentation/prio_tree.txt deleted file mode 100644 index 3aa68f9a117b..000000000000 --- a/Documentation/prio_tree.txt +++ /dev/null @@ -1,107 +0,0 @@ -The prio_tree.c code indexes vmas using 3 different indexes: - * heap_index = vm_pgoff + vm_size_in_pages : end_vm_pgoff - * radix_index = vm_pgoff : start_vm_pgoff - * size_index = vm_size_in_pages - -A regular radix-priority-search-tree indexes vmas using only heap_index and -radix_index. The conditions for indexing are: - * ->heap_index >= ->left->heap_index && - ->heap_index >= ->right->heap_index - * if (->heap_index == ->left->heap_index) - then ->radix_index < ->left->radix_index; - * if (->heap_index == ->right->heap_index) - then ->radix_index < ->right->radix_index; - * nodes are hashed to left or right subtree using radix_index - similar to a pure binary radix tree. - -A regular radix-priority-search-tree helps to store and query -intervals (vmas). However, a regular radix-priority-search-tree is only -suitable for storing vmas with different radix indices (vm_pgoff). - -Therefore, the prio_tree.c extends the regular radix-priority-search-tree -to handle many vmas with the same vm_pgoff. Such vmas are handled in -2 different ways: 1) All vmas with the same radix _and_ heap indices are -linked using vm_set.list, 2) if there are many vmas with the same radix -index, but different heap indices and if the regular radix-priority-search -tree cannot index them all, we build an overflow-sub-tree that indexes such -vmas using heap and size indices instead of heap and radix indices. For -example, in the figure below some vmas with vm_pgoff = 0 (zero) are -indexed by regular radix-priority-search-tree whereas others are pushed -into an overflow-subtree. Note that all vmas in an overflow-sub-tree have -the same vm_pgoff (radix_index) and if necessary we build different -overflow-sub-trees to handle each possible radix_index. For example, -in figure we have 3 overflow-sub-trees corresponding to radix indices -0, 2, and 4. - -In the final tree the first few (prio_tree_root->index_bits) levels -are indexed using heap and radix indices whereas the overflow-sub-trees below -those levels (i.e. levels prio_tree_root->index_bits + 1 and higher) are -indexed using heap and size indices. In overflow-sub-trees the size_index -is used for hashing the nodes to appropriate places. - -Now, an example prio_tree: - - vmas are represented [radix_index, size_index, heap_index] - i.e., [start_vm_pgoff, vm_size_in_pages, end_vm_pgoff] - -level prio_tree_root->index_bits = 3 ------ - _ - 0 [0,7,7] | - / \ | - ------------------ ------------ | Regular - / \ | radix priority - 1 [1,6,7] [4,3,7] | search tree - / \ / \ | - ------- ----- ------ ----- | heap-and-radix - / \ / \ | indexed - 2 [0,6,6] [2,5,7] [5,2,7] [6,1,7] | - / \ / \ / \ / \ | - 3 [0,5,5] [1,5,6] [2,4,6] [3,4,7] [4,2,6] [5,1,6] [6,0,6] [7,0,7] | - / / / _ - / / / _ - 4 [0,4,4] [2,3,5] [4,1,5] | - / / / | - 5 [0,3,3] [2,2,4] [4,0,4] | Overflow-sub-trees - / / | - 6 [0,2,2] [2,1,3] | heap-and-size - / / | indexed - 7 [0,1,1] [2,0,2] | - / | - 8 [0,0,0] | - _ - -Note that we use prio_tree_root->index_bits to optimize the height -of the heap-and-radix indexed tree. Since prio_tree_root->index_bits is -set according to the maximum end_vm_pgoff mapped, we are sure that all -bits (in vm_pgoff) above prio_tree_root->index_bits are 0 (zero). Therefore, -we only use the first prio_tree_root->index_bits as radix_index. -Whenever index_bits is increased in prio_tree_expand, we shuffle the tree -to make sure that the first prio_tree_root->index_bits levels of the tree -is indexed properly using heap and radix indices. - -We do not optimize the height of overflow-sub-trees using index_bits. -The reason is: there can be many such overflow-sub-trees and all of -them have to be suffled whenever the index_bits increases. This may involve -walking the whole prio_tree in prio_tree_insert->prio_tree_expand code -path which is not desirable. Hence, we do not optimize the height of the -heap-and-size indexed overflow-sub-trees using prio_tree->index_bits. -Instead the overflow sub-trees are indexed using full BITS_PER_LONG bits -of size_index. This may lead to skewed sub-trees because most of the -higher significant bits of the size_index are likely to be 0 (zero). In -the example above, all 3 overflow-sub-trees are skewed. This may marginally -affect the performance. However, processes rarely map many vmas with the -same start_vm_pgoff but different end_vm_pgoffs. Therefore, we normally -do not require overflow-sub-trees to index all vmas. - -From the above discussion it is clear that the maximum height of -a prio_tree can be prio_tree_root->index_bits + BITS_PER_LONG. -However, in most of the common cases we do not need overflow-sub-trees, -so the tree height in the common cases will be prio_tree_root->index_bits. - -It is fair to mention here that the prio_tree_root->index_bits -is increased on demand, however, the index_bits is not decreased when -vmas are removed from the prio_tree. That's tricky to do. Hence, it's -left as a home work problem. - - diff --git a/include/linux/prio_tree.h b/include/linux/prio_tree.h deleted file mode 100644 index db04abb557e0..000000000000 --- a/include/linux/prio_tree.h +++ /dev/null @@ -1,120 +0,0 @@ -#ifndef _LINUX_PRIO_TREE_H -#define _LINUX_PRIO_TREE_H - -/* - * K&R 2nd ed. A8.3 somewhat obliquely hints that initial sequences of struct - * fields with identical types should end up at the same location. We'll use - * this until we can scrap struct raw_prio_tree_node. - * - * Note: all this could be done more elegantly by using unnamed union/struct - * fields. However, gcc 2.95.3 and apparently also gcc 3.0.4 don't support this - * language extension. - */ - -struct raw_prio_tree_node { - struct prio_tree_node *left; - struct prio_tree_node *right; - struct prio_tree_node *parent; -}; - -struct prio_tree_node { - struct prio_tree_node *left; - struct prio_tree_node *right; - struct prio_tree_node *parent; - unsigned long start; - unsigned long last; /* last location _in_ interval */ -}; - -struct prio_tree_root { - struct prio_tree_node *prio_tree_node; - unsigned short index_bits; - unsigned short raw; - /* - * 0: nodes are of type struct prio_tree_node - * 1: nodes are of type raw_prio_tree_node - */ -}; - -struct prio_tree_iter { - struct prio_tree_node *cur; - unsigned long mask; - unsigned long value; - int size_level; - - struct prio_tree_root *root; - pgoff_t r_index; - pgoff_t h_index; -}; - -static inline void prio_tree_iter_init(struct prio_tree_iter *iter, - struct prio_tree_root *root, pgoff_t r_index, pgoff_t h_index) -{ - iter->root = root; - iter->r_index = r_index; - iter->h_index = h_index; - iter->cur = NULL; -} - -#define __INIT_PRIO_TREE_ROOT(ptr, _raw) \ -do { \ - (ptr)->prio_tree_node = NULL; \ - (ptr)->index_bits = 1; \ - (ptr)->raw = (_raw); \ -} while (0) - -#define INIT_PRIO_TREE_ROOT(ptr) __INIT_PRIO_TREE_ROOT(ptr, 0) -#define INIT_RAW_PRIO_TREE_ROOT(ptr) __INIT_PRIO_TREE_ROOT(ptr, 1) - -#define INIT_PRIO_TREE_NODE(ptr) \ -do { \ - (ptr)->left = (ptr)->right = (ptr)->parent = (ptr); \ -} while (0) - -#define INIT_PRIO_TREE_ITER(ptr) \ -do { \ - (ptr)->cur = NULL; \ - (ptr)->mask = 0UL; \ - (ptr)->value = 0UL; \ - (ptr)->size_level = 0; \ -} while (0) - -#define prio_tree_entry(ptr, type, member) \ - ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member))) - -static inline int prio_tree_empty(const struct prio_tree_root *root) -{ - return root->prio_tree_node == NULL; -} - -static inline int prio_tree_root(const struct prio_tree_node *node) -{ - return node->parent == node; -} - -static inline int prio_tree_left_empty(const struct prio_tree_node *node) -{ - return node->left == node; -} - -static inline int prio_tree_right_empty(const struct prio_tree_node *node) -{ - return node->right == node; -} - - -struct prio_tree_node *prio_tree_replace(struct prio_tree_root *root, - struct prio_tree_node *old, struct prio_tree_node *node); -struct prio_tree_node *prio_tree_insert(struct prio_tree_root *root, - struct prio_tree_node *node); -void prio_tree_remove(struct prio_tree_root *root, struct prio_tree_node *node); -struct prio_tree_node *prio_tree_next(struct prio_tree_iter *iter); - -#define raw_prio_tree_replace(root, old, node) \ - prio_tree_replace(root, (struct prio_tree_node *) (old), \ - (struct prio_tree_node *) (node)) -#define raw_prio_tree_insert(root, node) \ - prio_tree_insert(root, (struct prio_tree_node *) (node)) -#define raw_prio_tree_remove(root, node) \ - prio_tree_remove(root, (struct prio_tree_node *) (node)) - -#endif /* _LINUX_PRIO_TREE_H */ diff --git a/init/main.c b/init/main.c index db34c0ec4711..313360fe1118 100644 --- a/init/main.c +++ b/init/main.c @@ -86,7 +86,6 @@ extern void init_IRQ(void); extern void fork_init(unsigned long); extern void mca_init(void); extern void sbus_init(void); -extern void prio_tree_init(void); extern void radix_tree_init(void); #ifndef CONFIG_DEBUG_RODATA static inline void mark_rodata_ro(void) { } @@ -547,7 +546,6 @@ asmlinkage void __init start_kernel(void) /* init some links before init_ISA_irqs() */ early_irq_init(); init_IRQ(); - prio_tree_init(); init_timers(); hrtimers_init(); softirq_init(); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index ee9f030b6951..a6e7e7741523 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1289,12 +1289,6 @@ config RBTREE_TEST A benchmark measuring the performance of the rbtree library. Also includes rbtree invariant checks. -config PRIO_TREE_TEST - tristate "Prio tree test" - depends on m && DEBUG_KERNEL - help - A benchmark measuring the performance of the prio tree library - config INTERVAL_TREE_TEST tristate "Interval tree test" depends on m && DEBUG_KERNEL diff --git a/lib/Makefile b/lib/Makefile index 26f578bf616a..3128e357e286 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -9,7 +9,7 @@ endif lib-y := ctype.o string.o vsprintf.o cmdline.o \ rbtree.o radix-tree.o dump_stack.o timerqueue.o\ - idr.o int_sqrt.o extable.o prio_tree.o \ + idr.o int_sqrt.o extable.o \ sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \ proportions.o flex_proportions.o prio_heap.o ratelimit.o show_mem.o \ is_single_threaded.o plist.o decompress.o @@ -141,7 +141,6 @@ $(foreach file, $(libfdt_files), \ lib-$(CONFIG_LIBFDT) += $(libfdt_files) obj-$(CONFIG_RBTREE_TEST) += rbtree_test.o -obj-$(CONFIG_PRIO_TREE_TEST) += prio_tree_test.o obj-$(CONFIG_INTERVAL_TREE_TEST) += interval_tree_test.o interval_tree_test-objs := interval_tree_test_main.o interval_tree.o diff --git a/lib/prio_tree.c b/lib/prio_tree.c deleted file mode 100644 index bba37148c15e..000000000000 --- a/lib/prio_tree.c +++ /dev/null @@ -1,455 +0,0 @@ -/* - * lib/prio_tree.c - priority search tree - * - * Copyright (C) 2004, Rajesh Venkatasubramanian - * - * This file is released under the GPL v2. - * - * Based on the radix priority search tree proposed by Edward M. McCreight - * SIAM Journal of Computing, vol. 14, no.2, pages 257-276, May 1985 - * - * 02Feb2004 Initial version - */ - -#include -#include -#include -#include - -/* - * A clever mix of heap and radix trees forms a radix priority search tree (PST) - * which is useful for storing intervals, e.g, we can consider a vma as a closed - * interval of file pages [offset_begin, offset_end], and store all vmas that - * map a file in a PST. Then, using the PST, we can answer a stabbing query, - * i.e., selecting a set of stored intervals (vmas) that overlap with (map) a - * given input interval X (a set of consecutive file pages), in "O(log n + m)" - * time where 'log n' is the height of the PST, and 'm' is the number of stored - * intervals (vmas) that overlap (map) with the input interval X (the set of - * consecutive file pages). - * - * In our implementation, we store closed intervals of the form [radix_index, - * heap_index]. We assume that always radix_index <= heap_index. McCreight's PST - * is designed for storing intervals with unique radix indices, i.e., each - * interval have different radix_index. However, this limitation can be easily - * overcome by using the size, i.e., heap_index - radix_index, as part of the - * index, so we index the tree using [(radix_index,size), heap_index]. - * - * When the above-mentioned indexing scheme is used, theoretically, in a 32 bit - * machine, the maximum height of a PST can be 64. We can use a balanced version - * of the priority search tree to optimize the tree height, but the balanced - * tree proposed by McCreight is too complex and memory-hungry for our purpose. - */ - -/* - * The following macros are used for implementing prio_tree for i_mmap - */ - -static void get_index(const struct prio_tree_root *root, - const struct prio_tree_node *node, - unsigned long *radix, unsigned long *heap) -{ - *radix = node->start; - *heap = node->last; -} - -static unsigned long index_bits_to_maxindex[BITS_PER_LONG]; - -void __init prio_tree_init(void) -{ - unsigned int i; - - for (i = 0; i < ARRAY_SIZE(index_bits_to_maxindex) - 1; i++) - index_bits_to_maxindex[i] = (1UL << (i + 1)) - 1; - index_bits_to_maxindex[ARRAY_SIZE(index_bits_to_maxindex) - 1] = ~0UL; -} - -/* - * Maximum heap_index that can be stored in a PST with index_bits bits - */ -static inline unsigned long prio_tree_maxindex(unsigned int bits) -{ - return index_bits_to_maxindex[bits - 1]; -} - -static void prio_set_parent(struct prio_tree_node *parent, - struct prio_tree_node *child, bool left) -{ - if (left) - parent->left = child; - else - parent->right = child; - - child->parent = parent; -} - -/* - * Extend a priority search tree so that it can store a node with heap_index - * max_heap_index. In the worst case, this algorithm takes O((log n)^2). - * However, this function is used rarely and the common case performance is - * not bad. - */ -static struct prio_tree_node *prio_tree_expand(struct prio_tree_root *root, - struct prio_tree_node *node, unsigned long max_heap_index) -{ - struct prio_tree_node *prev; - - if (max_heap_index > prio_tree_maxindex(root->index_bits)) - root->index_bits++; - - prev = node; - INIT_PRIO_TREE_NODE(node); - - while (max_heap_index > prio_tree_maxindex(root->index_bits)) { - struct prio_tree_node *tmp = root->prio_tree_node; - - root->index_bits++; - - if (prio_tree_empty(root)) - continue; - - prio_tree_remove(root, root->prio_tree_node); - INIT_PRIO_TREE_NODE(tmp); - - prio_set_parent(prev, tmp, true); - prev = tmp; - } - - if (!prio_tree_empty(root)) - prio_set_parent(prev, root->prio_tree_node, true); - - root->prio_tree_node = node; - return node; -} - -/* - * Replace a prio_tree_node with a new node and return the old node - */ -struct prio_tree_node *prio_tree_replace(struct prio_tree_root *root, - struct prio_tree_node *old, struct prio_tree_node *node) -{ - INIT_PRIO_TREE_NODE(node); - - if (prio_tree_root(old)) { - BUG_ON(root->prio_tree_node != old); - /* - * We can reduce root->index_bits here. However, it is complex - * and does not help much to improve performance (IMO). - */ - root->prio_tree_node = node; - } else - prio_set_parent(old->parent, node, old->parent->left == old); - - if (!prio_tree_left_empty(old)) - prio_set_parent(node, old->left, true); - - if (!prio_tree_right_empty(old)) - prio_set_parent(node, old->right, false); - - return old; -} - -/* - * Insert a prio_tree_node @node into a radix priority search tree @root. The - * algorithm typically takes O(log n) time where 'log n' is the number of bits - * required to represent the maximum heap_index. In the worst case, the algo - * can take O((log n)^2) - check prio_tree_expand. - * - * If a prior node with same radix_index and heap_index is already found in - * the tree, then returns the address of the prior node. Otherwise, inserts - * @node into the tree and returns @node. - */ -struct prio_tree_node *prio_tree_insert(struct prio_tree_root *root, - struct prio_tree_node *node) -{ - struct prio_tree_node *cur, *res = node; - unsigned long radix_index, heap_index; - unsigned long r_index, h_index, index, mask; - int size_flag = 0; - - get_index(root, node, &radix_index, &heap_index); - - if (prio_tree_empty(root) || - heap_index > prio_tree_maxindex(root->index_bits)) - return prio_tree_expand(root, node, heap_index); - - cur = root->prio_tree_node; - mask = 1UL << (root->index_bits - 1); - - while (mask) { - get_index(root, cur, &r_index, &h_index); - - if (r_index == radix_index && h_index == heap_index) - return cur; - - if (h_index < heap_index || - (h_index == heap_index && r_index > radix_index)) { - struct prio_tree_node *tmp = node; - node = prio_tree_replace(root, cur, node); - cur = tmp; - /* swap indices */ - index = r_index; - r_index = radix_index; - radix_index = index; - index = h_index; - h_index = heap_index; - heap_index = index; - } - - if (size_flag) - index = heap_index - radix_index; - else - index = radix_index; - - if (index & mask) { - if (prio_tree_right_empty(cur)) { - INIT_PRIO_TREE_NODE(node); - prio_set_parent(cur, node, false); - return res; - } else - cur = cur->right; - } else { - if (prio_tree_left_empty(cur)) { - INIT_PRIO_TREE_NODE(node); - prio_set_parent(cur, node, true); - return res; - } else - cur = cur->left; - } - - mask >>= 1; - - if (!mask) { - mask = 1UL << (BITS_PER_LONG - 1); - size_flag = 1; - } - } - /* Should not reach here */ - BUG(); - return NULL; -} -EXPORT_SYMBOL(prio_tree_insert); - -/* - * Remove a prio_tree_node @node from a radix priority search tree @root. The - * algorithm takes O(log n) time where 'log n' is the number of bits required - * to represent the maximum heap_index. - */ -void prio_tree_remove(struct prio_tree_root *root, struct prio_tree_node *node) -{ - struct prio_tree_node *cur; - unsigned long r_index, h_index_right, h_index_left; - - cur = node; - - while (!prio_tree_left_empty(cur) || !prio_tree_right_empty(cur)) { - if (!prio_tree_left_empty(cur)) - get_index(root, cur->left, &r_index, &h_index_left); - else { - cur = cur->right; - continue; - } - - if (!prio_tree_right_empty(cur)) - get_index(root, cur->right, &r_index, &h_index_right); - else { - cur = cur->left; - continue; - } - - /* both h_index_left and h_index_right cannot be 0 */ - if (h_index_left >= h_index_right) - cur = cur->left; - else - cur = cur->right; - } - - if (prio_tree_root(cur)) { - BUG_ON(root->prio_tree_node != cur); - __INIT_PRIO_TREE_ROOT(root, root->raw); - return; - } - - if (cur->parent->right == cur) - cur->parent->right = cur->parent; - else - cur->parent->left = cur->parent; - - while (cur != node) - cur = prio_tree_replace(root, cur->parent, cur); -} -EXPORT_SYMBOL(prio_tree_remove); - -static void iter_walk_down(struct prio_tree_iter *iter) -{ - iter->mask >>= 1; - if (iter->mask) { - if (iter->size_level) - iter->size_level++; - return; - } - - if (iter->size_level) { - BUG_ON(!prio_tree_left_empty(iter->cur)); - BUG_ON(!prio_tree_right_empty(iter->cur)); - iter->size_level++; - iter->mask = ULONG_MAX; - } else { - iter->size_level = 1; - iter->mask = 1UL << (BITS_PER_LONG - 1); - } -} - -static void iter_walk_up(struct prio_tree_iter *iter) -{ - if (iter->mask == ULONG_MAX) - iter->mask = 1UL; - else if (iter->size_level == 1) - iter->mask = 1UL; - else - iter->mask <<= 1; - if (iter->size_level) - iter->size_level--; - if (!iter->size_level && (iter->value & iter->mask)) - iter->value ^= iter->mask; -} - -/* - * Following functions help to enumerate all prio_tree_nodes in the tree that - * overlap with the input interval X [radix_index, heap_index]. The enumeration - * takes O(log n + m) time where 'log n' is the height of the tree (which is - * proportional to # of bits required to represent the maximum heap_index) and - * 'm' is the number of prio_tree_nodes that overlap the interval X. - */ - -static struct prio_tree_node *prio_tree_left(struct prio_tree_iter *iter, - unsigned long *r_index, unsigned long *h_index) -{ - if (prio_tree_left_empty(iter->cur)) - return NULL; - - get_index(iter->root, iter->cur->left, r_index, h_index); - - if (iter->r_index <= *h_index) { - iter->cur = iter->cur->left; - iter_walk_down(iter); - return iter->cur; - } - - return NULL; -} - -static struct prio_tree_node *prio_tree_right(struct prio_tree_iter *iter, - unsigned long *r_index, unsigned long *h_index) -{ - unsigned long value; - - if (prio_tree_right_empty(iter->cur)) - return NULL; - - if (iter->size_level) - value = iter->value; - else - value = iter->value | iter->mask; - - if (iter->h_index < value) - return NULL; - - get_index(iter->root, iter->cur->right, r_index, h_index); - - if (iter->r_index <= *h_index) { - iter->cur = iter->cur->right; - iter_walk_down(iter); - return iter->cur; - } - - return NULL; -} - -static struct prio_tree_node *prio_tree_parent(struct prio_tree_iter *iter) -{ - iter->cur = iter->cur->parent; - iter_walk_up(iter); - return iter->cur; -} - -static inline int overlap(struct prio_tree_iter *iter, - unsigned long r_index, unsigned long h_index) -{ - return iter->h_index >= r_index && iter->r_index <= h_index; -} - -/* - * prio_tree_first: - * - * Get the first prio_tree_node that overlaps with the interval [radix_index, - * heap_index]. Note that always radix_index <= heap_index. We do a pre-order - * traversal of the tree. - */ -static struct prio_tree_node *prio_tree_first(struct prio_tree_iter *iter) -{ - struct prio_tree_root *root; - unsigned long r_index, h_index; - - INIT_PRIO_TREE_ITER(iter); - - root = iter->root; - if (prio_tree_empty(root)) - return NULL; - - get_index(root, root->prio_tree_node, &r_index, &h_index); - - if (iter->r_index > h_index) - return NULL; - - iter->mask = 1UL << (root->index_bits - 1); - iter->cur = root->prio_tree_node; - - while (1) { - if (overlap(iter, r_index, h_index)) - return iter->cur; - - if (prio_tree_left(iter, &r_index, &h_index)) - continue; - - if (prio_tree_right(iter, &r_index, &h_index)) - continue; - - break; - } - return NULL; -} - -/* - * prio_tree_next: - * - * Get the next prio_tree_node that overlaps with the input interval in iter - */ -struct prio_tree_node *prio_tree_next(struct prio_tree_iter *iter) -{ - unsigned long r_index, h_index; - - if (iter->cur == NULL) - return prio_tree_first(iter); - -repeat: - while (prio_tree_left(iter, &r_index, &h_index)) - if (overlap(iter, r_index, h_index)) - return iter->cur; - - while (!prio_tree_right(iter, &r_index, &h_index)) { - while (!prio_tree_root(iter->cur) && - iter->cur->parent->right == iter->cur) - prio_tree_parent(iter); - - if (prio_tree_root(iter->cur)) - return NULL; - - prio_tree_parent(iter); - } - - if (overlap(iter, r_index, h_index)) - return iter->cur; - - goto repeat; -} -EXPORT_SYMBOL(prio_tree_next); diff --git a/lib/prio_tree_test.c b/lib/prio_tree_test.c deleted file mode 100644 index c26084ddc6a4..000000000000 --- a/lib/prio_tree_test.c +++ /dev/null @@ -1,106 +0,0 @@ -#include -#include -#include -#include - -#define NODES 100 -#define PERF_LOOPS 100000 -#define SEARCHES 100 -#define SEARCH_LOOPS 10000 - -static struct prio_tree_root root; -static struct prio_tree_node nodes[NODES]; -static u32 queries[SEARCHES]; - -static struct rnd_state rnd; - -static inline unsigned long -search(unsigned long query, struct prio_tree_root *root) -{ - struct prio_tree_iter iter; - unsigned long results = 0; - - prio_tree_iter_init(&iter, root, query, query); - while (prio_tree_next(&iter)) - results++; - return results; -} - -static void init(void) -{ - int i; - for (i = 0; i < NODES; i++) { - u32 a = prandom32(&rnd), b = prandom32(&rnd); - if (a <= b) { - nodes[i].start = a; - nodes[i].last = b; - } else { - nodes[i].start = b; - nodes[i].last = a; - } - } - for (i = 0; i < SEARCHES; i++) - queries[i] = prandom32(&rnd); -} - -static int prio_tree_test_init(void) -{ - int i, j; - unsigned long results; - cycles_t time1, time2, time; - - printk(KERN_ALERT "prio tree insert/remove"); - - prandom32_seed(&rnd, 3141592653589793238ULL); - INIT_PRIO_TREE_ROOT(&root); - init(); - - time1 = get_cycles(); - - for (i = 0; i < PERF_LOOPS; i++) { - for (j = 0; j < NODES; j++) - prio_tree_insert(&root, nodes + j); - for (j = 0; j < NODES; j++) - prio_tree_remove(&root, nodes + j); - } - - time2 = get_cycles(); - time = time2 - time1; - - time = div_u64(time, PERF_LOOPS); - printk(" -> %llu cycles\n", (unsigned long long)time); - - printk(KERN_ALERT "prio tree search"); - - for (j = 0; j < NODES; j++) - prio_tree_insert(&root, nodes + j); - - time1 = get_cycles(); - - results = 0; - for (i = 0; i < SEARCH_LOOPS; i++) - for (j = 0; j < SEARCHES; j++) - results += search(queries[j], &root); - - time2 = get_cycles(); - time = time2 - time1; - - time = div_u64(time, SEARCH_LOOPS); - results = div_u64(results, SEARCH_LOOPS); - printk(" -> %llu cycles (%lu results)\n", - (unsigned long long)time, results); - - return -EAGAIN; /* Fail will directly unload the module */ -} - -static void prio_tree_test_exit(void) -{ - printk(KERN_ALERT "test exit\n"); -} - -module_init(prio_tree_test_init) -module_exit(prio_tree_test_exit) - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Michel Lespinasse"); -MODULE_DESCRIPTION("Prio Tree test"); -- cgit v1.2.3 From 9c079add0d0f45220f4bb37febf0621137ec2d38 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:33 -0700 Subject: rbtree: move augmented rbtree functionality to rbtree_augmented.h Provide rb_insert_augmented() and rb_erase_augmented() through a new rbtree_augmented.h include file. rb_erase_augmented() is defined there as an __always_inline function, in order to allow inlining of augmented rbtree callbacks into it. Since this generates a relatively large function, each augmented rbtree user should make sure to have a single call site. Signed-off-by: Michel Lespinasse Cc: Rik van Riel Cc: Hillf Danton Cc: Peter Zijlstra Cc: Catalin Marinas Cc: Andrea Arcangeli Cc: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/rbtree.txt | 13 +++ arch/x86/mm/pat_rbtree.c | 2 +- include/linux/interval_tree_tmpl.h | 8 +- include/linux/rbtree.h | 48 -------- include/linux/rbtree_augmented.h | 223 +++++++++++++++++++++++++++++++++++++ lib/rbtree.c | 162 ++------------------------- lib/rbtree_test.c | 2 +- 7 files changed, 255 insertions(+), 203 deletions(-) create mode 100644 include/linux/rbtree_augmented.h (limited to 'lib') diff --git a/Documentation/rbtree.txt b/Documentation/rbtree.txt index 0a0b6dce3e08..61b6c48871a0 100644 --- a/Documentation/rbtree.txt +++ b/Documentation/rbtree.txt @@ -202,6 +202,14 @@ An rbtree user who wants this feature will have to call the augmentation functions with the user provided augmentation callback when inserting and erasing nodes. +C files implementing augmented rbtree manipulation must include + instead of . Note that +linux/rbtree_augmented.h exposes some rbtree implementations details +you are not expected to rely on; please stick to the documented APIs +there and do not include from header files +either so as to minimize chances of your users accidentally relying on +such implementation details. + On insertion, the user must update the augmented information on the path leading to the inserted node, then call rb_link_node() as usual and rb_augment_inserted() instead of the usual rb_insert_color() call. @@ -227,6 +235,11 @@ In both cases, the callbacks are provided through struct rb_augment_callbacks. subtree to a newly assigned subtree root AND recomputes the augmented information for the former subtree root. +The compiled code for rb_erase_augmented() may inline the propagation and +copy callbacks, which results in a large function, so each augmented rbtree +user should have a single rb_erase_augmented() call site in order to limit +compiled code size. + Sample usage: diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c index 4d116959075d..415f6c4ced36 100644 --- a/arch/x86/mm/pat_rbtree.c +++ b/arch/x86/mm/pat_rbtree.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include diff --git a/include/linux/interval_tree_tmpl.h b/include/linux/interval_tree_tmpl.h index c65deda31413..c1aeb922d65f 100644 --- a/include/linux/interval_tree_tmpl.h +++ b/include/linux/interval_tree_tmpl.h @@ -19,6 +19,8 @@ include/linux/interval_tree_tmpl.h */ +#include + /* * Template for implementing interval trees * @@ -57,7 +59,8 @@ static inline ITTYPE IT(compute_subtree_last)(ITSTRUCT *node) return max; } -static void IT(augment_propagate)(struct rb_node *rb, struct rb_node *stop) +static inline void +IT(augment_propagate)(struct rb_node *rb, struct rb_node *stop) { while (rb != stop) { ITSTRUCT *node = rb_entry(rb, ITSTRUCT, ITRB); @@ -69,7 +72,8 @@ static void IT(augment_propagate)(struct rb_node *rb, struct rb_node *stop) } } -static void IT(augment_copy)(struct rb_node *rb_old, struct rb_node *rb_new) +static inline void +IT(augment_copy)(struct rb_node *rb_old, struct rb_node *rb_new) { ITSTRUCT *old = rb_entry(rb_old, ITSTRUCT, ITRB); ITSTRUCT *new = rb_entry(rb_new, ITSTRUCT, ITRB); diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index 8d1e83b1c87b..0022c1bb1e26 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -62,54 +62,6 @@ extern void rb_insert_color(struct rb_node *, struct rb_root *); extern void rb_erase(struct rb_node *, struct rb_root *); -struct rb_augment_callbacks { - void (*propagate)(struct rb_node *node, struct rb_node *stop); - void (*copy)(struct rb_node *old, struct rb_node *new); - void (*rotate)(struct rb_node *old, struct rb_node *new); -}; - -extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root, - void (*augment_rotate)(struct rb_node *old, struct rb_node *new)); -extern void rb_erase_augmented(struct rb_node *node, struct rb_root *root, - const struct rb_augment_callbacks *augment); -static inline void -rb_insert_augmented(struct rb_node *node, struct rb_root *root, - const struct rb_augment_callbacks *augment) -{ - __rb_insert_augmented(node, root, augment->rotate); -} - -#define RB_DECLARE_CALLBACKS(rbstatic, rbname, rbstruct, rbfield, \ - rbtype, rbaugmented, rbcompute) \ -static void rbname ## _propagate(struct rb_node *rb, struct rb_node *stop) \ -{ \ - while (rb != stop) { \ - rbstruct *node = rb_entry(rb, rbstruct, rbfield); \ - rbtype augmented = rbcompute(node); \ - if (node->rbaugmented == augmented) \ - break; \ - node->rbaugmented = augmented; \ - rb = rb_parent(&node->rbfield); \ - } \ -} \ -static void rbname ## _copy(struct rb_node *rb_old, struct rb_node *rb_new) \ -{ \ - rbstruct *old = rb_entry(rb_old, rbstruct, rbfield); \ - rbstruct *new = rb_entry(rb_new, rbstruct, rbfield); \ - new->rbaugmented = old->rbaugmented; \ -} \ -static void rbname ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \ -{ \ - rbstruct *old = rb_entry(rb_old, rbstruct, rbfield); \ - rbstruct *new = rb_entry(rb_new, rbstruct, rbfield); \ - new->rbaugmented = old->rbaugmented; \ - old->rbaugmented = rbcompute(old); \ -} \ -rbstatic const struct rb_augment_callbacks rbname = { \ - rbname ## _propagate, rbname ## _copy, rbname ## _rotate \ -}; - - /* Find logical next and previous nodes in a tree */ extern struct rb_node *rb_next(const struct rb_node *); extern struct rb_node *rb_prev(const struct rb_node *); diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h new file mode 100644 index 000000000000..214caa33433b --- /dev/null +++ b/include/linux/rbtree_augmented.h @@ -0,0 +1,223 @@ +/* + Red Black Trees + (C) 1999 Andrea Arcangeli + (C) 2002 David Woodhouse + (C) 2012 Michel Lespinasse + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + linux/include/linux/rbtree_augmented.h +*/ + +#ifndef _LINUX_RBTREE_AUGMENTED_H +#define _LINUX_RBTREE_AUGMENTED_H + +#include + +/* + * Please note - only struct rb_augment_callbacks and the prototypes for + * rb_insert_augmented() and rb_erase_augmented() are intended to be public. + * The rest are implementation details you are not expected to depend on. + * + * See Documentation/rbtree.txt for documentation and samples. + */ + +struct rb_augment_callbacks { + void (*propagate)(struct rb_node *node, struct rb_node *stop); + void (*copy)(struct rb_node *old, struct rb_node *new); + void (*rotate)(struct rb_node *old, struct rb_node *new); +}; + +extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root, + void (*augment_rotate)(struct rb_node *old, struct rb_node *new)); +static inline void +rb_insert_augmented(struct rb_node *node, struct rb_root *root, + const struct rb_augment_callbacks *augment) +{ + __rb_insert_augmented(node, root, augment->rotate); +} + +#define RB_DECLARE_CALLBACKS(rbstatic, rbname, rbstruct, rbfield, \ + rbtype, rbaugmented, rbcompute) \ +static inline void \ +rbname ## _propagate(struct rb_node *rb, struct rb_node *stop) \ +{ \ + while (rb != stop) { \ + rbstruct *node = rb_entry(rb, rbstruct, rbfield); \ + rbtype augmented = rbcompute(node); \ + if (node->rbaugmented == augmented) \ + break; \ + node->rbaugmented = augmented; \ + rb = rb_parent(&node->rbfield); \ + } \ +} \ +static inline void \ +rbname ## _copy(struct rb_node *rb_old, struct rb_node *rb_new) \ +{ \ + rbstruct *old = rb_entry(rb_old, rbstruct, rbfield); \ + rbstruct *new = rb_entry(rb_new, rbstruct, rbfield); \ + new->rbaugmented = old->rbaugmented; \ +} \ +static void \ +rbname ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \ +{ \ + rbstruct *old = rb_entry(rb_old, rbstruct, rbfield); \ + rbstruct *new = rb_entry(rb_new, rbstruct, rbfield); \ + new->rbaugmented = old->rbaugmented; \ + old->rbaugmented = rbcompute(old); \ +} \ +rbstatic const struct rb_augment_callbacks rbname = { \ + rbname ## _propagate, rbname ## _copy, rbname ## _rotate \ +}; + + +#define RB_RED 0 +#define RB_BLACK 1 + +#define __rb_parent(pc) ((struct rb_node *)(pc & ~3)) + +#define __rb_color(pc) ((pc) & 1) +#define __rb_is_black(pc) __rb_color(pc) +#define __rb_is_red(pc) (!__rb_color(pc)) +#define rb_color(rb) __rb_color((rb)->__rb_parent_color) +#define rb_is_red(rb) __rb_is_red((rb)->__rb_parent_color) +#define rb_is_black(rb) __rb_is_black((rb)->__rb_parent_color) + +static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p) +{ + rb->__rb_parent_color = rb_color(rb) | (unsigned long)p; +} + +static inline void rb_set_parent_color(struct rb_node *rb, + struct rb_node *p, int color) +{ + rb->__rb_parent_color = (unsigned long)p | color; +} + +static inline void +__rb_change_child(struct rb_node *old, struct rb_node *new, + struct rb_node *parent, struct rb_root *root) +{ + if (parent) { + if (parent->rb_left == old) + parent->rb_left = new; + else + parent->rb_right = new; + } else + root->rb_node = new; +} + +extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root, + void (*augment_rotate)(struct rb_node *old, struct rb_node *new)); + +static __always_inline void +rb_erase_augmented(struct rb_node *node, struct rb_root *root, + const struct rb_augment_callbacks *augment) +{ + struct rb_node *child = node->rb_right, *tmp = node->rb_left; + struct rb_node *parent, *rebalance; + unsigned long pc; + + if (!tmp) { + /* + * Case 1: node to erase has no more than 1 child (easy!) + * + * Note that if there is one child it must be red due to 5) + * and node must be black due to 4). We adjust colors locally + * so as to bypass __rb_erase_color() later on. + */ + pc = node->__rb_parent_color; + parent = __rb_parent(pc); + __rb_change_child(node, child, parent, root); + if (child) { + child->__rb_parent_color = pc; + rebalance = NULL; + } else + rebalance = __rb_is_black(pc) ? parent : NULL; + tmp = parent; + } else if (!child) { + /* Still case 1, but this time the child is node->rb_left */ + tmp->__rb_parent_color = pc = node->__rb_parent_color; + parent = __rb_parent(pc); + __rb_change_child(node, tmp, parent, root); + rebalance = NULL; + tmp = parent; + } else { + struct rb_node *successor = child, *child2; + tmp = child->rb_left; + if (!tmp) { + /* + * Case 2: node's successor is its right child + * + * (n) (s) + * / \ / \ + * (x) (s) -> (x) (c) + * \ + * (c) + */ + parent = successor; + child2 = successor->rb_right; + augment->copy(node, successor); + } else { + /* + * Case 3: node's successor is leftmost under + * node's right child subtree + * + * (n) (s) + * / \ / \ + * (x) (y) -> (x) (y) + * / / + * (p) (p) + * / / + * (s) (c) + * \ + * (c) + */ + do { + parent = successor; + successor = tmp; + tmp = tmp->rb_left; + } while (tmp); + parent->rb_left = child2 = successor->rb_right; + successor->rb_right = child; + rb_set_parent(child, successor); + augment->copy(node, successor); + augment->propagate(parent, successor); + } + + successor->rb_left = tmp = node->rb_left; + rb_set_parent(tmp, successor); + + pc = node->__rb_parent_color; + tmp = __rb_parent(pc); + __rb_change_child(node, successor, tmp, root); + if (child2) { + successor->__rb_parent_color = pc; + rb_set_parent_color(child2, parent, RB_BLACK); + rebalance = NULL; + } else { + unsigned long pc2 = successor->__rb_parent_color; + successor->__rb_parent_color = pc; + rebalance = __rb_is_black(pc2) ? parent : NULL; + } + tmp = successor; + } + + augment->propagate(tmp, NULL); + if (rebalance) + __rb_erase_color(rebalance, root, augment->rotate); +} + +#endif /* _LINUX_RBTREE_AUGMENTED_H */ diff --git a/lib/rbtree.c b/lib/rbtree.c index c0088ca345f9..4f56a11d67fa 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -21,7 +21,7 @@ linux/lib/rbtree.c */ -#include +#include #include /* @@ -44,52 +44,16 @@ * parentheses and have some accompanying text comment. */ -#define RB_RED 0 -#define RB_BLACK 1 - -#define __rb_parent(pc) ((struct rb_node *)(pc & ~3)) - -#define __rb_color(pc) ((pc) & 1) -#define __rb_is_black(pc) __rb_color(pc) -#define __rb_is_red(pc) (!__rb_color(pc)) -#define rb_color(rb) __rb_color((rb)->__rb_parent_color) -#define rb_is_red(rb) __rb_is_red((rb)->__rb_parent_color) -#define rb_is_black(rb) __rb_is_black((rb)->__rb_parent_color) - static inline void rb_set_black(struct rb_node *rb) { rb->__rb_parent_color |= RB_BLACK; } -static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p) -{ - rb->__rb_parent_color = rb_color(rb) | (unsigned long)p; -} - -static inline void rb_set_parent_color(struct rb_node *rb, - struct rb_node *p, int color) -{ - rb->__rb_parent_color = (unsigned long)p | color; -} - static inline struct rb_node *rb_red_parent(struct rb_node *red) { return (struct rb_node *)red->__rb_parent_color; } -static inline void -__rb_change_child(struct rb_node *old, struct rb_node *new, - struct rb_node *parent, struct rb_root *root) -{ - if (parent) { - if (parent->rb_left == old) - parent->rb_left = new; - else - parent->rb_right = new; - } else - root->rb_node = new; -} - /* * Helper function for rotations: * - old's parent and color get assigned to new @@ -230,9 +194,9 @@ __rb_insert(struct rb_node *node, struct rb_root *root, } } -static __always_inline void +__always_inline void __rb_erase_color(struct rb_node *parent, struct rb_root *root, - const struct rb_augment_callbacks *augment) + void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) { struct rb_node *node = NULL, *sibling, *tmp1, *tmp2; @@ -261,7 +225,7 @@ __rb_erase_color(struct rb_node *parent, struct rb_root *root, rb_set_parent_color(tmp1, parent, RB_BLACK); __rb_rotate_set_parents(parent, sibling, root, RB_RED); - augment->rotate(parent, sibling); + augment_rotate(parent, sibling); sibling = tmp1; } tmp1 = sibling->rb_right; @@ -313,7 +277,7 @@ __rb_erase_color(struct rb_node *parent, struct rb_root *root, if (tmp1) rb_set_parent_color(tmp1, sibling, RB_BLACK); - augment->rotate(sibling, tmp2); + augment_rotate(sibling, tmp2); tmp1 = sibling; sibling = tmp2; } @@ -336,7 +300,7 @@ __rb_erase_color(struct rb_node *parent, struct rb_root *root, rb_set_parent(tmp2, parent); __rb_rotate_set_parents(parent, sibling, root, RB_BLACK); - augment->rotate(parent, sibling); + augment_rotate(parent, sibling); break; } else { sibling = parent->rb_left; @@ -347,7 +311,7 @@ __rb_erase_color(struct rb_node *parent, struct rb_root *root, rb_set_parent_color(tmp1, parent, RB_BLACK); __rb_rotate_set_parents(parent, sibling, root, RB_RED); - augment->rotate(parent, sibling); + augment_rotate(parent, sibling); sibling = tmp1; } tmp1 = sibling->rb_left; @@ -374,7 +338,7 @@ __rb_erase_color(struct rb_node *parent, struct rb_root *root, if (tmp1) rb_set_parent_color(tmp1, sibling, RB_BLACK); - augment->rotate(sibling, tmp2); + augment_rotate(sibling, tmp2); tmp1 = sibling; sibling = tmp2; } @@ -386,109 +350,12 @@ __rb_erase_color(struct rb_node *parent, struct rb_root *root, rb_set_parent(tmp2, parent); __rb_rotate_set_parents(parent, sibling, root, RB_BLACK); - augment->rotate(parent, sibling); + augment_rotate(parent, sibling); break; } } } - -static __always_inline void -__rb_erase(struct rb_node *node, struct rb_root *root, - const struct rb_augment_callbacks *augment) -{ - struct rb_node *child = node->rb_right, *tmp = node->rb_left; - struct rb_node *parent, *rebalance; - unsigned long pc; - - if (!tmp) { - /* - * Case 1: node to erase has no more than 1 child (easy!) - * - * Note that if there is one child it must be red due to 5) - * and node must be black due to 4). We adjust colors locally - * so as to bypass __rb_erase_color() later on. - */ - pc = node->__rb_parent_color; - parent = __rb_parent(pc); - __rb_change_child(node, child, parent, root); - if (child) { - child->__rb_parent_color = pc; - rebalance = NULL; - } else - rebalance = __rb_is_black(pc) ? parent : NULL; - tmp = parent; - } else if (!child) { - /* Still case 1, but this time the child is node->rb_left */ - tmp->__rb_parent_color = pc = node->__rb_parent_color; - parent = __rb_parent(pc); - __rb_change_child(node, tmp, parent, root); - rebalance = NULL; - tmp = parent; - } else { - struct rb_node *successor = child, *child2; - tmp = child->rb_left; - if (!tmp) { - /* - * Case 2: node's successor is its right child - * - * (n) (s) - * / \ / \ - * (x) (s) -> (x) (c) - * \ - * (c) - */ - parent = successor; - child2 = successor->rb_right; - augment->copy(node, successor); - } else { - /* - * Case 3: node's successor is leftmost under - * node's right child subtree - * - * (n) (s) - * / \ / \ - * (x) (y) -> (x) (y) - * / / - * (p) (p) - * / / - * (s) (c) - * \ - * (c) - */ - do { - parent = successor; - successor = tmp; - tmp = tmp->rb_left; - } while (tmp); - parent->rb_left = child2 = successor->rb_right; - successor->rb_right = child; - rb_set_parent(child, successor); - augment->copy(node, successor); - augment->propagate(parent, successor); - } - - successor->rb_left = tmp = node->rb_left; - rb_set_parent(tmp, successor); - - pc = node->__rb_parent_color; - tmp = __rb_parent(pc); - __rb_change_child(node, successor, tmp, root); - if (child2) { - successor->__rb_parent_color = pc; - rb_set_parent_color(child2, parent, RB_BLACK); - rebalance = NULL; - } else { - unsigned long pc2 = successor->__rb_parent_color; - successor->__rb_parent_color = pc; - rebalance = __rb_is_black(pc2) ? parent : NULL; - } - tmp = successor; - } - - augment->propagate(tmp, NULL); - if (rebalance) - __rb_erase_color(rebalance, root, augment); -} +EXPORT_SYMBOL(__rb_erase_color); /* * Non-augmented rbtree manipulation functions. @@ -513,7 +380,7 @@ EXPORT_SYMBOL(rb_insert_color); void rb_erase(struct rb_node *node, struct rb_root *root) { - __rb_erase(node, root, &dummy_callbacks); + rb_erase_augmented(node, root, &dummy_callbacks); } EXPORT_SYMBOL(rb_erase); @@ -531,13 +398,6 @@ void __rb_insert_augmented(struct rb_node *node, struct rb_root *root, } EXPORT_SYMBOL(__rb_insert_augmented); -void rb_erase_augmented(struct rb_node *node, struct rb_root *root, - const struct rb_augment_callbacks *augment) -{ - __rb_erase(node, root, augment); -} -EXPORT_SYMBOL(rb_erase_augmented); - /* * This function returns the first node (in sort order) of the tree. */ diff --git a/lib/rbtree_test.c b/lib/rbtree_test.c index b20e99969b0f..268b23951fec 100644 --- a/lib/rbtree_test.c +++ b/lib/rbtree_test.c @@ -1,5 +1,5 @@ #include -#include +#include #include #include -- cgit v1.2.3 From 9826a516ff77c5820e591211e4f3e58ff36f46be Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:35 -0700 Subject: mm: interval tree updates Update the generic interval tree code that was introduced in "mm: replace vma prio_tree with an interval tree". Changes: - fixed 'endpoing' typo noticed by Andrew Morton - replaced include/linux/interval_tree_tmpl.h, which was used as a template (including it automatically defined the interval tree functions) with include/linux/interval_tree_generic.h, which only defines a preprocessor macro INTERVAL_TREE_DEFINE(), which itself defines the interval tree functions when invoked. Now that is a very long macro which is unfortunate, but it does make the usage sites (lib/interval_tree.c and mm/interval_tree.c) a bit nicer than previously. - make use of RB_DECLARE_CALLBACKS() in the INTERVAL_TREE_DEFINE() macro, instead of duplicating that code in the interval tree template. - replaced vma_interval_tree_add(), which was actually handling the nonlinear and interval tree cases, with vma_interval_tree_insert_after() which handles only the interval tree case and has an API that is more consistent with the other interval tree handling functions. The nonlinear case is now handled explicitly in kernel/fork.c dup_mmap(). Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Cc: Rik van Riel Cc: Peter Zijlstra Cc: Daniel Santos Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/interval_tree_generic.h | 191 +++++++++++++++++++++++++++++ include/linux/interval_tree_tmpl.h | 219 ---------------------------------- include/linux/mm.h | 6 +- kernel/fork.c | 7 +- lib/interval_tree.c | 15 +-- mm/interval_tree.c | 60 +++++----- 6 files changed, 235 insertions(+), 263 deletions(-) create mode 100644 include/linux/interval_tree_generic.h delete mode 100644 include/linux/interval_tree_tmpl.h (limited to 'lib') diff --git a/include/linux/interval_tree_generic.h b/include/linux/interval_tree_generic.h new file mode 100644 index 000000000000..58370e1862ad --- /dev/null +++ b/include/linux/interval_tree_generic.h @@ -0,0 +1,191 @@ +/* + Interval Trees + (C) 2012 Michel Lespinasse + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + include/linux/interval_tree_generic.h +*/ + +#include + +/* + * Template for implementing interval trees + * + * ITSTRUCT: struct type of the interval tree nodes + * ITRB: name of struct rb_node field within ITSTRUCT + * ITTYPE: type of the interval endpoints + * ITSUBTREE: name of ITTYPE field within ITSTRUCT holding last-in-subtree + * ITSTART(n): start endpoint of ITSTRUCT node n + * ITLAST(n): last endpoint of ITSTRUCT node n + * ITSTATIC: 'static' or empty + * ITPREFIX: prefix to use for the inline tree definitions + * + * Note - before using this, please consider if non-generic version + * (interval_tree.h) would work for you... + */ + +#define INTERVAL_TREE_DEFINE(ITSTRUCT, ITRB, ITTYPE, ITSUBTREE, \ + ITSTART, ITLAST, ITSTATIC, ITPREFIX) \ + \ +/* Callbacks for augmented rbtree insert and remove */ \ + \ +static inline ITTYPE ITPREFIX ## _compute_subtree_last(ITSTRUCT *node) \ +{ \ + ITTYPE max = ITLAST(node), subtree_last; \ + if (node->ITRB.rb_left) { \ + subtree_last = rb_entry(node->ITRB.rb_left, \ + ITSTRUCT, ITRB)->ITSUBTREE; \ + if (max < subtree_last) \ + max = subtree_last; \ + } \ + if (node->ITRB.rb_right) { \ + subtree_last = rb_entry(node->ITRB.rb_right, \ + ITSTRUCT, ITRB)->ITSUBTREE; \ + if (max < subtree_last) \ + max = subtree_last; \ + } \ + return max; \ +} \ + \ +RB_DECLARE_CALLBACKS(static, ITPREFIX ## _augment, ITSTRUCT, ITRB, \ + ITTYPE, ITSUBTREE, ITPREFIX ## _compute_subtree_last) \ + \ +/* Insert / remove interval nodes from the tree */ \ + \ +ITSTATIC void ITPREFIX ## _insert(ITSTRUCT *node, struct rb_root *root) \ +{ \ + struct rb_node **link = &root->rb_node, *rb_parent = NULL; \ + ITTYPE start = ITSTART(node), last = ITLAST(node); \ + ITSTRUCT *parent; \ + \ + while (*link) { \ + rb_parent = *link; \ + parent = rb_entry(rb_parent, ITSTRUCT, ITRB); \ + if (parent->ITSUBTREE < last) \ + parent->ITSUBTREE = last; \ + if (start < ITSTART(parent)) \ + link = &parent->ITRB.rb_left; \ + else \ + link = &parent->ITRB.rb_right; \ + } \ + \ + node->ITSUBTREE = last; \ + rb_link_node(&node->ITRB, rb_parent, link); \ + rb_insert_augmented(&node->ITRB, root, &ITPREFIX ## _augment); \ +} \ + \ +ITSTATIC void ITPREFIX ## _remove(ITSTRUCT *node, struct rb_root *root) \ +{ \ + rb_erase_augmented(&node->ITRB, root, &ITPREFIX ## _augment); \ +} \ + \ +/* \ + * Iterate over intervals intersecting [start;last] \ + * \ + * Note that a node's interval intersects [start;last] iff: \ + * Cond1: ITSTART(node) <= last \ + * and \ + * Cond2: start <= ITLAST(node) \ + */ \ + \ +static ITSTRUCT * \ +ITPREFIX ## _subtree_search(ITSTRUCT *node, ITTYPE start, ITTYPE last) \ +{ \ + while (true) { \ + /* \ + * Loop invariant: start <= node->ITSUBTREE \ + * (Cond2 is satisfied by one of the subtree nodes) \ + */ \ + if (node->ITRB.rb_left) { \ + ITSTRUCT *left = rb_entry(node->ITRB.rb_left, \ + ITSTRUCT, ITRB); \ + if (start <= left->ITSUBTREE) { \ + /* \ + * Some nodes in left subtree satisfy Cond2. \ + * Iterate to find the leftmost such node N. \ + * If it also satisfies Cond1, that's the \ + * match we are looking for. Otherwise, there \ + * is no matching interval as nodes to the \ + * right of N can't satisfy Cond1 either. \ + */ \ + node = left; \ + continue; \ + } \ + } \ + if (ITSTART(node) <= last) { /* Cond1 */ \ + if (start <= ITLAST(node)) /* Cond2 */ \ + return node; /* node is leftmost match */ \ + if (node->ITRB.rb_right) { \ + node = rb_entry(node->ITRB.rb_right, \ + ITSTRUCT, ITRB); \ + if (start <= node->ITSUBTREE) \ + continue; \ + } \ + } \ + return NULL; /* No match */ \ + } \ +} \ + \ +ITSTATIC ITSTRUCT * \ +ITPREFIX ## _iter_first(struct rb_root *root, ITTYPE start, ITTYPE last) \ +{ \ + ITSTRUCT *node; \ + \ + if (!root->rb_node) \ + return NULL; \ + node = rb_entry(root->rb_node, ITSTRUCT, ITRB); \ + if (node->ITSUBTREE < start) \ + return NULL; \ + return ITPREFIX ## _subtree_search(node, start, last); \ +} \ + \ +ITSTATIC ITSTRUCT * \ +ITPREFIX ## _iter_next(ITSTRUCT *node, ITTYPE start, ITTYPE last) \ +{ \ + struct rb_node *rb = node->ITRB.rb_right, *prev; \ + \ + while (true) { \ + /* \ + * Loop invariants: \ + * Cond1: ITSTART(node) <= last \ + * rb == node->ITRB.rb_right \ + * \ + * First, search right subtree if suitable \ + */ \ + if (rb) { \ + ITSTRUCT *right = rb_entry(rb, ITSTRUCT, ITRB); \ + if (start <= right->ITSUBTREE) \ + return ITPREFIX ## _subtree_search(right, \ + start, last); \ + } \ + \ + /* Move up the tree until we come from a node's left child */ \ + do { \ + rb = rb_parent(&node->ITRB); \ + if (!rb) \ + return NULL; \ + prev = &node->ITRB; \ + node = rb_entry(rb, ITSTRUCT, ITRB); \ + rb = node->ITRB.rb_right; \ + } while (prev == rb); \ + \ + /* Check if the node intersects [start;last] */ \ + if (last < ITSTART(node)) /* !Cond1 */ \ + return NULL; \ + else if (start <= ITLAST(node)) /* Cond2 */ \ + return node; \ + } \ +} diff --git a/include/linux/interval_tree_tmpl.h b/include/linux/interval_tree_tmpl.h deleted file mode 100644 index c1aeb922d65f..000000000000 --- a/include/linux/interval_tree_tmpl.h +++ /dev/null @@ -1,219 +0,0 @@ -/* - Interval Trees - (C) 2012 Michel Lespinasse - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - - include/linux/interval_tree_tmpl.h -*/ - -#include - -/* - * Template for implementing interval trees - * - * ITSTRUCT: struct type of the interval tree nodes - * ITRB: name of struct rb_node field within ITSTRUCT - * ITTYPE: type of the interval endpoints - * ITSUBTREE: name of ITTYPE field within ITSTRUCT holding last-in-subtree - * ITSTART(n): start endpoint of ITSTRUCT node n - * ITLAST(n): last endpoing of ITSTRUCT node n - * ITSTATIC: 'static' or empty - * ITPREFIX: prefix to use for the inline tree definitions - */ - -/* IT(name) -> ITPREFIX_name */ -#define _ITNAME(prefix, name) prefix ## _ ## name -#define ITNAME(prefix, name) _ITNAME(prefix, name) -#define IT(name) ITNAME(ITPREFIX, name) - -/* Callbacks for augmented rbtree insert and remove */ - -static inline ITTYPE IT(compute_subtree_last)(ITSTRUCT *node) -{ - ITTYPE max = ITLAST(node), subtree_last; - if (node->ITRB.rb_left) { - subtree_last = rb_entry(node->ITRB.rb_left, - ITSTRUCT, ITRB)->ITSUBTREE; - if (max < subtree_last) - max = subtree_last; - } - if (node->ITRB.rb_right) { - subtree_last = rb_entry(node->ITRB.rb_right, - ITSTRUCT, ITRB)->ITSUBTREE; - if (max < subtree_last) - max = subtree_last; - } - return max; -} - -static inline void -IT(augment_propagate)(struct rb_node *rb, struct rb_node *stop) -{ - while (rb != stop) { - ITSTRUCT *node = rb_entry(rb, ITSTRUCT, ITRB); - ITTYPE subtree_last = IT(compute_subtree_last)(node); - if (node->ITSUBTREE == subtree_last) - break; - node->ITSUBTREE = subtree_last; - rb = rb_parent(&node->ITRB); - } -} - -static inline void -IT(augment_copy)(struct rb_node *rb_old, struct rb_node *rb_new) -{ - ITSTRUCT *old = rb_entry(rb_old, ITSTRUCT, ITRB); - ITSTRUCT *new = rb_entry(rb_new, ITSTRUCT, ITRB); - - new->ITSUBTREE = old->ITSUBTREE; -} - -static void IT(augment_rotate)(struct rb_node *rb_old, struct rb_node *rb_new) -{ - ITSTRUCT *old = rb_entry(rb_old, ITSTRUCT, ITRB); - ITSTRUCT *new = rb_entry(rb_new, ITSTRUCT, ITRB); - - new->ITSUBTREE = old->ITSUBTREE; - old->ITSUBTREE = IT(compute_subtree_last)(old); -} - -static const struct rb_augment_callbacks IT(augment_callbacks) = { - IT(augment_propagate), IT(augment_copy), IT(augment_rotate) -}; - -/* Insert / remove interval nodes from the tree */ - -ITSTATIC void IT(insert)(ITSTRUCT *node, struct rb_root *root) -{ - struct rb_node **link = &root->rb_node, *rb_parent = NULL; - ITTYPE start = ITSTART(node), last = ITLAST(node); - ITSTRUCT *parent; - - while (*link) { - rb_parent = *link; - parent = rb_entry(rb_parent, ITSTRUCT, ITRB); - if (parent->ITSUBTREE < last) - parent->ITSUBTREE = last; - if (start < ITSTART(parent)) - link = &parent->ITRB.rb_left; - else - link = &parent->ITRB.rb_right; - } - - node->ITSUBTREE = last; - rb_link_node(&node->ITRB, rb_parent, link); - rb_insert_augmented(&node->ITRB, root, &IT(augment_callbacks)); -} - -ITSTATIC void IT(remove)(ITSTRUCT *node, struct rb_root *root) -{ - rb_erase_augmented(&node->ITRB, root, &IT(augment_callbacks)); -} - -/* - * Iterate over intervals intersecting [start;last] - * - * Note that a node's interval intersects [start;last] iff: - * Cond1: ITSTART(node) <= last - * and - * Cond2: start <= ITLAST(node) - */ - -static ITSTRUCT *IT(subtree_search)(ITSTRUCT *node, ITTYPE start, ITTYPE last) -{ - while (true) { - /* - * Loop invariant: start <= node->ITSUBTREE - * (Cond2 is satisfied by one of the subtree nodes) - */ - if (node->ITRB.rb_left) { - ITSTRUCT *left = rb_entry(node->ITRB.rb_left, - ITSTRUCT, ITRB); - if (start <= left->ITSUBTREE) { - /* - * Some nodes in left subtree satisfy Cond2. - * Iterate to find the leftmost such node N. - * If it also satisfies Cond1, that's the match - * we are looking for. Otherwise, there is no - * matching interval as nodes to the right of N - * can't satisfy Cond1 either. - */ - node = left; - continue; - } - } - if (ITSTART(node) <= last) { /* Cond1 */ - if (start <= ITLAST(node)) /* Cond2 */ - return node; /* node is leftmost match */ - if (node->ITRB.rb_right) { - node = rb_entry(node->ITRB.rb_right, - ITSTRUCT, ITRB); - if (start <= node->ITSUBTREE) - continue; - } - } - return NULL; /* No match */ - } -} - -ITSTATIC ITSTRUCT *IT(iter_first)(struct rb_root *root, - ITTYPE start, ITTYPE last) -{ - ITSTRUCT *node; - - if (!root->rb_node) - return NULL; - node = rb_entry(root->rb_node, ITSTRUCT, ITRB); - if (node->ITSUBTREE < start) - return NULL; - return IT(subtree_search)(node, start, last); -} - -ITSTATIC ITSTRUCT *IT(iter_next)(ITSTRUCT *node, ITTYPE start, ITTYPE last) -{ - struct rb_node *rb = node->ITRB.rb_right, *prev; - - while (true) { - /* - * Loop invariants: - * Cond1: ITSTART(node) <= last - * rb == node->ITRB.rb_right - * - * First, search right subtree if suitable - */ - if (rb) { - ITSTRUCT *right = rb_entry(rb, ITSTRUCT, ITRB); - if (start <= right->ITSUBTREE) - return IT(subtree_search)(right, start, last); - } - - /* Move up the tree until we come from a node's left child */ - do { - rb = rb_parent(&node->ITRB); - if (!rb) - return NULL; - prev = &node->ITRB; - node = rb_entry(rb, ITSTRUCT, ITRB); - rb = node->ITRB.rb_right; - } while (prev == rb); - - /* Check if the node intersects [start;last] */ - if (last < ITSTART(node)) /* !Cond1 */ - return NULL; - else if (start <= ITLAST(node)) /* Cond2 */ - return node; - } -} diff --git a/include/linux/mm.h b/include/linux/mm.h index 0f671ef09eba..f1d9aaadb566 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1355,11 +1355,11 @@ extern atomic_long_t mmap_pages_allocated; extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t); /* interval_tree.c */ -void vma_interval_tree_add(struct vm_area_struct *vma, - struct vm_area_struct *old, - struct address_space *mapping); void vma_interval_tree_insert(struct vm_area_struct *node, struct rb_root *root); +void vma_interval_tree_insert_after(struct vm_area_struct *node, + struct vm_area_struct *prev, + struct rb_root *root); void vma_interval_tree_remove(struct vm_area_struct *node, struct rb_root *root); struct vm_area_struct *vma_interval_tree_iter_first(struct rb_root *root, diff --git a/kernel/fork.c b/kernel/fork.c index 90dace52715e..1cd7d581b3b2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -423,7 +423,12 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) mapping->i_mmap_writable++; flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ - vma_interval_tree_add(tmp, mpnt, mapping); + if (unlikely(tmp->vm_flags & VM_NONLINEAR)) + vma_nonlinear_insert(tmp, + &mapping->i_mmap_nonlinear); + else + vma_interval_tree_insert_after(tmp, mpnt, + &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); mutex_unlock(&mapping->i_mmap_mutex); } diff --git a/lib/interval_tree.c b/lib/interval_tree.c index 77a793e0644b..e6eb406f2d65 100644 --- a/lib/interval_tree.c +++ b/lib/interval_tree.c @@ -1,13 +1,10 @@ #include #include +#include -#define ITSTRUCT struct interval_tree_node -#define ITRB rb -#define ITTYPE unsigned long -#define ITSUBTREE __subtree_last -#define ITSTART(n) ((n)->start) -#define ITLAST(n) ((n)->last) -#define ITSTATIC -#define ITPREFIX interval_tree +#define START(node) ((node)->start) +#define LAST(node) ((node)->last) -#include +INTERVAL_TREE_DEFINE(struct interval_tree_node, rb, + unsigned long, __subtree_last, + START, LAST,, interval_tree) diff --git a/mm/interval_tree.c b/mm/interval_tree.c index 7dc565660e56..4ab7b9ec3a56 100644 --- a/mm/interval_tree.c +++ b/mm/interval_tree.c @@ -8,40 +8,38 @@ #include #include +#include -#define ITSTRUCT struct vm_area_struct -#define ITRB shared.linear.rb -#define ITTYPE unsigned long -#define ITSUBTREE shared.linear.rb_subtree_last -#define ITSTART(n) ((n)->vm_pgoff) -#define ITLAST(n) ((n)->vm_pgoff + \ - (((n)->vm_end - (n)->vm_start) >> PAGE_SHIFT) - 1) -#define ITSTATIC -#define ITPREFIX vma_interval_tree - -#include - -/* Insert old immediately after vma in the interval tree */ -void vma_interval_tree_add(struct vm_area_struct *vma, - struct vm_area_struct *old, - struct address_space *mapping) +static inline unsigned long vma_start_pgoff(struct vm_area_struct *v) +{ + return v->vm_pgoff; +} + +static inline unsigned long vma_last_pgoff(struct vm_area_struct *v) +{ + return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1; +} + +INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.linear.rb, + unsigned long, shared.linear.rb_subtree_last, + vma_start_pgoff, vma_last_pgoff,, vma_interval_tree) + +/* Insert node immediately after prev in the interval tree */ +void vma_interval_tree_insert_after(struct vm_area_struct *node, + struct vm_area_struct *prev, + struct rb_root *root) { struct rb_node **link; struct vm_area_struct *parent; - unsigned long last; - - if (unlikely(vma->vm_flags & VM_NONLINEAR)) { - list_add(&vma->shared.nonlinear, &old->shared.nonlinear); - return; - } + unsigned long last = vma_last_pgoff(node); - last = ITLAST(vma); + VM_BUG_ON(vma_start_pgoff(node) != vma_start_pgoff(prev)); - if (!old->shared.linear.rb.rb_right) { - parent = old; - link = &old->shared.linear.rb.rb_right; + if (!prev->shared.linear.rb.rb_right) { + parent = prev; + link = &prev->shared.linear.rb.rb_right; } else { - parent = rb_entry(old->shared.linear.rb.rb_right, + parent = rb_entry(prev->shared.linear.rb.rb_right, struct vm_area_struct, shared.linear.rb); if (parent->shared.linear.rb_subtree_last < last) parent->shared.linear.rb_subtree_last = last; @@ -54,8 +52,8 @@ void vma_interval_tree_add(struct vm_area_struct *vma, link = &parent->shared.linear.rb.rb_left; } - vma->shared.linear.rb_subtree_last = last; - rb_link_node(&vma->shared.linear.rb, &parent->shared.linear.rb, link); - rb_insert_augmented(&vma->shared.linear.rb, &mapping->i_mmap, - &vma_interval_tree_augment_callbacks); + node->shared.linear.rb_subtree_last = last; + rb_link_node(&node->shared.linear.rb, &parent->shared.linear.rb, link); + rb_insert_augmented(&node->shared.linear.rb, root, + &vma_interval_tree_augment); } -- cgit v1.2.3 From ed8ea8150182f8d715fceb3b175ef0a9ebacd872 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:45 -0700 Subject: mm: add CONFIG_DEBUG_VM_RB build option Add a CONFIG_DEBUG_VM_RB build option for the previously existing DEBUG_MM_RB code. Now that Andi Kleen modified it to avoid using recursive algorithms, we can expose it a bit more. Also extend this code to validate_mm() after stack expansion, and to check that the vma's start and last pgoffs have not changed since the nodes were inserted on the anon vma interval tree (as it is important that the nodes be reindexed after each such update). Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Cc: Rik van Riel Cc: Peter Zijlstra Cc: Daniel Santos Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 3 +++ include/linux/rmap.h | 3 +++ lib/Kconfig.debug | 9 +++++++++ mm/interval_tree.c | 41 ++++++++++++++++++++++++++++++++++++++++- mm/mmap.c | 19 +++++++++---------- 5 files changed, 64 insertions(+), 11 deletions(-) (limited to 'lib') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0cdab4e0f814..0e6f9c9f2123 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1386,6 +1386,9 @@ struct anon_vma_chain *anon_vma_interval_tree_iter_first( struct rb_root *root, unsigned long start, unsigned long last); struct anon_vma_chain *anon_vma_interval_tree_iter_next( struct anon_vma_chain *node, unsigned long start, unsigned long last); +#ifdef CONFIG_DEBUG_VM_RB +void anon_vma_interval_tree_verify(struct anon_vma_chain *node); +#endif #define anon_vma_interval_tree_foreach(avc, root, start, last) \ for (avc = anon_vma_interval_tree_iter_first(root, start, last); \ diff --git a/include/linux/rmap.h b/include/linux/rmap.h index dce44f7d3ed8..b2cce644ffc7 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -66,6 +66,9 @@ struct anon_vma_chain { struct list_head same_vma; /* locked by mmap_sem & page_table_lock */ struct rb_node rb; /* locked by anon_vma->mutex */ unsigned long rb_subtree_last; +#ifdef CONFIG_DEBUG_VM_RB + unsigned long cached_vma_start, cached_vma_last; +#endif }; #ifdef CONFIG_MMU diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index a6e7e7741523..28e9d6c98941 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -798,6 +798,15 @@ config DEBUG_VM If unsure, say N. +config DEBUG_VM_RB + bool "Debug VM red-black trees" + depends on DEBUG_VM + help + Enable this to turn on more extended checks in the virtual-memory + system that may impact performance. + + If unsure, say N. + config DEBUG_VIRTUAL bool "Debug VM translations" depends on DEBUG_KERNEL && X86 diff --git a/mm/interval_tree.c b/mm/interval_tree.c index f7c72cd35e1d..4a5822a586e6 100644 --- a/mm/interval_tree.c +++ b/mm/interval_tree.c @@ -70,4 +70,43 @@ static inline unsigned long avc_last_pgoff(struct anon_vma_chain *avc) } INTERVAL_TREE_DEFINE(struct anon_vma_chain, rb, unsigned long, rb_subtree_last, - avc_start_pgoff, avc_last_pgoff,, anon_vma_interval_tree) + avc_start_pgoff, avc_last_pgoff, + static inline, __anon_vma_interval_tree) + +void anon_vma_interval_tree_insert(struct anon_vma_chain *node, + struct rb_root *root) +{ +#ifdef CONFIG_DEBUG_VM_RB + node->cached_vma_start = avc_start_pgoff(node); + node->cached_vma_last = avc_last_pgoff(node); +#endif + __anon_vma_interval_tree_insert(node, root); +} + +void anon_vma_interval_tree_remove(struct anon_vma_chain *node, + struct rb_root *root) +{ + __anon_vma_interval_tree_remove(node, root); +} + +struct anon_vma_chain * +anon_vma_interval_tree_iter_first(struct rb_root *root, + unsigned long first, unsigned long last) +{ + return __anon_vma_interval_tree_iter_first(root, first, last); +} + +struct anon_vma_chain * +anon_vma_interval_tree_iter_next(struct anon_vma_chain *node, + unsigned long first, unsigned long last) +{ + return __anon_vma_interval_tree_iter_next(node, first, last); +} + +#ifdef CONFIG_DEBUG_VM_RB +void anon_vma_interval_tree_verify(struct anon_vma_chain *node) +{ + WARN_ON_ONCE(node->cached_vma_start != avc_start_pgoff(node)); + WARN_ON_ONCE(node->cached_vma_last != avc_last_pgoff(node)); +} +#endif diff --git a/mm/mmap.c b/mm/mmap.c index 2e580ed79211..deb422c39e21 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -51,12 +51,6 @@ static void unmap_region(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, unsigned long start, unsigned long end); -/* - * WARNING: the debugging will use recursive algorithms so never enable this - * unless you know what you are doing. - */ -#undef DEBUG_MM_RB - /* description of effects of mapping type and prot in current implementation. * this is due to the limited x86 page protection hardware. The expected * behavior is in parens: @@ -303,7 +297,7 @@ out: return retval; } -#ifdef DEBUG_MM_RB +#ifdef CONFIG_DEBUG_VM_RB static int browse_rb(struct rb_root *root) { int i = 0, j; @@ -337,9 +331,12 @@ void validate_mm(struct mm_struct *mm) { int bug = 0; int i = 0; - struct vm_area_struct *tmp = mm->mmap; - while (tmp) { - tmp = tmp->vm_next; + struct vm_area_struct *vma = mm->mmap; + while (vma) { + struct anon_vma_chain *avc; + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + anon_vma_interval_tree_verify(avc); + vma = vma->vm_next; i++; } if (i != mm->map_count) @@ -1790,6 +1787,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) } vma_unlock_anon_vma(vma); khugepaged_enter_vma_merge(vma); + validate_mm(vma->vm_mm); return error; } #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ @@ -1843,6 +1841,7 @@ int expand_downwards(struct vm_area_struct *vma, } vma_unlock_anon_vma(vma); khugepaged_enter_vma_merge(vma); + validate_mm(vma->vm_mm); return error; } -- cgit v1.2.3 From dbadc17683e6c673a69b236c0f041b931cc55c42 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 4 Oct 2012 14:21:23 +0100 Subject: X.509: Fix indefinite length element skip error handling asn1_find_indefinite_length() returns an error indicator of -1, which the caller asn1_ber_decoder() places in a size_t (which is usually unsigned) and then checks to see whether it is less than 0 (which it can't be). This can lead to the following warning: lib/asn1_decoder.c:320 asn1_ber_decoder() warn: unsigned 'len' is never less than zero. Instead, asn1_find_indefinite_length() update the caller's idea of the data cursor and length separately from returning the error code. Reported-by: Dan Carpenter Signed-off-by: David Howells Signed-off-by: Rusty Russell --- lib/asn1_decoder.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) (limited to 'lib') diff --git a/lib/asn1_decoder.c b/lib/asn1_decoder.c index 2e4196ddf06f..de2c8b5a715b 100644 --- a/lib/asn1_decoder.c +++ b/lib/asn1_decoder.c @@ -46,12 +46,18 @@ static const unsigned char asn1_op_lengths[ASN1_OP__NR] = { /* * Find the length of an indefinite length object + * @data: The data buffer + * @datalen: The end of the innermost containing element in the buffer + * @_dp: The data parse cursor (updated before returning) + * @_len: Where to return the size of the element. + * @_errmsg: Where to return a pointer to an error message on error */ static int asn1_find_indefinite_length(const unsigned char *data, size_t datalen, - const char **_errmsg, size_t *_err_dp) + size_t *_dp, size_t *_len, + const char **_errmsg) { unsigned char tag, tmp; - size_t dp = 0, len, n; + size_t dp = *_dp, len, n; int indef_level = 1; next_tag: @@ -67,8 +73,11 @@ next_tag: /* It appears to be an EOC. */ if (data[dp++] != 0) goto invalid_eoc; - if (--indef_level <= 0) - return dp; + if (--indef_level <= 0) { + *_len = dp - *_dp; + *_dp = dp; + return 0; + } goto next_tag; } @@ -122,7 +131,7 @@ data_overrun_error: missing_eoc: *_errmsg = "Missing EOC in indefinite len cons"; error: - *_err_dp = dp; + *_dp = dp; return -1; } @@ -315,13 +324,14 @@ next_op: skip_data: if (!(flags & FLAG_CONS)) { if (flags & FLAG_INDEFINITE_LENGTH) { - len = asn1_find_indefinite_length( - data + dp, datalen - dp, &errmsg, &dp); - if (len < 0) + ret = asn1_find_indefinite_length( + data, datalen, &dp, &len, &errmsg); + if (ret < 0) goto error; + } else { + dp += len; } pr_debug("- LEAF: %zu\n", len); - dp += len; } pc += asn1_op_lengths[op]; goto next_op; -- cgit v1.2.3 From 3e1aa66bd423950aa69c3d50d91818af1d16e0a7 Mon Sep 17 00:00:00 2001 From: Ezequiel Garcia Date: Wed, 10 Oct 2012 15:54:04 -0700 Subject: lib/kasprintf.c: use kmalloc_track_caller() to get accurate traces for kvasprintf Previously kvasprintf() allocation was being done through kmalloc(), thus producing an inaccurate trace report. This is a common problem: in order to get accurate callsite tracing, a lib/utils function shouldn't allocate kmalloc but instead use kmalloc_track_caller. Signed-off-by: Ezequiel Garcia Cc: Sam Ravnborg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/kasprintf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/kasprintf.c b/lib/kasprintf.c index ae0de80c1c88..32f12150fc4f 100644 --- a/lib/kasprintf.c +++ b/lib/kasprintf.c @@ -21,7 +21,7 @@ char *kvasprintf(gfp_t gfp, const char *fmt, va_list ap) len = vsnprintf(NULL, 0, fmt, aq); va_end(aq); - p = kmalloc(len+1, gfp); + p = kmalloc_track_caller(len+1, gfp); if (!p) return NULL; -- cgit v1.2.3 From fe73fbe1c5eda709084dedb66cbdd4b86826cce7 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 19 Oct 2012 13:57:01 -0700 Subject: lib/dma-debug.c: fix __hash_bucket_find() If there is only one match, the unique matched entry should be returned. Without the fix, the upcoming dma debug interfaces ("dma-debug: new interfaces to debug dma mapping errors") can't work reliably because only device and dma_addr are passed to dma_mapping_error(). Signed-off-by: Ming Lei Reported-by: Wu Fengguang Cc: Joerg Roedel Tested-by: Shuah Khan Cc: Paul Gortmaker Cc: Jakub Kicinski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/dma-debug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/dma-debug.c b/lib/dma-debug.c index b9087bff008b..d84beb994f36 100644 --- a/lib/dma-debug.c +++ b/lib/dma-debug.c @@ -264,7 +264,7 @@ static struct dma_debug_entry *__hash_bucket_find(struct hash_bucket *bucket, match_fn match) { struct dma_debug_entry *entry, *ret = NULL; - int matches = 0, match_lvl, last_lvl = 0; + int matches = 0, match_lvl, last_lvl = -1; list_for_each_entry(entry, &bucket->list, list) { if (!match(ref, entry)) @@ -293,7 +293,7 @@ static struct dma_debug_entry *__hash_bucket_find(struct hash_bucket *bucket, } else if (match_lvl > last_lvl) { /* * We found an entry that fits better then the - * previous one + * previous one or it is the 1st match. */ last_lvl = match_lvl; ret = entry; -- cgit v1.2.3 From eedce141cd2dad8d0cefc5468ef41898949a7031 Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Thu, 25 Oct 2012 13:37:51 -0700 Subject: genalloc: stop crashing the system when destroying a pool The genalloc code uses the bitmap API from include/linux/bitmap.h and lib/bitmap.c, which is based on long values. Both bitmap_set from lib/bitmap.c and bitmap_set_ll, which is the lockless version from genalloc.c, use BITMAP_LAST_WORD_MASK to set the first bits in a long in the bitmap. That one uses (1 << bits) - 1, 0b111, if you are setting the first three bits. This means that the API counts from the least significant bits (LSB from now on) to the MSB. The LSB in the first long is bit 0, then. The same works for the lookup functions. The genalloc code uses longs for the bitmap, as it should. In include/linux/genalloc.h, struct gen_pool_chunk has unsigned long bits[0] as its last member. When allocating the struct, genalloc should reserve enough space for the bitmap. This should be a proper number of longs that can fit the amount of bits in the bitmap. However, genalloc allocates an integer number of bytes that fit the amount of bits, but may not be an integer amount of longs. 9 bytes, for example, could be allocated for 70 bits. This is a problem in itself if the Least Significat Bit in a long is in the byte with the largest address, which happens in Big Endian machines. This means genalloc is not allocating the byte in which it will try to set or check for a bit. This may end up in memory corruption, where genalloc will try to set the bits it has not allocated. In fact, genalloc may not set these bits because it may find them already set, because they were not zeroed since they were not allocated. And that's what causes a BUG when gen_pool_destroy is called and check for any set bits. What really happens is that genalloc uses kmalloc_node with __GFP_ZERO on gen_pool_add_virt. With SLAB and SLUB, this means the whole slab will be cleared, not only the requested bytes. Since struct gen_pool_chunk has a size that is a multiple of 8, and slab sizes are multiples of 8, we get lucky and allocate and clear the right amount of bytes. Hower, this is not the case with SLOB or with older code that did memset after allocating instead of using __GFP_ZERO. So, a simple module as this (running 3.6.0), will cause a crash when rmmod'ed. [root@phantom-lp2 foo]# cat foo.c #include #include #include #include MODULE_LICENSE("GPL"); MODULE_VERSION("0.1"); static struct gen_pool *foo_pool; static __init int foo_init(void) { int ret; foo_pool = gen_pool_create(10, -1); if (!foo_pool) return -ENOMEM; ret = gen_pool_add(foo_pool, 0xa0000000, 32 << 10, -1); if (ret) { gen_pool_destroy(foo_pool); return ret; } return 0; } static __exit void foo_exit(void) { gen_pool_destroy(foo_pool); } module_init(foo_init); module_exit(foo_exit); [root@phantom-lp2 foo]# zcat /proc/config.gz | grep SLOB CONFIG_SLOB=y [root@phantom-lp2 foo]# insmod ./foo.ko [root@phantom-lp2 foo]# rmmod foo ------------[ cut here ]------------ kernel BUG at lib/genalloc.c:243! cpu 0x4: Vector: 700 (Program Check) at [c0000000bb0e7960] pc: c0000000003cb50c: .gen_pool_destroy+0xac/0x110 lr: c0000000003cb4fc: .gen_pool_destroy+0x9c/0x110 sp: c0000000bb0e7be0 msr: 8000000000029032 current = 0xc0000000bb0e0000 paca = 0xc000000006d30e00 softe: 0 irq_happened: 0x01 pid = 13044, comm = rmmod kernel BUG at lib/genalloc.c:243! [c0000000bb0e7ca0] d000000004b00020 .foo_exit+0x20/0x38 [foo] [c0000000bb0e7d20] c0000000000dff98 .SyS_delete_module+0x1a8/0x290 [c0000000bb0e7e30] c0000000000097d4 syscall_exit+0x0/0x94 --- Exception: c00 (System Call) at 000000800753d1a0 SP (fffd0b0e640) is in userspace Signed-off-by: Thadeu Lima de Souza Cascardo Cc: Paul Gortmaker Cc: Benjamin Gaignard Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/genalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/genalloc.c b/lib/genalloc.c index ca208a92628c..54920433705a 100644 --- a/lib/genalloc.c +++ b/lib/genalloc.c @@ -178,7 +178,7 @@ int gen_pool_add_virt(struct gen_pool *pool, unsigned long virt, phys_addr_t phy struct gen_pool_chunk *chunk; int nbits = size >> pool->min_alloc_order; int nbytes = sizeof(struct gen_pool_chunk) + - (nbits + BITS_PER_BYTE - 1) / BITS_PER_BYTE; + BITS_TO_LONGS(nbits) * sizeof(long); chunk = kmalloc_node(nbytes, GFP_KERNEL | __GFP_ZERO, nid); if (unlikely(chunk == NULL)) -- cgit v1.2.3