From c38162be301d59278f568e0b34be31915b6fe3bb Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Thu, 16 Jun 2016 11:36:13 +0200 Subject: video: ARM CLCD: backlight support for OF If the device is probed from device tree, we can support backlight. This is used with some systems such as the ST Microelectronics Nomadik. We have to add HAS_IOMEM to the dependencies of CLCD since the backlight class device will now be selected, and if it gets selected on an arch that does not have IOMEM, compilation will fail. Cc: Pawel Moll Cc: Rob Herring Cc: Russell King Signed-off-by: Linus Walleij Signed-off-by: Tomi Valkeinen --- include/linux/amba/clcd.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/amba/clcd.h b/include/linux/amba/clcd.h index e82e3ee2c54a..e64c1ccebb76 100644 --- a/include/linux/amba/clcd.h +++ b/include/linux/amba/clcd.h @@ -93,6 +93,8 @@ enum { CLCD_CAP_ALL = CLCD_CAP_BGR | CLCD_CAP_RGB, }; +struct backlight_device; + struct clcd_panel { struct fb_videomode mode; signed short width; /* width in mm */ @@ -105,6 +107,7 @@ struct clcd_panel { fixedtimings:1, grayscale:1; unsigned int connector; + struct backlight_device *backlight; }; struct clcd_regs { -- cgit v1.2.3 From 03d14c36af98dd2191c2e35b5ed55ff93b59d345 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Thu, 16 Jun 2016 11:36:15 +0200 Subject: video: ARM CLCD: support pads connected in reverse order There are CLCDs connected with the pads in BGR rather than RGB order. It really doesn't matter since the CLCD has a flag and a bit to switch the position of the RGB and BGR components. This is needed to put something logical into the arm,pl11x,tft-r0g0b0-pads property of the device tree on the Nomadik which will then be <16 8 0>. Cc: Pawel Moll Cc: Rob Herring Cc: Russell King Signed-off-by: Linus Walleij Signed-off-by: Tomi Valkeinen --- drivers/video/fbdev/amba-clcd.c | 8 ++++++++ include/linux/amba/clcd.h | 18 +++++++++++++++--- 2 files changed, 23 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/video/fbdev/amba-clcd.c b/drivers/video/fbdev/amba-clcd.c index 5bd20e8800bc..080e8a246faf 100644 --- a/drivers/video/fbdev/amba-clcd.c +++ b/drivers/video/fbdev/amba-clcd.c @@ -675,6 +675,7 @@ static int clcdfb_of_init_tft_panel(struct clcd_fb *fb, u32 r0, u32 g0, u32 b0) } panels[] = { { 0x110, 1, 7, 13, CLCD_CAP_5551 }, { 0x110, 0, 8, 16, CLCD_CAP_888 }, + { 0x110, 16, 8, 0, CLCD_CAP_888 }, { 0x111, 4, 14, 20, CLCD_CAP_444 }, { 0x111, 3, 11, 19, CLCD_CAP_444 | CLCD_CAP_5551 }, { 0x111, 3, 10, 19, CLCD_CAP_444 | CLCD_CAP_5551 | @@ -702,6 +703,13 @@ static int clcdfb_of_init_tft_panel(struct clcd_fb *fb, u32 r0, u32 g0, u32 b0) fb->panel->caps = panels[i].caps; } + /* + * If we actually physically connected the R lines to B and + * vice versa + */ + if (r0 != 0 && b0 == 0) + fb->panel->bgr_connection = true; + return fb->panel->caps ? 0 : -EINVAL; } diff --git a/include/linux/amba/clcd.h b/include/linux/amba/clcd.h index e64c1ccebb76..8b64ec0d574b 100644 --- a/include/linux/amba/clcd.h +++ b/include/linux/amba/clcd.h @@ -108,6 +108,12 @@ struct clcd_panel { grayscale:1; unsigned int connector; struct backlight_device *backlight; + /* + * If the B/R lines are switched between the CLCD + * and the panel we need to know this and not try to + * compensate with the BGR bit in the control register. + */ + bool bgr_connection; }; struct clcd_regs { @@ -234,16 +240,22 @@ static inline void clcdfb_decode(struct clcd_fb *fb, struct clcd_regs *regs) if (var->grayscale) val |= CNTL_LCDBW; - if (fb->panel->caps && fb->board->caps && - var->bits_per_pixel >= 16) { + if (fb->panel->caps && fb->board->caps && var->bits_per_pixel >= 16) { /* * if board and panel supply capabilities, we can support - * changing BGR/RGB depending on supplied parameters + * changing BGR/RGB depending on supplied parameters. Here + * we switch to what the framebuffer is providing if need + * be, so if the framebuffer is BGR but the display connection + * is RGB (first case) we switch it around. Vice versa mutatis + * mutandis if the framebuffer is RGB but the display connection + * is BGR, we flip it around. */ if (var->red.offset == 0) val &= ~CNTL_BGR; else val |= CNTL_BGR; + if (fb->panel->bgr_connection) + val ^= CNTL_BGR; } switch (var->bits_per_pixel) { -- cgit v1.2.3 From 046ad6cdeb3f83abcbfa2af88ce471afb2e7fc30 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Thu, 16 Jun 2016 11:36:16 +0200 Subject: video: ARM CLCD: support Nomadik variant The Nomadik variant has a few special quirks that need to be respected to make the driver work: - The block need to be clocked during writing of the TIMn registers or the bus will stall. - Special bits in the control register select how many of the output display lines get activated. - Special bits in the control register select how to manage the different 565 and 5551 modes. - There is a packed 24bit graphics mode, i.e 888 pixels can be stored in memory is three consecutive bytes, not evenly aligned to a 32bit word. This patch uses the vendor data pointer from the AMBA matching mechanism to track the quirks for this variant, and adds two hooks that variants can use to initialize boards and panels during start-up. These will later be used to adopt a Nomadik board profile. Cc: Pawel Moll Cc: Rob Herring Cc: Russell King Signed-off-by: Linus Walleij Signed-off-by: Tomi Valkeinen --- drivers/video/fbdev/amba-clcd.c | 106 ++++++++++++++++++++++++++++++++++++---- include/linux/amba/clcd.h | 42 ++++++++++++++++ 2 files changed, 139 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/drivers/video/fbdev/amba-clcd.c b/drivers/video/fbdev/amba-clcd.c index 080e8a246faf..371f4e2aea6c 100644 --- a/drivers/video/fbdev/amba-clcd.c +++ b/drivers/video/fbdev/amba-clcd.c @@ -225,6 +225,15 @@ clcdfb_set_bitfields(struct clcd_fb *fb, struct fb_var_screeninfo *var) var->blue.length = 4; } break; + case 24: + if (fb->vendor->packed_24_bit_pixels) { + var->red.length = 8; + var->green.length = 8; + var->blue.length = 8; + } else { + ret = -EINVAL; + } + break; case 32: /* If we can't do 888, reject */ caps &= CLCD_CAP_888; @@ -311,6 +320,12 @@ static int clcdfb_set_par(struct fb_info *info) clcdfb_disable(fb); + /* Some variants must be clocked here */ + if (fb->vendor->clock_timregs && !fb->clk_enabled) { + fb->clk_enabled = true; + clk_enable(fb->clk); + } + writel(regs.tim0, fb->regs + CLCD_TIM0); writel(regs.tim1, fb->regs + CLCD_TIM1); writel(regs.tim2, fb->regs + CLCD_TIM2); @@ -710,6 +725,42 @@ static int clcdfb_of_init_tft_panel(struct clcd_fb *fb, u32 r0, u32 g0, u32 b0) if (r0 != 0 && b0 == 0) fb->panel->bgr_connection = true; + if (fb->panel->caps && fb->vendor->st_bitmux_control) { + /* + * Set up the special bits for the Nomadik control register + * (other platforms tend to do this through an external + * register). + */ + + /* Offset of the highest used color */ + int maxoff = max3(r0, g0, b0); + /* Most significant bit out, highest used bit */ + int msb = 0; + + if (fb->panel->caps & CLCD_CAP_888) { + msb = maxoff + 8 - 1; + } else if (fb->panel->caps & CLCD_CAP_565) { + msb = maxoff + 5 - 1; + fb->panel->cntl |= CNTL_ST_1XBPP_565; + } else if (fb->panel->caps & CLCD_CAP_5551) { + msb = maxoff + 5 - 1; + fb->panel->cntl |= CNTL_ST_1XBPP_5551; + } else if (fb->panel->caps & CLCD_CAP_444) { + msb = maxoff + 4 - 1; + fb->panel->cntl |= CNTL_ST_1XBPP_444; + } + + /* Send out as many bits as we need */ + if (msb > 17) + fb->panel->cntl |= CNTL_ST_CDWID_24; + else if (msb > 15) + fb->panel->cntl |= CNTL_ST_CDWID_18; + else if (msb > 11) + fb->panel->cntl |= CNTL_ST_CDWID_16; + else + fb->panel->cntl |= CNTL_ST_CDWID_12; + } + return fb->panel->caps ? 0 : -EINVAL; } @@ -725,9 +776,21 @@ static int clcdfb_of_init_display(struct clcd_fb *fb) if (!fb->panel) return -ENOMEM; - endpoint = of_graph_get_next_endpoint(fb->dev->dev.of_node, NULL); - if (!endpoint) - return -ENODEV; + /* + * Fetch the panel endpoint. + */ + if (!endpoint) { + endpoint = of_graph_get_next_endpoint(fb->dev->dev.of_node, + NULL); + if (!endpoint) + return -ENODEV; + } + + if (fb->vendor->init_panel) { + err = fb->vendor->init_panel(fb, endpoint); + if (err) + return err; + } err = clcdfb_of_get_backlight(endpoint, fb->panel); if (err) @@ -764,11 +827,11 @@ static int clcdfb_of_init_display(struct clcd_fb *fb) if (of_property_read_u32_array(endpoint, "arm,pl11x,tft-r0g0b0-pads", - tft_r0b0g0, ARRAY_SIZE(tft_r0b0g0)) == 0) - return clcdfb_of_init_tft_panel(fb, tft_r0b0g0[0], - tft_r0b0g0[1], tft_r0b0g0[2]); + tft_r0b0g0, ARRAY_SIZE(tft_r0b0g0)) != 0) + return -ENOENT; - return -ENOENT; + return clcdfb_of_init_tft_panel(fb, tft_r0b0g0[0], + tft_r0b0g0[1], tft_r0b0g0[2]); } static int clcdfb_of_vram_setup(struct clcd_fb *fb) @@ -889,6 +952,7 @@ static struct clcd_board *clcdfb_of_get_board(struct amba_device *dev) static int clcdfb_probe(struct amba_device *dev, const struct amba_id *id) { struct clcd_board *board = dev_get_platdata(&dev->dev); + struct clcd_vendor_data *vendor = id->data; struct clcd_fb *fb; int ret; @@ -898,6 +962,12 @@ static int clcdfb_probe(struct amba_device *dev, const struct amba_id *id) if (!board) return -EINVAL; + if (vendor->init_board) { + ret = vendor->init_board(dev, board); + if (ret) + return ret; + } + ret = dma_set_mask_and_coherent(&dev->dev, DMA_BIT_MASK(32)); if (ret) goto out; @@ -916,10 +986,11 @@ static int clcdfb_probe(struct amba_device *dev, const struct amba_id *id) } fb->dev = dev; + fb->vendor = vendor; fb->board = board; - dev_info(&fb->dev->dev, "PL%03x rev%u at 0x%08llx\n", - amba_part(dev), amba_rev(dev), + dev_info(&fb->dev->dev, "PL%03x designer %02x rev%u at 0x%08llx\n", + amba_part(dev), amba_manf(dev), amba_rev(dev), (unsigned long long)dev->res.start); ret = fb->board->setup(fb); @@ -962,10 +1033,27 @@ static int clcdfb_remove(struct amba_device *dev) return 0; } +static struct clcd_vendor_data vendor_arm = { + /* No special business */ +}; + +static struct clcd_vendor_data vendor_nomadik = { + .clock_timregs = true, + .packed_24_bit_pixels = true, + .st_bitmux_control = true, +}; + static struct amba_id clcdfb_id_table[] = { { .id = 0x00041110, .mask = 0x000ffffe, + .data = &vendor_arm, + }, + /* ST Electronics Nomadik variant */ + { + .id = 0x00180110, + .mask = 0x00fffffe, + .data = &vendor_nomadik, }, { 0, 0 }, }; diff --git a/include/linux/amba/clcd.h b/include/linux/amba/clcd.h index 8b64ec0d574b..1035879b322c 100644 --- a/include/linux/amba/clcd.h +++ b/include/linux/amba/clcd.h @@ -67,6 +67,17 @@ #define CNTL_LDMAFIFOTIME (1 << 15) #define CNTL_WATERMARK (1 << 16) +/* ST Microelectronics variant bits */ +#define CNTL_ST_1XBPP_444 0x0 +#define CNTL_ST_1XBPP_5551 (1 << 17) +#define CNTL_ST_1XBPP_565 (1 << 18) +#define CNTL_ST_CDWID_12 0x0 +#define CNTL_ST_CDWID_16 (1 << 19) +#define CNTL_ST_CDWID_18 (1 << 20) +#define CNTL_ST_CDWID_24 ((1 << 19)|(1 << 20)) +#define CNTL_ST_CEAEN (1 << 21) +#define CNTL_ST_LCDBPP24_PACKED (6 << 1) + enum { /* individual formats */ CLCD_CAP_RGB444 = (1 << 0), @@ -179,11 +190,38 @@ struct clcd_board { struct amba_device; struct clk; +/** + * struct clcd_vendor_data - holds hardware (IP-block) vendor-specific + * variant information + * + * @clock_timregs: the CLCD needs to be clocked when accessing the + * timer registers, or the hardware will hang. + * @packed_24_bit_pixels: this variant supports 24bit packed pixel data, + * so that RGB accesses 3 bytes at a time, not just on even 32bit + * boundaries, packing the pixel data in memory. ST Microelectronics + * have this. + * @st_bitmux_control: ST Microelectronics have implemented output + * bit line multiplexing into the CLCD control register. This indicates + * that we need to use this. + * @init_board: custom board init function for this variant + * @init_panel: custom panel init function for this variant + */ +struct clcd_vendor_data { + bool clock_timregs; + bool packed_24_bit_pixels; + bool st_bitmux_control; + int (*init_board)(struct amba_device *adev, + struct clcd_board *board); + int (*init_panel)(struct clcd_fb *fb, + struct device_node *panel); +}; + /* this data structure describes each frame buffer device we find */ struct clcd_fb { struct fb_info fb; struct amba_device *dev; struct clk *clk; + struct clcd_vendor_data *vendor; struct clcd_panel *panel; struct clcd_board *board; void *board_data; @@ -285,6 +323,10 @@ static inline void clcdfb_decode(struct clcd_fb *fb, struct clcd_regs *regs) else val |= CNTL_LCDBPP16_444; break; + case 24: + /* Modified variant supporting 24 bit packed pixels */ + val |= CNTL_ST_LCDBPP24_PACKED; + break; case 32: val |= CNTL_LCDBPP24; break; -- cgit v1.2.3 From 0733424c9ba9f42242409d1ece780777272f7ea1 Mon Sep 17 00:00:00 2001 From: David Hsu Date: Tue, 9 Aug 2016 14:57:46 -0700 Subject: pwm: Unexport children before chip removal Exported pwm channels aren't removed before the pwmchip and are leaked. This results in invalid sysfs files. This fix removes all exported pwm channels before chip removal. Signed-off-by: David Hsu Fixes: 76abbdde2d95 ("pwm: Add sysfs interface") Signed-off-by: Thierry Reding --- drivers/pwm/core.c | 2 ++ drivers/pwm/sysfs.c | 18 ++++++++++++++++++ include/linux/pwm.h | 5 +++++ 3 files changed, 25 insertions(+) (limited to 'include') diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c index 0dbd29e287db..172ef8245811 100644 --- a/drivers/pwm/core.c +++ b/drivers/pwm/core.c @@ -339,6 +339,8 @@ int pwmchip_remove(struct pwm_chip *chip) unsigned int i; int ret = 0; + pwmchip_sysfs_unexport_children(chip); + mutex_lock(&pwm_lock); for (i = 0; i < chip->npwm; i++) { diff --git a/drivers/pwm/sysfs.c b/drivers/pwm/sysfs.c index 18ed725594c3..0296d8178ae2 100644 --- a/drivers/pwm/sysfs.c +++ b/drivers/pwm/sysfs.c @@ -409,6 +409,24 @@ void pwmchip_sysfs_unexport(struct pwm_chip *chip) } } +void pwmchip_sysfs_unexport_children(struct pwm_chip *chip) +{ + struct device *parent; + unsigned int i; + + parent = class_find_device(&pwm_class, NULL, chip, + pwmchip_sysfs_match); + if (!parent) + return; + + for (i = 0; i < chip->npwm; i++) { + struct pwm_device *pwm = &chip->pwms[i]; + + if (test_bit(PWMF_EXPORTED, &pwm->flags)) + pwm_unexport_child(parent, pwm); + } +} + static int __init pwm_sysfs_init(void) { return class_register(&pwm_class); diff --git a/include/linux/pwm.h b/include/linux/pwm.h index f1bbae014889..2c6c5114c089 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -641,6 +641,7 @@ static inline void pwm_remove_table(struct pwm_lookup *table, size_t num) #ifdef CONFIG_PWM_SYSFS void pwmchip_sysfs_export(struct pwm_chip *chip); void pwmchip_sysfs_unexport(struct pwm_chip *chip); +void pwmchip_sysfs_unexport_children(struct pwm_chip *chip); #else static inline void pwmchip_sysfs_export(struct pwm_chip *chip) { @@ -649,6 +650,10 @@ static inline void pwmchip_sysfs_export(struct pwm_chip *chip) static inline void pwmchip_sysfs_unexport(struct pwm_chip *chip) { } + +static inline void pwmchip_sysfs_unexport_children(struct pwm_chip *chip) +{ +} #endif /* CONFIG_PWM_SYSFS */ #endif /* __LINUX_PWM_H */ -- cgit v1.2.3 From 3132e49ecef9dab43d858d8e7066662c6a1efb16 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 10 Aug 2016 15:58:24 -0400 Subject: pnfs: track multiple layout types in fsinfo structure Current NFSv4.1/pNFS client assumes that MDS supports only one layout type. While it's true for most existing servers, nevertheless, this can be change in the near future. For now, this patch just plumbs in the ability to track a list of layouts in the fsinfo structure. The existing behavior of the client is preserved, by having it just select the first entry in the list. Signed-off-by: Tigran Mkrtchyan Signed-off-by: Jeff Layton Reviewed-by: J. Bruce Fields Signed-off-by: Anna Schumaker --- fs/nfs/client.c | 2 +- fs/nfs/nfs4xdr.c | 23 ++++++++++------------- fs/nfs/pnfs.c | 27 ++++++++++++++++----------- fs/nfs/pnfs.h | 4 ++-- include/linux/nfs_xdr.h | 7 ++++++- 5 files changed, 35 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 1e106780a237..0c1b3a002dc2 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -785,7 +785,7 @@ int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs } fsinfo.fattr = fattr; - fsinfo.layouttype = 0; + memset(fsinfo.layouttype, 0, sizeof(fsinfo.layouttype)); error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo); if (error < 0) goto out_error; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 7bd3a5c09d31..41a02f994976 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -4725,14 +4725,13 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, } /* - * Decode potentially multiple layout types. Currently we only support - * one layout driver per file system. + * Decode potentially multiple layout types. */ -static int decode_first_pnfs_layout_type(struct xdr_stream *xdr, +static int decode_pnfs_layout_types(struct xdr_stream *xdr, uint32_t *layouttype) { __be32 *p; - int num; + uint32_t num, i; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) @@ -4741,18 +4740,17 @@ static int decode_first_pnfs_layout_type(struct xdr_stream *xdr, /* pNFS is not supported by the underlying file system */ if (num == 0) { - *layouttype = 0; return 0; } - if (num > 1) - printk(KERN_INFO "NFS: %s: Warning: Multiple pNFS layout " - "drivers per filesystem not supported\n", __func__); + if (num > NFS_MAX_LAYOUT_TYPES) + printk(KERN_INFO "NFS: %s: Warning: Too many (%d) pNFS layout types\n", __func__, num); /* Decode and set first layout type, move xdr->p past unused types */ p = xdr_inline_decode(xdr, num * 4); if (unlikely(!p)) goto out_overflow; - *layouttype = be32_to_cpup(p); + for(i = 0; i < num && i < NFS_MAX_LAYOUT_TYPES; i++) + layouttype[i] = be32_to_cpup(p++); return 0; out_overflow: print_overflow_msg(__func__, xdr); @@ -4772,10 +4770,9 @@ static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap, if (unlikely(bitmap[1] & (FATTR4_WORD1_FS_LAYOUT_TYPES - 1U))) return -EIO; if (bitmap[1] & FATTR4_WORD1_FS_LAYOUT_TYPES) { - status = decode_first_pnfs_layout_type(xdr, layouttype); + status = decode_pnfs_layout_types(xdr, layouttype); bitmap[1] &= ~FATTR4_WORD1_FS_LAYOUT_TYPES; - } else - *layouttype = 0; + } return status; } @@ -4856,7 +4853,7 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) status = decode_attr_time_delta(xdr, bitmap, &fsinfo->time_delta); if (status != 0) goto xdr_error; - status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype); + status = decode_attr_pnfstype(xdr, bitmap, fsinfo->layouttype); if (status != 0) goto xdr_error; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 2c93a85eda51..a6a683fbd230 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -102,32 +102,37 @@ unset_pnfs_layoutdriver(struct nfs_server *nfss) * Try to set the server's pnfs module to the pnfs layout type specified by id. * Currently only one pNFS layout driver per filesystem is supported. * - * @id layout type. Zero (illegal layout type) indicates pNFS not in use. + * @ids array of layout types supported by MDS. */ void set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh, - u32 id) + u32 *ids) { struct pnfs_layoutdriver_type *ld_type = NULL; + u32 id; - if (id == 0) - goto out_no_driver; if (!(server->nfs_client->cl_exchange_flags & (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) { - printk(KERN_ERR "NFS: %s: id %u cl_exchange_flags 0x%x\n", - __func__, id, server->nfs_client->cl_exchange_flags); + printk(KERN_ERR "NFS: %s: cl_exchange_flags 0x%x\n", + __func__, server->nfs_client->cl_exchange_flags); goto out_no_driver; } + + id = ids[0]; + if (!id) + goto out_no_driver; + ld_type = find_pnfs_driver(id); if (!ld_type) { request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id); ld_type = find_pnfs_driver(id); - if (!ld_type) { - dprintk("%s: No pNFS module found for %u.\n", - __func__, id); - goto out_no_driver; - } } + + if (!ld_type) { + dprintk("%s: No pNFS module found for %u.\n", __func__, id); + goto out_no_driver; + } + server->pnfs_curr_ld = ld_type; if (ld_type->set_layoutdriver && ld_type->set_layoutdriver(server, mntfh)) { diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 31d99b2927b0..be515e6a3823 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -236,7 +236,7 @@ void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo); void pnfs_put_lseg(struct pnfs_layout_segment *lseg); void pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg); -void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32); +void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32 *); void unset_pnfs_layoutdriver(struct nfs_server *); void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *); int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc); @@ -657,7 +657,7 @@ pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task) } static inline void set_pnfs_layoutdriver(struct nfs_server *s, - const struct nfs_fh *mntfh, u32 id) + const struct nfs_fh *mntfh, u32 *ids) { } diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 7cc0deee5bde..f11b26ed001b 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -124,6 +124,11 @@ struct nfs_fattr { | NFS_ATTR_FATTR_SPACE_USED \ | NFS_ATTR_FATTR_V4_SECURITY_LABEL) +/* + * Maximal number of supported layout drivers. + */ +#define NFS_MAX_LAYOUT_TYPES 8 + /* * Info on the file system */ @@ -139,7 +144,7 @@ struct nfs_fsinfo { __u64 maxfilesize; struct timespec time_delta; /* server time granularity */ __u32 lease_time; /* in seconds */ - __u32 layouttype; /* supported pnfs layout driver */ + __u32 layouttype[NFS_MAX_LAYOUT_TYPES]; /* supported pnfs layout driver */ __u32 blksize; /* preferred pnfs io block size */ __u32 clone_blksize; /* granularity of a CLONE operation */ }; -- cgit v1.2.3 From 3b58a8a9049d5e191402665c339690a148504358 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Fri, 9 Sep 2016 09:22:23 -0400 Subject: SUNRPC rpc_clnt_xprt_switch_put Give the NFS layer access to the xprt_switch_put function Signed-off-by: Andy Adamson Signed-off-by: Anna Schumaker --- include/linux/sunrpc/clnt.h | 2 ++ net/sunrpc/clnt.c | 6 ++++++ 2 files changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 5c02b0691587..c12f86b752cb 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -199,5 +199,7 @@ void rpc_cap_max_reconnect_timeout(struct rpc_clnt *clnt, unsigned long timeo); const char *rpc_proc_name(const struct rpc_task *task); + +void rpc_clnt_xprt_switch_put(struct rpc_clnt *); #endif /* __KERNEL__ */ #endif /* _LINUX_SUNRPC_CLNT_H */ diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 4bb526f57a09..0ff5cbf3d631 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2695,6 +2695,12 @@ rpc_cap_max_reconnect_timeout(struct rpc_clnt *clnt, unsigned long timeo) } EXPORT_SYMBOL_GPL(rpc_cap_max_reconnect_timeout); +void rpc_clnt_xprt_switch_put(struct rpc_clnt *clnt) +{ + xprt_switch_put(rcu_dereference(clnt->cl_xpi.xpi_xpswitch)); +} +EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_put); + #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) static void rpc_show_header(void) { -- cgit v1.2.3 From dd69171769cf4649a7ff3755e91cbd242a833727 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Fri, 9 Sep 2016 09:22:24 -0400 Subject: SUNRPC rpc_clnt_xprt_switch_add_xprt Give the NFS layer access to the rpc_xprt_switch_add_xprt function Signed-off-by: Andy Adamson Signed-off-by: Anna Schumaker --- include/linux/sunrpc/clnt.h | 1 + net/sunrpc/clnt.c | 7 +++++++ 2 files changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index c12f86b752cb..b069d6e2c3d6 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -201,5 +201,6 @@ void rpc_cap_max_reconnect_timeout(struct rpc_clnt *clnt, const char *rpc_proc_name(const struct rpc_task *task); void rpc_clnt_xprt_switch_put(struct rpc_clnt *); +void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *, struct rpc_xprt *); #endif /* __KERNEL__ */ #endif /* _LINUX_SUNRPC_CLNT_H */ diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 0ff5cbf3d631..43ec46547dc9 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2701,6 +2701,13 @@ void rpc_clnt_xprt_switch_put(struct rpc_clnt *clnt) } EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_put); +void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt) +{ + rpc_xprt_switch_add_xprt(rcu_dereference(clnt->cl_xpi.xpi_xpswitch), + xprt); +} +EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_add_xprt); + #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) static void rpc_show_header(void) { -- cgit v1.2.3 From 39e5d2df959dd4aea81fa33d765d2a5cc67a0512 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Fri, 9 Sep 2016 09:22:25 -0400 Subject: SUNRPC search xprt switch for sockaddr Signed-off-by: Andy Adamson Signed-off-by: Anna Schumaker --- include/linux/sunrpc/clnt.h | 2 ++ include/linux/sunrpc/xprtmultipath.h | 2 ++ net/sunrpc/clnt.c | 15 +++++++++++++++ net/sunrpc/xprtmultipath.c | 24 +++++++++++++++++++++++- 4 files changed, 42 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index b069d6e2c3d6..35cc539e2921 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -202,5 +202,7 @@ const char *rpc_proc_name(const struct rpc_task *task); void rpc_clnt_xprt_switch_put(struct rpc_clnt *); void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *, struct rpc_xprt *); +bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt, + const struct sockaddr *sap); #endif /* __KERNEL__ */ #endif /* _LINUX_SUNRPC_CLNT_H */ diff --git a/include/linux/sunrpc/xprtmultipath.h b/include/linux/sunrpc/xprtmultipath.h index 5a9acffa41be..507418c1c69e 100644 --- a/include/linux/sunrpc/xprtmultipath.h +++ b/include/linux/sunrpc/xprtmultipath.h @@ -66,4 +66,6 @@ extern struct rpc_xprt *xprt_iter_xprt(struct rpc_xprt_iter *xpi); extern struct rpc_xprt *xprt_iter_get_xprt(struct rpc_xprt_iter *xpi); extern struct rpc_xprt *xprt_iter_get_next(struct rpc_xprt_iter *xpi); +extern bool rpc_xprt_switch_has_addr(struct rpc_xprt_switch *xps, + const struct sockaddr *sap); #endif diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 43ec46547dc9..8d68efd2026f 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2708,6 +2708,21 @@ void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt) } EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_add_xprt); +bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt, + const struct sockaddr *sap) +{ + struct rpc_xprt_switch *xps; + bool ret; + + xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch); + + rcu_read_lock(); + ret = rpc_xprt_switch_has_addr(xps, sap); + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_has_addr); + #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) static void rpc_show_header(void) { diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c index 66c9d63f4797..ae92a9e9ba52 100644 --- a/net/sunrpc/xprtmultipath.c +++ b/net/sunrpc/xprtmultipath.c @@ -15,6 +15,7 @@ #include #include #include +#include #include typedef struct rpc_xprt *(*xprt_switch_find_xprt_t)(struct list_head *head, @@ -49,7 +50,8 @@ void rpc_xprt_switch_add_xprt(struct rpc_xprt_switch *xps, if (xprt == NULL) return; spin_lock(&xps->xps_lock); - if (xps->xps_net == xprt->xprt_net || xps->xps_net == NULL) + if ((xps->xps_net == xprt->xprt_net || xps->xps_net == NULL) && + !rpc_xprt_switch_has_addr(xps, (struct sockaddr *)&xprt->addr)) xprt_switch_add_xprt_locked(xps, xprt); spin_unlock(&xps->xps_lock); } @@ -232,6 +234,26 @@ struct rpc_xprt *xprt_iter_current_entry(struct rpc_xprt_iter *xpi) return xprt_switch_find_current_entry(head, xpi->xpi_cursor); } +bool rpc_xprt_switch_has_addr(struct rpc_xprt_switch *xps, + const struct sockaddr *sap) +{ + struct list_head *head; + struct rpc_xprt *pos; + + if (xps == NULL || sap == NULL) + return false; + + head = &xps->xps_xprt_list; + list_for_each_entry_rcu(pos, head, xprt_switch) { + if (rpc_cmp_addr_port(sap, (struct sockaddr *)&pos->addr)) { + pr_info("RPC: addr %s already in xprt switch\n", + pos->address_strings[RPC_DISPLAY_ADDR]); + return true; + } + } + return false; +} + static struct rpc_xprt *xprt_switch_find_next_entry(struct list_head *head, const struct rpc_xprt *cur) -- cgit v1.2.3 From fda0ab41170ee0a1c7a3781ff8cfb4395c3dd784 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Fri, 9 Sep 2016 09:22:26 -0400 Subject: SUNRPC: rpc_clnt_add_xprt setup function for NFS layer Use a setup function to call into the NFS layer to test an rpc_xprt for session trunking so as to not leak the rpc_xprt_switch into the nfs layer. Search for the address in the rpc_xprt_switch first so as not to put an unnecessary EXCHANGE_ID on the wire. Signed-off-by: Andy Adamson Signed-off-by: Anna Schumaker --- include/linux/sunrpc/clnt.h | 12 +++++++++ net/sunrpc/clnt.c | 64 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+) (limited to 'include') diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 35cc539e2921..85cc819676e8 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -125,6 +125,13 @@ struct rpc_create_args { struct svc_xprt *bc_xprt; /* NFSv4.1 backchannel */ }; +struct rpc_add_xprt_test { + int (*add_xprt_test)(struct rpc_clnt *, + struct rpc_xprt *, + void *calldata); + void *data; +}; + /* Values for "flags" field */ #define RPC_CLNT_CREATE_HARDRTRY (1UL << 0) #define RPC_CLNT_CREATE_AUTOBIND (1UL << 2) @@ -198,6 +205,11 @@ int rpc_clnt_add_xprt(struct rpc_clnt *, struct xprt_create *, void rpc_cap_max_reconnect_timeout(struct rpc_clnt *clnt, unsigned long timeo); +int rpc_clnt_setup_test_and_add_xprt(struct rpc_clnt *, + struct rpc_xprt_switch *, + struct rpc_xprt *, + void *); + const char *rpc_proc_name(const struct rpc_task *task); void rpc_clnt_xprt_switch_put(struct rpc_clnt *); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 8d68efd2026f..b614cb1356b9 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2613,6 +2613,70 @@ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt, } EXPORT_SYMBOL_GPL(rpc_clnt_test_and_add_xprt); +/** + * rpc_clnt_setup_test_and_add_xprt() + * + * This is an rpc_clnt_add_xprt setup() function which returns 1 so: + * 1) caller of the test function must dereference the rpc_xprt_switch + * and the rpc_xprt. + * 2) test function must call rpc_xprt_switch_add_xprt, usually in + * the rpc_call_done routine. + * + * Upon success (return of 1), the test function adds the new + * transport to the rpc_clnt xprt switch + * + * @clnt: struct rpc_clnt to get the new transport + * @xps: the rpc_xprt_switch to hold the new transport + * @xprt: the rpc_xprt to test + * @data: a struct rpc_add_xprt_test pointer that holds the test function + * and test function call data + */ +int rpc_clnt_setup_test_and_add_xprt(struct rpc_clnt *clnt, + struct rpc_xprt_switch *xps, + struct rpc_xprt *xprt, + void *data) +{ + struct rpc_cred *cred; + struct rpc_task *task; + struct rpc_add_xprt_test *xtest = (struct rpc_add_xprt_test *)data; + int status = -EADDRINUSE; + + xprt = xprt_get(xprt); + xprt_switch_get(xps); + + if (rpc_xprt_switch_has_addr(xps, (struct sockaddr *)&xprt->addr)) + goto out_err; + + /* Test the connection */ + cred = authnull_ops.lookup_cred(NULL, NULL, 0); + task = rpc_call_null_helper(clnt, xprt, cred, + RPC_TASK_SOFT | RPC_TASK_SOFTCONN, + NULL, NULL); + put_rpccred(cred); + if (IS_ERR(task)) { + status = PTR_ERR(task); + goto out_err; + } + status = task->tk_status; + rpc_put_task(task); + + if (status < 0) + goto out_err; + + /* rpc_xprt_switch and rpc_xprt are deferrenced by add_xprt_test() */ + xtest->add_xprt_test(clnt, xprt, xtest->data); + + /* so that rpc_clnt_add_xprt does not call rpc_xprt_switch_add_xprt */ + return 1; +out_err: + xprt_put(xprt); + xprt_switch_put(xps); + pr_info("RPC: rpc_clnt_test_xprt failed: %d addr %s not added\n", + status, xprt->address_strings[RPC_DISPLAY_ADDR]); + return status; +} +EXPORT_SYMBOL_GPL(rpc_clnt_setup_test_and_add_xprt); + /** * rpc_clnt_add_xprt - Add a new transport to a rpc_clnt * @clnt: pointer to struct rpc_clnt -- cgit v1.2.3 From b9c5bc03be6aae41990efd09f83cf70a89ac9f4b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 15 Sep 2016 10:55:12 -0400 Subject: SUNRPC: Refactor rpc_xdr_buf_init() Clean up: there is some XDR initialization logic that is common to the forward channel and backchannel. Move it to an XDR header so it can be shared. rpc_rqst::rq_buffer points to a buffer containing big-endian data. Update its annotation as part of the clean up. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/linux/sunrpc/xdr.h | 12 ++++++++++++ include/linux/sunrpc/xprt.h | 2 +- net/sunrpc/backchannel_rqst.c | 8 +------- net/sunrpc/clnt.c | 24 ++++++------------------ net/sunrpc/xprtrdma/backchannel.c | 12 +----------- 5 files changed, 21 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 70c6b92e15a7..56c48c884a24 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -67,6 +67,18 @@ struct xdr_buf { len; /* Length of XDR encoded message */ }; +static inline void +xdr_buf_init(struct xdr_buf *buf, void *start, size_t len) +{ + buf->head[0].iov_base = start; + buf->head[0].iov_len = len; + buf->tail[0].iov_len = 0; + buf->page_len = 0; + buf->flags = 0; + buf->len = 0; + buf->buflen = len; +} + /* * pre-xdr'ed macros. */ diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index a16070dd03ee..6f1d41b559a3 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -83,7 +83,7 @@ struct rpc_rqst { void (*rq_release_snd_buf)(struct rpc_rqst *); /* release rq_enc_pages */ struct list_head rq_list; - __u32 * rq_buffer; /* XDR encode buffer */ + void *rq_buffer; /* Call XDR encode buffer */ size_t rq_callsize, rq_rcvsize; size_t rq_xmit_bytes_sent; /* total bytes sent */ diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c index 229956bf8457..ac701c28f44f 100644 --- a/net/sunrpc/backchannel_rqst.c +++ b/net/sunrpc/backchannel_rqst.c @@ -76,13 +76,7 @@ static int xprt_alloc_xdr_buf(struct xdr_buf *buf, gfp_t gfp_flags) page = alloc_page(gfp_flags); if (page == NULL) return -ENOMEM; - buf->head[0].iov_base = page_address(page); - buf->head[0].iov_len = PAGE_SIZE; - buf->tail[0].iov_base = NULL; - buf->tail[0].iov_len = 0; - buf->page_len = 0; - buf->len = 0; - buf->buflen = PAGE_SIZE; + xdr_buf_init(buf, page_address(page), PAGE_SIZE); return 0; } diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index b614cb1356b9..6481986be7a7 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1746,18 +1746,6 @@ rpc_task_force_reencode(struct rpc_task *task) task->tk_rqstp->rq_bytes_sent = 0; } -static inline void -rpc_xdr_buf_init(struct xdr_buf *buf, void *start, size_t len) -{ - buf->head[0].iov_base = start; - buf->head[0].iov_len = len; - buf->tail[0].iov_len = 0; - buf->page_len = 0; - buf->flags = 0; - buf->len = 0; - buf->buflen = len; -} - /* * 3. Encode arguments of an RPC call */ @@ -1770,12 +1758,12 @@ rpc_xdr_encode(struct rpc_task *task) dprint_status(task); - rpc_xdr_buf_init(&req->rq_snd_buf, - req->rq_buffer, - req->rq_callsize); - rpc_xdr_buf_init(&req->rq_rcv_buf, - (char *)req->rq_buffer + req->rq_callsize, - req->rq_rcvsize); + xdr_buf_init(&req->rq_snd_buf, + req->rq_buffer, + req->rq_callsize); + xdr_buf_init(&req->rq_rcv_buf, + (char *)req->rq_buffer + req->rq_callsize, + req->rq_rcvsize); p = rpc_encode_header(task); if (p == NULL) { diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 5f60ab2f858a..d3cfaf281e55 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -38,7 +38,6 @@ static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt, struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rpcrdma_regbuf *rb; struct rpcrdma_req *req; - struct xdr_buf *buf; size_t size; req = rpcrdma_create_req(r_xprt); @@ -60,16 +59,7 @@ static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt, req->rl_sendbuf = rb; /* so that rpcr_to_rdmar works when receiving a request */ rqst->rq_buffer = (void *)req->rl_sendbuf->rg_base; - - buf = &rqst->rq_snd_buf; - buf->head[0].iov_base = rqst->rq_buffer; - buf->head[0].iov_len = 0; - buf->tail[0].iov_base = NULL; - buf->tail[0].iov_len = 0; - buf->page_len = 0; - buf->len = 0; - buf->buflen = size; - + xdr_buf_init(&rqst->rq_snd_buf, rqst->rq_buffer, size); return 0; out_fail: -- cgit v1.2.3 From 5fe6eaa1f9a00b9a5927e3b791ecad2f3eaab130 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 15 Sep 2016 10:55:20 -0400 Subject: SUNRPC: Generalize the RPC buffer allocation API xprtrdma needs to allocate the Call and Reply buffers separately. TBH, the reliance on using a single buffer for the pair of XDR buffers is transport implementation-specific. Transports that want to allocate separate Call and Reply buffers will ignore the "size" argument anyway. Don't bother passing it. The buf_alloc method can't return two pointers. Instead, make the method's return value an error code, and set the rq_buffer pointer in the method itself. This gives call_allocate an opportunity to terminate an RPC instead of looping forever when a permanent problem occurs. If a request is just bogus, or the transport is in a state where it can't allocate resources for any request, there needs to be a way to kill the RPC right there and not loop. This immediately fixes a rare problem in the backchannel send path, which loops if the server happens to send a CB request whose call+reply size is larger than a page (which it shouldn't do yet). One more issue: looks like xprt_inject_disconnect was incorrectly placed in the failure path in call_allocate. It needs to be in the success path, as it is for other call-sites. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/linux/sunrpc/sched.h | 2 +- include/linux/sunrpc/xprt.h | 2 +- net/sunrpc/clnt.c | 12 ++++++++---- net/sunrpc/sched.c | 24 +++++++++++++++--------- net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 17 +++++++++-------- net/sunrpc/xprtrdma/transport.c | 26 ++++++++++++++++++-------- net/sunrpc/xprtsock.c | 17 +++++++++++------ 7 files changed, 63 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index 817af0b4385e..38d4c1b378f2 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -239,7 +239,7 @@ struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *, void *); void rpc_wake_up_status(struct rpc_wait_queue *, int); void rpc_delay(struct rpc_task *, unsigned long); -void * rpc_malloc(struct rpc_task *, size_t); +int rpc_malloc(struct rpc_task *); void rpc_free(void *); int rpciod_up(void); void rpciod_down(void); diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 6f1d41b559a3..c01f468fb374 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -127,7 +127,7 @@ struct rpc_xprt_ops { void (*rpcbind)(struct rpc_task *task); void (*set_port)(struct rpc_xprt *xprt, unsigned short port); void (*connect)(struct rpc_xprt *xprt, struct rpc_task *task); - void * (*buf_alloc)(struct rpc_task *task, size_t size); + int (*buf_alloc)(struct rpc_task *task); void (*buf_free)(void *buffer); int (*send_request)(struct rpc_task *task); void (*set_retrans_timeout)(struct rpc_task *task); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 6481986be7a7..5499fda0c1f3 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1691,6 +1691,7 @@ call_allocate(struct rpc_task *task) struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; struct rpc_procinfo *proc = task->tk_msg.rpc_proc; + int status; dprint_status(task); @@ -1716,11 +1717,14 @@ call_allocate(struct rpc_task *task) req->rq_rcvsize = RPC_REPHDRSIZE + slack + proc->p_replen; req->rq_rcvsize <<= 2; - req->rq_buffer = xprt->ops->buf_alloc(task, - req->rq_callsize + req->rq_rcvsize); - if (req->rq_buffer != NULL) - return; + status = xprt->ops->buf_alloc(task); xprt_inject_disconnect(xprt); + if (status == 0) + return; + if (status != -ENOMEM) { + rpc_exit(task, status); + return; + } dprintk("RPC: %5u rpc_buffer allocation failed\n", task->tk_pid); diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 9ae588511aaf..b964d40b259b 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -849,14 +849,17 @@ static void rpc_async_schedule(struct work_struct *work) } /** - * rpc_malloc - allocate an RPC buffer - * @task: RPC task that will use this buffer - * @size: requested byte size + * rpc_malloc - allocate RPC buffer resources + * @task: RPC task + * + * A single memory region is allocated, which is split between the + * RPC call and RPC reply that this task is being used for. When + * this RPC is retired, the memory is released by calling rpc_free. * * To prevent rpciod from hanging, this allocator never sleeps, - * returning NULL and suppressing warning if the request cannot be serviced - * immediately. - * The caller can arrange to sleep in a way that is safe for rpciod. + * returning -ENOMEM and suppressing warning if the request cannot + * be serviced immediately. The caller can arrange to sleep in a + * way that is safe for rpciod. * * Most requests are 'small' (under 2KiB) and can be serviced from a * mempool, ensuring that NFS reads and writes can always proceed, @@ -865,8 +868,10 @@ static void rpc_async_schedule(struct work_struct *work) * In order to avoid memory starvation triggering more writebacks of * NFS requests, we avoid using GFP_KERNEL. */ -void *rpc_malloc(struct rpc_task *task, size_t size) +int rpc_malloc(struct rpc_task *task) { + struct rpc_rqst *rqst = task->tk_rqstp; + size_t size = rqst->rq_callsize + rqst->rq_rcvsize; struct rpc_buffer *buf; gfp_t gfp = GFP_NOIO | __GFP_NOWARN; @@ -880,12 +885,13 @@ void *rpc_malloc(struct rpc_task *task, size_t size) buf = kmalloc(size, gfp); if (!buf) - return NULL; + return -ENOMEM; buf->len = size; dprintk("RPC: %5u allocated buffer of size %zu at %p\n", task->tk_pid, size, buf); - return &buf->data; + rqst->rq_buffer = buf->data; + return 0; } EXPORT_SYMBOL_GPL(rpc_malloc); diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index a2a7519b0f23..124688ba67e5 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -159,29 +159,30 @@ out_unmap: /* Server-side transport endpoint wants a whole page for its send * buffer. The client RPC code constructs the RPC header in this * buffer before it invokes ->send_request. - * - * Returns NULL if there was a temporary allocation failure. */ -static void * -xprt_rdma_bc_allocate(struct rpc_task *task, size_t size) +static int +xprt_rdma_bc_allocate(struct rpc_task *task) { struct rpc_rqst *rqst = task->tk_rqstp; struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt; + size_t size = rqst->rq_callsize; struct svcxprt_rdma *rdma; struct page *page; rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt); - /* Prevent an infinite loop: try to make this case work */ - if (size > PAGE_SIZE) + if (size > PAGE_SIZE) { WARN_ONCE(1, "svcrdma: large bc buffer request (size %zu)\n", size); + return -EINVAL; + } page = alloc_page(RPCRDMA_DEF_GFP); if (!page) - return NULL; + return -ENOMEM; - return page_address(page); + rqst->rq_buffer = page_address(page); + return 0; } static void diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index be95eced0726..daa7d4d43fd8 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -477,7 +477,15 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) } } -/* +/** + * xprt_rdma_allocate - allocate transport resources for an RPC + * @task: RPC task + * + * Return values: + * 0: Success; rq_buffer points to RPC buffer to use + * ENOMEM: Out of memory, call again later + * EIO: A permanent error occurred, do not retry + * * The RDMA allocate/free functions need the task structure as a place * to hide the struct rpcrdma_req, which is necessary for the actual send/recv * sequence. @@ -486,11 +494,12 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) * (rq_send_buf and rq_rcv_buf are both part of a single contiguous buffer). * We may register rq_rcv_buf when using reply chunks. */ -static void * -xprt_rdma_allocate(struct rpc_task *task, size_t size) +static int +xprt_rdma_allocate(struct rpc_task *task) { - struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt; - struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct rpc_rqst *rqst = task->tk_rqstp; + size_t size = rqst->rq_callsize + rqst->rq_rcvsize; + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); struct rpcrdma_regbuf *rb; struct rpcrdma_req *req; size_t min_size; @@ -498,7 +507,7 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size) req = rpcrdma_buffer_get(&r_xprt->rx_buf); if (req == NULL) - return NULL; + return -ENOMEM; flags = RPCRDMA_DEF_GFP; if (RPC_IS_SWAPPER(task)) @@ -515,7 +524,8 @@ out: dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req); req->rl_connect_cookie = 0; /* our reserved value */ req->rl_task = task; - return req->rl_sendbuf->rg_base; + rqst->rq_buffer = req->rl_sendbuf->rg_base; + return 0; out_rdmabuf: min_size = r_xprt->rx_data.inline_wsize; @@ -558,7 +568,7 @@ out_sendbuf: out_fail: rpcrdma_buffer_put(req); - return NULL; + return -ENOMEM; } /* diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index bf168838a029..bd30b4b18d72 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2533,23 +2533,28 @@ static void xs_tcp_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) * we allocate pages instead doing a kmalloc like rpc_malloc is because we want * to use the server side send routines. */ -static void *bc_malloc(struct rpc_task *task, size_t size) +static int bc_malloc(struct rpc_task *task) { + struct rpc_rqst *rqst = task->tk_rqstp; + size_t size = rqst->rq_callsize; struct page *page; struct rpc_buffer *buf; - WARN_ON_ONCE(size > PAGE_SIZE - sizeof(struct rpc_buffer)); - if (size > PAGE_SIZE - sizeof(struct rpc_buffer)) - return NULL; + if (size > PAGE_SIZE - sizeof(struct rpc_buffer)) { + WARN_ONCE(1, "xprtsock: large bc buffer request (size %zu)\n", + size); + return -EINVAL; + } page = alloc_page(GFP_KERNEL); if (!page) - return NULL; + return -ENOMEM; buf = page_address(page); buf->len = PAGE_SIZE; - return buf->data; + rqst->rq_buffer = buf->data; + return 0; } /* -- cgit v1.2.3 From 3435c74aed2d7b743ccbf34616c523ebee7be943 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 15 Sep 2016 10:55:29 -0400 Subject: SUNRPC: Generalize the RPC buffer release API xprtrdma needs to allocate the Call and Reply buffers separately. TBH, the reliance on using a single buffer for the pair of XDR buffers is transport implementation-specific. Instead of passing just the rq_buffer into the buf_free method, pass the task structure and let buf_free take care of freeing both XDR buffers at once. There's a micro-optimization here. In the common case, both xprt_release and the transport's buf_free method were checking if rq_buffer was NULL. Now the check is done only once per RPC. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/linux/sunrpc/sched.h | 2 +- include/linux/sunrpc/xprt.h | 2 +- net/sunrpc/sched.c | 10 ++++------ net/sunrpc/xprt.c | 2 +- net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 2 +- net/sunrpc/xprtrdma/transport.c | 26 ++++++++++---------------- net/sunrpc/xprtrdma/xprt_rdma.h | 1 - net/sunrpc/xprtsock.c | 6 ++---- 8 files changed, 20 insertions(+), 31 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index 38d4c1b378f2..7ba040c797ec 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -240,7 +240,7 @@ struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *, void rpc_wake_up_status(struct rpc_wait_queue *, int); void rpc_delay(struct rpc_task *, unsigned long); int rpc_malloc(struct rpc_task *); -void rpc_free(void *); +void rpc_free(struct rpc_task *); int rpciod_up(void); void rpciod_down(void); int __rpc_wait_for_completion_task(struct rpc_task *task, wait_bit_action_f *); diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index c01f468fb374..72c2aebc592b 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -128,7 +128,7 @@ struct rpc_xprt_ops { void (*set_port)(struct rpc_xprt *xprt, unsigned short port); void (*connect)(struct rpc_xprt *xprt, struct rpc_task *task); int (*buf_alloc)(struct rpc_task *task); - void (*buf_free)(void *buffer); + void (*buf_free)(struct rpc_task *task); int (*send_request)(struct rpc_task *task); void (*set_retrans_timeout)(struct rpc_task *task); void (*timer)(struct rpc_xprt *xprt, struct rpc_task *task); diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index b964d40b259b..6690ebc774ed 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -896,18 +896,16 @@ int rpc_malloc(struct rpc_task *task) EXPORT_SYMBOL_GPL(rpc_malloc); /** - * rpc_free - free buffer allocated via rpc_malloc - * @buffer: buffer to free + * rpc_free - free RPC buffer resources allocated via rpc_malloc + * @task: RPC task * */ -void rpc_free(void *buffer) +void rpc_free(struct rpc_task *task) { + void *buffer = task->tk_rqstp->rq_buffer; size_t size; struct rpc_buffer *buf; - if (!buffer) - return; - buf = container_of(buffer, struct rpc_buffer, data); size = buf->len; diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index ea244b29138b..685e6d225414 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1295,7 +1295,7 @@ void xprt_release(struct rpc_task *task) xprt_schedule_autodisconnect(xprt); spin_unlock_bh(&xprt->transport_lock); if (req->rq_buffer) - xprt->ops->buf_free(req->rq_buffer); + xprt->ops->buf_free(task); xprt_inject_disconnect(xprt); if (req->rq_cred != NULL) put_rpccred(req->rq_cred); diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index 124688ba67e5..fa893507d9eb 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -186,7 +186,7 @@ xprt_rdma_bc_allocate(struct rpc_task *task) } static void -xprt_rdma_bc_free(void *buffer) +xprt_rdma_bc_free(struct rpc_task *task) { /* No-op: ctxt and page have already been freed. */ } diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index daa7d4d43fd8..ebf14ba437c6 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -523,7 +523,6 @@ xprt_rdma_allocate(struct rpc_task *task) out: dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req); req->rl_connect_cookie = 0; /* our reserved value */ - req->rl_task = task; rqst->rq_buffer = req->rl_sendbuf->rg_base; return 0; @@ -571,31 +570,26 @@ out_fail: return -ENOMEM; } -/* - * This function returns all RDMA resources to the pool. +/** + * xprt_rdma_free - release resources allocated by xprt_rdma_allocate + * @task: RPC task + * + * Caller guarantees rqst->rq_buffer is non-NULL. */ static void -xprt_rdma_free(void *buffer) +xprt_rdma_free(struct rpc_task *task) { - struct rpcrdma_req *req; - struct rpcrdma_xprt *r_xprt; - struct rpcrdma_regbuf *rb; - - if (buffer == NULL) - return; + struct rpc_rqst *rqst = task->tk_rqstp; + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); - rb = container_of(buffer, struct rpcrdma_regbuf, rg_base[0]); - req = rb->rg_owner; if (req->rl_backchannel) return; - r_xprt = container_of(req->rl_buffer, struct rpcrdma_xprt, rx_buf); - dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply); r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, - !RPC_IS_ASYNC(req->rl_task)); - + !RPC_IS_ASYNC(task)); rpcrdma_buffer_put(req); } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 9df47c857d27..4838a85bdcf6 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -283,7 +283,6 @@ struct rpcrdma_req { struct list_head rl_free; unsigned int rl_niovs; unsigned int rl_connect_cookie; - struct rpc_task *rl_task; struct rpcrdma_buffer *rl_buffer; struct rpcrdma_rep *rl_reply;/* holder for reply buffer */ struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS]; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index bd30b4b18d72..bde39f2ff6e5 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2560,13 +2560,11 @@ static int bc_malloc(struct rpc_task *task) /* * Free the space allocated in the bc_alloc routine */ -static void bc_free(void *buffer) +static void bc_free(struct rpc_task *task) { + void *buffer = task->tk_rqstp->rq_buffer; struct rpc_buffer *buf; - if (!buffer) - return; - buf = container_of(buffer, struct rpc_buffer, data); free_page((unsigned long)buf); } -- cgit v1.2.3 From 68778945e46f143ed7974b427a8065f69a4ce944 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 15 Sep 2016 10:55:37 -0400 Subject: SUNRPC: Separate buffer pointers for RPC Call and Reply messages For xprtrdma, the RPC Call and Reply buffers are involved in real I/O operations. To start with, the DMA direction of the I/O for a Call is opposite that of a Reply. In the current arrangement, the Reply buffer address is on a four-byte alignment just past the call buffer. Would be friendlier on some platforms if that was at a DMA cache alignment instead. Because the current arrangement allocates a single memory region which contains both buffers, the RPC Reply buffer often contains a page boundary in it when the Call buffer is large enough (which is frequent). It would be a little nicer for setting up DMA operations (and possible registration of the Reply buffer) if the two buffers were separated, well-aligned, and contained as few page boundaries as possible. Now, I could just pad out the single memory region used for the pair of buffers. But frequently that would mean a lot of unused space to ensure the Reply buffer did not have a page boundary. Add a separate pointer to rpc_rqst that points right to the RPC Reply buffer. This makes no difference to xprtsock, but it will help xprtrdma in subsequent patches. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/linux/sunrpc/xprt.h | 5 +++-- net/sunrpc/clnt.c | 2 +- net/sunrpc/sched.c | 1 + net/sunrpc/xprtrdma/transport.c | 1 + 4 files changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 72c2aebc592b..46f069efa056 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -84,8 +84,9 @@ struct rpc_rqst { struct list_head rq_list; void *rq_buffer; /* Call XDR encode buffer */ - size_t rq_callsize, - rq_rcvsize; + size_t rq_callsize; + void *rq_rbuffer; /* Reply XDR decode buffer */ + size_t rq_rcvsize; size_t rq_xmit_bytes_sent; /* total bytes sent */ size_t rq_reply_bytes_recvd; /* total reply bytes */ /* received */ diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 5499fda0c1f3..34dd7b26ee5f 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1766,7 +1766,7 @@ rpc_xdr_encode(struct rpc_task *task) req->rq_buffer, req->rq_callsize); xdr_buf_init(&req->rq_rcv_buf, - (char *)req->rq_buffer + req->rq_callsize, + req->rq_rbuffer, req->rq_rcvsize); p = rpc_encode_header(task); diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 6690ebc774ed..5db68b371db2 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -891,6 +891,7 @@ int rpc_malloc(struct rpc_task *task) dprintk("RPC: %5u allocated buffer of size %zu at %p\n", task->tk_pid, size, buf); rqst->rq_buffer = buf->data; + rqst->rq_rbuffer = (char *)rqst->rq_buffer + rqst->rq_callsize; return 0; } EXPORT_SYMBOL_GPL(rpc_malloc); diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index ebf14ba437c6..136caf3dd299 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -524,6 +524,7 @@ out: dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req); req->rl_connect_cookie = 0; /* our reserved value */ rqst->rq_buffer = req->rl_sendbuf->rg_base; + rqst->rq_rbuffer = (char *)rqst->rq_buffer + rqst->rq_rcvsize; return 0; out_rdmabuf: -- cgit v1.2.3 From 5a6d1db4556940533f1a5b6521e522f3e46508ed Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 15 Sep 2016 10:55:45 -0400 Subject: SUNRPC: Add a transport-specific private field in rpc_rqst Currently there's a hidden and indirect mechanism for finding the rpcrdma_req that goes with an rpc_rqst. It depends on getting from the rq_buffer pointer in struct rpc_rqst to the struct rpcrdma_regbuf that controls that buffer, and then to the struct rpcrdma_req it goes with. This was done back in the day to avoid the need to add a per-rqst pointer or to alter the buf_free API when support for RPC-over-RDMA was introduced. I'm about to change the way regbuf's work to support larger inline thresholds. Now is a good time to replace this indirect mechanism with something that is more straightforward. I guess this should be considered a clean up. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/linux/sunrpc/xprt.h | 1 + net/sunrpc/xprtrdma/backchannel.c | 6 ++---- net/sunrpc/xprtrdma/transport.c | 2 +- net/sunrpc/xprtrdma/verbs.c | 1 - net/sunrpc/xprtrdma/xprt_rdma.h | 13 +++++++------ 5 files changed, 11 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 46f069efa056..a5da60b24d83 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -83,6 +83,7 @@ struct rpc_rqst { void (*rq_release_snd_buf)(struct rpc_rqst *); /* release rq_enc_pages */ struct list_head rq_list; + void *rq_xprtdata; /* Per-xprt private data */ void *rq_buffer; /* Call XDR encode buffer */ size_t rq_callsize; void *rq_rbuffer; /* Reply XDR decode buffer */ diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index d3cfaf281e55..c4904f881640 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -55,11 +55,9 @@ static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt, rb = rpcrdma_alloc_regbuf(ia, size, GFP_KERNEL); if (IS_ERR(rb)) goto out_fail; - rb->rg_owner = req; req->rl_sendbuf = rb; - /* so that rpcr_to_rdmar works when receiving a request */ - rqst->rq_buffer = (void *)req->rl_sendbuf->rg_base; - xdr_buf_init(&rqst->rq_snd_buf, rqst->rq_buffer, size); + xdr_buf_init(&rqst->rq_snd_buf, rb->rg_base, size); + rpcrdma_set_xprtdata(rqst, req); return 0; out_fail: diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 136caf3dd299..d83bffa92dfc 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -523,6 +523,7 @@ xprt_rdma_allocate(struct rpc_task *task) out: dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req); req->rl_connect_cookie = 0; /* our reserved value */ + rpcrdma_set_xprtdata(rqst, req); rqst->rq_buffer = req->rl_sendbuf->rg_base; rqst->rq_rbuffer = (char *)rqst->rq_buffer + rqst->rq_rcvsize; return 0; @@ -559,7 +560,6 @@ out_sendbuf: rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, size, flags); if (IS_ERR(rb)) goto out_fail; - rb->rg_owner = req; r_xprt->rx_stats.hardway_register_count += size; rpcrdma_free_regbuf(&r_xprt->rx_ia, req->rl_sendbuf); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 799cce6cbe45..93def0bf07af 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1210,7 +1210,6 @@ rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags) iov->length = size; iov->lkey = ia->ri_pd->local_dma_lkey; rb->rg_size = size; - rb->rg_owner = NULL; return rb; out_free: diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 4838a85bdcf6..484855eddb85 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -113,7 +113,6 @@ struct rpcrdma_ep { struct rpcrdma_regbuf { size_t rg_size; - struct rpcrdma_req *rg_owner; struct ib_sge rg_iov; __be32 rg_base[0] __attribute__ ((aligned(256))); }; @@ -297,14 +296,16 @@ struct rpcrdma_req { struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; }; +static inline void +rpcrdma_set_xprtdata(struct rpc_rqst *rqst, struct rpcrdma_req *req) +{ + rqst->rq_xprtdata = req; +} + static inline struct rpcrdma_req * rpcr_to_rdmar(struct rpc_rqst *rqst) { - void *buffer = rqst->rq_buffer; - struct rpcrdma_regbuf *rb; - - rb = container_of(buffer, struct rpcrdma_regbuf, rg_base); - return rb->rg_owner; + return rqst->rq_xprtdata; } /* -- cgit v1.2.3 From ff06bd191e722393d9abf7d6f9767f195274e909 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 15 Sep 2016 10:56:59 -0400 Subject: rpcrdma: RDMA/CM private message data structure Introduce data structure used by both client and server to exchange implementation details during RDMA/CM connection establishment. This is an experimental out-of-band exchange between Linux RPC-over-RDMA Version One implementations, replacing the deprecated CCP (see RFC 5666bis). The purpose of this extension is to enable prototyping of features that might be introduced in a subsequent version of RPC-over-RDMA. Suggested by Christoph Hellwig and Devesh Sharma. Signed-off-by: Chuck Lever Reviewed-by: Sagi Grimberg Signed-off-by: Anna Schumaker --- include/linux/sunrpc/rpc_rdma.h | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'include') diff --git a/include/linux/sunrpc/rpc_rdma.h b/include/linux/sunrpc/rpc_rdma.h index 3b1ff38f0c37..a7da6bf56610 100644 --- a/include/linux/sunrpc/rpc_rdma.h +++ b/include/linux/sunrpc/rpc_rdma.h @@ -41,6 +41,7 @@ #define _LINUX_SUNRPC_RPC_RDMA_H #include +#include #define RPCRDMA_VERSION 1 #define rpcrdma_version cpu_to_be32(RPCRDMA_VERSION) @@ -129,4 +130,38 @@ enum rpcrdma_proc { #define rdma_done cpu_to_be32(RDMA_DONE) #define rdma_error cpu_to_be32(RDMA_ERROR) +/* + * Private extension to RPC-over-RDMA Version One. + * Message passed during RDMA-CM connection set-up. + * + * Add new fields at the end, and don't permute existing + * fields. + */ +struct rpcrdma_connect_private { + __be32 cp_magic; + u8 cp_version; + u8 cp_flags; + u8 cp_send_size; + u8 cp_recv_size; +} __packed; + +#define rpcrdma_cmp_magic __cpu_to_be32(0xf6ab0e18) + +enum { + RPCRDMA_CMP_VERSION = 1, + RPCRDMA_CMP_F_SND_W_INV_OK = BIT(0), +}; + +static inline u8 +rpcrdma_encode_buffer_size(unsigned int size) +{ + return (size >> 10) - 1; +} + +static inline unsigned int +rpcrdma_decode_buffer_size(u8 val) +{ + return ((unsigned int)val + 1) << 10; +} + #endif /* _LINUX_SUNRPC_RPC_RDMA_H */ -- cgit v1.2.3 From 87cfb9a0c85ce4a0c96a4f3d692a85519b933ade Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 15 Sep 2016 10:57:07 -0400 Subject: xprtrdma: Client-side support for rpcrdma_connect_private Send an RDMA-CM private message on connect, and look for one during a connection-established event. Both sides can communicate their various implementation limits. Implementations that don't support this sideband protocol ignore it. Once the client knows the server's inline threshold maxima, it can adjust the use of Reply chunks, and eliminate most use of Position Zero Read chunks. Moderately-sized I/O can be done using a pure inline RDMA Send instead of RDMA operations that require memory registration. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/linux/sunrpc/rpc_rdma.h | 4 ++++ net/sunrpc/xprtrdma/fmr_ops.c | 5 ++--- net/sunrpc/xprtrdma/frwr_ops.c | 5 ++--- net/sunrpc/xprtrdma/rpc_rdma.c | 8 +++++--- net/sunrpc/xprtrdma/verbs.c | 40 +++++++++++++++++++++++++++++++++++++--- net/sunrpc/xprtrdma/xprt_rdma.h | 6 +++--- 6 files changed, 53 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/rpc_rdma.h b/include/linux/sunrpc/rpc_rdma.h index a7da6bf56610..cfda6adcf33c 100644 --- a/include/linux/sunrpc/rpc_rdma.h +++ b/include/linux/sunrpc/rpc_rdma.h @@ -46,6 +46,10 @@ #define RPCRDMA_VERSION 1 #define rpcrdma_version cpu_to_be32(RPCRDMA_VERSION) +enum { + RPCRDMA_V1_DEF_INLINE_SIZE = 1024, +}; + struct rpcrdma_segment { __be32 rs_handle; /* Registered memory handle */ __be32 rs_length; /* Length of the chunk in bytes */ diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index 21cb3b150b37..16690a1b653e 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -160,9 +160,8 @@ static int fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, struct rpcrdma_create_data_internal *cdata) { - rpcrdma_set_max_header_sizes(ia, cdata, max_t(unsigned int, 1, - RPCRDMA_MAX_DATA_SEGS / - RPCRDMA_MAX_FMR_SGES)); + ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS / + RPCRDMA_MAX_FMR_SGES); return 0; } diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 892b5e1d9b09..fcfcf3ac030c 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -242,9 +242,8 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, depth; } - rpcrdma_set_max_header_sizes(ia, cdata, max_t(unsigned int, 1, - RPCRDMA_MAX_DATA_SEGS / - ia->ri_max_frmr_depth)); + ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS / + ia->ri_max_frmr_depth); return 0; } diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index c2906e314287..ea734c2c7ddb 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -118,10 +118,12 @@ static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs) return size; } -void rpcrdma_set_max_header_sizes(struct rpcrdma_ia *ia, - struct rpcrdma_create_data_internal *cdata, - unsigned int maxsegs) +void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *r_xprt) { + struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + unsigned int maxsegs = ia->ri_max_segs; + ia->ri_max_inline_write = cdata->inline_wsize - rpcrdma_max_call_header_size(maxsegs); ia->ri_max_inline_read = cdata->inline_rsize - diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index a49c788aa59a..6bab8416a4fc 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -204,6 +204,33 @@ out_fail: goto out_schedule; } +static void +rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt, + struct rdma_conn_param *param) +{ + struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; + const struct rpcrdma_connect_private *pmsg = param->private_data; + unsigned int rsize, wsize; + + rsize = RPCRDMA_V1_DEF_INLINE_SIZE; + wsize = RPCRDMA_V1_DEF_INLINE_SIZE; + + if (pmsg && + pmsg->cp_magic == rpcrdma_cmp_magic && + pmsg->cp_version == RPCRDMA_CMP_VERSION) { + rsize = rpcrdma_decode_buffer_size(pmsg->cp_send_size); + wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size); + } + + if (rsize < cdata->inline_rsize) + cdata->inline_rsize = rsize; + if (wsize < cdata->inline_wsize) + cdata->inline_wsize = wsize; + pr_info("rpcrdma: max send %u, max recv %u\n", + cdata->inline_wsize, cdata->inline_rsize); + rpcrdma_set_max_header_sizes(r_xprt); +} + static int rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) { @@ -244,6 +271,7 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) " (%d initiator)\n", __func__, attr->max_dest_rd_atomic, attr->max_rd_atomic); + rpcrdma_update_connect_private(xprt, &event->param.conn); goto connected; case RDMA_CM_EVENT_CONNECT_ERROR: connstate = -ENOTCONN; @@ -454,6 +482,7 @@ int rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata) { + struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private; struct ib_cq *sendcq, *recvcq; unsigned int max_qp_wr; int rc; @@ -536,9 +565,14 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, /* Initialize cma parameters */ memset(&ep->rep_remote_cma, 0, sizeof(ep->rep_remote_cma)); - /* RPC/RDMA does not use private data */ - ep->rep_remote_cma.private_data = NULL; - ep->rep_remote_cma.private_data_len = 0; + /* Prepare RDMA-CM private message */ + pmsg->cp_magic = rpcrdma_cmp_magic; + pmsg->cp_version = RPCRDMA_CMP_VERSION; + pmsg->cp_flags = 0; + pmsg->cp_send_size = rpcrdma_encode_buffer_size(cdata->inline_wsize); + pmsg->cp_recv_size = rpcrdma_encode_buffer_size(cdata->inline_rsize); + ep->rep_remote_cma.private_data = pmsg; + ep->rep_remote_cma.private_data_len = sizeof(*pmsg); /* Client offers RDMA Read but does not initiate */ ep->rep_remote_cma.initiator_depth = 0; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 9aabca68c49d..89df1680b1eb 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -70,6 +70,7 @@ struct rpcrdma_ia { struct ib_pd *ri_pd; struct completion ri_done; int ri_async_rc; + unsigned int ri_max_segs; unsigned int ri_max_frmr_depth; unsigned int ri_max_inline_write; unsigned int ri_max_inline_read; @@ -87,6 +88,7 @@ struct rpcrdma_ep { int rep_connected; struct ib_qp_init_attr rep_attr; wait_queue_head_t rep_connect_wait; + struct rpcrdma_connect_private rep_cm_private; struct rdma_conn_param rep_remote_cma; struct sockaddr_storage rep_remote_addr; struct delayed_work rep_connect_worker; @@ -523,9 +525,7 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *); * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c */ int rpcrdma_marshal_req(struct rpc_rqst *); -void rpcrdma_set_max_header_sizes(struct rpcrdma_ia *, - struct rpcrdma_create_data_internal *, - unsigned int); +void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *); /* RPC/RDMA module init - xprtrdma/transport.c */ -- cgit v1.2.3 From 44829d02d2d7a7064842ecf36239ea24df1cdf58 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 15 Sep 2016 10:57:32 -0400 Subject: xprtrdma: Support larger inline thresholds The Version One default inline threshold is still 1KB. But allow testing with thresholds up to 64KB. This maximum is somewhat arbitrary. There's no fundamental architectural limit I'm aware of, but it's good to keep the size of Receive buffers reasonable. Now that Send can use a s/g list, a Send buffer is only as large as each RPC requires. Receive buffers are always the size of the inline threshold, however. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/linux/sunrpc/xprtrdma.h | 4 ++-- net/sunrpc/xprtrdma/transport.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h index 39267dc3486a..221b7a2e5406 100644 --- a/include/linux/sunrpc/xprtrdma.h +++ b/include/linux/sunrpc/xprtrdma.h @@ -53,8 +53,8 @@ #define RPCRDMA_MAX_SLOT_TABLE (256U) #define RPCRDMA_MIN_INLINE (1024) /* min inline thresh */ -#define RPCRDMA_DEF_INLINE (1024) /* default inline thresh */ -#define RPCRDMA_MAX_INLINE (3068) /* max inline thresh */ +#define RPCRDMA_DEF_INLINE (4096) /* default inline thresh */ +#define RPCRDMA_MAX_INLINE (65536) /* max inline thresh */ /* Memory registration strategies, by number. * This is part of a kernel / user space API. Do not remove. */ diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 6a358ab6ce27..ed5e285fd2ea 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -97,7 +97,7 @@ static struct ctl_table xr_tunables_table[] = { .data = &xprt_rdma_max_inline_read, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &min_inline_size, .extra2 = &max_inline_size, }, @@ -106,7 +106,7 @@ static struct ctl_table xr_tunables_table[] = { .data = &xprt_rdma_max_inline_write, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &min_inline_size, .extra2 = &max_inline_size, }, -- cgit v1.2.3 From ca440c383a588091cae9fbce610b86a6e9d961ad Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 15 Sep 2016 14:40:49 -0400 Subject: pnfs: add a new mechanism to select a layout driver according to an ordered list Currently, the layout driver selection code always chooses the first one from the list. That's not really ideal however, as the server can send the list of layout types in any order that it likes. It's up to the client to select the best one for its needs. This patch adds an ordered list of preferred driver types and has the selection code sort the list of available layout drivers according to it. Any unrecognized layout type is sorted to the end of the list. For now, the order of preference is hardcoded, but it should be possible to make this configurable in the future. Signed-off-by: Jeff Layton Reviewed-by: J. Bruce Fields Signed-off-by: Anna Schumaker --- fs/nfs/client.c | 1 + fs/nfs/nfs4proc.c | 2 +- fs/nfs/nfs4xdr.c | 31 +++++++++++++++------------ fs/nfs/pnfs.c | 56 ++++++++++++++++++++++++++++++++++++++++++------- fs/nfs/pnfs.h | 5 +++-- include/linux/nfs_xdr.h | 1 + 6 files changed, 72 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 535c2563b701..51136b03788d 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -788,6 +788,7 @@ int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs } fsinfo.fattr = fattr; + fsinfo.nlayouttypes = 0; memset(fsinfo.layouttype, 0, sizeof(fsinfo.layouttype)); error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo); if (error < 0) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index cfdf45a14cc3..acf5a2caa423 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4312,7 +4312,7 @@ static int nfs4_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, s if (error == 0) { /* block layout checks this! */ server->pnfs_blksize = fsinfo->blksize; - set_pnfs_layoutdriver(server, fhandle, fsinfo->layouttype); + set_pnfs_layoutdriver(server, fhandle, fsinfo); } return error; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 41a02f994976..17b4e059c588 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -4728,29 +4728,34 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, * Decode potentially multiple layout types. */ static int decode_pnfs_layout_types(struct xdr_stream *xdr, - uint32_t *layouttype) + struct nfs_fsinfo *fsinfo) { __be32 *p; - uint32_t num, i; + uint32_t i; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; - num = be32_to_cpup(p); + fsinfo->nlayouttypes = be32_to_cpup(p); /* pNFS is not supported by the underlying file system */ - if (num == 0) { + if (fsinfo->nlayouttypes == 0) return 0; - } - if (num > NFS_MAX_LAYOUT_TYPES) - printk(KERN_INFO "NFS: %s: Warning: Too many (%d) pNFS layout types\n", __func__, num); /* Decode and set first layout type, move xdr->p past unused types */ - p = xdr_inline_decode(xdr, num * 4); + p = xdr_inline_decode(xdr, fsinfo->nlayouttypes * 4); if (unlikely(!p)) goto out_overflow; - for(i = 0; i < num && i < NFS_MAX_LAYOUT_TYPES; i++) - layouttype[i] = be32_to_cpup(p++); + + /* If we get too many, then just cap it at the max */ + if (fsinfo->nlayouttypes > NFS_MAX_LAYOUT_TYPES) { + printk(KERN_INFO "NFS: %s: Warning: Too many (%u) pNFS layout types\n", + __func__, fsinfo->nlayouttypes); + fsinfo->nlayouttypes = NFS_MAX_LAYOUT_TYPES; + } + + for(i = 0; i < fsinfo->nlayouttypes; ++i) + fsinfo->layouttype[i] = be32_to_cpup(p++); return 0; out_overflow: print_overflow_msg(__func__, xdr); @@ -4762,7 +4767,7 @@ out_overflow: * Note we must ensure that layouttype is set in any non-error case. */ static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap, - uint32_t *layouttype) + struct nfs_fsinfo *fsinfo) { int status = 0; @@ -4770,7 +4775,7 @@ static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap, if (unlikely(bitmap[1] & (FATTR4_WORD1_FS_LAYOUT_TYPES - 1U))) return -EIO; if (bitmap[1] & FATTR4_WORD1_FS_LAYOUT_TYPES) { - status = decode_pnfs_layout_types(xdr, layouttype); + status = decode_pnfs_layout_types(xdr, fsinfo); bitmap[1] &= ~FATTR4_WORD1_FS_LAYOUT_TYPES; } return status; @@ -4853,7 +4858,7 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) status = decode_attr_time_delta(xdr, bitmap, &fsinfo->time_delta); if (status != 0) goto xdr_error; - status = decode_attr_pnfstype(xdr, bitmap, fsinfo->layouttype); + status = decode_attr_pnfstype(xdr, bitmap, fsinfo); if (status != 0) goto xdr_error; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index a6a683fbd230..b588ccf05045 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -30,6 +30,7 @@ #include #include #include +#include #include "internal.h" #include "pnfs.h" #include "iostat.h" @@ -98,6 +99,39 @@ unset_pnfs_layoutdriver(struct nfs_server *nfss) nfss->pnfs_curr_ld = NULL; } +/* + * When the server sends a list of layout types, we choose one in the order + * given in the list below. + * + * FIXME: should this list be configurable in some fashion? module param? + * mount option? something else? + */ +static const u32 ld_prefs[] = { + LAYOUT_SCSI, + LAYOUT_BLOCK_VOLUME, + LAYOUT_OSD2_OBJECTS, + LAYOUT_FLEX_FILES, + LAYOUT_NFSV4_1_FILES, + 0 +}; + +static int +ld_cmp(const void *e1, const void *e2) +{ + u32 ld1 = *((u32 *)e1); + u32 ld2 = *((u32 *)e2); + int i; + + for (i = 0; ld_prefs[i] != 0; i++) { + if (ld1 == ld_prefs[i]) + return -1; + + if (ld2 == ld_prefs[i]) + return 1; + } + return 0; +} + /* * Try to set the server's pnfs module to the pnfs layout type specified by id. * Currently only one pNFS layout driver per filesystem is supported. @@ -106,10 +140,11 @@ unset_pnfs_layoutdriver(struct nfs_server *nfss) */ void set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh, - u32 *ids) + struct nfs_fsinfo *fsinfo) { struct pnfs_layoutdriver_type *ld_type = NULL; u32 id; + int i; if (!(server->nfs_client->cl_exchange_flags & (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) { @@ -118,18 +153,23 @@ set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh, goto out_no_driver; } - id = ids[0]; - if (!id) - goto out_no_driver; + sort(fsinfo->layouttype, fsinfo->nlayouttypes, + sizeof(*fsinfo->layouttype), ld_cmp, NULL); - ld_type = find_pnfs_driver(id); - if (!ld_type) { - request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id); + for (i = 0; i < fsinfo->nlayouttypes; i++) { + id = fsinfo->layouttype[i]; ld_type = find_pnfs_driver(id); + if (!ld_type) { + request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, + id); + ld_type = find_pnfs_driver(id); + } + if (ld_type) + break; } if (!ld_type) { - dprintk("%s: No pNFS module found for %u.\n", __func__, id); + dprintk("%s: No pNFS module found!\n", __func__); goto out_no_driver; } diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index be515e6a3823..5c295512c967 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -236,7 +236,7 @@ void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo); void pnfs_put_lseg(struct pnfs_layout_segment *lseg); void pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg); -void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32 *); +void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, struct nfs_fsinfo *); void unset_pnfs_layoutdriver(struct nfs_server *); void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *); int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc); @@ -657,7 +657,8 @@ pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task) } static inline void set_pnfs_layoutdriver(struct nfs_server *s, - const struct nfs_fh *mntfh, u32 *ids) + const struct nfs_fh *mntfh, + struct nfs_fsinfo *fsinfo) { } diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index f11b26ed001b..beb1e10f446e 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -144,6 +144,7 @@ struct nfs_fsinfo { __u64 maxfilesize; struct timespec time_delta; /* server time granularity */ __u32 lease_time; /* in seconds */ + __u32 nlayouttypes; /* number of layouttypes */ __u32 layouttype[NFS_MAX_LAYOUT_TYPES]; /* supported pnfs layout driver */ __u32 blksize; /* preferred pnfs io block size */ __u32 clone_blksize; /* granularity of a CLONE operation */ -- cgit v1.2.3 From eed7c4143dd993ef380340af64b8f2e7a79102c8 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sat, 17 Sep 2016 18:17:34 -0400 Subject: nfs: add a new NFS4_OPEN_RESULT_MAY_NOTIFY_LOCK constant As defined in RFC 5661, section 18.16. Signed-off-by: Jeff Layton Signed-off-by: Anna Schumaker --- include/uapi/linux/nfs4.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/nfs4.h b/include/uapi/linux/nfs4.h index 2b871e0858d9..4ae62796bfde 100644 --- a/include/uapi/linux/nfs4.h +++ b/include/uapi/linux/nfs4.h @@ -39,8 +39,9 @@ #define NFS4_FH_VOL_MIGRATION 0x0004 #define NFS4_FH_VOL_RENAME 0x0008 -#define NFS4_OPEN_RESULT_CONFIRM 0x0002 -#define NFS4_OPEN_RESULT_LOCKTYPE_POSIX 0x0004 +#define NFS4_OPEN_RESULT_CONFIRM 0x0002 +#define NFS4_OPEN_RESULT_LOCKTYPE_POSIX 0x0004 +#define NFS4_OPEN_RESULT_MAY_NOTIFY_LOCK 0x0020 #define NFS4_SHARE_ACCESS_MASK 0x000F #define NFS4_SHARE_ACCESS_READ 0x0001 -- cgit v1.2.3 From a1d617d8f134679741b0b35e8e1436b015ac5538 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sat, 17 Sep 2016 18:17:39 -0400 Subject: nfs: allow blocking locks to be awoken by lock callbacks Add a waitqueue head to the client structure. Have clients set a wait on that queue prior to requesting a lock from the server. If the lock is blocked, then we can use that to wait for wakeups. Note that we do need to do this "manually" since we need to set the wait on the waitqueue prior to requesting the lock, but requesting a lock can involve activities that can block. However, only do that for NFSv4.1 locks, either by compiling out all of the waitqueue handling when CONFIG_NFS_V4_1 is disabled, or skipping all of it at runtime if we're dealing with v4.0, or v4.1 servers that don't send lock callbacks. Note too that even when we expect to get a lock callback, RFC5661 section 20.11.4 is pretty clear that we still need to poll for them, so we do still sleep on a timeout. We do however always poll at the longest interval in that case. Signed-off-by: Jeff Layton [Anna: nfs4_retry_setlk() "status" should default to -ERESTARTSYS] Signed-off-by: Anna Schumaker --- fs/nfs/callback_proc.c | 4 ++ fs/nfs/nfs4client.c | 3 ++ fs/nfs/nfs4proc.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++- include/linux/nfs_fs_sb.h | 3 ++ 4 files changed, 103 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 974881824414..e9aa235e9d10 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -638,6 +638,10 @@ __be32 nfs4_callback_notify_lock(struct cb_notify_lock_args *args, void *dummy, dprintk_rcu("NFS: CB_NOTIFY_LOCK request from %s\n", rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR)); + /* Don't wake anybody if the string looked bogus */ + if (args->cbnl_valid) + __wake_up(&cps->clp->cl_lock_waitq, TASK_NORMAL, 0, args); + return htonl(NFS4_OK); } #endif /* CONFIG_NFS_V4_1 */ diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index a2c9654d018f..074ac7131459 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -199,6 +199,9 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init) clp->cl_minorversion = cl_init->minorversion; clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion]; clp->cl_mig_gen = 1; +#if IS_ENABLED(CONFIG_NFS_V4_1) + init_waitqueue_head(&clp->cl_lock_waitq); +#endif return clp; error: diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index a74c4167b5b0..5ec333f3a19b 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6166,7 +6166,8 @@ static int nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock * #define NFS4_LOCK_MAXTIMEOUT (30 * HZ) static int -nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) +nfs4_retry_setlk_simple(struct nfs4_state *state, int cmd, + struct file_lock *request) { int status = -ERESTARTSYS; unsigned long timeout = NFS4_LOCK_MINTIMEOUT; @@ -6183,6 +6184,97 @@ nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) return status; } +#ifdef CONFIG_NFS_V4_1 +struct nfs4_lock_waiter { + struct task_struct *task; + struct inode *inode; + struct nfs_lowner *owner; + bool notified; +}; + +static int +nfs4_wake_lock_waiter(wait_queue_t *wait, unsigned int mode, int flags, void *key) +{ + int ret; + struct cb_notify_lock_args *cbnl = key; + struct nfs4_lock_waiter *waiter = wait->private; + struct nfs_lowner *lowner = &cbnl->cbnl_owner, + *wowner = waiter->owner; + + /* Only wake if the callback was for the same owner */ + if (lowner->clientid != wowner->clientid || + lowner->id != wowner->id || + lowner->s_dev != wowner->s_dev) + return 0; + + /* Make sure it's for the right inode */ + if (nfs_compare_fh(NFS_FH(waiter->inode), &cbnl->cbnl_fh)) + return 0; + + waiter->notified = true; + + /* override "private" so we can use default_wake_function */ + wait->private = waiter->task; + ret = autoremove_wake_function(wait, mode, flags, key); + wait->private = waiter; + return ret; +} + +static int +nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) +{ + int status = -ERESTARTSYS; + unsigned long flags; + struct nfs4_lock_state *lsp = request->fl_u.nfs4_fl.owner; + struct nfs_server *server = NFS_SERVER(state->inode); + struct nfs_client *clp = server->nfs_client; + wait_queue_head_t *q = &clp->cl_lock_waitq; + struct nfs_lowner owner = { .clientid = clp->cl_clientid, + .id = lsp->ls_seqid.owner_id, + .s_dev = server->s_dev }; + struct nfs4_lock_waiter waiter = { .task = current, + .inode = state->inode, + .owner = &owner, + .notified = false }; + wait_queue_t wait; + + /* Don't bother with waitqueue if we don't expect a callback */ + if (!test_bit(NFS_STATE_MAY_NOTIFY_LOCK, &state->flags)) + return nfs4_retry_setlk_simple(state, cmd, request); + + init_wait(&wait); + wait.private = &waiter; + wait.func = nfs4_wake_lock_waiter; + add_wait_queue(q, &wait); + + while(!signalled()) { + status = nfs4_proc_setlk(state, cmd, request); + if ((status != -EAGAIN) || IS_SETLK(cmd)) + break; + + status = -ERESTARTSYS; + spin_lock_irqsave(&q->lock, flags); + if (waiter.notified) { + spin_unlock_irqrestore(&q->lock, flags); + continue; + } + set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irqrestore(&q->lock, flags); + + freezable_schedule_timeout_interruptible(NFS4_LOCK_MAXTIMEOUT); + } + + finish_wait(q, &wait); + return status; +} +#else /* !CONFIG_NFS_V4_1 */ +static inline int +nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) +{ + return nfs4_retry_setlk_simple(state, cmd, request); +} +#endif + static int nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) { diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 14a762d2734d..b34097c67848 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -103,6 +103,9 @@ struct nfs_client { #define NFS_SP4_MACH_CRED_WRITE 5 /* WRITE */ #define NFS_SP4_MACH_CRED_COMMIT 6 /* COMMIT */ #define NFS_SP4_MACH_CRED_PNFS_CLEANUP 7 /* LAYOUTRETURN */ +#if IS_ENABLED(CONFIG_NFS_V4_1) + wait_queue_head_t cl_lock_waitq; +#endif /* CONFIG_NFS_V4_1 */ #endif /* CONFIG_NFS_V4 */ /* Our own IP address, as a null-terminated string. -- cgit v1.2.3 From cace564f8b6260e806f5e28d7f192fd0e0c603ed Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 13 Sep 2016 10:52:50 -0400 Subject: svcrdma: Tail iovec leaves an orphaned DMA mapping The ctxt's count field is overloaded to mean the number of pages in the ctxt->page array and the number of SGEs in the ctxt->sge array. Typically these two numbers are the same. However, when an inline RPC reply is constructed from an xdr_buf with a tail iovec, the head and tail often occupy the same page, but each are DMA mapped independently. In that case, ->count equals the number of pages, but it does not equal the number of SGEs. There's one more SGE, for the tail iovec. Hence there is one more DMA mapping than there are pages in the ctxt->page array. This isn't a real problem until the server's iommu is enabled. Then each RPC reply that has content in that iovec orphans a DMA mapping that consists of real resources. krb5i and krb5p always populate that tail iovec. After a couple million sent krb5i/p RPC replies, the NFS server starts behaving erratically. Reboot is needed to clear the problem. Fixes: 9d11b51ce7c1 ("svcrdma: Fix send_reply() scatter/gather set-up") Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 9 +++++++++ net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 2 +- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 2 +- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 22 ++++------------------ net/sunrpc/xprtrdma/svc_rdma_transport.c | 18 ++++++++++++------ 5 files changed, 27 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index d6917b896d3a..3584bc8864c4 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -86,6 +86,7 @@ struct svc_rdma_op_ctxt { unsigned long flags; enum dma_data_direction direction; int count; + unsigned int mapped_sges; struct ib_sge sge[RPCSVC_MAXPAGES]; struct page *pages[RPCSVC_MAXPAGES]; }; @@ -193,6 +194,14 @@ struct svcxprt_rdma { #define RPCSVC_MAXPAYLOAD_RDMA RPCSVC_MAXPAYLOAD +/* Track DMA maps for this transport and context */ +static inline void svc_rdma_count_mappings(struct svcxprt_rdma *rdma, + struct svc_rdma_op_ctxt *ctxt) +{ + ctxt->mapped_sges++; + atomic_inc(&rdma->sc_dma_used); +} + /* svc_rdma_backchannel.c */ extern int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, struct rpcrdma_msg *rmsgp, diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index a2a7519b0f23..cd0c5581498c 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -129,7 +129,7 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma, ret = -EIO; goto out_unmap; } - atomic_inc(&rdma->sc_dma_used); + svc_rdma_count_mappings(rdma, ctxt); memset(&send_wr, 0, sizeof(send_wr)); ctxt->cqe.done = svc_rdma_wc_send; diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 2c25606f2561..ad1df979b3f0 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -159,7 +159,7 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt, ctxt->sge[pno].addr); if (ret) goto err; - atomic_inc(&xprt->sc_dma_used); + svc_rdma_count_mappings(xprt, ctxt); ctxt->sge[pno].lkey = xprt->sc_pd->local_dma_lkey; ctxt->sge[pno].length = len; diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 54d533300620..e5b49e62420c 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -280,7 +280,7 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, if (ib_dma_mapping_error(xprt->sc_cm_id->device, sge[sge_no].addr)) goto err; - atomic_inc(&xprt->sc_dma_used); + svc_rdma_count_mappings(xprt, ctxt); sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey; ctxt->count++; sge_off = 0; @@ -489,7 +489,7 @@ static int send_reply(struct svcxprt_rdma *rdma, ctxt->sge[0].length, DMA_TO_DEVICE); if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr)) goto err; - atomic_inc(&rdma->sc_dma_used); + svc_rdma_count_mappings(rdma, ctxt); ctxt->direction = DMA_TO_DEVICE; @@ -505,7 +505,7 @@ static int send_reply(struct svcxprt_rdma *rdma, if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[sge_no].addr)) goto err; - atomic_inc(&rdma->sc_dma_used); + svc_rdma_count_mappings(rdma, ctxt); ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey; ctxt->sge[sge_no].length = sge_bytes; } @@ -523,23 +523,9 @@ static int send_reply(struct svcxprt_rdma *rdma, ctxt->pages[page_no+1] = rqstp->rq_respages[page_no]; ctxt->count++; rqstp->rq_respages[page_no] = NULL; - /* - * If there are more pages than SGE, terminate SGE - * list so that svc_rdma_unmap_dma doesn't attempt to - * unmap garbage. - */ - if (page_no+1 >= sge_no) - ctxt->sge[page_no+1].length = 0; } rqstp->rq_next_page = rqstp->rq_respages + 1; - /* The loop above bumps sc_dma_used for each sge. The - * xdr_buf.tail gets a separate sge, but resides in the - * same page as xdr_buf.head. Don't count it twice. - */ - if (sge_no > ctxt->count) - atomic_dec(&rdma->sc_dma_used); - if (sge_no > rdma->sc_max_sge) { pr_err("svcrdma: Too many sges (%d)\n", sge_no); goto err; @@ -692,7 +678,7 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, svc_rdma_put_context(ctxt, 1); return; } - atomic_inc(&xprt->sc_dma_used); + svc_rdma_count_mappings(xprt, ctxt); /* Prepare SEND WR */ memset(&err_wr, 0, sizeof(err_wr)); diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index dd9440137834..924271c9ef3e 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -198,6 +198,7 @@ struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) out: ctxt->count = 0; + ctxt->mapped_sges = 0; ctxt->frmr = NULL; return ctxt; @@ -221,22 +222,27 @@ out_empty: void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt) { struct svcxprt_rdma *xprt = ctxt->xprt; - int i; - for (i = 0; i < ctxt->count && ctxt->sge[i].length; i++) { + struct ib_device *device = xprt->sc_cm_id->device; + u32 lkey = xprt->sc_pd->local_dma_lkey; + unsigned int i, count; + + for (count = 0, i = 0; i < ctxt->mapped_sges; i++) { /* * Unmap the DMA addr in the SGE if the lkey matches * the local_dma_lkey, otherwise, ignore it since it is * an FRMR lkey and will be unmapped later when the * last WR that uses it completes. */ - if (ctxt->sge[i].lkey == xprt->sc_pd->local_dma_lkey) { - atomic_dec(&xprt->sc_dma_used); - ib_dma_unmap_page(xprt->sc_cm_id->device, + if (ctxt->sge[i].lkey == lkey) { + count++; + ib_dma_unmap_page(device, ctxt->sge[i].addr, ctxt->sge[i].length, ctxt->direction); } } + ctxt->mapped_sges = 0; + atomic_sub(count, &xprt->sc_dma_used); } void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages) @@ -600,7 +606,7 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt, gfp_t flags) DMA_FROM_DEVICE); if (ib_dma_mapping_error(xprt->sc_cm_id->device, pa)) goto err_put_ctxt; - atomic_inc(&xprt->sc_dma_used); + svc_rdma_count_mappings(xprt, ctxt); ctxt->sge[sge_no].addr = pa; ctxt->sge[sge_no].length = PAGE_SIZE; ctxt->sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey; -- cgit v1.2.3 From 5d48709656584420f31b361c4b1a3ebf1d68b225 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 13 Sep 2016 10:53:07 -0400 Subject: rpcrdma: RDMA/CM private message data structure Introduce data structure used by both client and server to exchange implementation details during RDMA/CM connection establishment. This is an experimental out-of-band exchange between Linux RPC-over-RDMA Version One implementations, replacing the deprecated CCP (see RFC 5666bis). The purpose of this extension is to enable prototyping of features that might be introduced in a subsequent version of RPC-over-RDMA. Suggested by Christoph Hellwig and Devesh Sharma. Signed-off-by: Chuck Lever Reviewed-by: Sagi Grimberg Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/rpc_rdma.h | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'include') diff --git a/include/linux/sunrpc/rpc_rdma.h b/include/linux/sunrpc/rpc_rdma.h index 3b1ff38f0c37..a7da6bf56610 100644 --- a/include/linux/sunrpc/rpc_rdma.h +++ b/include/linux/sunrpc/rpc_rdma.h @@ -41,6 +41,7 @@ #define _LINUX_SUNRPC_RPC_RDMA_H #include +#include #define RPCRDMA_VERSION 1 #define rpcrdma_version cpu_to_be32(RPCRDMA_VERSION) @@ -129,4 +130,38 @@ enum rpcrdma_proc { #define rdma_done cpu_to_be32(RDMA_DONE) #define rdma_error cpu_to_be32(RDMA_ERROR) +/* + * Private extension to RPC-over-RDMA Version One. + * Message passed during RDMA-CM connection set-up. + * + * Add new fields at the end, and don't permute existing + * fields. + */ +struct rpcrdma_connect_private { + __be32 cp_magic; + u8 cp_version; + u8 cp_flags; + u8 cp_send_size; + u8 cp_recv_size; +} __packed; + +#define rpcrdma_cmp_magic __cpu_to_be32(0xf6ab0e18) + +enum { + RPCRDMA_CMP_VERSION = 1, + RPCRDMA_CMP_F_SND_W_INV_OK = BIT(0), +}; + +static inline u8 +rpcrdma_encode_buffer_size(unsigned int size) +{ + return (size >> 10) - 1; +} + +static inline unsigned int +rpcrdma_decode_buffer_size(u8 val) +{ + return ((unsigned int)val + 1) << 10; +} + #endif /* _LINUX_SUNRPC_RPC_RDMA_H */ -- cgit v1.2.3 From 25d55296dd3eac23adb2ae46b67b65bf73b22fb2 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 13 Sep 2016 10:53:23 -0400 Subject: svcrdma: support Remote Invalidation Support Remote Invalidation. A private message is exchanged with the client upon RDMA transport connect that indicates whether Send With Invalidation may be used by the server to send RPC replies. The invalidate_rkey is arbitrarily chosen from among rkeys present in the RPC-over-RDMA header's chunk lists. Send With Invalidate improves performance only when clients can recognize, while processing an RPC reply, that an rkey has already been invalidated. That has been submitted as a separate change. In the future, the RPC-over-RDMA protocol might support Remote Invalidation properly. The protocol needs to enable signaling between peers to indicate when Remote Invalidation can be used for each individual RPC. Signed-off-by: Chuck Lever Reviewed-by: Sagi Grimberg Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 1 + net/sunrpc/xprtrdma/svc_rdma_sendto.c | 58 ++++++++++++++++++++++++++++++-- net/sunrpc/xprtrdma/svc_rdma_transport.c | 12 +++++-- 3 files changed, 65 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 3584bc8864c4..cc3ae16eac68 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -137,6 +137,7 @@ struct svcxprt_rdma { int sc_ord; /* RDMA read limit */ int sc_max_sge; int sc_max_sge_rd; /* max sge for read target */ + bool sc_snd_w_inv; /* OK to use Send With Invalidate */ atomic_t sc_sq_count; /* Number of SQ WR on queue */ unsigned int sc_sq_depth; /* Depth of SQ */ diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 3b95b19fcf72..f5a91edcd233 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -225,6 +225,48 @@ svc_rdma_get_reply_array(struct rpcrdma_msg *rmsgp, return rp_ary; } +/* RPC-over-RDMA Version One private extension: Remote Invalidation. + * Responder's choice: requester signals it can handle Send With + * Invalidate, and responder chooses one rkey to invalidate. + * + * Find a candidate rkey to invalidate when sending a reply. Picks the + * first rkey it finds in the chunks lists. + * + * Returns zero if RPC's chunk lists are empty. + */ +static u32 svc_rdma_get_inv_rkey(struct rpcrdma_msg *rdma_argp, + struct rpcrdma_write_array *wr_ary, + struct rpcrdma_write_array *rp_ary) +{ + struct rpcrdma_read_chunk *rd_ary; + struct rpcrdma_segment *arg_ch; + u32 inv_rkey; + + inv_rkey = 0; + + rd_ary = svc_rdma_get_read_chunk(rdma_argp); + if (rd_ary) { + inv_rkey = be32_to_cpu(rd_ary->rc_target.rs_handle); + goto out; + } + + if (wr_ary && be32_to_cpu(wr_ary->wc_nchunks)) { + arg_ch = &wr_ary->wc_array[0].wc_target; + inv_rkey = be32_to_cpu(arg_ch->rs_handle); + goto out; + } + + if (rp_ary && be32_to_cpu(rp_ary->wc_nchunks)) { + arg_ch = &rp_ary->wc_array[0].wc_target; + inv_rkey = be32_to_cpu(arg_ch->rs_handle); + goto out; + } + +out: + dprintk("svcrdma: Send With Invalidate rkey=%08x\n", inv_rkey); + return inv_rkey; +} + /* Assumptions: * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE */ @@ -464,7 +506,8 @@ static int send_reply(struct svcxprt_rdma *rdma, struct page *page, struct rpcrdma_msg *rdma_resp, struct svc_rdma_req_map *vec, - int byte_count) + int byte_count, + u32 inv_rkey) { struct svc_rdma_op_ctxt *ctxt; struct ib_send_wr send_wr; @@ -535,7 +578,11 @@ static int send_reply(struct svcxprt_rdma *rdma, send_wr.wr_cqe = &ctxt->cqe; send_wr.sg_list = ctxt->sge; send_wr.num_sge = sge_no; - send_wr.opcode = IB_WR_SEND; + if (inv_rkey) { + send_wr.opcode = IB_WR_SEND_WITH_INV; + send_wr.ex.invalidate_rkey = inv_rkey; + } else + send_wr.opcode = IB_WR_SEND; send_wr.send_flags = IB_SEND_SIGNALED; ret = svc_rdma_send(rdma, &send_wr); @@ -567,6 +614,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) int inline_bytes; struct page *res_page; struct svc_rdma_req_map *vec; + u32 inv_rkey; dprintk("svcrdma: sending response for rqstp=%p\n", rqstp); @@ -577,6 +625,10 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) wr_ary = svc_rdma_get_write_array(rdma_argp); rp_ary = svc_rdma_get_reply_array(rdma_argp, wr_ary); + inv_rkey = 0; + if (rdma->sc_snd_w_inv) + inv_rkey = svc_rdma_get_inv_rkey(rdma_argp, wr_ary, rp_ary); + /* Build an req vec for the XDR */ vec = svc_rdma_get_req_map(rdma); ret = svc_rdma_map_xdr(rdma, &rqstp->rq_res, vec, wr_ary != NULL); @@ -619,7 +671,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) goto err1; ret = send_reply(rdma, rqstp, res_page, rdma_resp, vec, - inline_bytes); + inline_bytes, inv_rkey); if (ret < 0) goto err0; diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index f51e98a25263..b2464fc4c455 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -657,9 +657,14 @@ svc_rdma_parse_connect_private(struct svcxprt_rdma *newxprt, if (pmsg && pmsg->cp_magic == rpcrdma_cmp_magic && pmsg->cp_version == RPCRDMA_CMP_VERSION) { - dprintk("svcrdma: client send_size %u, recv_size %u\n", + newxprt->sc_snd_w_inv = pmsg->cp_flags & + RPCRDMA_CMP_F_SND_W_INV_OK; + + dprintk("svcrdma: client send_size %u, recv_size %u " + "remote inv %ssupported\n", rpcrdma_decode_buffer_size(pmsg->cp_send_size), - rpcrdma_decode_buffer_size(pmsg->cp_recv_size)); + rpcrdma_decode_buffer_size(pmsg->cp_recv_size), + newxprt->sc_snd_w_inv ? "" : "un"); } } @@ -1093,7 +1098,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) dev->attrs.max_fast_reg_page_list_len; newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG; newxprt->sc_reader = rdma_read_chunk_frmr; - } + } else + newxprt->sc_snd_w_inv = false; /* * Determine if a DMA MR is required and if so, what privs are required -- cgit v1.2.3 From df044e02206230c7d79a9aef96a6c087476f5533 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Wed, 31 Aug 2016 14:52:41 +0300 Subject: watchdog: add pretimeout support to the core Since the watchdog framework centrializes the IOCTL interfaces of device drivers now, SETPRETIMEOUT and GETPRETIMEOUT need to be added in the common code. Signed-off-by: Robin Gong Signed-off-by: Wolfram Sang [vzapolskiy: added conditional pretimeout sysfs attribute visibility] Signed-off-by: Vladimir Zapolskiy Reviewed-by: Guenter Roeck Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- Documentation/watchdog/watchdog-kernel-api.txt | 20 +++++++++ drivers/watchdog/watchdog_dev.c | 56 +++++++++++++++++++++++++- include/linux/watchdog.h | 11 +++++ 3 files changed, 85 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/Documentation/watchdog/watchdog-kernel-api.txt b/Documentation/watchdog/watchdog-kernel-api.txt index 7f31125c123e..3402dcad5b03 100644 --- a/Documentation/watchdog/watchdog-kernel-api.txt +++ b/Documentation/watchdog/watchdog-kernel-api.txt @@ -50,6 +50,7 @@ struct watchdog_device { const struct watchdog_ops *ops; unsigned int bootstatus; unsigned int timeout; + unsigned int pretimeout; unsigned int min_timeout; unsigned int max_timeout; unsigned int min_hw_heartbeat_ms; @@ -77,6 +78,7 @@ It contains following fields: * timeout: the watchdog timer's timeout value (in seconds). This is the time after which the system will reboot if user space does not send a heartbeat request if WDOG_ACTIVE is set. +* pretimeout: the watchdog timer's pretimeout value (in seconds). * min_timeout: the watchdog timer's minimum timeout value (in seconds). If set, the minimum configurable value for 'timeout'. * max_timeout: the watchdog timer's maximum timeout value (in seconds), @@ -121,6 +123,7 @@ struct watchdog_ops { int (*ping)(struct watchdog_device *); unsigned int (*status)(struct watchdog_device *); int (*set_timeout)(struct watchdog_device *, unsigned int); + int (*set_pretimeout)(struct watchdog_device *, unsigned int); unsigned int (*get_timeleft)(struct watchdog_device *); int (*restart)(struct watchdog_device *); void (*ref)(struct watchdog_device *) __deprecated; @@ -188,6 +191,23 @@ they are supported. These optional routines/operations are: If set_timeout is not provided but, WDIOF_SETTIMEOUT is set, the watchdog infrastructure updates the timeout value of the watchdog_device internally to the requested value. + If the pretimeout feature is used (WDIOF_PRETIMEOUT), then set_timeout must + also take care of checking if pretimeout is still valid and set up the timer + accordingly. This can't be done in the core without races, so it is the + duty of the driver. +* set_pretimeout: this routine checks and changes the pretimeout value of + the watchdog. It is optional because not all watchdogs support pretimeout + notification. The timeout value is not an absolute time, but the number of + seconds before the actual timeout would happen. It returns 0 on success, + -EINVAL for "parameter out of range" and -EIO for "could not write value to + the watchdog". A value of 0 disables pretimeout notification. + (Note: the WDIOF_PRETIMEOUT needs to be set in the options field of the + watchdog's info structure). + If the watchdog driver does not have to perform any action but setting the + watchdog_device.pretimeout, this callback can be omitted. That means if + set_pretimeout is not provided but WDIOF_PRETIMEOUT is set, the watchdog + infrastructure updates the pretimeout value of the watchdog_device internally + to the requested value. * get_timeleft: this routines returns the time that's left before a reset. * restart: this routine restarts the machine. It returns 0 on success or a negative errno code for failure. diff --git a/drivers/watchdog/watchdog_dev.c b/drivers/watchdog/watchdog_dev.c index 040bf8382f46..4b381a6be2ff 100644 --- a/drivers/watchdog/watchdog_dev.c +++ b/drivers/watchdog/watchdog_dev.c @@ -335,16 +335,45 @@ static int watchdog_set_timeout(struct watchdog_device *wdd, if (watchdog_timeout_invalid(wdd, timeout)) return -EINVAL; - if (wdd->ops->set_timeout) + if (wdd->ops->set_timeout) { err = wdd->ops->set_timeout(wdd, timeout); - else + } else { wdd->timeout = timeout; + /* Disable pretimeout if it doesn't fit the new timeout */ + if (wdd->pretimeout >= wdd->timeout) + wdd->pretimeout = 0; + } watchdog_update_worker(wdd); return err; } +/* + * watchdog_set_pretimeout: set the watchdog timer pretimeout + * @wdd: the watchdog device to set the timeout for + * @timeout: pretimeout to set in seconds + */ + +static int watchdog_set_pretimeout(struct watchdog_device *wdd, + unsigned int timeout) +{ + int err = 0; + + if (!(wdd->info->options & WDIOF_PRETIMEOUT)) + return -EOPNOTSUPP; + + if (watchdog_pretimeout_invalid(wdd, timeout)) + return -EINVAL; + + if (wdd->ops->set_pretimeout) + err = wdd->ops->set_pretimeout(wdd, timeout); + else + wdd->pretimeout = timeout; + + return err; +} + /* * watchdog_get_timeleft: wrapper to get the time left before a reboot * @wdd: the watchdog device to get the remaining time from @@ -429,6 +458,15 @@ static ssize_t timeout_show(struct device *dev, struct device_attribute *attr, } static DEVICE_ATTR_RO(timeout); +static ssize_t pretimeout_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct watchdog_device *wdd = dev_get_drvdata(dev); + + return sprintf(buf, "%u\n", wdd->pretimeout); +} +static DEVICE_ATTR_RO(pretimeout); + static ssize_t identity_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -459,6 +497,9 @@ static umode_t wdt_is_visible(struct kobject *kobj, struct attribute *attr, if (attr == &dev_attr_timeleft.attr && !wdd->ops->get_timeleft) mode = 0; + else if (attr == &dev_attr_pretimeout.attr && + !(wdd->info->options & WDIOF_PRETIMEOUT)) + mode = 0; return mode; } @@ -466,6 +507,7 @@ static struct attribute *wdt_attrs[] = { &dev_attr_state.attr, &dev_attr_identity.attr, &dev_attr_timeout.attr, + &dev_attr_pretimeout.attr, &dev_attr_timeleft.attr, &dev_attr_bootstatus.attr, &dev_attr_status.attr, @@ -646,6 +688,16 @@ static long watchdog_ioctl(struct file *file, unsigned int cmd, break; err = put_user(val, p); break; + case WDIOC_SETPRETIMEOUT: + if (get_user(val, p)) { + err = -EFAULT; + break; + } + err = watchdog_set_pretimeout(wdd, val); + break; + case WDIOC_GETPRETIMEOUT: + err = put_user(wdd->pretimeout, p); + break; default: err = -ENOTTY; break; diff --git a/include/linux/watchdog.h b/include/linux/watchdog.h index 7047bc7f8106..4035df7ec023 100644 --- a/include/linux/watchdog.h +++ b/include/linux/watchdog.h @@ -28,6 +28,7 @@ struct watchdog_core_data; * @ping: The routine that sends a keepalive ping to the watchdog device. * @status: The routine that shows the status of the watchdog device. * @set_timeout:The routine for setting the watchdog devices timeout value (in seconds). + * @set_pretimeout:The routine for setting the watchdog devices pretimeout. * @get_timeleft:The routine that gets the time left before a reset (in seconds). * @restart: The routine for restarting the machine. * @ioctl: The routines that handles extra ioctl calls. @@ -46,6 +47,7 @@ struct watchdog_ops { int (*ping)(struct watchdog_device *); unsigned int (*status)(struct watchdog_device *); int (*set_timeout)(struct watchdog_device *, unsigned int); + int (*set_pretimeout)(struct watchdog_device *, unsigned int); unsigned int (*get_timeleft)(struct watchdog_device *); int (*restart)(struct watchdog_device *, unsigned long, void *); long (*ioctl)(struct watchdog_device *, unsigned int, unsigned long); @@ -61,6 +63,7 @@ struct watchdog_ops { * @ops: Pointer to the list of watchdog operations. * @bootstatus: Status of the watchdog device at boot. * @timeout: The watchdog devices timeout value (in seconds). + * @pretimeout: The watchdog devices pre_timeout value. * @min_timeout:The watchdog devices minimum timeout value (in seconds). * @max_timeout:The watchdog devices maximum timeout value (in seconds) * as configurable from user space. Only relevant if @@ -96,6 +99,7 @@ struct watchdog_device { const struct watchdog_ops *ops; unsigned int bootstatus; unsigned int timeout; + unsigned int pretimeout; unsigned int min_timeout; unsigned int max_timeout; unsigned int min_hw_heartbeat_ms; @@ -163,6 +167,13 @@ static inline bool watchdog_timeout_invalid(struct watchdog_device *wdd, unsigne t > wdd->max_timeout); } +/* Use the following function to check if a pretimeout value is invalid */ +static inline bool watchdog_pretimeout_invalid(struct watchdog_device *wdd, + unsigned int t) +{ + return t && wdd->timeout && t >= wdd->timeout; +} + /* Use the following functions to manipulate watchdog driver specific data */ static inline void watchdog_set_drvdata(struct watchdog_device *wdd, void *data) { -- cgit v1.2.3 From b4c8eb0379c8d3300a54210edd2235fd1e81a8a6 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 16 Sep 2016 16:28:26 -0400 Subject: nfs: add a new NFS4_OPEN_RESULT_MAY_NOTIFY_LOCK constant As defined in RFC 5661, section 18.16. Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- include/uapi/linux/nfs4.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/nfs4.h b/include/uapi/linux/nfs4.h index 2b871e0858d9..4ae62796bfde 100644 --- a/include/uapi/linux/nfs4.h +++ b/include/uapi/linux/nfs4.h @@ -39,8 +39,9 @@ #define NFS4_FH_VOL_MIGRATION 0x0004 #define NFS4_FH_VOL_RENAME 0x0008 -#define NFS4_OPEN_RESULT_CONFIRM 0x0002 -#define NFS4_OPEN_RESULT_LOCKTYPE_POSIX 0x0004 +#define NFS4_OPEN_RESULT_CONFIRM 0x0002 +#define NFS4_OPEN_RESULT_LOCKTYPE_POSIX 0x0004 +#define NFS4_OPEN_RESULT_MAY_NOTIFY_LOCK 0x0020 #define NFS4_SHARE_ACCESS_MASK 0x000F #define NFS4_SHARE_ACCESS_READ 0x0001 -- cgit v1.2.3 From 4a7069a32c99a81950de035535b0a064dcceaeba Mon Sep 17 00:00:00 2001 From: Rajendra Nayak Date: Thu, 5 May 2016 14:21:42 +0530 Subject: thermal: core: export apis to get slope and offset Add apis for platform thermal drivers to query for slope and offset attributes, which might be needed for temperature calculations. Signed-off-by: Rajendra Nayak Signed-off-by: Eduardo Valentin Signed-off-by: Zhang Rui --- Documentation/thermal/sysfs-api.txt | 12 ++++++++++++ drivers/thermal/thermal_core.c | 30 ++++++++++++++++++++++++++++++ include/linux/thermal.h | 8 ++++++++ 3 files changed, 50 insertions(+) (limited to 'include') diff --git a/Documentation/thermal/sysfs-api.txt b/Documentation/thermal/sysfs-api.txt index efc3f3d293c4..755476ea3fda 100644 --- a/Documentation/thermal/sysfs-api.txt +++ b/Documentation/thermal/sysfs-api.txt @@ -140,6 +140,18 @@ temperature) and throttle appropriate devices. Normally this function will not need to be called and the resource management code will ensure that the resource is freed. +1.1.7 int thermal_zone_get_slope(struct thermal_zone_device *tz) + + This interface is used to read the slope attribute value + for the thermal zone device, which might be useful for platform + drivers for temperature calculations. + +1.1.8 int thermal_zone_get_offset(struct thermal_zone_device *tz) + + This interface is used to read the offset attribute value + for the thermal zone device, which might be useful for platform + drivers for temperature calculations. + 1.2 thermal cooling device interface 1.2.1 struct thermal_cooling_device *thermal_cooling_device_register(char *name, void *devdata, struct thermal_cooling_device_ops *) diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index e2fc6161dded..8728cc615452 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -2069,6 +2069,36 @@ exit: } EXPORT_SYMBOL_GPL(thermal_zone_get_zone_by_name); +/** + * thermal_zone_get_slope - return the slope attribute of the thermal zone + * @tz: thermal zone device with the slope attribute + * + * Return: If the thermal zone device has a slope attribute, return it, else + * return 1. + */ +int thermal_zone_get_slope(struct thermal_zone_device *tz) +{ + if (tz && tz->tzp) + return tz->tzp->slope; + return 1; +} +EXPORT_SYMBOL_GPL(thermal_zone_get_slope); + +/** + * thermal_zone_get_offset - return the offset attribute of the thermal zone + * @tz: thermal zone device with the offset attribute + * + * Return: If the thermal zone device has a offset attribute, return it, else + * return 0. + */ +int thermal_zone_get_offset(struct thermal_zone_device *tz) +{ + if (tz && tz->tzp) + return tz->tzp->offset; + return 0; +} +EXPORT_SYMBOL_GPL(thermal_zone_get_offset); + #ifdef CONFIG_NET static const struct genl_multicast_group thermal_event_mcgrps[] = { { .name = THERMAL_GENL_MCAST_GROUP_NAME, }, diff --git a/include/linux/thermal.h b/include/linux/thermal.h index ee517bef0db0..707d7353c28b 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -435,6 +435,8 @@ thermal_of_cooling_device_register(struct device_node *np, char *, void *, void thermal_cooling_device_unregister(struct thermal_cooling_device *); struct thermal_zone_device *thermal_zone_get_zone_by_name(const char *name); int thermal_zone_get_temp(struct thermal_zone_device *tz, int *temp); +int thermal_zone_get_slope(struct thermal_zone_device *tz); +int thermal_zone_get_offset(struct thermal_zone_device *tz); int get_tz_trend(struct thermal_zone_device *, int); struct thermal_instance *get_thermal_instance(struct thermal_zone_device *, @@ -492,6 +494,12 @@ static inline struct thermal_zone_device *thermal_zone_get_zone_by_name( static inline int thermal_zone_get_temp( struct thermal_zone_device *tz, int *temp) { return -ENODEV; } +static inline int thermal_zone_get_slope( + struct thermal_zone_device *tz) +{ return -ENODEV; } +static inline int thermal_zone_get_offset( + struct thermal_zone_device *tz) +{ return -ENODEV; } static inline int get_tz_trend(struct thermal_zone_device *tz, int trip) { return -ENODEV; } static inline struct thermal_instance * -- cgit v1.2.3 From 060c034a974187e930b790957cafc5047cc30a40 Mon Sep 17 00:00:00 2001 From: Sascha Hauer Date: Wed, 22 Jun 2016 16:42:01 +0800 Subject: thermal: Add support for hardware-tracked trip points This adds support for hardware-tracked trip points to the device tree thermal sensor framework. The framework supports an arbitrary number of trip points. Whenever the current temperature is updated, the trip points immediately below and above the current temperature are found. A .set_trips callback is then called with the temperatures. If there is no trip point above or below the current temperature, the passed trip temperature will be -INT_MAX or INT_MAX respectively. In this callback, the driver should program the hardware such that it is notified when either of these trip points are triggered. When a trip point is triggered, the driver should call `thermal_zone_device_update' for the respective thermal zone. This will cause the trip points to be updated again. If .set_trips is not implemented, the framework behaves as before. This patch is based on an earlier version from Mikko Perttunen Signed-off-by: Sascha Hauer Signed-off-by: Caesar Wang Cc: Zhang Rui Cc: Eduardo Valentin Cc: linux-pm@vger.kernel.org Reviewed-by: Javi Merino Signed-off-by: Eduardo Valentin Signed-off-by: Zhang Rui --- Documentation/thermal/sysfs-api.txt | 7 +++++ drivers/thermal/thermal_core.c | 55 +++++++++++++++++++++++++++++++++++++ include/linux/thermal.h | 10 +++++++ 3 files changed, 72 insertions(+) (limited to 'include') diff --git a/Documentation/thermal/sysfs-api.txt b/Documentation/thermal/sysfs-api.txt index 755476ea3fda..ef473dc7f55e 100644 --- a/Documentation/thermal/sysfs-api.txt +++ b/Documentation/thermal/sysfs-api.txt @@ -49,6 +49,9 @@ temperature) and throttle appropriate devices. .bind: bind the thermal zone device with a thermal cooling device. .unbind: unbind the thermal zone device with a thermal cooling device. .get_temp: get the current temperature of the thermal zone. + .set_trips: set the trip points window. Whenever the current temperature + is updated, the trip points immediately below and above the + current temperature are found. .get_mode: get the current mode (enabled/disabled) of the thermal zone. - "enabled" means the kernel thermal management is enabled. - "disabled" will prevent kernel thermal driver action upon trip points @@ -95,6 +98,10 @@ temperature) and throttle appropriate devices. get_temp: a pointer to a function that reads the sensor temperature. This is mandatory callback provided by sensor driver. + set_trips: a pointer to a function that sets a + temperature window. When this window is + left the driver must inform the thermal + core via thermal_zone_device_update. get_trend: a pointer to a function that reads the sensor temperature trend. set_emul_temp: a pointer to a function that sets diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index 8728cc615452..f2d55e478b2a 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -520,6 +520,56 @@ exit: } EXPORT_SYMBOL_GPL(thermal_zone_get_temp); +void thermal_zone_set_trips(struct thermal_zone_device *tz) +{ + int low = -INT_MAX; + int high = INT_MAX; + int trip_temp, hysteresis; + int i, ret; + + mutex_lock(&tz->lock); + + if (!tz->ops->set_trips || !tz->ops->get_trip_hyst) + goto exit; + + for (i = 0; i < tz->trips; i++) { + int trip_low; + + tz->ops->get_trip_temp(tz, i, &trip_temp); + tz->ops->get_trip_hyst(tz, i, &hysteresis); + + trip_low = trip_temp - hysteresis; + + if (trip_low < tz->temperature && trip_low > low) + low = trip_low; + + if (trip_temp > tz->temperature && trip_temp < high) + high = trip_temp; + } + + /* No need to change trip points */ + if (tz->prev_low_trip == low && tz->prev_high_trip == high) + goto exit; + + tz->prev_low_trip = low; + tz->prev_high_trip = high; + + dev_dbg(&tz->device, + "new temperature boundaries: %d < x < %d\n", low, high); + + /* + * Set a temperature window. When this window is left the driver + * must inform the thermal core via thermal_zone_device_update. + */ + ret = tz->ops->set_trips(tz, low, high); + if (ret) + dev_err(&tz->device, "Failed to set trips: %d\n", ret); + +exit: + mutex_unlock(&tz->lock); +} +EXPORT_SYMBOL_GPL(thermal_zone_set_trips); + static void update_temperature(struct thermal_zone_device *tz) { int temp, ret; @@ -569,6 +619,8 @@ void thermal_zone_device_update(struct thermal_zone_device *tz) update_temperature(tz); + thermal_zone_set_trips(tz); + for (count = 0; count < tz->trips; count++) handle_thermal_trip(tz, count); } @@ -754,6 +806,9 @@ trip_point_hyst_store(struct device *dev, struct device_attribute *attr, */ ret = tz->ops->set_trip_hyst(tz, trip, temperature); + if (!ret) + thermal_zone_set_trips(tz); + return ret ? ret : count; } diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 707d7353c28b..54cdfeaaedd4 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -98,6 +98,7 @@ struct thermal_zone_device_ops { int (*unbind) (struct thermal_zone_device *, struct thermal_cooling_device *); int (*get_temp) (struct thermal_zone_device *, int *); + int (*set_trips) (struct thermal_zone_device *, int, int); int (*get_mode) (struct thermal_zone_device *, enum thermal_device_mode *); int (*set_mode) (struct thermal_zone_device *, @@ -168,6 +169,10 @@ struct thermal_attr { * @last_temperature: previous temperature read * @emul_temperature: emulated temperature when using CONFIG_THERMAL_EMULATION * @passive: 1 if you've crossed a passive trip point, 0 otherwise. + * @prev_low_trip: the low current temperature if you've crossed a passive + trip point. + * @prev_high_trip: the above current temperature if you've crossed a + passive trip point. * @forced_passive: If > 0, temperature at which to switch on all ACPI * processor cooling devices. Currently only used by the * step-wise governor. @@ -199,6 +204,8 @@ struct thermal_zone_device { int last_temperature; int emul_temperature; int passive; + int prev_low_trip; + int prev_high_trip; unsigned int forced_passive; atomic_t need_update; struct thermal_zone_device_ops *ops; @@ -426,6 +433,7 @@ int thermal_zone_bind_cooling_device(struct thermal_zone_device *, int, int thermal_zone_unbind_cooling_device(struct thermal_zone_device *, int, struct thermal_cooling_device *); void thermal_zone_device_update(struct thermal_zone_device *); +void thermal_zone_set_trips(struct thermal_zone_device *); struct thermal_cooling_device *thermal_cooling_device_register(char *, void *, const struct thermal_cooling_device_ops *); @@ -477,6 +485,8 @@ static inline int thermal_zone_unbind_cooling_device( { return -ENODEV; } static inline void thermal_zone_device_update(struct thermal_zone_device *tz) { } +static inline void thermal_zone_set_trips(struct thermal_zone_device *tz) +{ } static inline struct thermal_cooling_device * thermal_cooling_device_register(char *type, void *devdata, const struct thermal_cooling_device_ops *ops) -- cgit v1.2.3 From 826386e73193e0b58c6d797fbbab409bc98b1d9c Mon Sep 17 00:00:00 2001 From: Sascha Hauer Date: Wed, 22 Jun 2016 16:42:02 +0800 Subject: thermal: of: implement .set_trips for device tree thermal zones This patch implements .set_trips for device tree thermal zones. As the hardware-tracked trip points is supported by thermal core patch[0]. patch[0] "thermal: Add support for hardware-tracked trip points". Signed-off-by: Sascha Hauer Signed-off-by: Caesar Wang Cc: Zhang Rui Cc: Eduardo Valentin Reviewed-by: Javi Merino Signed-off-by: Eduardo Valentin Signed-off-by: Zhang Rui --- drivers/thermal/of-thermal.c | 19 +++++++++++++++++++ include/linux/thermal.h | 4 ++++ 2 files changed, 23 insertions(+) (limited to 'include') diff --git a/drivers/thermal/of-thermal.c b/drivers/thermal/of-thermal.c index b8e509c60848..2d2a06f155e2 100644 --- a/drivers/thermal/of-thermal.c +++ b/drivers/thermal/of-thermal.c @@ -101,6 +101,17 @@ static int of_thermal_get_temp(struct thermal_zone_device *tz, return data->ops->get_temp(data->sensor_data, temp); } +static int of_thermal_set_trips(struct thermal_zone_device *tz, + int low, int high) +{ + struct __thermal_zone *data = tz->devdata; + + if (!data->ops || !data->ops->set_trips) + return -EINVAL; + + return data->ops->set_trips(data->sensor_data, low, high); +} + /** * of_thermal_get_ntrips - function to export number of available trip * points. @@ -427,6 +438,14 @@ thermal_zone_of_add_sensor(struct device_node *zone, tzd->ops->get_temp = of_thermal_get_temp; tzd->ops->get_trend = of_thermal_get_trend; + + /* + * The thermal zone core will calculate the window if they have set the + * optional set_trips pointer. + */ + if (ops->set_trips) + tzd->ops->set_trips = of_thermal_set_trips; + tzd->ops->set_emul_temp = of_thermal_set_emul_temp; mutex_unlock(&tzd->lock); diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 54cdfeaaedd4..20118b9ebeb7 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -340,6 +340,9 @@ struct thermal_genl_event { * * Optional: * @get_trend: a pointer to a function that reads the sensor temperature trend. + * @set_trips: a pointer to a function that sets a temperature window. When + * this window is left the driver must inform the thermal core via + * thermal_zone_device_update. * @set_emul_temp: a pointer to a function that sets sensor emulated * temperature. * @set_trip_temp: a pointer to a function that sets the trip temperature on @@ -348,6 +351,7 @@ struct thermal_genl_event { struct thermal_zone_of_device_ops { int (*get_temp)(void *, int *); int (*get_trend)(void *, long *); + int (*set_trips)(void *, int, int); int (*set_emul_temp)(void *, int); int (*set_trip_temp)(void *, int, int); }; -- cgit v1.2.3 From e78eaf45993a51e5d7120de48aa01f059ffe8d37 Mon Sep 17 00:00:00 2001 From: Sascha Hauer Date: Wed, 22 Jun 2016 16:42:03 +0800 Subject: thermal: streamline get_trend callbacks The .get_trend callback in struct thermal_zone_device_ops has the prototype: int (*get_trend) (struct thermal_zone_device *, int, enum thermal_trend *); whereas the .get_trend callback in struct thermal_zone_of_device_ops has: int (*get_trend)(void *, long *); Streamline both prototypes and add the trip argument to the OF callback aswell and use enum thermal_trend * instead of an integer pointer. While the OF prototype may be the better one, this should be decided at framework level and not on OF level. Signed-off-by: Sascha Hauer Signed-off-by: Caesar Wang Cc: Zhang Rui Cc: Eduardo Valentin Cc: linux-pm@vger.kernel.org Reviewed-by: Keerthy Signed-off-by: Eduardo Valentin Signed-off-by: Zhang Rui --- drivers/thermal/of-thermal.c | 16 +------------- drivers/thermal/qcom/tsens.c | 6 +++--- drivers/thermal/qcom/tsens.h | 4 +++- drivers/thermal/ti-soc-thermal/ti-thermal-common.c | 25 +++++++--------------- include/linux/thermal.h | 2 +- 5 files changed, 16 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/drivers/thermal/of-thermal.c b/drivers/thermal/of-thermal.c index 2d2a06f155e2..20822abc6682 100644 --- a/drivers/thermal/of-thermal.c +++ b/drivers/thermal/of-thermal.c @@ -202,25 +202,11 @@ static int of_thermal_get_trend(struct thermal_zone_device *tz, int trip, enum thermal_trend *trend) { struct __thermal_zone *data = tz->devdata; - long dev_trend; - int r; if (!data->ops->get_trend) return -EINVAL; - r = data->ops->get_trend(data->sensor_data, &dev_trend); - if (r) - return r; - - /* TODO: These intervals might have some thresholds, but in core code */ - if (dev_trend > 0) - *trend = THERMAL_TREND_RAISING; - else if (dev_trend < 0) - *trend = THERMAL_TREND_DROPPING; - else - *trend = THERMAL_TREND_STABLE; - - return 0; + return data->ops->get_trend(data->sensor_data, trip, trend); } static int of_thermal_bind(struct thermal_zone_device *thermal, diff --git a/drivers/thermal/qcom/tsens.c b/drivers/thermal/qcom/tsens.c index b18227269286..446f70b5dbb2 100644 --- a/drivers/thermal/qcom/tsens.c +++ b/drivers/thermal/qcom/tsens.c @@ -29,13 +29,13 @@ static int tsens_get_temp(void *data, int *temp) return tmdev->ops->get_temp(tmdev, s->id, temp); } -static int tsens_get_trend(void *data, long *temp) +static int tsens_get_trend(void *p, int trip, enum thermal_trend *trend) { - const struct tsens_sensor *s = data; + const struct tsens_sensor *s = p; struct tsens_device *tmdev = s->tmdev; if (tmdev->ops->get_trend) - return tmdev->ops->get_trend(tmdev, s->id, temp); + return tmdev->ops->get_trend(tmdev, s->id, trend); return -ENOTSUPP; } diff --git a/drivers/thermal/qcom/tsens.h b/drivers/thermal/qcom/tsens.h index b0f8c47ff9e2..911c1978892b 100644 --- a/drivers/thermal/qcom/tsens.h +++ b/drivers/thermal/qcom/tsens.h @@ -17,6 +17,8 @@ #define ONE_PT_CALIB2 0x2 #define TWO_PT_CALIB 0x3 +#include + struct tsens_device; struct tsens_sensor { @@ -50,7 +52,7 @@ struct tsens_ops { void (*disable)(struct tsens_device *); int (*suspend)(struct tsens_device *); int (*resume)(struct tsens_device *); - int (*get_trend)(struct tsens_device *, int, long *); + int (*get_trend)(struct tsens_device *, int, enum thermal_trend *); }; /** diff --git a/drivers/thermal/ti-soc-thermal/ti-thermal-common.c b/drivers/thermal/ti-soc-thermal/ti-thermal-common.c index 15c0a9ac2209..4a6757ca78f0 100644 --- a/drivers/thermal/ti-soc-thermal/ti-thermal-common.c +++ b/drivers/thermal/ti-soc-thermal/ti-thermal-common.c @@ -239,7 +239,7 @@ static int ti_thermal_get_trip_temp(struct thermal_zone_device *thermal, return 0; } -static int __ti_thermal_get_trend(void *p, long *trend) +static int __ti_thermal_get_trend(void *p, int trip, enum thermal_trend *trend) { struct ti_thermal_data *data = p; struct ti_bandgap *bgp; @@ -252,22 +252,6 @@ static int __ti_thermal_get_trend(void *p, long *trend) if (ret) return ret; - *trend = tr; - - return 0; -} - -/* Get the temperature trend callback functions for thermal zone */ -static int ti_thermal_get_trend(struct thermal_zone_device *thermal, - int trip, enum thermal_trend *trend) -{ - int ret; - long tr; - - ret = __ti_thermal_get_trend(thermal->devdata, &tr); - if (ret) - return ret; - if (tr > 0) *trend = THERMAL_TREND_RAISING; else if (tr < 0) @@ -278,6 +262,13 @@ static int ti_thermal_get_trend(struct thermal_zone_device *thermal, return 0; } +/* Get the temperature trend callback functions for thermal zone */ +static int ti_thermal_get_trend(struct thermal_zone_device *thermal, + int trip, enum thermal_trend *trend) +{ + return __ti_thermal_get_trend(thermal->devdata, trip, trend); +} + /* Get critical temperature callback functions for thermal zone */ static int ti_thermal_get_crit_temp(struct thermal_zone_device *thermal, int *temp) diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 20118b9ebeb7..b3c16f06fdc4 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -350,7 +350,7 @@ struct thermal_genl_event { */ struct thermal_zone_of_device_ops { int (*get_temp)(void *, int *); - int (*get_trend)(void *, long *); + int (*get_trend)(void *, int, enum thermal_trend *); int (*set_trips)(void *, int, int); int (*set_emul_temp)(void *, int); int (*set_trip_temp)(void *, int, int); -- cgit v1.2.3 From 6c7c324570847a459c21e7298bc5c92a40577103 Mon Sep 17 00:00:00 2001 From: Wei Ni Date: Wed, 11 May 2016 18:20:18 +0800 Subject: thermal: tegra: add hw-throttle for Tegra132 Tegra132 use CCROC throttle registers to configure pulse skiper, set these registers to enable throttle function for Tegra132. Signed-off-by: Wei Ni Signed-off-by: Zhang Rui --- drivers/thermal/tegra/soctherm.c | 234 +++++++++++++++++++++--- drivers/thermal/tegra/tegra132-soctherm.c | 17 ++ include/dt-bindings/thermal/tegra124-soctherm.h | 5 + 3 files changed, 226 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/drivers/thermal/tegra/soctherm.c b/drivers/thermal/tegra/soctherm.c index dc75f92e1d8e..7d2db23d71a3 100644 --- a/drivers/thermal/tegra/soctherm.c +++ b/drivers/thermal/tegra/soctherm.c @@ -136,6 +136,21 @@ #define CAR_SUPER_CCLKG_DIVIDER 0x36c #define CDIVG_USE_THERM_CONTROLS_MASK BIT(30) +/* ccroc register offsets needed for enabling HW throttling for Tegra132 */ +#define CCROC_SUPER_CCLKG_DIVIDER 0x024 + +#define CCROC_GLOBAL_CFG 0x148 + +#define CCROC_THROT_PSKIP_RAMP_CPU 0x150 +#define CCROC_THROT_PSKIP_RAMP_SEQ_BYPASS_MODE_MASK BIT(31) +#define CCROC_THROT_PSKIP_RAMP_DURATION_MASK (0xffff << 8) +#define CCROC_THROT_PSKIP_RAMP_STEP_MASK 0xff + +#define CCROC_THROT_PSKIP_CTRL_CPU 0x154 +#define CCROC_THROT_PSKIP_CTRL_ENB_MASK BIT(31) +#define CCROC_THROT_PSKIP_CTRL_DIVIDEND_MASK (0xff << 8) +#define CCROC_THROT_PSKIP_CTRL_DIVISOR_MASK 0xff + /* get val from register(r) mask bits(m) */ #define REG_GET_MASK(r, m) (((r) & (m)) >> (ffs(m) - 1)) /* set val(v) to mask bits(m) of register(r) */ @@ -158,6 +173,13 @@ #define THROT_DELAY_CTRL(throt) (THROT_DELAY_LITE + \ (THROT_OFFSET * throt)) +/* get CCROC_THROT_PSKIP_xxx offset per HIGH/MED/LOW vect*/ +#define CCROC_THROT_OFFSET 0x0c +#define CCROC_THROT_PSKIP_CTRL_CPU_REG(vect) (CCROC_THROT_PSKIP_CTRL_CPU + \ + (CCROC_THROT_OFFSET * vect)) +#define CCROC_THROT_PSKIP_RAMP_CPU_REG(vect) (CCROC_THROT_PSKIP_RAMP_CPU + \ + (CCROC_THROT_OFFSET * vect)) + /* get THERMCTL_LEVELx offset per CPU/GPU/MEM/TSENSE rg and LEVEL0~3 lv */ #define THERMCTL_LVL_REGS_SIZE 0x20 #define THERMCTL_LVL_REG(rg, lv) ((rg) + ((lv) * THERMCTL_LVL_REGS_SIZE)) @@ -195,6 +217,7 @@ struct soctherm_throt_cfg { const char *name; unsigned int id; u8 priority; + u8 cpu_throt_level; u32 cpu_throt_depth; struct thermal_cooling_device *cdev; bool init; @@ -206,6 +229,7 @@ struct tegra_soctherm { struct clk *clock_soctherm; void __iomem *regs; void __iomem *clk_regs; + void __iomem *ccroc_regs; u32 *calib; struct thermal_zone_device **thermctl_tzs; @@ -241,6 +265,31 @@ static inline u32 clk_readl(struct tegra_soctherm *ts, u32 reg) return readl(ts->clk_regs + reg); } +/** + * ccroc_writel() - writes a value to a CCROC register + * @ts: pointer to a struct tegra_soctherm + * @v: the value to write + * @reg: the register offset + * + * Writes @v to @reg. No return value. + */ +static inline void ccroc_writel(struct tegra_soctherm *ts, u32 value, u32 reg) +{ + writel(value, (ts->ccroc_regs + reg)); +} + +/** + * ccroc_readl() - reads specified register from CCROC IP block + * @ts: pointer to a struct tegra_soctherm + * @reg: register address to be read + * + * Return: the value of the register + */ +static inline u32 ccroc_readl(struct tegra_soctherm *ts, u32 reg) +{ + return readl(ts->ccroc_regs + reg); +} + static void enable_tsensor(struct tegra_soctherm *tegra, unsigned int i) { const struct tegra_tsensor *sensor = &tegra->soc->tsensors[i]; @@ -552,9 +601,6 @@ static int tegra_soctherm_set_hwtrips(struct device *dev, sg->name, temperature); set_throttle: - if (ts->soc->use_ccroc) - return 0; - ret = get_hot_temp(tz, &trip, &temperature); if (ret) { dev_warn(dev, "throttrip: %s: missing hot temperature\n", @@ -676,9 +722,6 @@ static int regs_show(struct seq_file *s, void *data) state = REG_GET_MASK(r, SENSOR_TEMP2_MEM_TEMP_MASK); seq_printf(s, " MEM(%d)\n", translate_temp(state)); - if (ts->soc->use_ccroc) - return 0; - for (i = 0; i < ts->soc->num_ttgs; i++) { seq_printf(s, "%s:\n", ttgs[i]->name); for (level = 0; level < 4; level++) { @@ -779,12 +822,17 @@ static int regs_show(struct seq_file *s, void *data) seq_printf(s, "enabled(%d)\n", state); r = readl(ts->regs + CPU_PSKIP_STATUS); - state = REG_GET_MASK(r, XPU_PSKIP_STATUS_M_MASK); - seq_printf(s, "CPU PSKIP STATUS: M(%d) ", state); - state = REG_GET_MASK(r, XPU_PSKIP_STATUS_N_MASK); - seq_printf(s, "N(%d) ", state); - state = REG_GET_MASK(r, XPU_PSKIP_STATUS_ENABLED_MASK); - seq_printf(s, "enabled(%d)\n", state); + if (ts->soc->use_ccroc) { + state = REG_GET_MASK(r, XPU_PSKIP_STATUS_ENABLED_MASK); + seq_printf(s, "CPU PSKIP STATUS: enabled(%d)\n", state); + } else { + state = REG_GET_MASK(r, XPU_PSKIP_STATUS_M_MASK); + seq_printf(s, "CPU PSKIP STATUS: M(%d) ", state); + state = REG_GET_MASK(r, XPU_PSKIP_STATUS_N_MASK); + seq_printf(s, "N(%d) ", state); + state = REG_GET_MASK(r, XPU_PSKIP_STATUS_ENABLED_MASK); + seq_printf(s, "enabled(%d)\n", state); + } return 0; } @@ -939,15 +987,29 @@ static void soctherm_init_hw_throt_cdev(struct platform_device *pdev) } stc->priority = val; - r = of_property_read_u32(np_stcc, "nvidia,cpu-throt-percent", - &val); - if (r) { - dev_info(dev, - "throttle-cfg: %s: missing cpu-throt-percent\n", - name); - continue; + if (ts->soc->use_ccroc) { + r = of_property_read_u32(np_stcc, + "nvidia,cpu-throt-level", + &val); + if (r) { + dev_info(dev, + "throttle-cfg: %s: missing cpu-throt-level\n", + name); + continue; + } + stc->cpu_throt_level = val; + } else { + r = of_property_read_u32(np_stcc, + "nvidia,cpu-throt-percent", + &val); + if (r) { + dev_info(dev, + "throttle-cfg: %s: missing cpu-throt-percent\n", + name); + continue; + } + stc->cpu_throt_depth = val; } - stc->cpu_throt_depth = val; tcd = thermal_of_cooling_device_register(np_stcc, (char *)name, ts, @@ -967,6 +1029,96 @@ static void soctherm_init_hw_throt_cdev(struct platform_device *pdev) of_node_put(np_stc); } +/** + * throttlectl_cpu_level_cfg() - programs CCROC NV_THERM level config + * @level: describing the level LOW/MED/HIGH of throttling + * + * It's necessary to set up the CPU-local CCROC NV_THERM instance with + * the M/N values desired for each level. This function does this. + * + * This function pre-programs the CCROC NV_THERM levels in terms of + * pre-configured "Low", "Medium" or "Heavy" throttle levels which are + * mapped to THROT_LEVEL_LOW, THROT_LEVEL_MED and THROT_LEVEL_HVY. + */ +static void throttlectl_cpu_level_cfg(struct tegra_soctherm *ts, int level) +{ + u8 depth, dividend; + u32 r; + + switch (level) { + case TEGRA_SOCTHERM_THROT_LEVEL_LOW: + depth = 50; + break; + case TEGRA_SOCTHERM_THROT_LEVEL_MED: + depth = 75; + break; + case TEGRA_SOCTHERM_THROT_LEVEL_HIGH: + depth = 80; + break; + case TEGRA_SOCTHERM_THROT_LEVEL_NONE: + return; + default: + return; + } + + dividend = THROT_DEPTH_DIVIDEND(depth); + + /* setup PSKIP in ccroc nv_therm registers */ + r = ccroc_readl(ts, CCROC_THROT_PSKIP_RAMP_CPU_REG(level)); + r = REG_SET_MASK(r, CCROC_THROT_PSKIP_RAMP_DURATION_MASK, 0xff); + r = REG_SET_MASK(r, CCROC_THROT_PSKIP_RAMP_STEP_MASK, 0xf); + ccroc_writel(ts, r, CCROC_THROT_PSKIP_RAMP_CPU_REG(level)); + + r = ccroc_readl(ts, CCROC_THROT_PSKIP_CTRL_CPU_REG(level)); + r = REG_SET_MASK(r, CCROC_THROT_PSKIP_CTRL_ENB_MASK, 1); + r = REG_SET_MASK(r, CCROC_THROT_PSKIP_CTRL_DIVIDEND_MASK, dividend); + r = REG_SET_MASK(r, CCROC_THROT_PSKIP_CTRL_DIVISOR_MASK, 0xff); + ccroc_writel(ts, r, CCROC_THROT_PSKIP_CTRL_CPU_REG(level)); +} + +/** + * throttlectl_cpu_level_select() - program CPU pulse skipper config + * @throt: the LIGHT/HEAVY of throttle event id + * + * Pulse skippers are used to throttle clock frequencies. This + * function programs the pulse skippers based on @throt and platform + * data. This function is used on SoCs which have CPU-local pulse + * skipper control, such as T13x. It programs soctherm's interface to + * Denver:CCROC NV_THERM in terms of Low, Medium and HIGH throttling + * vectors. PSKIP_BYPASS mode is set as required per HW spec. + */ +static void throttlectl_cpu_level_select(struct tegra_soctherm *ts, + enum soctherm_throttle_id throt) +{ + u32 r, throt_vect; + + /* Denver:CCROC NV_THERM interface N:3 Mapping */ + switch (ts->throt_cfgs[throt].cpu_throt_level) { + case TEGRA_SOCTHERM_THROT_LEVEL_LOW: + throt_vect = THROT_VECT_LOW; + break; + case TEGRA_SOCTHERM_THROT_LEVEL_MED: + throt_vect = THROT_VECT_MED; + break; + case TEGRA_SOCTHERM_THROT_LEVEL_HIGH: + throt_vect = THROT_VECT_HIGH; + break; + default: + throt_vect = THROT_VECT_NONE; + break; + } + + r = readl(ts->regs + THROT_PSKIP_CTRL(throt, THROTTLE_DEV_CPU)); + r = REG_SET_MASK(r, THROT_PSKIP_CTRL_ENABLE_MASK, 1); + r = REG_SET_MASK(r, THROT_PSKIP_CTRL_VECT_CPU_MASK, throt_vect); + r = REG_SET_MASK(r, THROT_PSKIP_CTRL_VECT2_CPU_MASK, throt_vect); + writel(r, ts->regs + THROT_PSKIP_CTRL(throt, THROTTLE_DEV_CPU)); + + /* bypass sequencer in soc_therm as it is programmed in ccroc */ + r = REG_SET_MASK(0, THROT_PSKIP_RAMP_SEQ_BYPASS_MODE_MASK, 1); + writel(r, ts->regs + THROT_PSKIP_RAMP(throt, THROTTLE_DEV_CPU)); +} + /** * throttlectl_cpu_mn() - program CPU pulse skipper configuration * @throt: the LIGHT/HEAVY of throttle event id @@ -1017,7 +1169,10 @@ static void soctherm_throttle_program(struct tegra_soctherm *ts, return; /* Setup PSKIP parameters */ - throttlectl_cpu_mn(ts, throt); + if (ts->soc->use_ccroc) + throttlectl_cpu_level_select(ts, throt); + else + throttlectl_cpu_mn(ts, throt); r = REG_SET_MASK(0, THROT_PRIORITY_LITE_PRIO_MASK, stc.priority); writel(r, ts->regs + THROT_PRIORITY_CTRL(throt)); @@ -1040,16 +1195,31 @@ static void tegra_soctherm_throttle(struct device *dev) u32 v; int i; + /* configure LOW, MED and HIGH levels for CCROC NV_THERM */ + if (ts->soc->use_ccroc) { + throttlectl_cpu_level_cfg(ts, TEGRA_SOCTHERM_THROT_LEVEL_LOW); + throttlectl_cpu_level_cfg(ts, TEGRA_SOCTHERM_THROT_LEVEL_MED); + throttlectl_cpu_level_cfg(ts, TEGRA_SOCTHERM_THROT_LEVEL_HIGH); + } + /* Thermal HW throttle programming */ for (i = 0; i < THROTTLE_SIZE; i++) soctherm_throttle_program(ts, i); v = REG_SET_MASK(0, THROT_GLOBAL_ENB_MASK, 1); - writel(v, ts->regs + THROT_GLOBAL_CFG); + if (ts->soc->use_ccroc) { + ccroc_writel(ts, v, CCROC_GLOBAL_CFG); - v = clk_readl(ts, CAR_SUPER_CCLKG_DIVIDER); - v = REG_SET_MASK(v, CDIVG_USE_THERM_CONTROLS_MASK, 1); - clk_writel(ts, v, CAR_SUPER_CCLKG_DIVIDER); + v = ccroc_readl(ts, CCROC_SUPER_CCLKG_DIVIDER); + v = REG_SET_MASK(v, CDIVG_USE_THERM_CONTROLS_MASK, 1); + ccroc_writel(ts, v, CCROC_SUPER_CCLKG_DIVIDER); + } else { + writel(v, ts->regs + THROT_GLOBAL_CFG); + + v = clk_readl(ts, CAR_SUPER_CCLKG_DIVIDER); + v = REG_SET_MASK(v, CDIVG_USE_THERM_CONTROLS_MASK, 1); + clk_writel(ts, v, CAR_SUPER_CCLKG_DIVIDER); + } /* initialize stats collection */ v = STATS_CTL_CLR_DN | STATS_CTL_EN_DN | @@ -1084,9 +1254,6 @@ static void soctherm_init(struct platform_device *pdev) writel(pdiv, tegra->regs + SENSOR_PDIV); writel(hotspot, tegra->regs + SENSOR_HOTSPOT_OFF); - if (tegra->soc->use_ccroc) - return; - /* Configure hw throttle */ tegra_soctherm_throttle(&pdev->dev); } @@ -1157,6 +1324,14 @@ static int tegra_soctherm_probe(struct platform_device *pdev) dev_err(&pdev->dev, "can't get car clk registers"); return PTR_ERR(tegra->clk_regs); } + } else { + res = platform_get_resource_byname(pdev, IORESOURCE_MEM, + "ccroc-reg"); + tegra->ccroc_regs = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(tegra->ccroc_regs)) { + dev_err(&pdev->dev, "can't get ccroc registers"); + return PTR_ERR(tegra->ccroc_regs); + } } tegra->reset = devm_reset_control_get(&pdev->dev, "soctherm"); @@ -1207,8 +1382,7 @@ static int tegra_soctherm_probe(struct platform_device *pdev) if (err) return err; - if (!tegra->soc->use_ccroc) - soctherm_init_hw_throt_cdev(pdev); + soctherm_init_hw_throt_cdev(pdev); soctherm_init(pdev); diff --git a/drivers/thermal/tegra/tegra132-soctherm.c b/drivers/thermal/tegra/tegra132-soctherm.c index ae5d61af2df3..97fa30501eb1 100644 --- a/drivers/thermal/tegra/tegra132-soctherm.c +++ b/drivers/thermal/tegra/tegra132-soctherm.c @@ -28,7 +28,11 @@ #define TEGRA132_THERMTRIP_CPU_THRESH_MASK (0xff << 8) #define TEGRA132_THERMTRIP_TSENSE_THRESH_MASK 0xff +#define TEGRA132_THERMCTL_LVL0_UP_THRESH_MASK (0xff << 17) +#define TEGRA132_THERMCTL_LVL0_DN_THRESH_MASK (0xff << 9) + #define TEGRA132_THRESH_GRAIN 1000 +#define TEGRA132_BPTT 8 static const struct tegra_tsensor_configuration tegra132_tsensor_config = { .tall = 16300, @@ -51,6 +55,9 @@ static const struct tegra_tsensor_group tegra132_tsensor_group_cpu = { .thermtrip_any_en_mask = TEGRA132_THERMTRIP_ANY_EN_MASK, .thermtrip_enable_mask = TEGRA132_THERMTRIP_CPU_EN_MASK, .thermtrip_threshold_mask = TEGRA132_THERMTRIP_CPU_THRESH_MASK, + .thermctl_lvl0_offset = THERMCTL_LEVEL0_GROUP_CPU, + .thermctl_lvl0_up_thresh_mask = TEGRA132_THERMCTL_LVL0_UP_THRESH_MASK, + .thermctl_lvl0_dn_thresh_mask = TEGRA132_THERMCTL_LVL0_DN_THRESH_MASK, }; static const struct tegra_tsensor_group tegra132_tsensor_group_gpu = { @@ -66,6 +73,9 @@ static const struct tegra_tsensor_group tegra132_tsensor_group_gpu = { .thermtrip_any_en_mask = TEGRA132_THERMTRIP_ANY_EN_MASK, .thermtrip_enable_mask = TEGRA132_THERMTRIP_GPU_EN_MASK, .thermtrip_threshold_mask = TEGRA132_THERMTRIP_GPUMEM_THRESH_MASK, + .thermctl_lvl0_offset = THERMCTL_LEVEL0_GROUP_GPU, + .thermctl_lvl0_up_thresh_mask = TEGRA132_THERMCTL_LVL0_UP_THRESH_MASK, + .thermctl_lvl0_dn_thresh_mask = TEGRA132_THERMCTL_LVL0_DN_THRESH_MASK, }; static const struct tegra_tsensor_group tegra132_tsensor_group_pll = { @@ -79,6 +89,9 @@ static const struct tegra_tsensor_group tegra132_tsensor_group_pll = { .thermtrip_any_en_mask = TEGRA132_THERMTRIP_ANY_EN_MASK, .thermtrip_enable_mask = TEGRA132_THERMTRIP_TSENSE_EN_MASK, .thermtrip_threshold_mask = TEGRA132_THERMTRIP_TSENSE_THRESH_MASK, + .thermctl_lvl0_offset = THERMCTL_LEVEL0_GROUP_TSENSE, + .thermctl_lvl0_up_thresh_mask = TEGRA132_THERMCTL_LVL0_UP_THRESH_MASK, + .thermctl_lvl0_dn_thresh_mask = TEGRA132_THERMCTL_LVL0_DN_THRESH_MASK, }; static const struct tegra_tsensor_group tegra132_tsensor_group_mem = { @@ -94,6 +107,9 @@ static const struct tegra_tsensor_group tegra132_tsensor_group_mem = { .thermtrip_any_en_mask = TEGRA132_THERMTRIP_ANY_EN_MASK, .thermtrip_enable_mask = TEGRA132_THERMTRIP_MEM_EN_MASK, .thermtrip_threshold_mask = TEGRA132_THERMTRIP_GPUMEM_THRESH_MASK, + .thermctl_lvl0_offset = THERMCTL_LEVEL0_GROUP_MEM, + .thermctl_lvl0_up_thresh_mask = TEGRA132_THERMCTL_LVL0_UP_THRESH_MASK, + .thermctl_lvl0_dn_thresh_mask = TEGRA132_THERMCTL_LVL0_DN_THRESH_MASK, }; static const struct tegra_tsensor_group *tegra132_tsensor_groups[] = { @@ -193,5 +209,6 @@ const struct tegra_soctherm_soc tegra132_soctherm = { .num_ttgs = ARRAY_SIZE(tegra132_tsensor_groups), .tfuse = &tegra132_soctherm_fuse, .thresh_grain = TEGRA132_THRESH_GRAIN, + .bptt = TEGRA132_BPTT, .use_ccroc = true, }; diff --git a/include/dt-bindings/thermal/tegra124-soctherm.h b/include/dt-bindings/thermal/tegra124-soctherm.h index 729ab9fc325e..2a99f1d52bb5 100644 --- a/include/dt-bindings/thermal/tegra124-soctherm.h +++ b/include/dt-bindings/thermal/tegra124-soctherm.h @@ -11,4 +11,9 @@ #define TEGRA124_SOCTHERM_SENSOR_PLLX 3 #define TEGRA124_SOCTHERM_SENSOR_NUM 4 +#define TEGRA_SOCTHERM_THROT_LEVEL_LOW 0 +#define TEGRA_SOCTHERM_THROT_LEVEL_MED 1 +#define TEGRA_SOCTHERM_THROT_LEVEL_HIGH 2 +#define TEGRA_SOCTHERM_THROT_LEVEL_NONE -1 + #endif -- cgit v1.2.3 From 0e70f466fb910ae54c4c71243b99385129e93feb Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Fri, 26 Aug 2016 16:21:16 -0700 Subject: thermal: Enhance thermal_zone_device_update for events Added one additional parameter to thermal_zone_device_update() to provide caller with an optional capability to specify reason. Currently this event is used by user space governor to trigger different processing based on event code. Also it saves an additional call to read temperature when the event is received. The following events are cuurently defined: - Unspecified event - New temperature sample - Trip point violated - Trip point changed - thermal device up and down - thermal device power capability changed Signed-off-by: Srinivas Pandruvada Signed-off-by: Zhang Rui --- drivers/acpi/thermal.c | 3 ++- drivers/platform/x86/acerhdf.c | 2 +- drivers/regulator/max8973-regulator.c | 3 ++- drivers/thermal/db8500_thermal.c | 2 +- drivers/thermal/hisi_thermal.c | 3 ++- drivers/thermal/imx_thermal.c | 4 ++-- .../thermal/int340x_thermal/int340x_thermal_zone.h | 2 +- drivers/thermal/intel_bxt_pmic_thermal.c | 3 ++- drivers/thermal/intel_soc_dts_iosf.c | 3 ++- drivers/thermal/max77620_thermal.c | 3 ++- drivers/thermal/of-thermal.c | 2 +- drivers/thermal/qcom-spmi-temp-alarm.c | 2 +- drivers/thermal/rcar_thermal.c | 3 ++- drivers/thermal/rockchip_thermal.c | 3 ++- drivers/thermal/samsung/exynos_tmu.c | 2 +- drivers/thermal/st/st_thermal_memmap.c | 3 ++- drivers/thermal/thermal_core.c | 21 +++++++++++++-------- drivers/thermal/ti-soc-thermal/ti-thermal-common.c | 4 ++-- drivers/thermal/x86_pkg_temp_thermal.c | 3 ++- include/linux/thermal.h | 19 +++++++++++++++++-- 20 files changed, 60 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c index f4ebe39539af..35e8fbca10ad 100644 --- a/drivers/acpi/thermal.c +++ b/drivers/acpi/thermal.c @@ -520,7 +520,8 @@ static void acpi_thermal_check(void *data) if (!tz->tz_enabled) return; - thermal_zone_device_update(tz->thermal_zone); + thermal_zone_device_update(tz->thermal_zone, + THERMAL_EVENT_UNSPECIFIED); } /* sys I/F for generic thermal sysfs support */ diff --git a/drivers/platform/x86/acerhdf.c b/drivers/platform/x86/acerhdf.c index 460fa6708bfc..2acdb0d6ea89 100644 --- a/drivers/platform/x86/acerhdf.c +++ b/drivers/platform/x86/acerhdf.c @@ -405,7 +405,7 @@ static inline void acerhdf_enable_kernelmode(void) kernelmode = 1; thz_dev->polling_delay = interval*1000; - thermal_zone_device_update(thz_dev); + thermal_zone_device_update(thz_dev, THERMAL_EVENT_UNSPECIFIED); pr_notice("kernel mode fan control ON\n"); } diff --git a/drivers/regulator/max8973-regulator.c b/drivers/regulator/max8973-regulator.c index 3958f50c5975..e0c747aa9f85 100644 --- a/drivers/regulator/max8973-regulator.c +++ b/drivers/regulator/max8973-regulator.c @@ -495,7 +495,8 @@ static irqreturn_t max8973_thermal_irq(int irq, void *data) { struct max8973_chip *mchip = data; - thermal_zone_device_update(mchip->tz_device); + thermal_zone_device_update(mchip->tz_device, + THERMAL_EVENT_UNSPECIFIED); return IRQ_HANDLED; } diff --git a/drivers/thermal/db8500_thermal.c b/drivers/thermal/db8500_thermal.c index 652acd8fbe48..e776cea80cfc 100644 --- a/drivers/thermal/db8500_thermal.c +++ b/drivers/thermal/db8500_thermal.c @@ -306,7 +306,7 @@ static void db8500_thermal_work(struct work_struct *work) if (cur_mode == THERMAL_DEVICE_DISABLED) return; - thermal_zone_device_update(pzone->therm_dev); + thermal_zone_device_update(pzone->therm_dev, THERMAL_EVENT_UNSPECIFIED); dev_dbg(&pzone->therm_dev->device, "thermal work finished.\n"); } diff --git a/drivers/thermal/hisi_thermal.c b/drivers/thermal/hisi_thermal.c index 97fad8f51e1c..f6429666a1cf 100644 --- a/drivers/thermal/hisi_thermal.c +++ b/drivers/thermal/hisi_thermal.c @@ -237,7 +237,8 @@ static irqreturn_t hisi_thermal_alarm_irq_thread(int irq, void *dev) if (!data->sensors[i].tzd) continue; - thermal_zone_device_update(data->sensors[i].tzd); + thermal_zone_device_update(data->sensors[i].tzd, + THERMAL_EVENT_UNSPECIFIED); } return IRQ_HANDLED; diff --git a/drivers/thermal/imx_thermal.c b/drivers/thermal/imx_thermal.c index e473548b5d28..06912f0602b7 100644 --- a/drivers/thermal/imx_thermal.c +++ b/drivers/thermal/imx_thermal.c @@ -246,7 +246,7 @@ static int imx_set_mode(struct thermal_zone_device *tz, } data->mode = mode; - thermal_zone_device_update(tz); + thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED); return 0; } @@ -457,7 +457,7 @@ static irqreturn_t imx_thermal_alarm_irq_thread(int irq, void *dev) dev_dbg(&data->tz->device, "THERMAL ALARM: T > %d\n", data->alarm_temp / 1000); - thermal_zone_device_update(data->tz); + thermal_zone_device_update(data->tz, THERMAL_EVENT_UNSPECIFIED); return IRQ_HANDLED; } diff --git a/drivers/thermal/int340x_thermal/int340x_thermal_zone.h b/drivers/thermal/int340x_thermal/int340x_thermal_zone.h index aaadf724ff2e..65116b1f19f1 100644 --- a/drivers/thermal/int340x_thermal/int340x_thermal_zone.h +++ b/drivers/thermal/int340x_thermal/int340x_thermal_zone.h @@ -62,7 +62,7 @@ static inline void *int340x_thermal_zone_get_priv_data( static inline void int340x_thermal_zone_device_update( struct int34x_thermal_zone *tzone) { - thermal_zone_device_update(tzone->zone); + thermal_zone_device_update(tzone->zone, THERMAL_EVENT_UNSPECIFIED); } #endif diff --git a/drivers/thermal/intel_bxt_pmic_thermal.c b/drivers/thermal/intel_bxt_pmic_thermal.c index 4ae3e0c1576a..0f19a393ddd8 100644 --- a/drivers/thermal/intel_bxt_pmic_thermal.c +++ b/drivers/thermal/intel_bxt_pmic_thermal.c @@ -204,7 +204,8 @@ static irqreturn_t pmic_thermal_irq_handler(int irq, void *data) trip = td->maps[i].trip_config[j].trip_num; tzd = thermal_zone_get_zone_by_name(td->maps[i].handle); if (!IS_ERR(tzd)) - thermal_zone_device_update(tzd); + thermal_zone_device_update(tzd, + THERMAL_EVENT_UNSPECIFIED); /* Clear the appropriate irq */ regmap_write(regmap, reg, reg_val & mask); diff --git a/drivers/thermal/intel_soc_dts_iosf.c b/drivers/thermal/intel_soc_dts_iosf.c index f72e1db3216f..e0813dfaa278 100644 --- a/drivers/thermal/intel_soc_dts_iosf.c +++ b/drivers/thermal/intel_soc_dts_iosf.c @@ -391,7 +391,8 @@ void intel_soc_dts_iosf_interrupt_handler(struct intel_soc_dts_sensors *sensors) for (i = 0; i < SOC_MAX_DTS_SENSORS; ++i) { pr_debug("TZD update for zone %d\n", i); - thermal_zone_device_update(sensors->soc_dts[i].tzone); + thermal_zone_device_update(sensors->soc_dts[i].tzone, + THERMAL_EVENT_UNSPECIFIED); } } else spin_unlock_irqrestore(&sensors->intr_notify_lock, flags); diff --git a/drivers/thermal/max77620_thermal.c b/drivers/thermal/max77620_thermal.c index 8d6162676f68..83905ff46e40 100644 --- a/drivers/thermal/max77620_thermal.c +++ b/drivers/thermal/max77620_thermal.c @@ -82,7 +82,8 @@ static irqreturn_t max77620_thermal_irq(int irq, void *data) else if (irq == mtherm->irq_tjalarm2) dev_crit(mtherm->dev, "Junction Temp Alarm2(140C) occurred\n"); - thermal_zone_device_update(mtherm->tz_device); + thermal_zone_device_update(mtherm->tz_device, + THERMAL_EVENT_UNSPECIFIED); return IRQ_HANDLED; } diff --git a/drivers/thermal/of-thermal.c b/drivers/thermal/of-thermal.c index ab6e5260a901..d04ec3b9e5ff 100644 --- a/drivers/thermal/of-thermal.c +++ b/drivers/thermal/of-thermal.c @@ -286,7 +286,7 @@ static int of_thermal_set_mode(struct thermal_zone_device *tz, mutex_unlock(&tz->lock); data->mode = mode; - thermal_zone_device_update(tz); + thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED); return 0; } diff --git a/drivers/thermal/qcom-spmi-temp-alarm.c b/drivers/thermal/qcom-spmi-temp-alarm.c index f8a3c60bef94..819c6d5d7aa7 100644 --- a/drivers/thermal/qcom-spmi-temp-alarm.c +++ b/drivers/thermal/qcom-spmi-temp-alarm.c @@ -150,7 +150,7 @@ static irqreturn_t qpnp_tm_isr(int irq, void *data) { struct qpnp_tm_chip *chip = data; - thermal_zone_device_update(chip->tz_dev); + thermal_zone_device_update(chip->tz_dev, THERMAL_EVENT_UNSPECIFIED); return IRQ_HANDLED; } diff --git a/drivers/thermal/rcar_thermal.c b/drivers/thermal/rcar_thermal.c index b5c6442d82d6..6c73d3ecf33b 100644 --- a/drivers/thermal/rcar_thermal.c +++ b/drivers/thermal/rcar_thermal.c @@ -358,7 +358,8 @@ static void rcar_thermal_work(struct work_struct *work) return; if (nctemp != cctemp) - thermal_zone_device_update(priv->zone); + thermal_zone_device_update(priv->zone, + THERMAL_EVENT_UNSPECIFIED); } static u32 rcar_thermal_had_changed(struct rcar_thermal_priv *priv, u32 status) diff --git a/drivers/thermal/rockchip_thermal.c b/drivers/thermal/rockchip_thermal.c index 1f165c990c85..e227a9f0acf7 100644 --- a/drivers/thermal/rockchip_thermal.c +++ b/drivers/thermal/rockchip_thermal.c @@ -873,7 +873,8 @@ static irqreturn_t rockchip_thermal_alarm_irq_thread(int irq, void *dev) thermal->chip->irq_ack(thermal->regs); for (i = 0; i < thermal->chip->chn_num; i++) - thermal_zone_device_update(thermal->sensors[i].tzd); + thermal_zone_device_update(thermal->sensors[i].tzd, + THERMAL_EVENT_UNSPECIFIED); return IRQ_HANDLED; } diff --git a/drivers/thermal/samsung/exynos_tmu.c b/drivers/thermal/samsung/exynos_tmu.c index f3ce94ec73b5..ad1186dd6132 100644 --- a/drivers/thermal/samsung/exynos_tmu.c +++ b/drivers/thermal/samsung/exynos_tmu.c @@ -225,7 +225,7 @@ static void exynos_report_trigger(struct exynos_tmu_data *p) return; } - thermal_zone_device_update(tz); + thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED); mutex_lock(&tz->lock); /* Find the level for which trip happened */ diff --git a/drivers/thermal/st/st_thermal_memmap.c b/drivers/thermal/st/st_thermal_memmap.c index fc0c9e198710..91d42319de27 100644 --- a/drivers/thermal/st/st_thermal_memmap.c +++ b/drivers/thermal/st/st_thermal_memmap.c @@ -42,7 +42,8 @@ static irqreturn_t st_mmap_thermal_trip_handler(int irq, void *sdata) { struct st_thermal_sensor *sensor = sdata; - thermal_zone_device_update(sensor->thermal_dev); + thermal_zone_device_update(sensor->thermal_dev, + THERMAL_EVENT_UNSPECIFIED); return IRQ_HANDLED; } diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index f2d55e478b2a..226b0b4aced6 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -607,7 +607,8 @@ static void thermal_zone_device_reset(struct thermal_zone_device *tz) pos->initialized = false; } -void thermal_zone_device_update(struct thermal_zone_device *tz) +void thermal_zone_device_update(struct thermal_zone_device *tz, + enum thermal_notify_event event) { int count; @@ -621,6 +622,8 @@ void thermal_zone_device_update(struct thermal_zone_device *tz) thermal_zone_set_trips(tz); + tz->notify_event = event; + for (count = 0; count < tz->trips; count++) handle_thermal_trip(tz, count); } @@ -631,7 +634,7 @@ static void thermal_zone_device_check(struct work_struct *work) struct thermal_zone_device *tz = container_of(work, struct thermal_zone_device, poll_queue.work); - thermal_zone_device_update(tz); + thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED); } /* sys I/F for thermal zone */ @@ -755,7 +758,7 @@ trip_point_temp_store(struct device *dev, struct device_attribute *attr, if (ret) return ret; - thermal_zone_device_update(tz); + thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED); return count; } @@ -877,7 +880,7 @@ passive_store(struct device *dev, struct device_attribute *attr, tz->forced_passive = state; - thermal_zone_device_update(tz); + thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED); return count; } @@ -968,7 +971,7 @@ emul_temp_store(struct device *dev, struct device_attribute *attr, } if (!ret) - thermal_zone_device_update(tz); + thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED); return ret ? ret : count; } @@ -1564,7 +1567,8 @@ __thermal_cooling_device_register(struct device_node *np, mutex_lock(&thermal_list_lock); list_for_each_entry(pos, &thermal_tz_list, node) if (atomic_cmpxchg(&pos->need_update, 1, 0)) - thermal_zone_device_update(pos); + thermal_zone_device_update(pos, + THERMAL_EVENT_UNSPECIFIED); mutex_unlock(&thermal_list_lock); return cdev; @@ -2007,7 +2011,7 @@ struct thermal_zone_device *thermal_zone_device_register(const char *type, thermal_zone_device_reset(tz); /* Update the new thermal zone and mark it as already updated. */ if (atomic_cmpxchg(&tz->need_update, 1, 0)) - thermal_zone_device_update(tz); + thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED); return tz; @@ -2294,7 +2298,8 @@ static int thermal_pm_notify(struct notifier_block *nb, atomic_set(&in_suspend, 0); list_for_each_entry(tz, &thermal_tz_list, node) { thermal_zone_device_reset(tz); - thermal_zone_device_update(tz); + thermal_zone_device_update(tz, + THERMAL_EVENT_UNSPECIFIED); } break; default: diff --git a/drivers/thermal/ti-soc-thermal/ti-thermal-common.c b/drivers/thermal/ti-soc-thermal/ti-thermal-common.c index 4a6757ca78f0..0586bd0f2bab 100644 --- a/drivers/thermal/ti-soc-thermal/ti-thermal-common.c +++ b/drivers/thermal/ti-soc-thermal/ti-thermal-common.c @@ -52,7 +52,7 @@ static void ti_thermal_work(struct work_struct *work) struct ti_thermal_data *data = container_of(work, struct ti_thermal_data, thermal_wq); - thermal_zone_device_update(data->ti_thermal); + thermal_zone_device_update(data->ti_thermal, THERMAL_EVENT_UNSPECIFIED); dev_dbg(&data->ti_thermal->device, "updated thermal zone %s\n", data->ti_thermal->type); @@ -205,7 +205,7 @@ static int ti_thermal_set_mode(struct thermal_zone_device *thermal, data->mode = mode; ti_bandgap_write_update_interval(bgp, data->sensor_id, data->ti_thermal->polling_delay); - thermal_zone_device_update(data->ti_thermal); + thermal_zone_device_update(data->ti_thermal, THERMAL_EVENT_UNSPECIFIED); dev_dbg(&thermal->device, "thermal polling set for duration=%d msec\n", data->ti_thermal->polling_delay); diff --git a/drivers/thermal/x86_pkg_temp_thermal.c b/drivers/thermal/x86_pkg_temp_thermal.c index 97f0a2bd93ed..95f4c1bcdb4c 100644 --- a/drivers/thermal/x86_pkg_temp_thermal.c +++ b/drivers/thermal/x86_pkg_temp_thermal.c @@ -348,7 +348,8 @@ static void pkg_temp_thermal_threshold_work_fn(struct work_struct *work) } if (notify) { pr_debug("thermal_zone_device_update\n"); - thermal_zone_device_update(phdev->tzone); + thermal_zone_device_update(phdev->tzone, + THERMAL_EVENT_UNSPECIFIED); } } diff --git a/include/linux/thermal.h b/include/linux/thermal.h index b3c16f06fdc4..511182a88e76 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -92,6 +92,17 @@ enum thermal_trend { THERMAL_TREND_DROP_FULL, /* apply lowest cooling action */ }; +/* Thermal notification reason */ +enum thermal_notify_event { + THERMAL_EVENT_UNSPECIFIED, /* Unspecified event */ + THERMAL_EVENT_TEMP_SAMPLE, /* New Temperature sample */ + THERMAL_TRIP_VIOLATED, /* TRIP Point violation */ + THERMAL_TRIP_CHANGED, /* TRIP Point temperature changed */ + THERMAL_DEVICE_DOWN, /* Thermal device is down */ + THERMAL_DEVICE_UP, /* Thermal device is up after a down event */ + THERMAL_DEVICE_POWER_CAPABILITY_CHANGED, /* power capability changed */ +}; + struct thermal_zone_device_ops { int (*bind) (struct thermal_zone_device *, struct thermal_cooling_device *); @@ -187,6 +198,7 @@ struct thermal_attr { * @lock: lock to protect thermal_instances list * @node: node in thermal_tz_list (in thermal_core.c) * @poll_queue: delayed work for polling + * @notify_event: Last notification event */ struct thermal_zone_device { int id; @@ -217,6 +229,7 @@ struct thermal_zone_device { struct mutex lock; struct list_head node; struct delayed_work poll_queue; + enum thermal_notify_event notify_event; }; /** @@ -436,7 +449,8 @@ int thermal_zone_bind_cooling_device(struct thermal_zone_device *, int, unsigned int); int thermal_zone_unbind_cooling_device(struct thermal_zone_device *, int, struct thermal_cooling_device *); -void thermal_zone_device_update(struct thermal_zone_device *); +void thermal_zone_device_update(struct thermal_zone_device *, + enum thermal_notify_event); void thermal_zone_set_trips(struct thermal_zone_device *); struct thermal_cooling_device *thermal_cooling_device_register(char *, void *, @@ -487,7 +501,8 @@ static inline int thermal_zone_unbind_cooling_device( struct thermal_zone_device *tz, int trip, struct thermal_cooling_device *cdev) { return -ENODEV; } -static inline void thermal_zone_device_update(struct thermal_zone_device *tz) +static inline void thermal_zone_device_update(struct thermal_zone_device *tz, + enum thermal_notify_event event) { } static inline void thermal_zone_set_trips(struct thermal_zone_device *tz) { } -- cgit v1.2.3 From a38719b3dce002b7decc29b17312b8ba4a738db9 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Fri, 16 Sep 2016 10:32:39 +0200 Subject: video: fbdev: exynos: Remove old non-working MIPI driver The old non-DRM Exynos MIPI driver does not support DeviceTree and requires board files. Our platforms do not provide such so the driver is not usable since a long time ago. All features provided by the driver (and associated s6e8ax0 panel driver) are already supported by newer DRM version so the old code can be removed. Cc: Inki Dae Cc: Donghwa Lee Cc: Kyungmin Park Signed-off-by: Krzysztof Kozlowski Signed-off-by: Tomi Valkeinen --- MAINTAINERS | 9 - drivers/video/fbdev/Kconfig | 1 - drivers/video/fbdev/Makefile | 2 - drivers/video/fbdev/exynos/Kconfig | 32 - drivers/video/fbdev/exynos/Makefile | 9 - drivers/video/fbdev/exynos/exynos_mipi_dsi.c | 574 ------------- .../video/fbdev/exynos/exynos_mipi_dsi_common.c | 880 -------------------- .../video/fbdev/exynos/exynos_mipi_dsi_common.h | 46 -- .../video/fbdev/exynos/exynos_mipi_dsi_lowlevel.c | 618 -------------- .../video/fbdev/exynos/exynos_mipi_dsi_lowlevel.h | 112 --- drivers/video/fbdev/exynos/exynos_mipi_dsi_regs.h | 149 ---- drivers/video/fbdev/exynos/s6e8ax0.c | 887 --------------------- include/video/exynos_mipi_dsim.h | 358 --------- 13 files changed, 3677 deletions(-) delete mode 100644 drivers/video/fbdev/exynos/Kconfig delete mode 100644 drivers/video/fbdev/exynos/Makefile delete mode 100644 drivers/video/fbdev/exynos/exynos_mipi_dsi.c delete mode 100644 drivers/video/fbdev/exynos/exynos_mipi_dsi_common.c delete mode 100644 drivers/video/fbdev/exynos/exynos_mipi_dsi_common.h delete mode 100644 drivers/video/fbdev/exynos/exynos_mipi_dsi_lowlevel.c delete mode 100644 drivers/video/fbdev/exynos/exynos_mipi_dsi_lowlevel.h delete mode 100644 drivers/video/fbdev/exynos/exynos_mipi_dsi_regs.h delete mode 100644 drivers/video/fbdev/exynos/s6e8ax0.c delete mode 100644 include/video/exynos_mipi_dsim.h (limited to 'include') diff --git a/MAINTAINERS b/MAINTAINERS index fbb78a8522da..f8d0312dc749 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4695,15 +4695,6 @@ L: iommu@lists.linux-foundation.org S: Maintained F: drivers/iommu/exynos-iommu.c -EXYNOS MIPI DISPLAY DRIVERS -M: Inki Dae -M: Donghwa Lee -M: Kyungmin Park -L: linux-fbdev@vger.kernel.org -S: Maintained -F: drivers/video/fbdev/exynos/exynos_mipi* -F: include/video/exynos_mipi* - EZchip NPS platform support M: Noam Camus S: Supported diff --git a/drivers/video/fbdev/Kconfig b/drivers/video/fbdev/Kconfig index 6c28ecc3958c..af2f117208f1 100644 --- a/drivers/video/fbdev/Kconfig +++ b/drivers/video/fbdev/Kconfig @@ -2447,7 +2447,6 @@ config FB_SIMPLE source "drivers/video/fbdev/omap/Kconfig" source "drivers/video/fbdev/omap2/Kconfig" -source "drivers/video/fbdev/exynos/Kconfig" source "drivers/video/fbdev/mmp/Kconfig" config FB_SH_MOBILE_MERAM diff --git a/drivers/video/fbdev/Makefile b/drivers/video/fbdev/Makefile index 5550944f0ad7..ee8c81405a7f 100644 --- a/drivers/video/fbdev/Makefile +++ b/drivers/video/fbdev/Makefile @@ -6,8 +6,6 @@ obj-y += core/ -obj-$(CONFIG_EXYNOS_VIDEO) += exynos/ - obj-$(CONFIG_FB_MACMODES) += macmodes.o obj-$(CONFIG_FB_WMT_GE_ROPS) += wmt_ge_rops.o diff --git a/drivers/video/fbdev/exynos/Kconfig b/drivers/video/fbdev/exynos/Kconfig deleted file mode 100644 index d916bef94f25..000000000000 --- a/drivers/video/fbdev/exynos/Kconfig +++ /dev/null @@ -1,32 +0,0 @@ -# -# Exynos Video configuration -# - -menuconfig EXYNOS_VIDEO - tristate "Exynos Video driver support" - depends on ARCH_S5PV210 || ARCH_EXYNOS - help - This enables support for EXYNOS Video device. - -if EXYNOS_VIDEO - -# -# MIPI DSI driver -# - -config EXYNOS_MIPI_DSI - tristate "EXYNOS MIPI DSI driver support." - select GENERIC_PHY - help - This enables support for MIPI-DSI device. - -config EXYNOS_LCD_S6E8AX0 - tristate "S6E8AX0 MIPI AMOLED LCD Driver" - depends on EXYNOS_MIPI_DSI && BACKLIGHT_CLASS_DEVICE - depends on (LCD_CLASS_DEVICE = y) - default n - help - If you have an S6E8AX0 MIPI AMOLED LCD Panel, say Y to enable its - LCD control driver. - -endif # EXYNOS_VIDEO diff --git a/drivers/video/fbdev/exynos/Makefile b/drivers/video/fbdev/exynos/Makefile deleted file mode 100644 index 02d8dc522fea..000000000000 --- a/drivers/video/fbdev/exynos/Makefile +++ /dev/null @@ -1,9 +0,0 @@ -# -# Makefile for the exynos video drivers. -# - -obj-$(CONFIG_EXYNOS_MIPI_DSI) += exynos-mipi-dsi-mod.o - -exynos-mipi-dsi-mod-objs += exynos_mipi_dsi.o exynos_mipi_dsi_common.o \ - exynos_mipi_dsi_lowlevel.o -obj-$(CONFIG_EXYNOS_LCD_S6E8AX0) += s6e8ax0.o diff --git a/drivers/video/fbdev/exynos/exynos_mipi_dsi.c b/drivers/video/fbdev/exynos/exynos_mipi_dsi.c deleted file mode 100644 index 92e4af3caaf8..000000000000 --- a/drivers/video/fbdev/exynos/exynos_mipi_dsi.c +++ /dev/null @@ -1,574 +0,0 @@ -/* linux/drivers/video/exynos/exynos_mipi_dsi.c - * - * Samsung SoC MIPI-DSIM driver. - * - * Copyright (c) 2012 Samsung Electronics Co., Ltd - * - * InKi Dae, - * Donghwa Lee, - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. -*/ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include