From 402dab548d0da38b260f3843225cdfd37d91f512 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 22 Oct 2020 10:08:24 +0300 Subject: [PATCH 01/27] hwmon: (pmbus/max20730) use scnprintf() instead of snprintf() The snprintf() function returns the number of characters which would have been printed if there were enough space, but the scnprintf() returns the number of characters which were actually printed. If the buffer is not large enough, then using snprintf() would result in a read overflow and an information leak. Fixes: 8910c0bd533d ("hwmon: (pmbus/max20730) add device monitoring via debugfs") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/20201022070824.GC2817762@mwanda Signed-off-by: Guenter Roeck --- drivers/hwmon/pmbus/max20730.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/hwmon/pmbus/max20730.c b/drivers/hwmon/pmbus/max20730.c index 57923d72490c..be83b98411c7 100644 --- a/drivers/hwmon/pmbus/max20730.c +++ b/drivers/hwmon/pmbus/max20730.c @@ -122,8 +122,8 @@ static ssize_t max20730_debugfs_read(struct file *file, char __user *buf, switch (idx) { case MAX20730_DEBUGFS_VOUT_MIN: ret = VOLT_FROM_REG(data->mfr_voutmin * 10000); - len = snprintf(tbuf, DEBUG_FS_DATA_MAX, "%d.%d\n", - ret / 10000, ret % 10000); + len = scnprintf(tbuf, DEBUG_FS_DATA_MAX, "%d.%d\n", + ret / 10000, ret % 10000); break; case MAX20730_DEBUGFS_FREQUENCY: val = (data->mfr_devset1 & MAX20730_MFR_DEVSET1_FSW_MASK) @@ -141,7 +141,7 @@ static ssize_t max20730_debugfs_read(struct file *file, char __user *buf, ret = 800; else ret = 900; - len = snprintf(tbuf, DEBUG_FS_DATA_MAX, "%d\n", ret); + len = scnprintf(tbuf, DEBUG_FS_DATA_MAX, "%d\n", ret); break; case MAX20730_DEBUGFS_PG_DELAY: val = (data->mfr_devset1 & MAX20730_MFR_DEVSET1_TSTAT_MASK) @@ -223,7 +223,7 @@ static ssize_t max20730_debugfs_read(struct file *file, char __user *buf, case MAX20730_DEBUGFS_OC_PROTECT_MODE: ret = (data->mfr_devset2 & MAX20730_MFR_DEVSET2_OCPM_MASK) >> MAX20730_MFR_DEVSET2_OCPM_BIT_POS; - len = snprintf(tbuf, DEBUG_FS_DATA_MAX, "%d\n", ret); + len = scnprintf(tbuf, DEBUG_FS_DATA_MAX, "%d\n", ret); break; case MAX20730_DEBUGFS_SS_TIMING: val = (data->mfr_devset2 & MAX20730_MFR_DEVSET2_SS_MASK) @@ -241,32 +241,32 @@ static ssize_t max20730_debugfs_read(struct file *file, char __user *buf, case MAX20730_DEBUGFS_IMAX: ret = (data->mfr_devset2 & MAX20730_MFR_DEVSET2_IMAX_MASK) >> MAX20730_MFR_DEVSET2_IMAX_BIT_POS; - len = snprintf(tbuf, DEBUG_FS_DATA_MAX, "%d\n", ret); + len = scnprintf(tbuf, DEBUG_FS_DATA_MAX, "%d\n", ret); break; case MAX20730_DEBUGFS_OPERATION: ret = i2c_smbus_read_byte_data(psu->client, PMBUS_OPERATION); if (ret < 0) return ret; - len = snprintf(tbuf, DEBUG_FS_DATA_MAX, "%d\n", ret); + len = scnprintf(tbuf, DEBUG_FS_DATA_MAX, "%d\n", ret); break; case MAX20730_DEBUGFS_ON_OFF_CONFIG: ret = i2c_smbus_read_byte_data(psu->client, PMBUS_ON_OFF_CONFIG); if (ret < 0) return ret; - len = snprintf(tbuf, DEBUG_FS_DATA_MAX, "%d\n", ret); + len = scnprintf(tbuf, DEBUG_FS_DATA_MAX, "%d\n", ret); break; case MAX20730_DEBUGFS_SMBALERT_MASK: ret = i2c_smbus_read_word_data(psu->client, PMBUS_SMB_ALERT_MASK); if (ret < 0) return ret; - len = snprintf(tbuf, DEBUG_FS_DATA_MAX, "%d\n", ret); + len = scnprintf(tbuf, DEBUG_FS_DATA_MAX, "%d\n", ret); break; case MAX20730_DEBUGFS_VOUT_MODE: ret = i2c_smbus_read_byte_data(psu->client, PMBUS_VOUT_MODE); if (ret < 0) return ret; - len = snprintf(tbuf, DEBUG_FS_DATA_MAX, "%d\n", ret); + len = scnprintf(tbuf, DEBUG_FS_DATA_MAX, "%d\n", ret); break; case MAX20730_DEBUGFS_VOUT_COMMAND: ret = i2c_smbus_read_word_data(psu->client, PMBUS_VOUT_COMMAND); @@ -274,8 +274,8 @@ static ssize_t max20730_debugfs_read(struct file *file, char __user *buf, return ret; ret = VOLT_FROM_REG(ret * 10000); - len = snprintf(tbuf, DEBUG_FS_DATA_MAX, - "%d.%d\n", ret / 10000, ret % 10000); + len = scnprintf(tbuf, DEBUG_FS_DATA_MAX, + "%d.%d\n", ret / 10000, ret % 10000); break; case MAX20730_DEBUGFS_VOUT_MAX: ret = i2c_smbus_read_word_data(psu->client, PMBUS_VOUT_MAX); @@ -283,8 +283,8 @@ static ssize_t max20730_debugfs_read(struct file *file, char __user *buf, return ret; ret = VOLT_FROM_REG(ret * 10000); - len = snprintf(tbuf, DEBUG_FS_DATA_MAX, - "%d.%d\n", ret / 10000, ret % 10000); + len = scnprintf(tbuf, DEBUG_FS_DATA_MAX, + "%d.%d\n", ret / 10000, ret % 10000); break; default: len = strlcpy(tbuf, "Invalid\n", DEBUG_FS_DATA_MAX); From 8d8c3131248d7e9c6c8ab448e1c6cb6bd7755e9c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 26 Oct 2020 17:13:57 +0100 Subject: [PATCH 02/27] clk: define to_clk_regmap() as inline function Nesting container_of() causes warnings with W=2, which is annoying if it happens in headers and fills the build log like: In file included from drivers/clk/qcom/clk-alpha-pll.c:6: drivers/clk/qcom/clk-alpha-pll.c: In function 'clk_alpha_pll_hwfsm_enable': include/linux/kernel.h:852:8: warning: declaration of '__mptr' shadows a previous local [-Wshadow] 852 | void *__mptr = (void *)(ptr); \ | ^~~~~~ drivers/clk/qcom/clk-alpha-pll.c:155:31: note: in expansion of macro 'container_of' 155 | #define to_clk_alpha_pll(_hw) container_of(to_clk_regmap(_hw), \ | ^~~~~~~~~~~~ drivers/clk/qcom/clk-regmap.h:27:28: note: in expansion of macro 'container_of' 27 | #define to_clk_regmap(_hw) container_of(_hw, struct clk_regmap, hw) | ^~~~~~~~~~~~ drivers/clk/qcom/clk-alpha-pll.c:155:44: note: in expansion of macro 'to_clk_regmap' 155 | #define to_clk_alpha_pll(_hw) container_of(to_clk_regmap(_hw), \ | ^~~~~~~~~~~~~ drivers/clk/qcom/clk-alpha-pll.c:254:30: note: in expansion of macro 'to_clk_alpha_pll' 254 | struct clk_alpha_pll *pll = to_clk_alpha_pll(hw); | ^~~~~~~~~~~~~~~~ include/linux/kernel.h:852:8: note: shadowed declaration is here 852 | void *__mptr = (void *)(ptr); \ | ^~~~~~ Redefine two copies of the to_clk_regmap() macro as inline functions to avoid a lot of these. Fixes: ea11dda9e091 ("clk: meson: add regmap clocks") Fixes: 085d7a455444 ("clk: qcom: Add a regmap type clock struct") Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20201026161411.3708639-1-arnd@kernel.org Acked-by: Jerome Brunet Signed-off-by: Stephen Boyd --- drivers/clk/meson/clk-regmap.h | 5 ++++- drivers/clk/qcom/clk-regmap.h | 6 +++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/clk/meson/clk-regmap.h b/drivers/clk/meson/clk-regmap.h index c4a39604cffd..e365312da54e 100644 --- a/drivers/clk/meson/clk-regmap.h +++ b/drivers/clk/meson/clk-regmap.h @@ -26,7 +26,10 @@ struct clk_regmap { void *data; }; -#define to_clk_regmap(_hw) container_of(_hw, struct clk_regmap, hw) +static inline struct clk_regmap *to_clk_regmap(struct clk_hw *hw) +{ + return container_of(hw, struct clk_regmap, hw); +} /** * struct clk_regmap_gate_data - regmap backed gate specific data diff --git a/drivers/clk/qcom/clk-regmap.h b/drivers/clk/qcom/clk-regmap.h index 6cfc1bccb255..14ec659a3a77 100644 --- a/drivers/clk/qcom/clk-regmap.h +++ b/drivers/clk/qcom/clk-regmap.h @@ -24,7 +24,11 @@ struct clk_regmap { unsigned int enable_mask; bool enable_is_inverted; }; -#define to_clk_regmap(_hw) container_of(_hw, struct clk_regmap, hw) + +static inline struct clk_regmap *to_clk_regmap(struct clk_hw *hw) +{ + return container_of(hw, struct clk_regmap, hw); +} int clk_is_enabled_regmap(struct clk_hw *hw); int clk_enable_regmap(struct clk_hw *hw); From 18e8db7f6526928858dfa99b49d831497f0f8df8 Mon Sep 17 00:00:00 2001 From: Robert Hancock Date: Tue, 3 Nov 2020 13:33:15 -0600 Subject: [PATCH 03/27] hwmon: (pmbus) Add mutex locking for sysfs reads As part of commit a919ba06979a7 ("hwmon: (pmbus) Stop caching register values"), the update of the sensor value is now triggered directly by the sensor attribute value being read from sysfs. This created (or at least made much more likely) a locking issue, since nothing protected the device page selection from being unexpectedly modified by concurrent reads. If sensor values on different pages on the same device were being concurrently read by multiple threads, this could cause spurious read errors due to the page register not reading back the same value last written, or sensor values being read from the incorrect page. Add locking of the update_lock mutex in pmbus_show_sensor and pmbus_show_samples so that these cannot result in concurrent reads from the underlying device. Fixes: a919ba06979a7 ("hwmon: (pmbus) Stop caching register values") Signed-off-by: Robert Hancock Reviewed-by: Alex Qiu Link: https://lore.kernel.org/r/20201103193315.3011800-1-robert.hancock@calian.com Signed-off-by: Guenter Roeck --- drivers/hwmon/pmbus/pmbus_core.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/hwmon/pmbus/pmbus_core.c b/drivers/hwmon/pmbus/pmbus_core.c index 170a9f82ca61..b0e2820a2d57 100644 --- a/drivers/hwmon/pmbus/pmbus_core.c +++ b/drivers/hwmon/pmbus/pmbus_core.c @@ -941,12 +941,16 @@ static ssize_t pmbus_show_sensor(struct device *dev, struct i2c_client *client = to_i2c_client(dev->parent); struct pmbus_sensor *sensor = to_pmbus_sensor(devattr); struct pmbus_data *data = i2c_get_clientdata(client); + ssize_t ret; + mutex_lock(&data->update_lock); pmbus_update_sensor_data(client, sensor); if (sensor->data < 0) - return sensor->data; - - return snprintf(buf, PAGE_SIZE, "%lld\n", pmbus_reg2data(data, sensor)); + ret = sensor->data; + else + ret = snprintf(buf, PAGE_SIZE, "%lld\n", pmbus_reg2data(data, sensor)); + mutex_unlock(&data->update_lock); + return ret; } static ssize_t pmbus_set_sensor(struct device *dev, @@ -2012,8 +2016,11 @@ static ssize_t pmbus_show_samples(struct device *dev, int val; struct i2c_client *client = to_i2c_client(dev->parent); struct pmbus_samples_reg *reg = to_samples_reg(devattr); + struct pmbus_data *data = i2c_get_clientdata(client); + mutex_lock(&data->update_lock); val = _pmbus_read_word_data(client, reg->page, 0xff, reg->attr->reg); + mutex_unlock(&data->update_lock); if (val < 0) return val; From c277ca155d2f0028a5c79708426d3f79b54a5fc1 Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Sun, 1 Nov 2020 19:23:54 +0800 Subject: [PATCH 04/27] clk: imx8m: fix bus critical clk registration noc/axi/ahb are bus clk, not peripheral clk. Since peripheral clk has a limitation that for peripheral clock slice, IP clock slices must be stopped to change the clock source. However if the bus clk is marked as critical clk peripheral, the assigned clock parent operation will fail. So we added CLK_SET_PARENT_GATE flag to avoid glitch. And add imx8m_clk_hw_composite_bus_critical for bus critical clock usage Fixes: 936c383673b9e ("clk: imx: fix composite peripheral flags") Reviewed-by: Abel Vesa Reported-by: Abel Vesa Signed-off-by: Peng Fan Link: https://lore.kernel.org/r/1604229834-25594-1-git-send-email-peng.fan@nxp.com Signed-off-by: Stephen Boyd --- drivers/clk/imx/clk-imx8mm.c | 10 +++++----- drivers/clk/imx/clk-imx8mn.c | 6 +++--- drivers/clk/imx/clk-imx8mp.c | 10 +++++----- drivers/clk/imx/clk-imx8mq.c | 8 ++++---- drivers/clk/imx/clk.h | 5 +++++ 5 files changed, 22 insertions(+), 17 deletions(-) diff --git a/drivers/clk/imx/clk-imx8mm.c b/drivers/clk/imx/clk-imx8mm.c index 0de0be0cf548..f358ad907299 100644 --- a/drivers/clk/imx/clk-imx8mm.c +++ b/drivers/clk/imx/clk-imx8mm.c @@ -443,9 +443,9 @@ static int imx8mm_clocks_probe(struct platform_device *pdev) hws[IMX8MM_CLK_A53_CORE] = imx_clk_hw_mux2("arm_a53_core", base + 0x9880, 24, 1, imx8mm_a53_core_sels, ARRAY_SIZE(imx8mm_a53_core_sels)); /* BUS */ - hws[IMX8MM_CLK_MAIN_AXI] = imx8m_clk_hw_composite_critical("main_axi", imx8mm_main_axi_sels, base + 0x8800); + hws[IMX8MM_CLK_MAIN_AXI] = imx8m_clk_hw_composite_bus_critical("main_axi", imx8mm_main_axi_sels, base + 0x8800); hws[IMX8MM_CLK_ENET_AXI] = imx8m_clk_hw_composite_bus("enet_axi", imx8mm_enet_axi_sels, base + 0x8880); - hws[IMX8MM_CLK_NAND_USDHC_BUS] = imx8m_clk_hw_composite_critical("nand_usdhc_bus", imx8mm_nand_usdhc_sels, base + 0x8900); + hws[IMX8MM_CLK_NAND_USDHC_BUS] = imx8m_clk_hw_composite_bus_critical("nand_usdhc_bus", imx8mm_nand_usdhc_sels, base + 0x8900); hws[IMX8MM_CLK_VPU_BUS] = imx8m_clk_hw_composite_bus("vpu_bus", imx8mm_vpu_bus_sels, base + 0x8980); hws[IMX8MM_CLK_DISP_AXI] = imx8m_clk_hw_composite_bus("disp_axi", imx8mm_disp_axi_sels, base + 0x8a00); hws[IMX8MM_CLK_DISP_APB] = imx8m_clk_hw_composite_bus("disp_apb", imx8mm_disp_apb_sels, base + 0x8a80); @@ -453,11 +453,11 @@ static int imx8mm_clocks_probe(struct platform_device *pdev) hws[IMX8MM_CLK_USB_BUS] = imx8m_clk_hw_composite_bus("usb_bus", imx8mm_usb_bus_sels, base + 0x8b80); hws[IMX8MM_CLK_GPU_AXI] = imx8m_clk_hw_composite_bus("gpu_axi", imx8mm_gpu_axi_sels, base + 0x8c00); hws[IMX8MM_CLK_GPU_AHB] = imx8m_clk_hw_composite_bus("gpu_ahb", imx8mm_gpu_ahb_sels, base + 0x8c80); - hws[IMX8MM_CLK_NOC] = imx8m_clk_hw_composite_critical("noc", imx8mm_noc_sels, base + 0x8d00); - hws[IMX8MM_CLK_NOC_APB] = imx8m_clk_hw_composite_critical("noc_apb", imx8mm_noc_apb_sels, base + 0x8d80); + hws[IMX8MM_CLK_NOC] = imx8m_clk_hw_composite_bus_critical("noc", imx8mm_noc_sels, base + 0x8d00); + hws[IMX8MM_CLK_NOC_APB] = imx8m_clk_hw_composite_bus_critical("noc_apb", imx8mm_noc_apb_sels, base + 0x8d80); /* AHB */ - hws[IMX8MM_CLK_AHB] = imx8m_clk_hw_composite_critical("ahb", imx8mm_ahb_sels, base + 0x9000); + hws[IMX8MM_CLK_AHB] = imx8m_clk_hw_composite_bus_critical("ahb", imx8mm_ahb_sels, base + 0x9000); hws[IMX8MM_CLK_AUDIO_AHB] = imx8m_clk_hw_composite_bus("audio_ahb", imx8mm_audio_ahb_sels, base + 0x9100); /* IPG */ diff --git a/drivers/clk/imx/clk-imx8mn.c b/drivers/clk/imx/clk-imx8mn.c index e984de543f0b..f3c5e6cf55dd 100644 --- a/drivers/clk/imx/clk-imx8mn.c +++ b/drivers/clk/imx/clk-imx8mn.c @@ -431,7 +431,7 @@ static int imx8mn_clocks_probe(struct platform_device *pdev) hws[IMX8MN_CLK_A53_CORE] = imx_clk_hw_mux2("arm_a53_core", base + 0x9880, 24, 1, imx8mn_a53_core_sels, ARRAY_SIZE(imx8mn_a53_core_sels)); /* BUS */ - hws[IMX8MN_CLK_MAIN_AXI] = imx8m_clk_hw_composite_critical("main_axi", imx8mn_main_axi_sels, base + 0x8800); + hws[IMX8MN_CLK_MAIN_AXI] = imx8m_clk_hw_composite_bus_critical("main_axi", imx8mn_main_axi_sels, base + 0x8800); hws[IMX8MN_CLK_ENET_AXI] = imx8m_clk_hw_composite_bus("enet_axi", imx8mn_enet_axi_sels, base + 0x8880); hws[IMX8MN_CLK_NAND_USDHC_BUS] = imx8m_clk_hw_composite_bus("nand_usdhc_bus", imx8mn_nand_usdhc_sels, base + 0x8900); hws[IMX8MN_CLK_DISP_AXI] = imx8m_clk_hw_composite_bus("disp_axi", imx8mn_disp_axi_sels, base + 0x8a00); @@ -439,9 +439,9 @@ static int imx8mn_clocks_probe(struct platform_device *pdev) hws[IMX8MN_CLK_USB_BUS] = imx8m_clk_hw_composite_bus("usb_bus", imx8mn_usb_bus_sels, base + 0x8b80); hws[IMX8MN_CLK_GPU_AXI] = imx8m_clk_hw_composite_bus("gpu_axi", imx8mn_gpu_axi_sels, base + 0x8c00); hws[IMX8MN_CLK_GPU_AHB] = imx8m_clk_hw_composite_bus("gpu_ahb", imx8mn_gpu_ahb_sels, base + 0x8c80); - hws[IMX8MN_CLK_NOC] = imx8m_clk_hw_composite_critical("noc", imx8mn_noc_sels, base + 0x8d00); + hws[IMX8MN_CLK_NOC] = imx8m_clk_hw_composite_bus_critical("noc", imx8mn_noc_sels, base + 0x8d00); - hws[IMX8MN_CLK_AHB] = imx8m_clk_hw_composite_critical("ahb", imx8mn_ahb_sels, base + 0x9000); + hws[IMX8MN_CLK_AHB] = imx8m_clk_hw_composite_bus_critical("ahb", imx8mn_ahb_sels, base + 0x9000); hws[IMX8MN_CLK_AUDIO_AHB] = imx8m_clk_hw_composite_bus("audio_ahb", imx8mn_audio_ahb_sels, base + 0x9100); hws[IMX8MN_CLK_IPG_ROOT] = imx_clk_hw_divider2("ipg_root", "ahb", base + 0x9080, 0, 1); hws[IMX8MN_CLK_IPG_AUDIO_ROOT] = imx_clk_hw_divider2("ipg_audio_root", "audio_ahb", base + 0x9180, 0, 1); diff --git a/drivers/clk/imx/clk-imx8mp.c b/drivers/clk/imx/clk-imx8mp.c index 12ce4770f702..48e212477f52 100644 --- a/drivers/clk/imx/clk-imx8mp.c +++ b/drivers/clk/imx/clk-imx8mp.c @@ -557,9 +557,9 @@ static int imx8mp_clocks_probe(struct platform_device *pdev) /* CORE SEL */ hws[IMX8MP_CLK_A53_CORE] = imx_clk_hw_mux2("arm_a53_core", ccm_base + 0x9880, 24, 1, imx8mp_a53_core_sels, ARRAY_SIZE(imx8mp_a53_core_sels)); - hws[IMX8MP_CLK_MAIN_AXI] = imx8m_clk_hw_composite_critical("main_axi", imx8mp_main_axi_sels, ccm_base + 0x8800); + hws[IMX8MP_CLK_MAIN_AXI] = imx8m_clk_hw_composite_bus_critical("main_axi", imx8mp_main_axi_sels, ccm_base + 0x8800); hws[IMX8MP_CLK_ENET_AXI] = imx8m_clk_hw_composite_bus("enet_axi", imx8mp_enet_axi_sels, ccm_base + 0x8880); - hws[IMX8MP_CLK_NAND_USDHC_BUS] = imx8m_clk_hw_composite_critical("nand_usdhc_bus", imx8mp_nand_usdhc_sels, ccm_base + 0x8900); + hws[IMX8MP_CLK_NAND_USDHC_BUS] = imx8m_clk_hw_composite_bus_critical("nand_usdhc_bus", imx8mp_nand_usdhc_sels, ccm_base + 0x8900); hws[IMX8MP_CLK_VPU_BUS] = imx8m_clk_hw_composite_bus("vpu_bus", imx8mp_vpu_bus_sels, ccm_base + 0x8980); hws[IMX8MP_CLK_MEDIA_AXI] = imx8m_clk_hw_composite_bus("media_axi", imx8mp_media_axi_sels, ccm_base + 0x8a00); hws[IMX8MP_CLK_MEDIA_APB] = imx8m_clk_hw_composite_bus("media_apb", imx8mp_media_apb_sels, ccm_base + 0x8a80); @@ -567,12 +567,12 @@ static int imx8mp_clocks_probe(struct platform_device *pdev) hws[IMX8MP_CLK_HDMI_AXI] = imx8m_clk_hw_composite_bus("hdmi_axi", imx8mp_media_axi_sels, ccm_base + 0x8b80); hws[IMX8MP_CLK_GPU_AXI] = imx8m_clk_hw_composite_bus("gpu_axi", imx8mp_gpu_axi_sels, ccm_base + 0x8c00); hws[IMX8MP_CLK_GPU_AHB] = imx8m_clk_hw_composite_bus("gpu_ahb", imx8mp_gpu_ahb_sels, ccm_base + 0x8c80); - hws[IMX8MP_CLK_NOC] = imx8m_clk_hw_composite_critical("noc", imx8mp_noc_sels, ccm_base + 0x8d00); - hws[IMX8MP_CLK_NOC_IO] = imx8m_clk_hw_composite_critical("noc_io", imx8mp_noc_io_sels, ccm_base + 0x8d80); + hws[IMX8MP_CLK_NOC] = imx8m_clk_hw_composite_bus_critical("noc", imx8mp_noc_sels, ccm_base + 0x8d00); + hws[IMX8MP_CLK_NOC_IO] = imx8m_clk_hw_composite_bus_critical("noc_io", imx8mp_noc_io_sels, ccm_base + 0x8d80); hws[IMX8MP_CLK_ML_AXI] = imx8m_clk_hw_composite_bus("ml_axi", imx8mp_ml_axi_sels, ccm_base + 0x8e00); hws[IMX8MP_CLK_ML_AHB] = imx8m_clk_hw_composite_bus("ml_ahb", imx8mp_ml_ahb_sels, ccm_base + 0x8e80); - hws[IMX8MP_CLK_AHB] = imx8m_clk_hw_composite_critical("ahb_root", imx8mp_ahb_sels, ccm_base + 0x9000); + hws[IMX8MP_CLK_AHB] = imx8m_clk_hw_composite_bus_critical("ahb_root", imx8mp_ahb_sels, ccm_base + 0x9000); hws[IMX8MP_CLK_AUDIO_AHB] = imx8m_clk_hw_composite_bus("audio_ahb", imx8mp_audio_ahb_sels, ccm_base + 0x9100); hws[IMX8MP_CLK_MIPI_DSI_ESC_RX] = imx8m_clk_hw_composite_bus("mipi_dsi_esc_rx", imx8mp_mipi_dsi_esc_rx_sels, ccm_base + 0x9200); diff --git a/drivers/clk/imx/clk-imx8mq.c b/drivers/clk/imx/clk-imx8mq.c index 8265d1d48af4..06292d4a98ff 100644 --- a/drivers/clk/imx/clk-imx8mq.c +++ b/drivers/clk/imx/clk-imx8mq.c @@ -431,7 +431,7 @@ static int imx8mq_clocks_probe(struct platform_device *pdev) hws[IMX8MQ_CLK_A53_CORE] = imx_clk_hw_mux2("arm_a53_core", base + 0x9880, 24, 1, imx8mq_a53_core_sels, ARRAY_SIZE(imx8mq_a53_core_sels)); /* BUS */ - hws[IMX8MQ_CLK_MAIN_AXI] = imx8m_clk_hw_composite_critical("main_axi", imx8mq_main_axi_sels, base + 0x8800); + hws[IMX8MQ_CLK_MAIN_AXI] = imx8m_clk_hw_composite_bus_critical("main_axi", imx8mq_main_axi_sels, base + 0x8800); hws[IMX8MQ_CLK_ENET_AXI] = imx8m_clk_hw_composite_bus("enet_axi", imx8mq_enet_axi_sels, base + 0x8880); hws[IMX8MQ_CLK_NAND_USDHC_BUS] = imx8m_clk_hw_composite_bus("nand_usdhc_bus", imx8mq_nand_usdhc_sels, base + 0x8900); hws[IMX8MQ_CLK_VPU_BUS] = imx8m_clk_hw_composite_bus("vpu_bus", imx8mq_vpu_bus_sels, base + 0x8980); @@ -441,12 +441,12 @@ static int imx8mq_clocks_probe(struct platform_device *pdev) hws[IMX8MQ_CLK_USB_BUS] = imx8m_clk_hw_composite_bus("usb_bus", imx8mq_usb_bus_sels, base + 0x8b80); hws[IMX8MQ_CLK_GPU_AXI] = imx8m_clk_hw_composite_bus("gpu_axi", imx8mq_gpu_axi_sels, base + 0x8c00); hws[IMX8MQ_CLK_GPU_AHB] = imx8m_clk_hw_composite_bus("gpu_ahb", imx8mq_gpu_ahb_sels, base + 0x8c80); - hws[IMX8MQ_CLK_NOC] = imx8m_clk_hw_composite_critical("noc", imx8mq_noc_sels, base + 0x8d00); - hws[IMX8MQ_CLK_NOC_APB] = imx8m_clk_hw_composite_critical("noc_apb", imx8mq_noc_apb_sels, base + 0x8d80); + hws[IMX8MQ_CLK_NOC] = imx8m_clk_hw_composite_bus_critical("noc", imx8mq_noc_sels, base + 0x8d00); + hws[IMX8MQ_CLK_NOC_APB] = imx8m_clk_hw_composite_bus_critical("noc_apb", imx8mq_noc_apb_sels, base + 0x8d80); /* AHB */ /* AHB clock is used by the AHB bus therefore marked as critical */ - hws[IMX8MQ_CLK_AHB] = imx8m_clk_hw_composite_critical("ahb", imx8mq_ahb_sels, base + 0x9000); + hws[IMX8MQ_CLK_AHB] = imx8m_clk_hw_composite_bus_critical("ahb", imx8mq_ahb_sels, base + 0x9000); hws[IMX8MQ_CLK_AUDIO_AHB] = imx8m_clk_hw_composite_bus("audio_ahb", imx8mq_audio_ahb_sels, base + 0x9100); /* IPG */ diff --git a/drivers/clk/imx/clk.h b/drivers/clk/imx/clk.h index 3b796b3da249..1d7be0c86538 100644 --- a/drivers/clk/imx/clk.h +++ b/drivers/clk/imx/clk.h @@ -549,6 +549,11 @@ struct clk_hw *imx8m_clk_hw_composite_flags(const char *name, IMX_COMPOSITE_BUS, \ CLK_SET_RATE_NO_REPARENT | CLK_OPS_PARENT_ENABLE) +#define imx8m_clk_hw_composite_bus_critical(name, parent_names, reg) \ + imx8m_clk_hw_composite_flags(name, parent_names, ARRAY_SIZE(parent_names), reg, \ + IMX_COMPOSITE_BUS, \ + CLK_SET_RATE_NO_REPARENT | CLK_OPS_PARENT_ENABLE | CLK_IS_CRITICAL) + #define imx8m_clk_hw_composite_core(name, parent_names, reg) \ imx8m_clk_hw_composite_flags(name, parent_names, \ ARRAY_SIZE(parent_names), reg, \ From da3fecb0040324c08f1587e5bff1f15f36be1872 Mon Sep 17 00:00:00 2001 From: Can Guo Date: Mon, 2 Nov 2020 22:24:39 -0800 Subject: [PATCH 05/27] scsi: ufs: Fix unbalanced scsi_block_reqs_cnt caused by ufshcd_hold() The scsi_block_reqs_cnt increased in ufshcd_hold() is supposed to be decreased back in ufshcd_ungate_work() in a paired way. However, if specific ufshcd_hold/release sequences are met, it is possible that scsi_block_reqs_cnt is increased twice but only one ungate work is queued. To make sure scsi_block_reqs_cnt is handled by ufshcd_hold() and ufshcd_ungate_work() in a paired way, increase it only if queue_work() returns true. Link: https://lore.kernel.org/r/1604384682-15837-2-git-send-email-cang@codeaurora.org Reviewed-by: Hongwu Su Reviewed-by: Stanley Chu Reviewed-by: Bean Huo Signed-off-by: Can Guo Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufshcd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index b8f573a02713..09deefe5904a 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -1627,12 +1627,12 @@ start: */ fallthrough; case CLKS_OFF: - ufshcd_scsi_block_requests(hba); hba->clk_gating.state = REQ_CLKS_ON; trace_ufshcd_clk_gating(dev_name(hba->dev), hba->clk_gating.state); - queue_work(hba->clk_gating.clk_gating_workq, - &hba->clk_gating.ungate_work); + if (queue_work(hba->clk_gating.clk_gating_workq, + &hba->clk_gating.ungate_work)) + ufshcd_scsi_block_requests(hba); /* * fall through to check if we should wait for this * work to be done or not. From 0f52fcb99ea2738a0a0f28e12cf4dd427069dd2a Mon Sep 17 00:00:00 2001 From: Can Guo Date: Mon, 2 Nov 2020 22:24:40 -0800 Subject: [PATCH 06/27] scsi: ufs: Try to save power mode change and UIC cmd completion timeout Use the uic_cmd->cmd_active as a flag to track the lifecycle of an UIC cmd. The flag is set before sending the UIC cmd and cleared in IRQ handler. When a PMC or UIC cmd completion timeout happens, if the flag is not set, instead of returning timeout error, we still treat it as a successful operation. This is to deal with the scenario in which completion has been raised but the one waiting for the completion cannot be awaken in time due to kernel scheduling problem. Link: https://lore.kernel.org/r/1604384682-15837-3-git-send-email-cang@codeaurora.org Reviewed-by: Stanley Chu Signed-off-by: Can Guo Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufshcd.c | 26 ++++++++++++++++++++++++-- drivers/scsi/ufs/ufshcd.h | 2 ++ 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index 09deefe5904a..5167c3e770a3 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -2115,10 +2115,20 @@ ufshcd_wait_for_uic_cmd(struct ufs_hba *hba, struct uic_command *uic_cmd) unsigned long flags; if (wait_for_completion_timeout(&uic_cmd->done, - msecs_to_jiffies(UIC_CMD_TIMEOUT))) + msecs_to_jiffies(UIC_CMD_TIMEOUT))) { ret = uic_cmd->argument2 & MASK_UIC_COMMAND_RESULT; - else + } else { ret = -ETIMEDOUT; + dev_err(hba->dev, + "uic cmd 0x%x with arg3 0x%x completion timeout\n", + uic_cmd->command, uic_cmd->argument3); + + if (!uic_cmd->cmd_active) { + dev_err(hba->dev, "%s: UIC cmd has been completed, return the result\n", + __func__); + ret = uic_cmd->argument2 & MASK_UIC_COMMAND_RESULT; + } + } spin_lock_irqsave(hba->host->host_lock, flags); hba->active_uic_cmd = NULL; @@ -2150,6 +2160,7 @@ __ufshcd_send_uic_cmd(struct ufs_hba *hba, struct uic_command *uic_cmd, if (completion) init_completion(&uic_cmd->done); + uic_cmd->cmd_active = 1; ufshcd_dispatch_uic_cmd(hba, uic_cmd); return 0; @@ -3807,10 +3818,18 @@ static int ufshcd_uic_pwr_ctrl(struct ufs_hba *hba, struct uic_command *cmd) dev_err(hba->dev, "pwr ctrl cmd 0x%x with mode 0x%x completion timeout\n", cmd->command, cmd->argument3); + + if (!cmd->cmd_active) { + dev_err(hba->dev, "%s: Power Mode Change operation has been completed, go check UPMCRS\n", + __func__); + goto check_upmcrs; + } + ret = -ETIMEDOUT; goto out; } +check_upmcrs: status = ufshcd_get_upmcrs(hba); if (status != PWR_LOCAL) { dev_err(hba->dev, @@ -4902,11 +4921,14 @@ static irqreturn_t ufshcd_uic_cmd_compl(struct ufs_hba *hba, u32 intr_status) ufshcd_get_uic_cmd_result(hba); hba->active_uic_cmd->argument3 = ufshcd_get_dme_attr_val(hba); + if (!hba->uic_async_done) + hba->active_uic_cmd->cmd_active = 0; complete(&hba->active_uic_cmd->done); retval = IRQ_HANDLED; } if ((intr_status & UFSHCD_UIC_PWR_MASK) && hba->uic_async_done) { + hba->active_uic_cmd->cmd_active = 0; complete(hba->uic_async_done); retval = IRQ_HANDLED; } diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h index 47eb1430274c..e0f00a42371c 100644 --- a/drivers/scsi/ufs/ufshcd.h +++ b/drivers/scsi/ufs/ufshcd.h @@ -64,6 +64,7 @@ enum dev_cmd_type { * @argument1: UIC command argument 1 * @argument2: UIC command argument 2 * @argument3: UIC command argument 3 + * @cmd_active: Indicate if UIC command is outstanding * @done: UIC command completion */ struct uic_command { @@ -71,6 +72,7 @@ struct uic_command { u32 argument1; u32 argument2; u32 argument3; + int cmd_active; struct completion done; }; From 9a5085b3fad5d5d6019a3d160cdd70357d35c8b1 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Mon, 19 Oct 2020 23:10:49 +0200 Subject: [PATCH 07/27] um: Call pgtable_pmd_page_dtor() in __pmd_free_tlb() Commit b2b29d6d0119 ("mm: account PMD tables like PTE tables") uncovered a bug in uml, we forgot to call the destructor. While we are here, give x a sane name. Reported-by: Anton Ivanov Co-developed-by: Matthew Wilcox (Oracle) Signed-off-by: Richard Weinberger Tested-by: Christopher Obbard --- arch/um/include/asm/pgalloc.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/um/include/asm/pgalloc.h b/arch/um/include/asm/pgalloc.h index 5393e13e07e0..2bbf28cf3aa9 100644 --- a/arch/um/include/asm/pgalloc.h +++ b/arch/um/include/asm/pgalloc.h @@ -33,7 +33,13 @@ do { \ } while (0) #ifdef CONFIG_3_LEVEL_PGTABLES -#define __pmd_free_tlb(tlb,x, address) tlb_remove_page((tlb),virt_to_page(x)) + +#define __pmd_free_tlb(tlb, pmd, address) \ +do { \ + pgtable_pmd_page_dtor(virt_to_page(pmd)); \ + tlb_remove_page((tlb),virt_to_page(pmd)); \ +} while (0) \ + #endif #endif From 2e6f11a797a24d1e2141a214a6dd6dfbe709f55d Mon Sep 17 00:00:00 2001 From: Qinglang Miao Date: Tue, 10 Nov 2020 15:42:23 +0800 Subject: [PATCH 08/27] scsi: ufshcd: Fix missing destroy_workqueue() Add the missing destroy_workqueue() before return from ufshcd_init in the error handling case as well as in ufshcd_remove. Link: https://lore.kernel.org/r/20201110074223.41280-1-miaoqinglang@huawei.com Fixes: 4db7a2360597 ("scsi: ufs: Fix concurrency of error handler and other error recovery paths") Suggested-by: Avri Altman Reviewed-by: Asutosh Das Reviewed-by: Avri Altman Signed-off-by: Qinglang Miao Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufshcd.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index 5167c3e770a3..7a160b86adc6 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -8928,6 +8928,7 @@ void ufshcd_remove(struct ufs_hba *hba) blk_mq_free_tag_set(&hba->tmf_tag_set); blk_cleanup_queue(hba->cmd_queue); scsi_remove_host(hba->host); + destroy_workqueue(hba->eh_wq); /* disable interrupts */ ufshcd_disable_intr(hba, hba->intr_mask); ufshcd_hba_stop(hba); @@ -9228,6 +9229,7 @@ out_remove_scsi_host: exit_gating: ufshcd_exit_clk_scaling(hba); ufshcd_exit_clk_gating(hba); + destroy_workqueue(hba->eh_wq); out_disable: hba->is_irq_enabled = false; ufshcd_hba_exit(hba); From fd8feec665fef840277515a5c2b9b7c3e3970fad Mon Sep 17 00:00:00 2001 From: Paul Barker Date: Wed, 11 Nov 2020 16:46:43 +0000 Subject: [PATCH 09/27] hwmon: (pwm-fan) Fix RPM calculation To convert the number of pulses counted into an RPM estimation, we need to divide by the width of our measurement interval instead of multiplying by it. If the width of the measurement interval is zero we don't update the RPM value to avoid dividing by zero. We also don't need to do 64-bit division, with 32-bits we can handle a fan running at over 4 million RPM. Signed-off-by: Paul Barker Link: https://lore.kernel.org/r/20201111164643.7087-1-pbarker@konsulko.com Signed-off-by: Guenter Roeck --- drivers/hwmon/pwm-fan.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/hwmon/pwm-fan.c b/drivers/hwmon/pwm-fan.c index bdba2143021a..1f63807c0399 100644 --- a/drivers/hwmon/pwm-fan.c +++ b/drivers/hwmon/pwm-fan.c @@ -54,16 +54,18 @@ static irqreturn_t pulse_handler(int irq, void *dev_id) static void sample_timer(struct timer_list *t) { struct pwm_fan_ctx *ctx = from_timer(ctx, t, rpm_timer); + unsigned int delta = ktime_ms_delta(ktime_get(), ctx->sample_start); int pulses; - u64 tmp; - pulses = atomic_read(&ctx->pulses); - atomic_sub(pulses, &ctx->pulses); - tmp = (u64)pulses * ktime_ms_delta(ktime_get(), ctx->sample_start) * 60; - do_div(tmp, ctx->pulses_per_revolution * 1000); - ctx->rpm = tmp; + if (delta) { + pulses = atomic_read(&ctx->pulses); + atomic_sub(pulses, &ctx->pulses); + ctx->rpm = (unsigned int)(pulses * 1000 * 60) / + (ctx->pulses_per_revolution * delta); + + ctx->sample_start = ktime_get(); + } - ctx->sample_start = ktime_get(); mod_timer(&ctx->rpm_timer, jiffies + HZ); } From 4d64bb4ba5ecf4831448cdb2fe16d0ae91b2b40b Mon Sep 17 00:00:00 2001 From: Brad Campbell Date: Thu, 12 Nov 2020 14:08:23 +1100 Subject: [PATCH 10/27] hwmon: (applesmc) Re-work SMC comms Commit fff2d0f701e6 ("hwmon: (applesmc) avoid overlong udelay()") introduced an issue whereby communication with the SMC became unreliable with write errors like : [ 120.378614] applesmc: send_byte(0x00, 0x0300) fail: 0x40 [ 120.378621] applesmc: LKSB: write data fail [ 120.512782] applesmc: send_byte(0x00, 0x0300) fail: 0x40 [ 120.512787] applesmc: LKSB: write data fail The original code appeared to be timing sensitive and was not reliable with the timing changes in the aforementioned commit. This patch re-factors the SMC communication to remove the timing dependencies and restore function with the changes previously committed. Tested on : MacbookAir6,2 MacBookPro11,1 iMac12,2, MacBookAir1,1, MacBookAir3,1 Fixes: fff2d0f701e6 ("hwmon: (applesmc) avoid overlong udelay()") Reported-by: Andreas Kemnade Tested-by: Andreas Kemnade # MacBookAir6,2 Acked-by: Arnd Bergmann Signed-off-by: Brad Campbell Signed-off-by: Henrik Rydberg Link: https://lore.kernel.org/r/194a7d71-a781-765a-d177-c962ef296b90@fnarfbargle.com Signed-off-by: Guenter Roeck --- drivers/hwmon/applesmc.c | 130 ++++++++++++++++++++++++--------------- 1 file changed, 82 insertions(+), 48 deletions(-) diff --git a/drivers/hwmon/applesmc.c b/drivers/hwmon/applesmc.c index a18887990f4a..79b498f816fe 100644 --- a/drivers/hwmon/applesmc.c +++ b/drivers/hwmon/applesmc.c @@ -32,6 +32,7 @@ #include #include #include +#include /* data port used by Apple SMC */ #define APPLESMC_DATA_PORT 0x300 @@ -42,10 +43,13 @@ #define APPLESMC_MAX_DATA_LENGTH 32 -/* wait up to 128 ms for a status change. */ -#define APPLESMC_MIN_WAIT 0x0010 -#define APPLESMC_RETRY_WAIT 0x0100 -#define APPLESMC_MAX_WAIT 0x20000 +/* Apple SMC status bits */ +#define SMC_STATUS_AWAITING_DATA BIT(0) /* SMC has data waiting to be read */ +#define SMC_STATUS_IB_CLOSED BIT(1) /* Will ignore any input */ +#define SMC_STATUS_BUSY BIT(2) /* Command in progress */ + +/* Initial wait is 8us */ +#define APPLESMC_MIN_WAIT 0x0008 #define APPLESMC_READ_CMD 0x10 #define APPLESMC_WRITE_CMD 0x11 @@ -151,65 +155,84 @@ static unsigned int key_at_index; static struct workqueue_struct *applesmc_led_wq; /* - * wait_read - Wait for a byte to appear on SMC port. Callers must - * hold applesmc_lock. + * Wait for specific status bits with a mask on the SMC. + * Used before all transactions. + * This does 10 fast loops of 8us then exponentially backs off for a + * minimum total wait of 262ms. Depending on usleep_range this could + * run out past 500ms. */ -static int wait_read(void) + +static int wait_status(u8 val, u8 mask) { - unsigned long end = jiffies + (APPLESMC_MAX_WAIT * HZ) / USEC_PER_SEC; u8 status; int us; + int i; - for (us = APPLESMC_MIN_WAIT; us < APPLESMC_MAX_WAIT; us <<= 1) { - usleep_range(us, us * 16); + us = APPLESMC_MIN_WAIT; + for (i = 0; i < 24 ; i++) { status = inb(APPLESMC_CMD_PORT); - /* read: wait for smc to settle */ - if (status & 0x01) + if ((status & mask) == val) return 0; - /* timeout: give up */ - if (time_after(jiffies, end)) - break; + usleep_range(us, us * 2); + if (i > 9) + us <<= 1; } - - pr_warn("wait_read() fail: 0x%02x\n", status); return -EIO; } -/* - * send_byte - Write to SMC port, retrying when necessary. Callers - * must hold applesmc_lock. - */ +/* send_byte - Write to SMC data port. Callers must hold applesmc_lock. */ + static int send_byte(u8 cmd, u16 port) { - u8 status; - int us; - unsigned long end = jiffies + (APPLESMC_MAX_WAIT * HZ) / USEC_PER_SEC; + int status; + + status = wait_status(0, SMC_STATUS_IB_CLOSED); + if (status) + return status; + /* + * This needs to be a separate read looking for bit 0x04 + * after bit 0x02 falls. If consolidated with the wait above + * this extra read may not happen if status returns both + * simultaneously and this would appear to be required. + */ + status = wait_status(SMC_STATUS_BUSY, SMC_STATUS_BUSY); + if (status) + return status; outb(cmd, port); - for (us = APPLESMC_MIN_WAIT; us < APPLESMC_MAX_WAIT; us <<= 1) { - usleep_range(us, us * 16); - status = inb(APPLESMC_CMD_PORT); - /* write: wait for smc to settle */ - if (status & 0x02) - continue; - /* ready: cmd accepted, return */ - if (status & 0x04) - return 0; - /* timeout: give up */ - if (time_after(jiffies, end)) - break; - /* busy: long wait and resend */ - udelay(APPLESMC_RETRY_WAIT); - outb(cmd, port); - } - - pr_warn("send_byte(0x%02x, 0x%04x) fail: 0x%02x\n", cmd, port, status); - return -EIO; + return 0; } +/* send_command - Write a command to the SMC. Callers must hold applesmc_lock. */ + static int send_command(u8 cmd) { - return send_byte(cmd, APPLESMC_CMD_PORT); + int ret; + + ret = wait_status(0, SMC_STATUS_IB_CLOSED); + if (ret) + return ret; + outb(cmd, APPLESMC_CMD_PORT); + return 0; +} + +/* + * Based on logic from the Apple driver. This is issued before any interaction + * If busy is stuck high, issue a read command to reset the SMC state machine. + * If busy is stuck high after the command then the SMC is jammed. + */ + +static int smc_sane(void) +{ + int ret; + + ret = wait_status(0, SMC_STATUS_BUSY); + if (!ret) + return ret; + ret = send_command(APPLESMC_READ_CMD); + if (ret) + return ret; + return wait_status(0, SMC_STATUS_BUSY); } static int send_argument(const char *key) @@ -226,6 +249,11 @@ static int read_smc(u8 cmd, const char *key, u8 *buffer, u8 len) { u8 status, data = 0; int i; + int ret; + + ret = smc_sane(); + if (ret) + return ret; if (send_command(cmd) || send_argument(key)) { pr_warn("%.4s: read arg fail\n", key); @@ -239,7 +267,8 @@ static int read_smc(u8 cmd, const char *key, u8 *buffer, u8 len) } for (i = 0; i < len; i++) { - if (wait_read()) { + if (wait_status(SMC_STATUS_AWAITING_DATA | SMC_STATUS_BUSY, + SMC_STATUS_AWAITING_DATA | SMC_STATUS_BUSY)) { pr_warn("%.4s: read data[%d] fail\n", key, i); return -EIO; } @@ -250,19 +279,24 @@ static int read_smc(u8 cmd, const char *key, u8 *buffer, u8 len) for (i = 0; i < 16; i++) { udelay(APPLESMC_MIN_WAIT); status = inb(APPLESMC_CMD_PORT); - if (!(status & 0x01)) + if (!(status & SMC_STATUS_AWAITING_DATA)) break; data = inb(APPLESMC_DATA_PORT); } if (i) pr_warn("flushed %d bytes, last value is: %d\n", i, data); - return 0; + return wait_status(0, SMC_STATUS_BUSY); } static int write_smc(u8 cmd, const char *key, const u8 *buffer, u8 len) { int i; + int ret; + + ret = smc_sane(); + if (ret) + return ret; if (send_command(cmd) || send_argument(key)) { pr_warn("%s: write arg fail\n", key); @@ -281,7 +315,7 @@ static int write_smc(u8 cmd, const char *key, const u8 *buffer, u8 len) } } - return 0; + return wait_status(0, SMC_STATUS_BUSY); } static int read_register_count(unsigned int *count) From c350f8bea271782e2733419bd2ab9bf4ec2051ef Mon Sep 17 00:00:00 2001 From: Chen Zhou Date: Thu, 12 Nov 2020 21:53:32 +0800 Subject: [PATCH 11/27] selinux: Fix error return code in sel_ib_pkey_sid_slow() Fix to return a negative error code from the error handling case instead of 0 in function sel_ib_pkey_sid_slow(), as done elsewhere in this function. Cc: stable@vger.kernel.org Fixes: 409dcf31538a ("selinux: Add a cache for quicker retreival of PKey SIDs") Reported-by: Hulk Robot Signed-off-by: Chen Zhou Signed-off-by: Paul Moore --- security/selinux/ibpkey.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/security/selinux/ibpkey.c b/security/selinux/ibpkey.c index f68a7617cfb9..3a63a989e55e 100644 --- a/security/selinux/ibpkey.c +++ b/security/selinux/ibpkey.c @@ -151,8 +151,10 @@ static int sel_ib_pkey_sid_slow(u64 subnet_prefix, u16 pkey_num, u32 *sid) * is valid, it just won't be added to the cache. */ new = kzalloc(sizeof(*new), GFP_ATOMIC); - if (!new) + if (!new) { + ret = -ENOMEM; goto out; + } new->psec.subnet_prefix = subnet_prefix; new->psec.pkey = pkey_num; From 60268b0e8258fdea9a3c9f4b51e161c123571db3 Mon Sep 17 00:00:00 2001 From: Naveen Krishna Chatradhi Date: Thu, 12 Nov 2020 22:51:59 +0530 Subject: [PATCH 12/27] hwmon: (amd_energy) modify the visibility of the counters This patch limits the visibility to owner and groups only for the energy counters exposed through the hwmon based amd_energy driver. Cc: stable@vger.kernel.org Reviewed-by: Greg Kroah-Hartman Signed-off-by: Naveen Krishna Chatradhi Link: https://lore.kernel.org/r/20201112172159.8781-1-nchatrad@amd.com Signed-off-by: Guenter Roeck --- drivers/hwmon/amd_energy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hwmon/amd_energy.c b/drivers/hwmon/amd_energy.c index d06597303d5a..3197cda7bcd9 100644 --- a/drivers/hwmon/amd_energy.c +++ b/drivers/hwmon/amd_energy.c @@ -171,7 +171,7 @@ static umode_t amd_energy_is_visible(const void *_data, enum hwmon_sensor_types type, u32 attr, int channel) { - return 0444; + return 0440; } static int energy_accumulator(void *p) From 38935861d85a4d9a353d1dd5a156c97700e2765d Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Fri, 13 Nov 2020 22:51:40 -0800 Subject: [PATCH 13/27] mm/compaction: count pages and stop correctly during page isolation In isolate_migratepages_block, when cc->alloc_contig is true, we are able to isolate compound pages. But nr_migratepages and nr_isolated did not count compound pages correctly, causing us to isolate more pages than we thought. So count compound pages as the number of base pages they contain. Otherwise, we might be trapped in too_many_isolated while loop, since the actual isolated pages can go up to COMPACT_CLUSTER_MAX*512=16384, where COMPACT_CLUSTER_MAX is 32, since we stop isolation after cc->nr_migratepages reaches to COMPACT_CLUSTER_MAX. In addition, after we fix the issue above, cc->nr_migratepages could never be equal to COMPACT_CLUSTER_MAX if compound pages are isolated, thus page isolation could not stop as we intended. Change the isolation stop condition to '>='. The issue can be triggered as follows: In a system with 16GB memory and an 8GB CMA region reserved by hugetlb_cma, if we first allocate 10GB THPs and mlock them (so some THPs are allocated in the CMA region and mlocked), reserving 6 1GB hugetlb pages via /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages will get stuck (looping in too_many_isolated function) until we kill either task. With the patch applied, oom will kill the application with 10GB THPs and let hugetlb page reservation finish. [ziy@nvidia.com: v3] Link: https://lkml.kernel.org/r/20201030183809.3616803-1-zi.yan@sent.com Fixes: 1da2f328fa64 ("cmm,thp,compaction,cma: allow THP migration for CMA allocations") Signed-off-by: Zi Yan Signed-off-by: Andrew Morton Reviewed-by: Yang Shi Acked-by: Vlastimil Babka Cc: Rik van Riel Cc: Michal Hocko Cc: Mel Gorman Cc: Link: https://lkml.kernel.org/r/20201029200435.3386066-1-zi.yan@sent.com Signed-off-by: Linus Torvalds --- mm/compaction.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 6e0ee5641788..d549a4521edf 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1012,8 +1012,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, isolate_success: list_add(&page->lru, &cc->migratepages); - cc->nr_migratepages++; - nr_isolated++; + cc->nr_migratepages += compound_nr(page); + nr_isolated += compound_nr(page); /* * Avoid isolating too much unless this block is being @@ -1021,7 +1021,7 @@ isolate_success: * or a lock is contended. For contention, isolate quickly to * potentially remove one source of contention. */ - if (cc->nr_migratepages == COMPACT_CLUSTER_MAX && + if (cc->nr_migratepages >= COMPACT_CLUSTER_MAX && !cc->rescan && !cc->contended) { ++low_pfn; break; @@ -1132,7 +1132,7 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, if (!pfn) break; - if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) + if (cc->nr_migratepages >= COMPACT_CLUSTER_MAX) break; } From d20bdd571ee5c9966191568527ecdb1bd4b52368 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Fri, 13 Nov 2020 22:51:43 -0800 Subject: [PATCH 14/27] mm/compaction: stop isolation if too many pages are isolated and we have pages to migrate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In isolate_migratepages_block, if we have too many isolated pages and nr_migratepages is not zero, we should try to migrate what we have without wasting time on isolating. In theory it's possible that multiple parallel compactions will cause too_many_isolated() to become true even if each has isolated less than COMPACT_CLUSTER_MAX, and loop forever in the while loop. Bailing immediately prevents that. [vbabka@suse.cz: changelog addition] Fixes: 1da2f328fa64 (“mm,thp,compaction,cma: allow THP migration for CMA allocations”) Suggested-by: Vlastimil Babka Signed-off-by: Zi Yan Signed-off-by: Andrew Morton Cc: Cc: Mel Gorman Cc: Michal Hocko Cc: Rik van Riel Cc: Yang Shi Link: https://lkml.kernel.org/r/20201030183809.3616803-2-zi.yan@sent.com Signed-off-by: Linus Torvalds --- mm/compaction.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/compaction.c b/mm/compaction.c index d549a4521edf..13cb7a961b31 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -817,6 +817,10 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * delay for some time until fewer pages are isolated */ while (unlikely(too_many_isolated(pgdat))) { + /* stop isolation if there are still pages not migrated */ + if (cc->nr_migratepages) + return 0; + /* async migration should just abort */ if (cc->mode == MIGRATE_ASYNC) return 0; From 2da9f6305f306ffbbb44790675799328fb73119d Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 13 Nov 2020 22:51:46 -0800 Subject: [PATCH 15/27] mm/vmscan: fix NR_ISOLATED_FILE corruption on 64-bit Previously the negated unsigned long would be cast back to signed long which would have the correct negative value. After commit 730ec8c01a2b ("mm/vmscan.c: change prototype for shrink_page_list"), the large unsigned int converts to a large positive signed long. Symptoms include CMA allocations hanging forever holding the cma_mutex due to alloc_contig_range->...->isolate_migratepages_block waiting forever in "while (unlikely(too_many_isolated(pgdat)))". [akpm@linux-foundation.org: fix -stat.nr_lazyfree_fail as well, per Michal] Fixes: 730ec8c01a2b ("mm/vmscan.c: change prototype for shrink_page_list") Signed-off-by: Nicholas Piggin Signed-off-by: Andrew Morton Acked-by: Michal Hocko Cc: Vaneet Narang Cc: Maninder Singh Cc: Amit Sahrawat Cc: Mel Gorman Cc: Vlastimil Babka Cc: Link: https://lkml.kernel.org/r/20201029032320.1448441-1-npiggin@gmail.com Signed-off-by: Linus Torvalds --- mm/vmscan.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 1b8f0e059767..7b4e31eac2cf 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1516,7 +1516,8 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, nr_reclaimed = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc, TTU_IGNORE_ACCESS, &stat, true); list_splice(&clean_pages, page_list); - mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -nr_reclaimed); + mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, + -(long)nr_reclaimed); /* * Since lazyfree pages are isolated from file LRU from the beginning, * they will rotate back to anonymous LRU in the end if it failed to @@ -1526,7 +1527,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON, stat.nr_lazyfree_fail); mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, - -stat.nr_lazyfree_fail); + -(long)stat.nr_lazyfree_fail); return nr_reclaimed; } From 044747e971ace469064e68a0e8b3666011f0f3bd Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Fri, 13 Nov 2020 22:51:49 -0800 Subject: [PATCH 16/27] mailmap: fix entry for Dmitry Baryshkov/Eremin-Solenikov Change back surname to new (old) one. Dmitry Baryshkov -> Dmitry Eremin-Solenikov -> Dmitry Baryshkov. Map several odd entries to main identity. Signed-off-by: Dmitry Baryshkov Signed-off-by: Andrew Morton Link: https://lkml.kernel.org/r/20201103005158.1181426-1-dmitry.baryshkov@linaro.org Signed-off-by: Linus Torvalds --- .mailmap | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.mailmap b/.mailmap index 1e14566a3d56..505b3d771964 100644 --- a/.mailmap +++ b/.mailmap @@ -82,7 +82,10 @@ Dengcheng Zhu Dengcheng Zhu Dengcheng Zhu -Dmitry Eremin-Solenikov +Dmitry Baryshkov +Dmitry Baryshkov <[dbaryshkov@gmail.com]> +Dmitry Baryshkov +Dmitry Baryshkov Dmitry Safonov <0x7f454c46@gmail.com> Dmitry Safonov <0x7f454c46@gmail.com> Dmitry Safonov <0x7f454c46@gmail.com> From 22e4663e916321b72972c69ca0c6b962f529bd78 Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Fri, 13 Nov 2020 22:51:53 -0800 Subject: [PATCH 17/27] mm/slub: fix panic in slab_alloc_node() While doing memory hot-unplug operation on a PowerPC VM running 1024 CPUs with 11TB of ram, I hit the following panic: BUG: Kernel NULL pointer dereference on read at 0x00000007 Faulting instruction address: 0xc000000000456048 Oops: Kernel access of bad area, sig: 11 [#2] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS= 2048 NUMA pSeries Modules linked in: rpadlpar_io rpaphp CPU: 160 PID: 1 Comm: systemd Tainted: G D 5.9.0 #1 NIP: c000000000456048 LR: c000000000455fd4 CTR: c00000000047b350 REGS: c00006028d1b77a0 TRAP: 0300 Tainted: G D (5.9.0) MSR: 8000000000009033 CR: 24004228 XER: 00000000 CFAR: c00000000000f1b0 DAR: 0000000000000007 DSISR: 40000000 IRQMASK: 0 GPR00: c000000000455fd4 c00006028d1b7a30 c000000001bec800 0000000000000000 GPR04: 0000000000000dc0 0000000000000000 00000000000374ef c00007c53df99320 GPR08: 000007c53c980000 0000000000000000 000007c53c980000 0000000000000000 GPR12: 0000000000004400 c00000001e8e4400 0000000000000000 0000000000000f6a GPR16: 0000000000000000 c000000001c25930 c000000001d62528 00000000000000c1 GPR20: c000000001d62538 c00006be469e9000 0000000fffffffe0 c0000000003c0ff8 GPR24: 0000000000000018 0000000000000000 0000000000000dc0 0000000000000000 GPR28: c00007c513755700 c000000001c236a4 c00007bc4001f800 0000000000000001 NIP [c000000000456048] __kmalloc_node+0x108/0x790 LR [c000000000455fd4] __kmalloc_node+0x94/0x790 Call Trace: kvmalloc_node+0x58/0x110 mem_cgroup_css_online+0x10c/0x270 online_css+0x48/0xd0 cgroup_apply_control_enable+0x2c4/0x470 cgroup_mkdir+0x408/0x5f0 kernfs_iop_mkdir+0x90/0x100 vfs_mkdir+0x138/0x250 do_mkdirat+0x154/0x1c0 system_call_exception+0xf8/0x200 system_call_common+0xf0/0x27c Instruction dump: e93e0000 e90d0030 39290008 7cc9402a e94d0030 e93e0000 7ce95214 7f89502a 2fbc0000 419e0018 41920230 e9270010 <89290007> 7f994800 419e0220 7ee6bb78 This pointing to the following code: mm/slub.c:2851 if (unlikely(!object || !node_match(page, node))) { c000000000456038: 00 00 bc 2f cmpdi cr7,r28,0 c00000000045603c: 18 00 9e 41 beq cr7,c000000000456054 <__kmalloc_node+0x114> node_match(): mm/slub.c:2491 if (node != NUMA_NO_NODE && page_to_nid(page) != node) c000000000456040: 30 02 92 41 beq cr4,c000000000456270 <__kmalloc_node+0x330> page_to_nid(): include/linux/mm.h:1294 c000000000456044: 10 00 27 e9 ld r9,16(r7) c000000000456048: 07 00 29 89 lbz r9,7(r9) <<<< r9 = NULL node_match(): mm/slub.c:2491 c00000000045604c: 00 48 99 7f cmpw cr7,r25,r9 c000000000456050: 20 02 9e 41 beq cr7,c000000000456270 <__kmalloc_node+0x330> The panic occurred in slab_alloc_node() when checking for the page's node: object = c->freelist; page = c->page; if (unlikely(!object || !node_match(page, node))) { object = __slab_alloc(s, gfpflags, node, addr, c); stat(s, ALLOC_SLOWPATH); The issue is that object is not NULL while page is NULL which is odd but may happen if the cache flush happened after loading object but before loading page. Thus checking for the page pointer is required too. The cache flush is done through an inter processor interrupt when a piece of memory is off-lined. That interrupt is triggered when a memory hot-unplug operation is initiated and offline_pages() is calling the slub's MEM_GOING_OFFLINE callback slab_mem_going_offline_callback() which is calling flush_cpu_slab(). If that interrupt is caught between the reading of c->freelist and the reading of c->page, this could lead to such a situation. That situation is expected and the later call to this_cpu_cmpxchg_double() will detect the change to c->freelist and redo the whole operation. In commit 6159d0f5c03e ("mm/slub.c: page is always non-NULL in node_match()") check on the page pointer has been removed assuming that page is always valid when it is called. It happens that this is not true in that particular case, so check for page before calling node_match() here. Fixes: 6159d0f5c03e ("mm/slub.c: page is always non-NULL in node_match()") Signed-off-by: Laurent Dufour Signed-off-by: Andrew Morton Acked-by: Vlastimil Babka Acked-by: Christoph Lameter Cc: Wei Yang Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Nathan Lynch Cc: Scott Cheloha Cc: Michal Hocko Cc: Link: https://lkml.kernel.org/r/20201027190406.33283-1-ldufour@linux.ibm.com Signed-off-by: Linus Torvalds --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/slub.c b/mm/slub.c index b30be2385d1c..34dcc09e2ec9 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2852,7 +2852,7 @@ redo: object = c->freelist; page = c->page; - if (unlikely(!object || !node_match(page, node))) { + if (unlikely(!object || !page || !node_match(page, node))) { object = __slab_alloc(s, gfpflags, node, addr, c); } else { void *next_object = get_freepointer_safe(s, object); From 96e1fac162cc0086c50b2b14062112adb2ba640e Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 13 Nov 2020 22:51:56 -0800 Subject: [PATCH 18/27] mm/gup: use unpin_user_pages() in __gup_longterm_locked() When FOLL_PIN is passed to __get_user_pages() the page list must be put back using unpin_user_pages() otherwise the page pin reference persists in a corrupted state. There are two places in the unwind of __gup_longterm_locked() that put the pages back without checking. Normally on error this function would return the partial page list making this the caller's responsibility, but in these two cases the caller is not allowed to see these pages at all. Fixes: 3faa52c03f44 ("mm/gup: track FOLL_PIN pages") Reported-by: Ira Weiny Signed-off-by: Jason Gunthorpe Signed-off-by: Andrew Morton Reviewed-by: Ira Weiny Reviewed-by: John Hubbard Cc: Aneesh Kumar K.V Cc: Dan Williams Cc: Link: https://lkml.kernel.org/r/0-v2-3ae7d9d162e2+2a7-gup_cma_fix_jgg@nvidia.com Signed-off-by: Linus Torvalds --- mm/gup.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 102877ed77a4..98eb8e6d2609 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1647,8 +1647,11 @@ check_again: /* * drop the above get_user_pages reference. */ - for (i = 0; i < nr_pages; i++) - put_page(pages[i]); + if (gup_flags & FOLL_PIN) + unpin_user_pages(pages, nr_pages); + else + for (i = 0; i < nr_pages; i++) + put_page(pages[i]); if (migrate_pages(&cma_page_list, alloc_migration_target, NULL, (unsigned long)&mtc, MIGRATE_SYNC, MR_CONTIG_RANGE)) { @@ -1728,8 +1731,11 @@ static long __gup_longterm_locked(struct mm_struct *mm, goto out; if (check_dax_vmas(vmas_tmp, rc)) { - for (i = 0; i < rc; i++) - put_page(pages[i]); + if (gup_flags & FOLL_PIN) + unpin_user_pages(pages, rc); + else + for (i = 0; i < rc; i++) + put_page(pages[i]); rc = -EOPNOTSUPP; goto out; } From 3347acc6fcd4ee71ad18a9ff9d9dac176b517329 Mon Sep 17 00:00:00 2001 From: Arvind Sankar Date: Fri, 13 Nov 2020 22:51:59 -0800 Subject: [PATCH 19/27] compiler.h: fix barrier_data() on clang Commit 815f0ddb346c ("include/linux/compiler*.h: make compiler-*.h mutually exclusive") neglected to copy barrier_data() from compiler-gcc.h into compiler-clang.h. The definition in compiler-gcc.h was really to work around clang's more aggressive optimization, so this broke barrier_data() on clang, and consequently memzero_explicit() as well. For example, this results in at least the memzero_explicit() call in lib/crypto/sha256.c:sha256_transform() being optimized away by clang. Fix this by moving the definition of barrier_data() into compiler.h. Also move the gcc/clang definition of barrier() into compiler.h, __memory_barrier() is icc-specific (and barrier() is already defined using it in compiler-intel.h) and doesn't belong in compiler.h. [rdunlap@infradead.org: fix ALPHA builds when SMP is not enabled] Link: https://lkml.kernel.org/r/20201101231835.4589-1-rdunlap@infradead.org Fixes: 815f0ddb346c ("include/linux/compiler*.h: make compiler-*.h mutually exclusive") Signed-off-by: Arvind Sankar Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Tested-by: Nick Desaulniers Reviewed-by: Nick Desaulniers Reviewed-by: Kees Cook Cc: Link: https://lkml.kernel.org/r/20201014212631.207844-1-nivedita@alum.mit.edu Signed-off-by: Linus Torvalds --- include/asm-generic/barrier.h | 1 + include/linux/compiler-clang.h | 6 ------ include/linux/compiler-gcc.h | 19 ------------------- include/linux/compiler.h | 18 ++++++++++++++++-- 4 files changed, 17 insertions(+), 27 deletions(-) diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h index 798027bb89be..640f09479bdf 100644 --- a/include/asm-generic/barrier.h +++ b/include/asm-generic/barrier.h @@ -13,6 +13,7 @@ #ifndef __ASSEMBLY__ +#include #include #ifndef nop diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index 230604e7f057..dd7233c48bf3 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -60,12 +60,6 @@ #define COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW 1 #endif -/* The following are for compatibility with GCC, from compiler-gcc.h, - * and may be redefined here because they should not be shared with other - * compilers, like ICC. - */ -#define barrier() __asm__ __volatile__("" : : : "memory") - #if __has_feature(shadow_call_stack) # define __noscs __attribute__((__no_sanitize__("shadow-call-stack"))) #endif diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 5deb37024574..74c6c0486eed 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -15,25 +15,6 @@ # error Sorry, your version of GCC is too old - please use 4.9 or newer. #endif -/* Optimization barrier */ - -/* The "volatile" is due to gcc bugs */ -#define barrier() __asm__ __volatile__("": : :"memory") -/* - * This version is i.e. to prevent dead stores elimination on @ptr - * where gcc and llvm may behave differently when otherwise using - * normal barrier(): while gcc behavior gets along with a normal - * barrier(), llvm needs an explicit input variable to be assumed - * clobbered. The issue is as follows: while the inline asm might - * access any memory it wants, the compiler could have fit all of - * @ptr into memory registers instead, and since @ptr never escaped - * from that, it proved that the inline asm wasn't touching any of - * it. This version works well with both compilers, i.e. we're telling - * the compiler that the inline asm absolutely may see the contents - * of @ptr. See also: https://llvm.org/bugs/show_bug.cgi?id=15495 - */ -#define barrier_data(ptr) __asm__ __volatile__("": :"r"(ptr) :"memory") - /* * This macro obfuscates arithmetic on a variable address so that gcc * shouldn't recognize the original var, and make assumptions about it. diff --git a/include/linux/compiler.h b/include/linux/compiler.h index e512f5505dad..b8fe0c23cfff 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -80,11 +80,25 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, /* Optimization barrier */ #ifndef barrier -# define barrier() __memory_barrier() +/* The "volatile" is due to gcc bugs */ +# define barrier() __asm__ __volatile__("": : :"memory") #endif #ifndef barrier_data -# define barrier_data(ptr) barrier() +/* + * This version is i.e. to prevent dead stores elimination on @ptr + * where gcc and llvm may behave differently when otherwise using + * normal barrier(): while gcc behavior gets along with a normal + * barrier(), llvm needs an explicit input variable to be assumed + * clobbered. The issue is as follows: while the inline asm might + * access any memory it wants, the compiler could have fit all of + * @ptr into memory registers instead, and since @ptr never escaped + * from that, it proved that the inline asm wasn't touching any of + * it. This version works well with both compilers, i.e. we're telling + * the compiler that the inline asm absolutely may see the contents + * of @ptr. See also: https://llvm.org/bugs/show_bug.cgi?id=15495 + */ +# define barrier_data(ptr) __asm__ __volatile__("": :"r"(ptr) :"memory") #endif /* workaround for GCC PR82365 if needed */ From 8b92c4ff4423aa9900cf838d3294fcade4dbda35 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Fri, 13 Nov 2020 22:52:02 -0800 Subject: [PATCH 20/27] Revert "kernel/reboot.c: convert simple_strtoul to kstrtoint" Patch series "fix parsing of reboot= cmdline", v3. The parsing of the reboot= cmdline has two major errors: - a missing bound check can crash the system on reboot - parsing of the cpu number only works if specified last Fix both. This patch (of 2): This reverts commit 616feab753972b97. kstrtoint() and simple_strtoul() have a subtle difference which makes them non interchangeable: if a non digit character is found amid the parsing, the former will return an error, while the latter will just stop parsing, e.g. simple_strtoul("123xyx") = 123. The kernel cmdline reboot= argument allows to specify the CPU used for rebooting, with the syntax `s####` among the other flags, e.g. "reboot=warm,s31,force", so if this flag is not the last given, it's silently ignored as well as the subsequent ones. Fixes: 616feab75397 ("kernel/reboot.c: convert simple_strtoul to kstrtoint") Signed-off-by: Matteo Croce Signed-off-by: Andrew Morton Cc: Guenter Roeck Cc: Petr Mladek Cc: Arnd Bergmann Cc: Mike Rapoport Cc: Kees Cook Cc: Pavel Tatashin Cc: Robin Holt Cc: Fabian Frederick Cc: Greg Kroah-Hartman Cc: Link: https://lkml.kernel.org/r/20201103214025.116799-2-mcroce@linux.microsoft.com Signed-off-by: Linus Torvalds --- kernel/reboot.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/kernel/reboot.c b/kernel/reboot.c index e7b78d5ae1ab..8fbba433725e 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -551,22 +551,15 @@ static int __init reboot_setup(char *str) break; case 's': - { - int rc; - - if (isdigit(*(str+1))) { - rc = kstrtoint(str+1, 0, &reboot_cpu); - if (rc) - return rc; - } else if (str[1] == 'm' && str[2] == 'p' && - isdigit(*(str+3))) { - rc = kstrtoint(str+3, 0, &reboot_cpu); - if (rc) - return rc; - } else + if (isdigit(*(str+1))) + reboot_cpu = simple_strtoul(str+1, NULL, 0); + else if (str[1] == 'm' && str[2] == 'p' && + isdigit(*(str+3))) + reboot_cpu = simple_strtoul(str+3, NULL, 0); + else *mode = REBOOT_SOFT; break; - } + case 'g': *mode = REBOOT_GPIO; break; From df5b0ab3e08a156701b537809914b339b0daa526 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Fri, 13 Nov 2020 22:52:07 -0800 Subject: [PATCH 21/27] reboot: fix overflow parsing reboot cpu number Limit the CPU number to num_possible_cpus(), because setting it to a value lower than INT_MAX but higher than NR_CPUS produces the following error on reboot and shutdown: BUG: unable to handle page fault for address: ffffffff90ab1bb0 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 1c09067 P4D 1c09067 PUD 1c0a063 PMD 0 Oops: 0000 [#1] SMP CPU: 1 PID: 1 Comm: systemd-shutdow Not tainted 5.9.0-rc8-kvm #110 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014 RIP: 0010:migrate_to_reboot_cpu+0xe/0x60 Code: ea ea 00 48 89 fa 48 c7 c7 30 57 f1 81 e9 fa ef ff ff 66 2e 0f 1f 84 00 00 00 00 00 53 8b 1d d5 ea ea 00 e8 14 33 fe ff 89 da <48> 0f a3 15 ea fc bd 00 48 89 d0 73 29 89 c2 c1 e8 06 65 48 8b 3c RSP: 0018:ffffc90000013e08 EFLAGS: 00010246 RAX: ffff88801f0a0000 RBX: 0000000077359400 RCX: 0000000000000000 RDX: 0000000077359400 RSI: 0000000000000002 RDI: ffffffff81c199e0 RBP: ffffffff81c1e3c0 R08: ffff88801f41f000 R09: ffffffff81c1e348 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 R13: 00007f32bedf8830 R14: 00000000fee1dead R15: 0000000000000000 FS: 00007f32bedf8980(0000) GS:ffff88801f480000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffffff90ab1bb0 CR3: 000000001d057000 CR4: 00000000000006a0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __do_sys_reboot.cold+0x34/0x5b do_syscall_64+0x2d/0x40 Fixes: 1b3a5d02ee07 ("reboot: move arch/x86 reboot= handling to generic kernel") Signed-off-by: Matteo Croce Signed-off-by: Andrew Morton Cc: Arnd Bergmann Cc: Fabian Frederick Cc: Greg Kroah-Hartman Cc: Guenter Roeck Cc: Kees Cook Cc: Mike Rapoport Cc: Pavel Tatashin Cc: Petr Mladek Cc: Robin Holt Cc: Link: https://lkml.kernel.org/r/20201103214025.116799-3-mcroce@linux.microsoft.com Signed-off-by: Linus Torvalds --- kernel/reboot.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/reboot.c b/kernel/reboot.c index 8fbba433725e..af6f23d8bea1 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -558,6 +558,13 @@ static int __init reboot_setup(char *str) reboot_cpu = simple_strtoul(str+3, NULL, 0); else *mode = REBOOT_SOFT; + if (reboot_cpu >= num_possible_cpus()) { + pr_err("Ignoring the CPU number in reboot= option. " + "CPU %d exceeds possible cpu number %d\n", + reboot_cpu, num_possible_cpus()); + reboot_cpu = 0; + break; + } break; case 'g': From e7e046155af04cdca5e1157f28b07e1651eb317b Mon Sep 17 00:00:00 2001 From: Santosh Sivaraj Date: Fri, 13 Nov 2020 22:52:10 -0800 Subject: [PATCH 22/27] kernel/watchdog: fix watchdog_allowed_mask not used warning Define watchdog_allowed_mask only when SOFTLOCKUP_DETECTOR is enabled. Fixes: 7feeb9cd4f5b ("watchdog/sysctl: Clean up sysctl variable name space") Signed-off-by: Santosh Sivaraj Signed-off-by: Andrew Morton Reviewed-by: Petr Mladek Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20201106015025.1281561-1-santosh@fossix.org Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 5abb5b22ad13..71109065bd8e 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -44,8 +44,6 @@ int __read_mostly soft_watchdog_user_enabled = 1; int __read_mostly watchdog_thresh = 10; static int __read_mostly nmi_watchdog_available; -static struct cpumask watchdog_allowed_mask __read_mostly; - struct cpumask watchdog_cpumask __read_mostly; unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); @@ -162,6 +160,8 @@ static void lockup_detector_update_enable(void) int __read_mostly sysctl_softlockup_all_cpu_backtrace; #endif +static struct cpumask watchdog_allowed_mask __read_mostly; + /* Global variables, exported for sysctl */ unsigned int __read_mostly softlockup_panic = CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; From 8b21ca0218d29cc6bb7028125c7e5a10dfb4730c Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 13 Nov 2020 22:52:13 -0800 Subject: [PATCH 23/27] mm: memcontrol: fix missing wakeup polling thread When we poll the swap.events, we can miss being woken up when the swap event occurs. Because we didn't notify. Fixes: f3a53a3a1e5b ("mm, memcontrol: implement memory.swap.events") Signed-off-by: Muchun Song Signed-off-by: Andrew Morton Reviewed-by: Shakeel Butt Acked-by: Johannes Weiner Cc: Roman Gushchin Cc: Michal Hocko Cc: Yafang Shao Cc: Chris Down Cc: Tejun Heo Link: https://lkml.kernel.org/r/20201105161936.98312-1-songmuchun@bytedance.com Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e391e3c56de5..a80c59af2c60 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -900,12 +900,19 @@ static inline void count_memcg_event_mm(struct mm_struct *mm, static inline void memcg_memory_event(struct mem_cgroup *memcg, enum memcg_memory_event event) { + bool swap_event = event == MEMCG_SWAP_HIGH || event == MEMCG_SWAP_MAX || + event == MEMCG_SWAP_FAIL; + atomic_long_inc(&memcg->memory_events_local[event]); - cgroup_file_notify(&memcg->events_local_file); + if (!swap_event) + cgroup_file_notify(&memcg->events_local_file); do { atomic_long_inc(&memcg->memory_events[event]); - cgroup_file_notify(&memcg->events_file); + if (swap_event) + cgroup_file_notify(&memcg->swap_events_file); + else + cgroup_file_notify(&memcg->events_file); if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) break; From 336bf30eb76580b579dc711ded5d599d905c0217 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Fri, 13 Nov 2020 22:52:16 -0800 Subject: [PATCH 24/27] hugetlbfs: fix anon huge page migration race Qian Cai reported the following BUG in [1] LTP: starting move_pages12 BUG: unable to handle page fault for address: ffffffffffffffe0 ... RIP: 0010:anon_vma_interval_tree_iter_first+0xa2/0x170 avc_start_pgoff at mm/interval_tree.c:63 Call Trace: rmap_walk_anon+0x141/0xa30 rmap_walk_anon at mm/rmap.c:1864 try_to_unmap+0x209/0x2d0 try_to_unmap at mm/rmap.c:1763 migrate_pages+0x1005/0x1fb0 move_pages_and_store_status.isra.47+0xd7/0x1a0 __x64_sys_move_pages+0xa5c/0x1100 do_syscall_64+0x5f/0x310 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Hugh Dickins diagnosed this as a migration bug caused by code introduced to use i_mmap_rwsem for pmd sharing synchronization. Specifically, the routine unmap_and_move_huge_page() is always passing the TTU_RMAP_LOCKED flag to try_to_unmap() while holding i_mmap_rwsem. This is wrong for anon pages as the anon_vma_lock should be held in this case. Further analysis suggested that i_mmap_rwsem was not required to he held at all when calling try_to_unmap for anon pages as an anon page could never be part of a shared pmd mapping. Discussion also revealed that the hack in hugetlb_page_mapping_lock_write to drop page lock and acquire i_mmap_rwsem is wrong. There is no way to keep mapping valid while dropping page lock. This patch does the following: - Do not take i_mmap_rwsem and set TTU_RMAP_LOCKED for anon pages when calling try_to_unmap. - Remove the hacky code in hugetlb_page_mapping_lock_write. The routine will now simply do a 'trylock' while still holding the page lock. If the trylock fails, it will return NULL. This could impact the callers: - migration calling code will receive -EAGAIN and retry up to the hard coded limit (10). - memory error code will treat the page as BUSY. This will force killing (SIGKILL) instead of SIGBUS any mapping tasks. Do note that this change in behavior only happens when there is a race. None of the standard kernel testing suites actually hit this race, but it is possible. [1] https://lore.kernel.org/lkml/20200708012044.GC992@lca.pw/ [2] https://lore.kernel.org/linux-mm/alpine.LSU.2.11.2010071833100.2214@eggly.anvils/ Fixes: c0d0381ade79 ("hugetlbfs: use i_mmap_rwsem for more pmd sharing synchronization") Reported-by: Qian Cai Suggested-by: Hugh Dickins Signed-off-by: Mike Kravetz Signed-off-by: Andrew Morton Acked-by: Naoya Horiguchi Cc: Link: https://lkml.kernel.org/r/20201105195058.78401-1-mike.kravetz@oracle.com Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 90 +++------------------------------------------ mm/memory-failure.c | 36 +++++++++--------- mm/migrate.c | 44 ++++++++++++---------- mm/rmap.c | 5 +-- 4 files changed, 47 insertions(+), 128 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5a620f690911..37f15c3c24dc 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1567,104 +1567,24 @@ int PageHeadHuge(struct page *page_head) return page_head[1].compound_dtor == HUGETLB_PAGE_DTOR; } -/* - * Find address_space associated with hugetlbfs page. - * Upon entry page is locked and page 'was' mapped although mapped state - * could change. If necessary, use anon_vma to find vma and associated - * address space. The returned mapping may be stale, but it can not be - * invalid as page lock (which is held) is required to destroy mapping. - */ -static struct address_space *_get_hugetlb_page_mapping(struct page *hpage) -{ - struct anon_vma *anon_vma; - pgoff_t pgoff_start, pgoff_end; - struct anon_vma_chain *avc; - struct address_space *mapping = page_mapping(hpage); - - /* Simple file based mapping */ - if (mapping) - return mapping; - - /* - * Even anonymous hugetlbfs mappings are associated with an - * underlying hugetlbfs file (see hugetlb_file_setup in mmap - * code). Find a vma associated with the anonymous vma, and - * use the file pointer to get address_space. - */ - anon_vma = page_lock_anon_vma_read(hpage); - if (!anon_vma) - return mapping; /* NULL */ - - /* Use first found vma */ - pgoff_start = page_to_pgoff(hpage); - pgoff_end = pgoff_start + pages_per_huge_page(page_hstate(hpage)) - 1; - anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, - pgoff_start, pgoff_end) { - struct vm_area_struct *vma = avc->vma; - - mapping = vma->vm_file->f_mapping; - break; - } - - anon_vma_unlock_read(anon_vma); - return mapping; -} - /* * Find and lock address space (mapping) in write mode. * - * Upon entry, the page is locked which allows us to find the mapping - * even in the case of an anon page. However, locking order dictates - * the i_mmap_rwsem be acquired BEFORE the page lock. This is hugetlbfs - * specific. So, we first try to lock the sema while still holding the - * page lock. If this works, great! If not, then we need to drop the - * page lock and then acquire i_mmap_rwsem and reacquire page lock. Of - * course, need to revalidate state along the way. + * Upon entry, the page is locked which means that page_mapping() is + * stable. Due to locking order, we can only trylock_write. If we can + * not get the lock, simply return NULL to caller. */ struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage) { - struct address_space *mapping, *mapping2; + struct address_space *mapping = page_mapping(hpage); - mapping = _get_hugetlb_page_mapping(hpage); -retry: if (!mapping) return mapping; - /* - * If no contention, take lock and return - */ if (i_mmap_trylock_write(mapping)) return mapping; - /* - * Must drop page lock and wait on mapping sema. - * Note: Once page lock is dropped, mapping could become invalid. - * As a hack, increase map count until we lock page again. - */ - atomic_inc(&hpage->_mapcount); - unlock_page(hpage); - i_mmap_lock_write(mapping); - lock_page(hpage); - atomic_add_negative(-1, &hpage->_mapcount); - - /* verify page is still mapped */ - if (!page_mapped(hpage)) { - i_mmap_unlock_write(mapping); - return NULL; - } - - /* - * Get address space again and verify it is the same one - * we locked. If not, drop lock and retry. - */ - mapping2 = _get_hugetlb_page_mapping(hpage); - if (mapping2 != mapping) { - i_mmap_unlock_write(mapping); - mapping = mapping2; - goto retry; - } - - return mapping; + return NULL; } pgoff_t __basepage_index(struct page *page) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index c0bb186bba62..5d880d4eb9a2 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1057,27 +1057,25 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, if (!PageHuge(hpage)) { unmap_success = try_to_unmap(hpage, ttu); } else { - /* - * For hugetlb pages, try_to_unmap could potentially call - * huge_pmd_unshare. Because of this, take semaphore in - * write mode here and set TTU_RMAP_LOCKED to indicate we - * have taken the lock at this higer level. - * - * Note that the call to hugetlb_page_mapping_lock_write - * is necessary even if mapping is already set. It handles - * ugliness of potentially having to drop page lock to obtain - * i_mmap_rwsem. - */ - mapping = hugetlb_page_mapping_lock_write(hpage); - - if (mapping) { - unmap_success = try_to_unmap(hpage, + if (!PageAnon(hpage)) { + /* + * For hugetlb pages in shared mappings, try_to_unmap + * could potentially call huge_pmd_unshare. Because of + * this, take semaphore in write mode here and set + * TTU_RMAP_LOCKED to indicate we have taken the lock + * at this higer level. + */ + mapping = hugetlb_page_mapping_lock_write(hpage); + if (mapping) { + unmap_success = try_to_unmap(hpage, ttu|TTU_RMAP_LOCKED); - i_mmap_unlock_write(mapping); + i_mmap_unlock_write(mapping); + } else { + pr_info("Memory failure: %#lx: could not lock mapping for mapped huge page\n", pfn); + unmap_success = false; + } } else { - pr_info("Memory failure: %#lx: could not find mapping for mapped huge page\n", - pfn); - unmap_success = false; + unmap_success = try_to_unmap(hpage, ttu); } } if (!unmap_success) diff --git a/mm/migrate.c b/mm/migrate.c index 5ca5842df5db..5795cb82e27c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1328,34 +1328,38 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, goto put_anon; if (page_mapped(hpage)) { - /* - * try_to_unmap could potentially call huge_pmd_unshare. - * Because of this, take semaphore in write mode here and - * set TTU_RMAP_LOCKED to let lower levels know we have - * taken the lock. - */ - mapping = hugetlb_page_mapping_lock_write(hpage); - if (unlikely(!mapping)) - goto unlock_put_anon; + bool mapping_locked = false; + enum ttu_flags ttu = TTU_MIGRATION|TTU_IGNORE_MLOCK| + TTU_IGNORE_ACCESS; - try_to_unmap(hpage, - TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS| - TTU_RMAP_LOCKED); + if (!PageAnon(hpage)) { + /* + * In shared mappings, try_to_unmap could potentially + * call huge_pmd_unshare. Because of this, take + * semaphore in write mode here and set TTU_RMAP_LOCKED + * to let lower levels know we have taken the lock. + */ + mapping = hugetlb_page_mapping_lock_write(hpage); + if (unlikely(!mapping)) + goto unlock_put_anon; + + mapping_locked = true; + ttu |= TTU_RMAP_LOCKED; + } + + try_to_unmap(hpage, ttu); page_was_mapped = 1; - /* - * Leave mapping locked until after subsequent call to - * remove_migration_ptes() - */ + + if (mapping_locked) + i_mmap_unlock_write(mapping); } if (!page_mapped(hpage)) rc = move_to_new_page(new_hpage, hpage, mode); - if (page_was_mapped) { + if (page_was_mapped) remove_migration_ptes(hpage, - rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, true); - i_mmap_unlock_write(mapping); - } + rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, false); unlock_put_anon: unlock_page(new_hpage); diff --git a/mm/rmap.c b/mm/rmap.c index 1b84945d655c..31b29321adfe 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1413,9 +1413,6 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, /* * If sharing is possible, start and end will be adjusted * accordingly. - * - * If called for a huge page, caller must hold i_mmap_rwsem - * in write mode as it is possible to call huge_pmd_unshare. */ adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); @@ -1462,7 +1459,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte); address = pvmw.address; - if (PageHuge(page)) { + if (PageHuge(page) && !PageAnon(page)) { /* * To call huge_pmd_unshare, i_mmap_rwsem must be * held in write mode. Caller needs to explicitly From 2f31ad64a9cce8b2409d2d4563482adfb8664082 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 13 Nov 2020 22:52:20 -0800 Subject: [PATCH 25/27] panic: don't dump stack twice on warn Before commit 3f388f28639f ("panic: dump registers on panic_on_warn"), __warn() was calling show_regs() when regs was not NULL, and show_stack() otherwise. After that commit, show_stack() is called regardless of whether show_regs() has been called or not, leading to duplicated Call Trace: ------------[ cut here ]------------ WARNING: CPU: 0 PID: 1 at arch/powerpc/mm/nohash/8xx.c:186 mmu_mark_initmem_nx+0x24/0x94 CPU: 0 PID: 1 Comm: swapper Not tainted 5.10.0-rc2-s3k-dev-01375-gf46ec0d3ecbd-dirty #4092 NIP: c00128b4 LR: c0010228 CTR: 00000000 REGS: c9023e40 TRAP: 0700 Not tainted (5.10.0-rc2-s3k-dev-01375-gf46ec0d3ecbd-dirty) MSR: 00029032 CR: 24000424 XER: 00000000 GPR00: c0010228 c9023ef8 c2100000 0074c000 ffffffff 00000000 c2151000 c07b3880 GPR08: ff000900 0074c000 c8000000 c33b53a8 24000822 00000000 c0003a20 00000000 GPR16: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 GPR24: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00800000 NIP [c00128b4] mmu_mark_initmem_nx+0x24/0x94 LR [c0010228] free_initmem+0x20/0x58 Call Trace: free_initmem+0x20/0x58 kernel_init+0x1c/0x114 ret_from_kernel_thread+0x14/0x1c Instruction dump: 7d291850 7d234b78 4e800020 9421ffe0 7c0802a6 bfc10018 3fe0c060 3bff0000 3fff4080 3bffffff 90010024 57ff0010 <0fe00000> 392001cd 7c3e0b78 953e0008 CPU: 0 PID: 1 Comm: swapper Not tainted 5.10.0-rc2-s3k-dev-01375-gf46ec0d3ecbd-dirty #4092 Call Trace: __warn+0x8c/0xd8 (unreliable) report_bug+0x11c/0x154 program_check_exception+0x1dc/0x6e0 ret_from_except_full+0x0/0x4 --- interrupt: 700 at mmu_mark_initmem_nx+0x24/0x94 LR = free_initmem+0x20/0x58 free_initmem+0x20/0x58 kernel_init+0x1c/0x114 ret_from_kernel_thread+0x14/0x1c ---[ end trace 31702cd2a9570752 ]--- Only call show_stack() when regs is NULL. Fixes: 3f388f28639f ("panic: dump registers on panic_on_warn") Signed-off-by: Christophe Leroy Signed-off-by: Andrew Morton Cc: Alexey Kardashevskiy Cc: Kefeng Wang Link: https://lkml.kernel.org/r/e8c055458b080707f1bc1a98ff8bea79d0cec445.1604748361.git.christophe.leroy@csgroup.eu Signed-off-by: Linus Torvalds --- kernel/panic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/panic.c b/kernel/panic.c index 396142ee43fd..332736a72a58 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -605,7 +605,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint, panic("panic_on_warn set ...\n"); } - dump_stack(); + if (!regs) + dump_stack(); print_irqtrace_events(current); From f5785283dd64867a711ca1fb1f5bb172f252ecdf Mon Sep 17 00:00:00 2001 From: Wengang Wang Date: Fri, 13 Nov 2020 22:52:23 -0800 Subject: [PATCH 26/27] ocfs2: initialize ip_next_orphan Though problem if found on a lower 4.1.12 kernel, I think upstream has same issue. In one node in the cluster, there is the following callback trace: # cat /proc/21473/stack __ocfs2_cluster_lock.isra.36+0x336/0x9e0 [ocfs2] ocfs2_inode_lock_full_nested+0x121/0x520 [ocfs2] ocfs2_evict_inode+0x152/0x820 [ocfs2] evict+0xae/0x1a0 iput+0x1c6/0x230 ocfs2_orphan_filldir+0x5d/0x100 [ocfs2] ocfs2_dir_foreach_blk+0x490/0x4f0 [ocfs2] ocfs2_dir_foreach+0x29/0x30 [ocfs2] ocfs2_recover_orphans+0x1b6/0x9a0 [ocfs2] ocfs2_complete_recovery+0x1de/0x5c0 [ocfs2] process_one_work+0x169/0x4a0 worker_thread+0x5b/0x560 kthread+0xcb/0xf0 ret_from_fork+0x61/0x90 The above stack is not reasonable, the final iput shouldn't happen in ocfs2_orphan_filldir() function. Looking at the code, 2067 /* Skip inodes which are already added to recover list, since dio may 2068 * happen concurrently with unlink/rename */ 2069 if (OCFS2_I(iter)->ip_next_orphan) { 2070 iput(iter); 2071 return 0; 2072 } 2073 The logic thinks the inode is already in recover list on seeing ip_next_orphan is non-NULL, so it skip this inode after dropping a reference which incremented in ocfs2_iget(). While, if the inode is already in recover list, it should have another reference and the iput() at line 2070 should not be the final iput (dropping the last reference). So I don't think the inode is really in the recover list (no vmcore to confirm). Note that ocfs2_queue_orphans(), though not shown up in the call back trace, is holding cluster lock on the orphan directory when looking up for unlinked inodes. The on disk inode eviction could involve a lot of IOs which may need long time to finish. That means this node could hold the cluster lock for very long time, that can lead to the lock requests (from other nodes) to the orhpan directory hang for long time. Looking at more on ip_next_orphan, I found it's not initialized when allocating a new ocfs2_inode_info structure. This causes te reflink operations from some nodes hang for very long time waiting for the cluster lock on the orphan directory. Fix: initialize ip_next_orphan as NULL. Signed-off-by: Wengang Wang Signed-off-by: Andrew Morton Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Cc: Link: https://lkml.kernel.org/r/20201109171746.27884-1-wen.gang.wang@oracle.com Signed-off-by: Linus Torvalds --- fs/ocfs2/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 1d91dd1e8711..2febc76e9de7 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1713,6 +1713,7 @@ static void ocfs2_inode_init_once(void *data) oi->ip_blkno = 0ULL; oi->ip_clusters = 0; + oi->ip_next_orphan = NULL; ocfs2_resv_init_once(&oi->ip_la_data_resv); From 3ad216ee73abc554ed8f13f4f8b70845a7bef6da Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 14 Nov 2020 17:27:57 +0000 Subject: [PATCH 27/27] afs: Fix afs_write_end() when called with copied == 0 [ver #3] When afs_write_end() is called with copied == 0, it tries to set the dirty region, but there's no way to actually encode a 0-length region in the encoding in page->private. "0,0", for example, indicates a 1-byte region at offset 0. The maths miscalculates this and sets it incorrectly. Fix it to just do nothing but unlock and put the page in this case. We don't actually need to mark the page dirty as nothing presumably changed. Fixes: 65dd2d6072d3 ("afs: Alter dirty range encoding in page->private") Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- fs/afs/write.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/afs/write.c b/fs/afs/write.c index 50371207f327..c9195fc67fd8 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -169,11 +169,14 @@ int afs_write_end(struct file *file, struct address_space *mapping, unsigned int f, from = pos & (PAGE_SIZE - 1); unsigned int t, to = from + copied; loff_t i_size, maybe_i_size; - int ret; + int ret = 0; _enter("{%llx:%llu},{%lx}", vnode->fid.vid, vnode->fid.vnode, page->index); + if (copied == 0) + goto out; + maybe_i_size = pos + copied; i_size = i_size_read(&vnode->vfs_inode);