From 64a38e840ce5940253208eaba40265c73decc4ee Mon Sep 17 00:00:00 2001 From: Dave Wysochanski Date: Fri, 26 Jul 2019 18:33:01 -0400 Subject: [PATCH 001/334] SUNRPC: Track writers of the 'channel' file to improve cache_listeners_exist The sunrpc cache interface is susceptible to being fooled by a rogue process just reading a 'channel' file. If this happens the kernel may think a valid daemon exists to service the cache when it does not. For example, the following may fool the kernel: cat /proc/net/rpc/auth.unix.gid/channel Change the tracking of readers to writers when considering whether a listener exists as all valid daemon processes either open a channel file O_RDWR or O_WRONLY. While this does not prevent a rogue process from "stealing" a message from the kernel, it does at least improve the kernels perception of whether a valid process servicing the cache exists. Signed-off-by: Dave Wysochanski Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/cache.h | 6 +++--- net/sunrpc/cache.c | 12 ++++++++---- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h index c7f38e897174..f7d086b77a21 100644 --- a/include/linux/sunrpc/cache.h +++ b/include/linux/sunrpc/cache.h @@ -107,9 +107,9 @@ struct cache_detail { /* fields for communication over channel */ struct list_head queue; - atomic_t readers; /* how many time is /chennel open */ - time_t last_close; /* if no readers, when did last close */ - time_t last_warn; /* when we last warned about no readers */ + atomic_t writers; /* how many time is /channel open */ + time_t last_close; /* if no writers, when did last close */ + time_t last_warn; /* when we last warned about no writers */ union { struct proc_dir_entry *procfs; diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 6f1528f271ee..a6a6190ad37a 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -373,7 +373,7 @@ void sunrpc_init_cache_detail(struct cache_detail *cd) spin_lock(&cache_list_lock); cd->nextcheck = 0; cd->entries = 0; - atomic_set(&cd->readers, 0); + atomic_set(&cd->writers, 0); cd->last_close = 0; cd->last_warn = -1; list_add(&cd->others, &cache_list); @@ -1029,11 +1029,13 @@ static int cache_open(struct inode *inode, struct file *filp, } rp->offset = 0; rp->q.reader = 1; - atomic_inc(&cd->readers); + spin_lock(&queue_lock); list_add(&rp->q.list, &cd->queue); spin_unlock(&queue_lock); } + if (filp->f_mode & FMODE_WRITE) + atomic_inc(&cd->writers); filp->private_data = rp; return 0; } @@ -1062,8 +1064,10 @@ static int cache_release(struct inode *inode, struct file *filp, filp->private_data = NULL; kfree(rp); + } + if (filp->f_mode & FMODE_WRITE) { + atomic_dec(&cd->writers); cd->last_close = seconds_since_boot(); - atomic_dec(&cd->readers); } module_put(cd->owner); return 0; @@ -1171,7 +1175,7 @@ static void warn_no_listener(struct cache_detail *detail) static bool cache_listeners_exist(struct cache_detail *detail) { - if (atomic_read(&detail->readers)) + if (atomic_read(&detail->writers)) return true; if (detail->last_close == 0) /* This cache was never opened */ From 81b23ba645e6b2b446093b2d927c261a17f7dee3 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Tue, 30 Jul 2019 14:08:07 +0800 Subject: [PATCH 002/334] csky: Fixup mb() synchronization problem The mb() is the superset of dma and smp. Using bar.xxx to implement mb() will cause problem when sync data with dma device, becasue bar.xxx couldn't guarantee bus transactions finished at outside bus level. We must use sync.s instead of bar.xxx for dma data synchronization and it will guarantee retirement after getting the bus bresponse. Changes for V2: - Use sync.s for all mb, rmb, wmb, dma_wmb, dma_rmb. Signed-off-by: Guo Ren Cc: Arnd Bergmann --- arch/csky/include/asm/barrier.h | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/csky/include/asm/barrier.h b/arch/csky/include/asm/barrier.h index 476eb786f22d..a430e7fddf35 100644 --- a/arch/csky/include/asm/barrier.h +++ b/arch/csky/include/asm/barrier.h @@ -9,11 +9,12 @@ #define nop() asm volatile ("nop\n":::"memory") /* - * sync: completion barrier - * sync.s: completion barrier and shareable to other cores - * sync.i: completion barrier with flush cpu pipeline - * sync.is: completion barrier with flush cpu pipeline and shareable to - * other cores + * sync: completion barrier, all sync.xx instructions + * guarantee the last response recieved by bus transaction + * made by ld/st instructions before sync.s + * sync.s: inherit from sync, but also shareable to other cores + * sync.i: inherit from sync, but also flush cpu pipeline + * sync.is: the same with sync.i + sync.s * * bar.brwarw: ordering barrier for all load/store instructions before it * bar.brwarws: ordering barrier for all load/store instructions before it @@ -27,9 +28,7 @@ */ #ifdef CONFIG_CPU_HAS_CACHEV2 -#define mb() asm volatile ("bar.brwarw\n":::"memory") -#define rmb() asm volatile ("bar.brar\n":::"memory") -#define wmb() asm volatile ("bar.bwaw\n":::"memory") +#define mb() asm volatile ("sync.s\n":::"memory") #ifdef CONFIG_SMP #define __smp_mb() asm volatile ("bar.brwarws\n":::"memory") From 7f80fe207de9602aaff028c79345caa68c90cd31 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Tue, 30 Jul 2019 14:43:22 +0800 Subject: [PATCH 003/334] csky: Fixup dma_alloc_coherent with PAGE_SO attribute This bug is from commit: 2b070ccdf8c0 (fixup abiv2 mmap(... O_SYNC) failed). In that patch we remove the _PAGE_SO for memory noncache mapping and this will cause problem when drivers use dma descriptors to control the transcations without dma_w/rmb(). After referencing other archs' implementation, pgprot_writecombine is introduced for mmap(... O_SYNC). Signed-off-by: Guo Ren --- arch/csky/include/asm/pgtable.h | 10 ++++++++++ arch/csky/mm/ioremap.c | 6 ++---- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/arch/csky/include/asm/pgtable.h b/arch/csky/include/asm/pgtable.h index c429a6f347de..fc19ba446d62 100644 --- a/arch/csky/include/asm/pgtable.h +++ b/arch/csky/include/asm/pgtable.h @@ -258,6 +258,16 @@ static inline pgprot_t pgprot_noncached(pgprot_t _prot) { unsigned long prot = pgprot_val(_prot); + prot = (prot & ~_CACHE_MASK) | _CACHE_UNCACHED | _PAGE_SO; + + return __pgprot(prot); +} + +#define pgprot_writecombine pgprot_writecombine +static inline pgprot_t pgprot_writecombine(pgprot_t _prot) +{ + unsigned long prot = pgprot_val(_prot); + prot = (prot & ~_CACHE_MASK) | _CACHE_UNCACHED; return __pgprot(prot); diff --git a/arch/csky/mm/ioremap.c b/arch/csky/mm/ioremap.c index 8473b6bdf512..48531115fd9d 100644 --- a/arch/csky/mm/ioremap.c +++ b/arch/csky/mm/ioremap.c @@ -29,8 +29,7 @@ void __iomem *ioremap(phys_addr_t addr, size_t size) vaddr = (unsigned long)area->addr; - prot = __pgprot(_PAGE_PRESENT | __READABLE | __WRITEABLE | - _PAGE_GLOBAL | _CACHE_UNCACHED | _PAGE_SO); + prot = pgprot_noncached(PAGE_KERNEL); if (ioremap_page_range(vaddr, vaddr + size, addr, prot)) { free_vm_area(area); @@ -51,10 +50,9 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot) { if (!pfn_valid(pfn)) { - vma_prot.pgprot |= _PAGE_SO; return pgprot_noncached(vma_prot); } else if (file->f_flags & O_SYNC) { - return pgprot_noncached(vma_prot); + return pgprot_writecombine(vma_prot); } return vma_prot; From b36f281f4a314de4be0a51d6511b794691f8a244 Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Fri, 19 Jul 2019 07:16:57 -0400 Subject: [PATCH 004/334] ima: initialize the "template" field with the default template IMA policy rules are walked sequentially. Depending on the ordering of the policy rules, the "template" field might be defined in one rule, but will be replaced by subsequent, applicable rules, even if the rule does not explicitly define the "template" field. This patch initializes the "template" once and only replaces the "template", when explicitly defined. Fixes: 19453ce0bcfb ("IMA: support for per policy rule template formats") Signed-off-by: Mimi Zohar --- security/integrity/ima/ima_policy.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c index 6df7f641ff66..36a0727f1d7a 100644 --- a/security/integrity/ima/ima_policy.c +++ b/security/integrity/ima/ima_policy.c @@ -491,6 +491,9 @@ int ima_match_policy(struct inode *inode, const struct cred *cred, u32 secid, struct ima_rule_entry *entry; int action = 0, actmask = flags | (flags << 1); + if (template_desc) + *template_desc = ima_template_desc_current(); + rcu_read_lock(); list_for_each_entry_rcu(entry, ima_rules, list) { @@ -510,6 +513,7 @@ int ima_match_policy(struct inode *inode, const struct cred *cred, u32 secid, action |= IMA_FAIL_UNVERIFIABLE_SIGS; } + if (entry->action & IMA_DO_MASK) actmask &= ~(entry->action | entry->action << 1); else @@ -520,8 +524,6 @@ int ima_match_policy(struct inode *inode, const struct cred *cred, u32 secid, if (template_desc && entry->template) *template_desc = entry->template; - else if (template_desc) - *template_desc = ima_template_desc_current(); if (!actmask) break; From c8424e776b093280d3fdd104d850706b3b229ac8 Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Thu, 4 Jul 2019 15:57:34 -0300 Subject: [PATCH 005/334] MODSIGN: Export module signature definitions IMA will use the module_signature format for append signatures, so export the relevant definitions and factor out the code which verifies that the appended signature trailer is valid. Also, create a CONFIG_MODULE_SIG_FORMAT option so that IMA can select it and be able to use mod_check_sig() without having to depend on either CONFIG_MODULE_SIG or CONFIG_MODULES. s390 duplicated the definition of struct module_signature so now they can use the new header instead. Signed-off-by: Thiago Jung Bauermann Acked-by: Jessica Yu Reviewed-by: Philipp Rudo Cc: Heiko Carstens Signed-off-by: Mimi Zohar --- arch/s390/Kconfig | 2 +- arch/s390/kernel/machine_kexec_file.c | 24 +----------- include/linux/module.h | 3 -- include/linux/module_signature.h | 44 +++++++++++++++++++++ init/Kconfig | 6 ++- kernel/Makefile | 1 + kernel/module.c | 1 + kernel/module_signature.c | 46 ++++++++++++++++++++++ kernel/module_signing.c | 56 ++++----------------------- scripts/Makefile | 2 +- 10 files changed, 108 insertions(+), 77 deletions(-) create mode 100644 include/linux/module_signature.h create mode 100644 kernel/module_signature.c diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index a4ad2733eedf..e0ae0d51f985 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -538,7 +538,7 @@ config ARCH_HAS_KEXEC_PURGATORY config KEXEC_VERIFY_SIG bool "Verify kernel signature during kexec_file_load() syscall" - depends on KEXEC_FILE && SYSTEM_DATA_VERIFICATION + depends on KEXEC_FILE && MODULE_SIG_FORMAT help This option makes kernel signature verification mandatory for the kexec_file_load() syscall. diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c index fbdd3ea73667..1ac9fbc6e01e 100644 --- a/arch/s390/kernel/machine_kexec_file.c +++ b/arch/s390/kernel/machine_kexec_file.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include #include @@ -23,28 +23,6 @@ const struct kexec_file_ops * const kexec_file_loaders[] = { }; #ifdef CONFIG_KEXEC_VERIFY_SIG -/* - * Module signature information block. - * - * The constituents of the signature section are, in order: - * - * - Signer's name - * - Key identifier - * - Signature data - * - Information block - */ -struct module_signature { - u8 algo; /* Public-key crypto algorithm [0] */ - u8 hash; /* Digest algorithm [0] */ - u8 id_type; /* Key identifier type [PKEY_ID_PKCS7] */ - u8 signer_len; /* Length of signer's name [0] */ - u8 key_id_len; /* Length of key identifier [0] */ - u8 __pad[3]; - __be32 sig_len; /* Length of signature data */ -}; - -#define PKEY_ID_PKCS7 2 - int s390_verify_sig(const char *kernel, unsigned long kernel_len) { const unsigned long marker_len = sizeof(MODULE_SIG_STRING) - 1; diff --git a/include/linux/module.h b/include/linux/module.h index 1455812dd325..f6fc1dae74f4 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -26,9 +26,6 @@ #include #include -/* In stripped ARM and x86-64 modules, ~ is surprisingly rare. */ -#define MODULE_SIG_STRING "~Module signature appended~\n" - /* Not Yet Implemented */ #define MODULE_SUPPORTED_DEVICE(name) diff --git a/include/linux/module_signature.h b/include/linux/module_signature.h new file mode 100644 index 000000000000..523617fc5b6a --- /dev/null +++ b/include/linux/module_signature.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Module signature handling. + * + * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#ifndef _LINUX_MODULE_SIGNATURE_H +#define _LINUX_MODULE_SIGNATURE_H + +/* In stripped ARM and x86-64 modules, ~ is surprisingly rare. */ +#define MODULE_SIG_STRING "~Module signature appended~\n" + +enum pkey_id_type { + PKEY_ID_PGP, /* OpenPGP generated key ID */ + PKEY_ID_X509, /* X.509 arbitrary subjectKeyIdentifier */ + PKEY_ID_PKCS7, /* Signature in PKCS#7 message */ +}; + +/* + * Module signature information block. + * + * The constituents of the signature section are, in order: + * + * - Signer's name + * - Key identifier + * - Signature data + * - Information block + */ +struct module_signature { + u8 algo; /* Public-key crypto algorithm [0] */ + u8 hash; /* Digest algorithm [0] */ + u8 id_type; /* Key identifier type [PKEY_ID_PKCS7] */ + u8 signer_len; /* Length of signer's name [0] */ + u8 key_id_len; /* Length of key identifier [0] */ + u8 __pad[3]; + __be32 sig_len; /* Length of signature data */ +}; + +int mod_check_sig(const struct module_signature *ms, size_t file_len, + const char *name); + +#endif /* _LINUX_MODULE_SIGNATURE_H */ diff --git a/init/Kconfig b/init/Kconfig index bd7d650d4a99..2dca877c9ed7 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1930,6 +1930,10 @@ config BASE_SMALL default 0 if BASE_FULL default 1 if !BASE_FULL +config MODULE_SIG_FORMAT + def_bool n + select SYSTEM_DATA_VERIFICATION + menuconfig MODULES bool "Enable loadable module support" option modules @@ -2007,7 +2011,7 @@ config MODULE_SRCVERSION_ALL config MODULE_SIG bool "Module signature verification" depends on MODULES - select SYSTEM_DATA_VERIFICATION + select MODULE_SIG_FORMAT help Check modules for valid signatures upon load: the signature is simply appended to the module. For more information see diff --git a/kernel/Makefile b/kernel/Makefile index a8d923b5481b..179b44ab8b46 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -58,6 +58,7 @@ endif obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_MODULE_SIG) += module_signing.o +obj-$(CONFIG_MODULE_SIG_FORMAT) += module_signature.o obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_CRASH_CORE) += crash_core.o diff --git a/kernel/module.c b/kernel/module.c index 5933395af9a0..5ac22efc3685 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include diff --git a/kernel/module_signature.c b/kernel/module_signature.c new file mode 100644 index 000000000000..4224a1086b7d --- /dev/null +++ b/kernel/module_signature.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Module signature checker + * + * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include +#include +#include +#include + +/** + * mod_check_sig - check that the given signature is sane + * + * @ms: Signature to check. + * @file_len: Size of the file to which @ms is appended. + * @name: What is being checked. Used for error messages. + */ +int mod_check_sig(const struct module_signature *ms, size_t file_len, + const char *name) +{ + if (be32_to_cpu(ms->sig_len) >= file_len - sizeof(*ms)) + return -EBADMSG; + + if (ms->id_type != PKEY_ID_PKCS7) { + pr_err("%s: Module is not signed with expected PKCS#7 message\n", + name); + return -ENOPKG; + } + + if (ms->algo != 0 || + ms->hash != 0 || + ms->signer_len != 0 || + ms->key_id_len != 0 || + ms->__pad[0] != 0 || + ms->__pad[1] != 0 || + ms->__pad[2] != 0) { + pr_err("%s: PKCS#7 signature info has unexpected non-zero params\n", + name); + return -EBADMSG; + } + + return 0; +} diff --git a/kernel/module_signing.c b/kernel/module_signing.c index b10fb1986ca9..9d9fc678c91d 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c @@ -7,37 +7,13 @@ #include #include +#include +#include #include #include #include #include "module-internal.h" -enum pkey_id_type { - PKEY_ID_PGP, /* OpenPGP generated key ID */ - PKEY_ID_X509, /* X.509 arbitrary subjectKeyIdentifier */ - PKEY_ID_PKCS7, /* Signature in PKCS#7 message */ -}; - -/* - * Module signature information block. - * - * The constituents of the signature section are, in order: - * - * - Signer's name - * - Key identifier - * - Signature data - * - Information block - */ -struct module_signature { - u8 algo; /* Public-key crypto algorithm [0] */ - u8 hash; /* Digest algorithm [0] */ - u8 id_type; /* Key identifier type [PKEY_ID_PKCS7] */ - u8 signer_len; /* Length of signer's name [0] */ - u8 key_id_len; /* Length of key identifier [0] */ - u8 __pad[3]; - __be32 sig_len; /* Length of signature data */ -}; - /* * Verify the signature on a module. */ @@ -45,6 +21,7 @@ int mod_verify_sig(const void *mod, struct load_info *info) { struct module_signature ms; size_t sig_len, modlen = info->len; + int ret; pr_devel("==>%s(,%zu)\n", __func__, modlen); @@ -52,32 +29,15 @@ int mod_verify_sig(const void *mod, struct load_info *info) return -EBADMSG; memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms)); - modlen -= sizeof(ms); + + ret = mod_check_sig(&ms, modlen, info->name); + if (ret) + return ret; sig_len = be32_to_cpu(ms.sig_len); - if (sig_len >= modlen) - return -EBADMSG; - modlen -= sig_len; + modlen -= sig_len + sizeof(ms); info->len = modlen; - if (ms.id_type != PKEY_ID_PKCS7) { - pr_err("%s: Module is not signed with expected PKCS#7 message\n", - info->name); - return -ENOPKG; - } - - if (ms.algo != 0 || - ms.hash != 0 || - ms.signer_len != 0 || - ms.key_id_len != 0 || - ms.__pad[0] != 0 || - ms.__pad[1] != 0 || - ms.__pad[2] != 0) { - pr_err("%s: PKCS#7 signature info has unexpected non-zero params\n", - info->name); - return -EBADMSG; - } - return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len, VERIFY_USE_SECONDARY_KEYRING, VERIFYING_MODULE_SIGNATURE, diff --git a/scripts/Makefile b/scripts/Makefile index 16bcb8087899..532f7e0915c3 100644 --- a/scripts/Makefile +++ b/scripts/Makefile @@ -17,7 +17,7 @@ hostprogs-$(CONFIG_VT) += conmakehash hostprogs-$(BUILD_C_RECORDMCOUNT) += recordmcount hostprogs-$(CONFIG_BUILDTIME_EXTABLE_SORT) += sortextable hostprogs-$(CONFIG_ASN1) += asn1_compiler -hostprogs-$(CONFIG_MODULE_SIG) += sign-file +hostprogs-$(CONFIG_MODULE_SIG_FORMAT) += sign-file hostprogs-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += extract-cert hostprogs-$(CONFIG_SYSTEM_EXTRA_CERTIFICATE) += insert-sys-cert From 2a7bf671186eb5d1a47ef192aefbdf788f5b38fe Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Thu, 27 Jun 2019 23:19:25 -0300 Subject: [PATCH 006/334] PKCS#7: Refactor verify_pkcs7_signature() IMA will need to verify a PKCS#7 signature which has already been parsed. For this reason, factor out the code which does that from verify_pkcs7_signature() into a new function which takes a struct pkcs7_message instead of a data buffer. Signed-off-by: Thiago Jung Bauermann Reviewed-by: Mimi Zohar Cc: David Howells Cc: David Woodhouse Cc: Herbert Xu Cc: "David S. Miller" Signed-off-by: Mimi Zohar --- certs/system_keyring.c | 61 ++++++++++++++++++++++++++---------- include/linux/verification.h | 10 ++++++ 2 files changed, 55 insertions(+), 16 deletions(-) diff --git a/certs/system_keyring.c b/certs/system_keyring.c index 1eba08a1af82..798291177186 100644 --- a/certs/system_keyring.c +++ b/certs/system_keyring.c @@ -190,33 +190,27 @@ late_initcall(load_system_certificate_list); #ifdef CONFIG_SYSTEM_DATA_VERIFICATION /** - * verify_pkcs7_signature - Verify a PKCS#7-based signature on system data. + * verify_pkcs7_message_sig - Verify a PKCS#7-based signature on system data. * @data: The data to be verified (NULL if expecting internal data). * @len: Size of @data. - * @raw_pkcs7: The PKCS#7 message that is the signature. - * @pkcs7_len: The size of @raw_pkcs7. + * @pkcs7: The PKCS#7 message that is the signature. * @trusted_keys: Trusted keys to use (NULL for builtin trusted keys only, * (void *)1UL for all trusted keys). * @usage: The use to which the key is being put. * @view_content: Callback to gain access to content. * @ctx: Context for callback. */ -int verify_pkcs7_signature(const void *data, size_t len, - const void *raw_pkcs7, size_t pkcs7_len, - struct key *trusted_keys, - enum key_being_used_for usage, - int (*view_content)(void *ctx, - const void *data, size_t len, - size_t asn1hdrlen), - void *ctx) +int verify_pkcs7_message_sig(const void *data, size_t len, + struct pkcs7_message *pkcs7, + struct key *trusted_keys, + enum key_being_used_for usage, + int (*view_content)(void *ctx, + const void *data, size_t len, + size_t asn1hdrlen), + void *ctx) { - struct pkcs7_message *pkcs7; int ret; - pkcs7 = pkcs7_parse_message(raw_pkcs7, pkcs7_len); - if (IS_ERR(pkcs7)) - return PTR_ERR(pkcs7); - /* The data should be detached - so we need to supply it. */ if (data && pkcs7_supply_detached_data(pkcs7, data, len) < 0) { pr_err("PKCS#7 signature with non-detached data\n"); @@ -269,6 +263,41 @@ int verify_pkcs7_signature(const void *data, size_t len, } error: + pr_devel("<==%s() = %d\n", __func__, ret); + return ret; +} + +/** + * verify_pkcs7_signature - Verify a PKCS#7-based signature on system data. + * @data: The data to be verified (NULL if expecting internal data). + * @len: Size of @data. + * @raw_pkcs7: The PKCS#7 message that is the signature. + * @pkcs7_len: The size of @raw_pkcs7. + * @trusted_keys: Trusted keys to use (NULL for builtin trusted keys only, + * (void *)1UL for all trusted keys). + * @usage: The use to which the key is being put. + * @view_content: Callback to gain access to content. + * @ctx: Context for callback. + */ +int verify_pkcs7_signature(const void *data, size_t len, + const void *raw_pkcs7, size_t pkcs7_len, + struct key *trusted_keys, + enum key_being_used_for usage, + int (*view_content)(void *ctx, + const void *data, size_t len, + size_t asn1hdrlen), + void *ctx) +{ + struct pkcs7_message *pkcs7; + int ret; + + pkcs7 = pkcs7_parse_message(raw_pkcs7, pkcs7_len); + if (IS_ERR(pkcs7)) + return PTR_ERR(pkcs7); + + ret = verify_pkcs7_message_sig(data, len, pkcs7, trusted_keys, usage, + view_content, ctx); + pkcs7_free_message(pkcs7); pr_devel("<==%s() = %d\n", __func__, ret); return ret; diff --git a/include/linux/verification.h b/include/linux/verification.h index 32d990d163c4..911ab7c2b1ab 100644 --- a/include/linux/verification.h +++ b/include/linux/verification.h @@ -32,6 +32,7 @@ extern const char *const key_being_used_for[NR__KEY_BEING_USED_FOR]; #ifdef CONFIG_SYSTEM_DATA_VERIFICATION struct key; +struct pkcs7_message; extern int verify_pkcs7_signature(const void *data, size_t len, const void *raw_pkcs7, size_t pkcs7_len, @@ -41,6 +42,15 @@ extern int verify_pkcs7_signature(const void *data, size_t len, const void *data, size_t len, size_t asn1hdrlen), void *ctx); +extern int verify_pkcs7_message_sig(const void *data, size_t len, + struct pkcs7_message *pkcs7, + struct key *trusted_keys, + enum key_being_used_for usage, + int (*view_content)(void *ctx, + const void *data, + size_t len, + size_t asn1hdrlen), + void *ctx); #ifdef CONFIG_SIGNED_PE_FILE_VERIFICATION extern int verify_pefile_signature(const void *pebuf, unsigned pelen, From e201af16d1ec76ccd19b90484d767984ff451f18 Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Thu, 27 Jun 2019 23:19:26 -0300 Subject: [PATCH 007/334] PKCS#7: Introduce pkcs7_get_digest() IMA will need to access the digest of the PKCS7 message (as calculated by the kernel) before the signature is verified, so introduce pkcs7_get_digest() for that purpose. Also, modify pkcs7_digest() to detect when the digest was already calculated so that it doesn't have to do redundant work. Verifying that sinfo->sig->digest isn't NULL is sufficient because both places which allocate sinfo->sig (pkcs7_parse_message() and pkcs7_note_signed_info()) use kzalloc() so sig->digest is always initialized to zero. Signed-off-by: Thiago Jung Bauermann Reviewed-by: Mimi Zohar Cc: David Howells Cc: David Woodhouse Cc: Herbert Xu Cc: "David S. Miller" Signed-off-by: Mimi Zohar --- crypto/asymmetric_keys/pkcs7_verify.c | 33 +++++++++++++++++++++++++++ include/crypto/pkcs7.h | 4 ++++ 2 files changed, 37 insertions(+) diff --git a/crypto/asymmetric_keys/pkcs7_verify.c b/crypto/asymmetric_keys/pkcs7_verify.c index 11bee67fa9cc..ce49820caa97 100644 --- a/crypto/asymmetric_keys/pkcs7_verify.c +++ b/crypto/asymmetric_keys/pkcs7_verify.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include "pkcs7_parser.h" @@ -29,6 +30,10 @@ static int pkcs7_digest(struct pkcs7_message *pkcs7, kenter(",%u,%s", sinfo->index, sinfo->sig->hash_algo); + /* The digest was calculated already. */ + if (sig->digest) + return 0; + if (!sinfo->sig->hash_algo) return -ENOPKG; @@ -117,6 +122,34 @@ error_no_desc: return ret; } +int pkcs7_get_digest(struct pkcs7_message *pkcs7, const u8 **buf, u32 *len, + enum hash_algo *hash_algo) +{ + struct pkcs7_signed_info *sinfo = pkcs7->signed_infos; + int i, ret; + + /* + * This function doesn't support messages with more than one signature. + */ + if (sinfo == NULL || sinfo->next != NULL) + return -EBADMSG; + + ret = pkcs7_digest(pkcs7, sinfo); + if (ret) + return ret; + + *buf = sinfo->sig->digest; + *len = sinfo->sig->digest_size; + + for (i = 0; i < HASH_ALGO__LAST; i++) + if (!strcmp(hash_algo_name[i], sinfo->sig->hash_algo)) { + *hash_algo = i; + break; + } + + return 0; +} + /* * Find the key (X.509 certificate) to use to verify a PKCS#7 message. PKCS#7 * uses the issuer's name and the issuing certificate serial number for diff --git a/include/crypto/pkcs7.h b/include/crypto/pkcs7.h index 96071bee03ac..38ec7f5f9041 100644 --- a/include/crypto/pkcs7.h +++ b/include/crypto/pkcs7.h @@ -9,6 +9,7 @@ #define _CRYPTO_PKCS7_H #include +#include #include struct key; @@ -40,4 +41,7 @@ extern int pkcs7_verify(struct pkcs7_message *pkcs7, extern int pkcs7_supply_detached_data(struct pkcs7_message *pkcs7, const void *data, size_t datalen); +extern int pkcs7_get_digest(struct pkcs7_message *pkcs7, const u8 **buf, + u32 *len, enum hash_algo *hash_algo); + #endif /* _CRYPTO_PKCS7_H */ From cf38fed1e183dd2410f62d49ae635fe593082f0c Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Thu, 27 Jun 2019 23:19:27 -0300 Subject: [PATCH 008/334] integrity: Select CONFIG_KEYS instead of depending on it This avoids a dependency cycle in soon-to-be-introduced CONFIG_IMA_APPRAISE_MODSIG: it will select CONFIG_MODULE_SIG_FORMAT which in turn selects CONFIG_KEYS. Kconfig then complains that CONFIG_INTEGRITY_SIGNATURE depends on CONFIG_KEYS. Signed-off-by: Thiago Jung Bauermann Signed-off-by: Mimi Zohar --- security/integrity/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/integrity/Kconfig b/security/integrity/Kconfig index c352532b8f84..0bae6adb63a9 100644 --- a/security/integrity/Kconfig +++ b/security/integrity/Kconfig @@ -18,8 +18,8 @@ if INTEGRITY config INTEGRITY_SIGNATURE bool "Digital signature verification using multiple keyrings" - depends on KEYS default n + select KEYS select SIGNATURE help This option enables digital signature verification support From 9044d627fd18f9fca49b62d4619ee14914b91464 Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Thu, 27 Jun 2019 23:19:28 -0300 Subject: [PATCH 009/334] ima: Add modsig appraise_type option for module-style appended signatures Introduce the modsig keyword to the IMA policy syntax to specify that a given hook should expect the file to have the IMA signature appended to it. Here is how it can be used in a rule: appraise func=KEXEC_KERNEL_CHECK appraise_type=imasig|modsig With this rule, IMA will accept either a signature stored in the extended attribute or an appended signature. For now, the rule above will behave exactly the same as if appraise_type=imasig was specified. The actual modsig implementation will be introduced separately. Suggested-by: Mimi Zohar Signed-off-by: Thiago Jung Bauermann Signed-off-by: Mimi Zohar --- Documentation/ABI/testing/ima_policy | 6 +++++- security/integrity/ima/Kconfig | 10 +++++++++ security/integrity/ima/Makefile | 1 + security/integrity/ima/ima.h | 9 ++++++++ security/integrity/ima/ima_modsig.c | 31 ++++++++++++++++++++++++++++ security/integrity/ima/ima_policy.c | 12 +++++++++-- security/integrity/integrity.h | 1 + 7 files changed, 67 insertions(+), 3 deletions(-) create mode 100644 security/integrity/ima/ima_modsig.c diff --git a/Documentation/ABI/testing/ima_policy b/Documentation/ABI/testing/ima_policy index fc376a323908..29ebe9afdac4 100644 --- a/Documentation/ABI/testing/ima_policy +++ b/Documentation/ABI/testing/ima_policy @@ -37,7 +37,7 @@ Description: euid:= decimal value fowner:= decimal value lsm: are LSM specific - option: appraise_type:= [imasig] + option: appraise_type:= [imasig] [imasig|modsig] template:= name of a defined IMA template type (eg, ima-ng). Only valid when action is "measure". pcr:= decimal value @@ -105,3 +105,7 @@ Description: measure func=KEXEC_KERNEL_CHECK pcr=4 measure func=KEXEC_INITRAMFS_CHECK pcr=5 + + Example of appraise rule allowing modsig appended signatures: + + appraise func=KEXEC_KERNEL_CHECK appraise_type=imasig|modsig diff --git a/security/integrity/ima/Kconfig b/security/integrity/ima/Kconfig index 2ced99dde694..8bf46646b185 100644 --- a/security/integrity/ima/Kconfig +++ b/security/integrity/ima/Kconfig @@ -233,6 +233,16 @@ config IMA_APPRAISE_BOOTPARAM This option enables the different "ima_appraise=" modes (eg. fix, log) from the boot command line. +config IMA_APPRAISE_MODSIG + bool "Support module-style signatures for appraisal" + depends on IMA_APPRAISE + default n + help + Adds support for signatures appended to files. The format of the + appended signature is the same used for signed kernel modules. + The modsig keyword can be used in the IMA policy to allow a hook + to accept such signatures. + config IMA_TRUSTED_KEYRING bool "Require all keys on the .ima keyring be signed (deprecated)" depends on IMA_APPRAISE && SYSTEM_TRUSTED_KEYRING diff --git a/security/integrity/ima/Makefile b/security/integrity/ima/Makefile index d921dc4f9eb0..31d57cdf2421 100644 --- a/security/integrity/ima/Makefile +++ b/security/integrity/ima/Makefile @@ -9,5 +9,6 @@ obj-$(CONFIG_IMA) += ima.o ima-y := ima_fs.o ima_queue.o ima_init.o ima_main.o ima_crypto.o ima_api.o \ ima_policy.o ima_template.o ima_template_lib.o ima-$(CONFIG_IMA_APPRAISE) += ima_appraise.o +ima-$(CONFIG_IMA_APPRAISE_MODSIG) += ima_modsig.o ima-$(CONFIG_HAVE_IMA_KEXEC) += ima_kexec.o obj-$(CONFIG_IMA_BLACKLIST_KEYRING) += ima_mok.o diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h index 011b91c79351..e21b06942858 100644 --- a/security/integrity/ima/ima.h +++ b/security/integrity/ima/ima.h @@ -302,6 +302,15 @@ static inline int ima_read_xattr(struct dentry *dentry, #endif /* CONFIG_IMA_APPRAISE */ +#ifdef CONFIG_IMA_APPRAISE_MODSIG +bool ima_hook_supports_modsig(enum ima_hooks func); +#else +static inline bool ima_hook_supports_modsig(enum ima_hooks func) +{ + return false; +} +#endif /* CONFIG_IMA_APPRAISE_MODSIG */ + /* LSM based policy rules require audit */ #ifdef CONFIG_IMA_LSM_RULES diff --git a/security/integrity/ima/ima_modsig.c b/security/integrity/ima/ima_modsig.c new file mode 100644 index 000000000000..87503bfe8c8b --- /dev/null +++ b/security/integrity/ima/ima_modsig.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * IMA support for appraising module-style appended signatures. + * + * Copyright (C) 2019 IBM Corporation + * + * Author: + * Thiago Jung Bauermann + */ + +#include "ima.h" + +/** + * ima_hook_supports_modsig - can the policy allow modsig for this hook? + * + * modsig is only supported by hooks using ima_post_read_file(), because only + * they preload the contents of the file in a buffer. FILE_CHECK does that in + * some cases, but not when reached from vfs_open(). POLICY_CHECK can support + * it, but it's not useful in practice because it's a text file so deny. + */ +bool ima_hook_supports_modsig(enum ima_hooks func) +{ + switch (func) { + case KEXEC_KERNEL_CHECK: + case KEXEC_INITRAMFS_CHECK: + case MODULE_CHECK: + return true; + default: + return false; + } +} diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c index 36a0727f1d7a..5b6061d6bce0 100644 --- a/security/integrity/ima/ima_policy.c +++ b/security/integrity/ima/ima_policy.c @@ -1130,6 +1130,10 @@ static int ima_parse_rule(char *rule, struct ima_rule_entry *entry) ima_log_string(ab, "appraise_type", args[0].from); if ((strcmp(args[0].from, "imasig")) == 0) entry->flags |= IMA_DIGSIG_REQUIRED; + else if (ima_hook_supports_modsig(entry->func) && + strcmp(args[0].from, "imasig|modsig") == 0) + entry->flags |= IMA_DIGSIG_REQUIRED | + IMA_MODSIG_ALLOWED; else result = -EINVAL; break; @@ -1449,8 +1453,12 @@ int ima_policy_show(struct seq_file *m, void *v) } if (entry->template) seq_printf(m, "template=%s ", entry->template->name); - if (entry->flags & IMA_DIGSIG_REQUIRED) - seq_puts(m, "appraise_type=imasig "); + if (entry->flags & IMA_DIGSIG_REQUIRED) { + if (entry->flags & IMA_MODSIG_ALLOWED) + seq_puts(m, "appraise_type=imasig|modsig "); + else + seq_puts(m, "appraise_type=imasig "); + } if (entry->flags & IMA_PERMIT_DIRECTIO) seq_puts(m, "permit_directio "); rcu_read_unlock(); diff --git a/security/integrity/integrity.h b/security/integrity/integrity.h index ed12d8e13d04..8c5736b68156 100644 --- a/security/integrity/integrity.h +++ b/security/integrity/integrity.h @@ -31,6 +31,7 @@ #define IMA_NEW_FILE 0x04000000 #define EVM_IMMUTABLE_DIGSIG 0x08000000 #define IMA_FAIL_UNVERIFIABLE_SIGS 0x10000000 +#define IMA_MODSIG_ALLOWED 0x20000000 #define IMA_DO_MASK (IMA_MEASURE | IMA_APPRAISE | IMA_AUDIT | \ IMA_HASH | IMA_APPRAISE_SUBMASK) From a5fbeb615ca42f913ace3291d636e96feabcc545 Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Thu, 27 Jun 2019 23:19:29 -0300 Subject: [PATCH 010/334] ima: Factor xattr_verify() out of ima_appraise_measurement() Verify xattr signature in a separate function so that the logic in ima_appraise_measurement() remains clear when it gains the ability to also verify an appended module signature. The code in the switch statement is unchanged except for having to dereference the status and cause variables (since they're now pointers), and fixing the style of a block comment to appease checkpatch. Suggested-by: Mimi Zohar Signed-off-by: Thiago Jung Bauermann Signed-off-by: Mimi Zohar --- security/integrity/ima/ima_appraise.c | 141 +++++++++++++++----------- 1 file changed, 81 insertions(+), 60 deletions(-) diff --git a/security/integrity/ima/ima_appraise.c b/security/integrity/ima/ima_appraise.c index 89b83194d1dc..d3298045737f 100644 --- a/security/integrity/ima/ima_appraise.c +++ b/security/integrity/ima/ima_appraise.c @@ -199,6 +199,83 @@ int ima_read_xattr(struct dentry *dentry, return ret; } +/* + * xattr_verify - verify xattr digest or signature + * + * Verify whether the hash or signature matches the file contents. + * + * Return 0 on success, error code otherwise. + */ +static int xattr_verify(enum ima_hooks func, struct integrity_iint_cache *iint, + struct evm_ima_xattr_data *xattr_value, int xattr_len, + enum integrity_status *status, const char **cause) +{ + int rc = -EINVAL, hash_start = 0; + + switch (xattr_value->type) { + case IMA_XATTR_DIGEST_NG: + /* first byte contains algorithm id */ + hash_start = 1; + /* fall through */ + case IMA_XATTR_DIGEST: + if (iint->flags & IMA_DIGSIG_REQUIRED) { + *cause = "IMA-signature-required"; + *status = INTEGRITY_FAIL; + break; + } + clear_bit(IMA_DIGSIG, &iint->atomic_flags); + if (xattr_len - sizeof(xattr_value->type) - hash_start >= + iint->ima_hash->length) + /* + * xattr length may be longer. md5 hash in previous + * version occupied 20 bytes in xattr, instead of 16 + */ + rc = memcmp(&xattr_value->data[hash_start], + iint->ima_hash->digest, + iint->ima_hash->length); + else + rc = -EINVAL; + if (rc) { + *cause = "invalid-hash"; + *status = INTEGRITY_FAIL; + break; + } + *status = INTEGRITY_PASS; + break; + case EVM_IMA_XATTR_DIGSIG: + set_bit(IMA_DIGSIG, &iint->atomic_flags); + rc = integrity_digsig_verify(INTEGRITY_KEYRING_IMA, + (const char *)xattr_value, + xattr_len, + iint->ima_hash->digest, + iint->ima_hash->length); + if (rc == -EOPNOTSUPP) { + *status = INTEGRITY_UNKNOWN; + break; + } + if (IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING) && rc && + func == KEXEC_KERNEL_CHECK) + rc = integrity_digsig_verify(INTEGRITY_KEYRING_PLATFORM, + (const char *)xattr_value, + xattr_len, + iint->ima_hash->digest, + iint->ima_hash->length); + if (rc) { + *cause = "invalid-signature"; + *status = INTEGRITY_FAIL; + } else { + *status = INTEGRITY_PASS; + } + break; + default: + *status = INTEGRITY_UNKNOWN; + *cause = "unknown-ima-data"; + break; + } + + return rc; +} + /* * ima_appraise_measurement - appraise file measurement * @@ -218,7 +295,7 @@ int ima_appraise_measurement(enum ima_hooks func, struct dentry *dentry = file_dentry(file); struct inode *inode = d_backing_inode(dentry); enum integrity_status status = INTEGRITY_UNKNOWN; - int rc = xattr_len, hash_start = 0; + int rc = xattr_len; if (!(inode->i_opflags & IOP_XATTR)) return INTEGRITY_UNKNOWN; @@ -256,65 +333,9 @@ int ima_appraise_measurement(enum ima_hooks func, WARN_ONCE(true, "Unexpected integrity status %d\n", status); } - switch (xattr_value->type) { - case IMA_XATTR_DIGEST_NG: - /* first byte contains algorithm id */ - hash_start = 1; - /* fall through */ - case IMA_XATTR_DIGEST: - if (iint->flags & IMA_DIGSIG_REQUIRED) { - cause = "IMA-signature-required"; - status = INTEGRITY_FAIL; - break; - } - clear_bit(IMA_DIGSIG, &iint->atomic_flags); - if (xattr_len - sizeof(xattr_value->type) - hash_start >= - iint->ima_hash->length) - /* xattr length may be longer. md5 hash in previous - version occupied 20 bytes in xattr, instead of 16 - */ - rc = memcmp(&xattr_value->data[hash_start], - iint->ima_hash->digest, - iint->ima_hash->length); - else - rc = -EINVAL; - if (rc) { - cause = "invalid-hash"; - status = INTEGRITY_FAIL; - break; - } - status = INTEGRITY_PASS; - break; - case EVM_IMA_XATTR_DIGSIG: - set_bit(IMA_DIGSIG, &iint->atomic_flags); - rc = integrity_digsig_verify(INTEGRITY_KEYRING_IMA, - (const char *)xattr_value, - xattr_len, - iint->ima_hash->digest, - iint->ima_hash->length); - if (rc == -EOPNOTSUPP) { - status = INTEGRITY_UNKNOWN; - break; - } - if (IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING) && rc && - func == KEXEC_KERNEL_CHECK) - rc = integrity_digsig_verify(INTEGRITY_KEYRING_PLATFORM, - (const char *)xattr_value, - xattr_len, - iint->ima_hash->digest, - iint->ima_hash->length); - if (rc) { - cause = "invalid-signature"; - status = INTEGRITY_FAIL; - } else { - status = INTEGRITY_PASS; - } - break; - default: - status = INTEGRITY_UNKNOWN; - cause = "unknown-ima-data"; - break; - } + if (xattr_value) + rc = xattr_verify(func, iint, xattr_value, xattr_len, &status, + &cause); out: /* From 39b07096364a42c516415d5f841069e885234e61 Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Thu, 27 Jun 2019 23:19:30 -0300 Subject: [PATCH 011/334] ima: Implement support for module-style appended signatures Implement the appraise_type=imasig|modsig option, allowing IMA to read and verify modsig signatures. In case a file has both an xattr signature and an appended modsig, IMA will only use the appended signature if the key used by the xattr signature isn't present in the IMA or platform keyring. Because modsig verification needs to convert from an integrity keyring id to the keyring itself, add an integrity_keyring_from_id() function in digsig.c so that integrity_modsig_verify() can use it. Signed-off-by: Thiago Jung Bauermann Signed-off-by: Mimi Zohar --- security/integrity/digsig.c | 43 ++++++++++++---- security/integrity/ima/Kconfig | 3 ++ security/integrity/ima/ima.h | 22 ++++++++- security/integrity/ima/ima_appraise.c | 51 +++++++++++++++++-- security/integrity/ima/ima_main.c | 11 ++++- security/integrity/ima/ima_modsig.c | 71 +++++++++++++++++++++++++++ security/integrity/ima/ima_policy.c | 12 ++--- security/integrity/integrity.h | 19 +++++++ 8 files changed, 209 insertions(+), 23 deletions(-) diff --git a/security/integrity/digsig.c b/security/integrity/digsig.c index 868ade3e8970..ea1aae3d07b3 100644 --- a/security/integrity/digsig.c +++ b/security/integrity/digsig.c @@ -39,11 +39,10 @@ static const char * const keyring_name[INTEGRITY_KEYRING_MAX] = { #define restrict_link_to_ima restrict_link_by_builtin_trusted #endif -int integrity_digsig_verify(const unsigned int id, const char *sig, int siglen, - const char *digest, int digestlen) +static struct key *integrity_keyring_from_id(const unsigned int id) { - if (id >= INTEGRITY_KEYRING_MAX || siglen < 2) - return -EINVAL; + if (id >= INTEGRITY_KEYRING_MAX) + return ERR_PTR(-EINVAL); if (!keyring[id]) { keyring[id] = @@ -52,23 +51,49 @@ int integrity_digsig_verify(const unsigned int id, const char *sig, int siglen, int err = PTR_ERR(keyring[id]); pr_err("no %s keyring: %d\n", keyring_name[id], err); keyring[id] = NULL; - return err; + return ERR_PTR(err); } } + return keyring[id]; +} + +int integrity_digsig_verify(const unsigned int id, const char *sig, int siglen, + const char *digest, int digestlen) +{ + struct key *keyring; + + if (siglen < 2) + return -EINVAL; + + keyring = integrity_keyring_from_id(id); + if (IS_ERR(keyring)) + return PTR_ERR(keyring); + switch (sig[1]) { case 1: /* v1 API expect signature without xattr type */ - return digsig_verify(keyring[id], sig + 1, siglen - 1, - digest, digestlen); + return digsig_verify(keyring, sig + 1, siglen - 1, digest, + digestlen); case 2: - return asymmetric_verify(keyring[id], sig, siglen, - digest, digestlen); + return asymmetric_verify(keyring, sig, siglen, digest, + digestlen); } return -EOPNOTSUPP; } +int integrity_modsig_verify(const unsigned int id, const struct modsig *modsig) +{ + struct key *keyring; + + keyring = integrity_keyring_from_id(id); + if (IS_ERR(keyring)) + return PTR_ERR(keyring); + + return ima_modsig_verify(keyring, modsig); +} + static int __init __integrity_init_keyring(const unsigned int id, key_perm_t perm, struct key_restriction *restriction) diff --git a/security/integrity/ima/Kconfig b/security/integrity/ima/Kconfig index 8bf46646b185..897bafc59a33 100644 --- a/security/integrity/ima/Kconfig +++ b/security/integrity/ima/Kconfig @@ -236,6 +236,9 @@ config IMA_APPRAISE_BOOTPARAM config IMA_APPRAISE_MODSIG bool "Support module-style signatures for appraisal" depends on IMA_APPRAISE + depends on INTEGRITY_ASYMMETRIC_KEYS + select PKCS7_MESSAGE_PARSER + select MODULE_SIG_FORMAT default n help Adds support for signatures appended to files. The format of the diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h index e21b06942858..ed8e19d38805 100644 --- a/security/integrity/ima/ima.h +++ b/security/integrity/ima/ima.h @@ -196,6 +196,10 @@ enum ima_hooks { __ima_hooks(__ima_hook_enumify) }; +extern const char *const func_tokens[]; + +struct modsig; + /* LIM API function definitions */ int ima_get_action(struct inode *inode, const struct cred *cred, u32 secid, int mask, enum ima_hooks func, int *pcr, @@ -249,7 +253,7 @@ int ima_appraise_measurement(enum ima_hooks func, struct integrity_iint_cache *iint, struct file *file, const unsigned char *filename, struct evm_ima_xattr_data *xattr_value, - int xattr_len); + int xattr_len, const struct modsig *modsig); int ima_must_appraise(struct inode *inode, int mask, enum ima_hooks func); void ima_update_xattr(struct integrity_iint_cache *iint, struct file *file); enum integrity_status ima_get_cache_status(struct integrity_iint_cache *iint, @@ -265,7 +269,8 @@ static inline int ima_appraise_measurement(enum ima_hooks func, struct file *file, const unsigned char *filename, struct evm_ima_xattr_data *xattr_value, - int xattr_len) + int xattr_len, + const struct modsig *modsig) { return INTEGRITY_UNKNOWN; } @@ -304,11 +309,24 @@ static inline int ima_read_xattr(struct dentry *dentry, #ifdef CONFIG_IMA_APPRAISE_MODSIG bool ima_hook_supports_modsig(enum ima_hooks func); +int ima_read_modsig(enum ima_hooks func, const void *buf, loff_t buf_len, + struct modsig **modsig); +void ima_free_modsig(struct modsig *modsig); #else static inline bool ima_hook_supports_modsig(enum ima_hooks func) { return false; } + +static inline int ima_read_modsig(enum ima_hooks func, const void *buf, + loff_t buf_len, struct modsig **modsig) +{ + return -EOPNOTSUPP; +} + +static inline void ima_free_modsig(struct modsig *modsig) +{ +} #endif /* CONFIG_IMA_APPRAISE_MODSIG */ /* LSM based policy rules require audit */ diff --git a/security/integrity/ima/ima_appraise.c b/security/integrity/ima/ima_appraise.c index d3298045737f..d21e40eaba42 100644 --- a/security/integrity/ima/ima_appraise.c +++ b/security/integrity/ima/ima_appraise.c @@ -276,6 +276,33 @@ static int xattr_verify(enum ima_hooks func, struct integrity_iint_cache *iint, return rc; } +/* + * modsig_verify - verify modsig signature + * + * Verify whether the signature matches the file contents. + * + * Return 0 on success, error code otherwise. + */ +static int modsig_verify(enum ima_hooks func, const struct modsig *modsig, + enum integrity_status *status, const char **cause) +{ + int rc; + + rc = integrity_modsig_verify(INTEGRITY_KEYRING_IMA, modsig); + if (IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING) && rc && + func == KEXEC_KERNEL_CHECK) + rc = integrity_modsig_verify(INTEGRITY_KEYRING_PLATFORM, + modsig); + if (rc) { + *cause = "invalid-signature"; + *status = INTEGRITY_FAIL; + } else { + *status = INTEGRITY_PASS; + } + + return rc; +} + /* * ima_appraise_measurement - appraise file measurement * @@ -288,7 +315,7 @@ int ima_appraise_measurement(enum ima_hooks func, struct integrity_iint_cache *iint, struct file *file, const unsigned char *filename, struct evm_ima_xattr_data *xattr_value, - int xattr_len) + int xattr_len, const struct modsig *modsig) { static const char op[] = "appraise_data"; const char *cause = "unknown"; @@ -296,11 +323,14 @@ int ima_appraise_measurement(enum ima_hooks func, struct inode *inode = d_backing_inode(dentry); enum integrity_status status = INTEGRITY_UNKNOWN; int rc = xattr_len; + bool try_modsig = iint->flags & IMA_MODSIG_ALLOWED && modsig; - if (!(inode->i_opflags & IOP_XATTR)) + /* If not appraising a modsig, we need an xattr. */ + if (!(inode->i_opflags & IOP_XATTR) && !try_modsig) return INTEGRITY_UNKNOWN; - if (rc <= 0) { + /* If reading the xattr failed and there's no modsig, error out. */ + if (rc <= 0 && !try_modsig) { if (rc && rc != -ENODATA) goto out; @@ -323,6 +353,10 @@ int ima_appraise_measurement(enum ima_hooks func, case INTEGRITY_UNKNOWN: break; case INTEGRITY_NOXATTRS: /* No EVM protected xattrs. */ + /* It's fine not to have xattrs when using a modsig. */ + if (try_modsig) + break; + /* fall through */ case INTEGRITY_NOLABEL: /* No security.evm xattr. */ cause = "missing-HMAC"; goto out; @@ -337,6 +371,15 @@ int ima_appraise_measurement(enum ima_hooks func, rc = xattr_verify(func, iint, xattr_value, xattr_len, &status, &cause); + /* + * If we have a modsig and either no imasig or the imasig's key isn't + * known, then try verifying the modsig. + */ + if (try_modsig && + (!xattr_value || xattr_value->type == IMA_XATTR_DIGEST_NG || + rc == -ENOKEY)) + rc = modsig_verify(func, modsig, &status, &cause); + out: /* * File signatures on some filesystems can not be properly verified. @@ -353,7 +396,7 @@ out: op, cause, rc, 0); } else if (status != INTEGRITY_PASS) { /* Fix mode, but don't replace file signatures. */ - if ((ima_appraise & IMA_APPRAISE_FIX) && + if ((ima_appraise & IMA_APPRAISE_FIX) && !try_modsig && (!xattr_value || xattr_value->type != EVM_IMA_XATTR_DIGSIG)) { if (!ima_fix_xattr(dentry, iint)) diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index 584019728660..d8672e850615 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -202,6 +202,7 @@ static int process_measurement(struct file *file, const struct cred *cred, int rc = 0, action, must_appraise = 0; int pcr = CONFIG_IMA_MEASURE_PCR_IDX; struct evm_ima_xattr_data *xattr_value = NULL; + struct modsig *modsig = NULL; int xattr_len = 0; bool violation_check; enum hash_algo hash_algo; @@ -302,10 +303,15 @@ static int process_measurement(struct file *file, const struct cred *cred, } if ((action & IMA_APPRAISE_SUBMASK) || - strcmp(template_desc->name, IMA_TEMPLATE_IMA_NAME) != 0) + strcmp(template_desc->name, IMA_TEMPLATE_IMA_NAME) != 0) { /* read 'security.ima' */ xattr_len = ima_read_xattr(file_dentry(file), &xattr_value); + /* Read the appended modsig if allowed by the policy. */ + if (iint->flags & IMA_MODSIG_ALLOWED) + ima_read_modsig(func, buf, size, &modsig); + } + hash_algo = ima_get_hash_algo(xattr_value, xattr_len); rc = ima_collect_measurement(iint, file, buf, size, hash_algo); @@ -322,7 +328,7 @@ static int process_measurement(struct file *file, const struct cred *cred, if (rc == 0 && (action & IMA_APPRAISE_SUBMASK)) { inode_lock(inode); rc = ima_appraise_measurement(func, iint, file, pathname, - xattr_value, xattr_len); + xattr_value, xattr_len, modsig); inode_unlock(inode); if (!rc) rc = mmap_violation_check(func, file, &pathbuf, @@ -339,6 +345,7 @@ out_locked: rc = -EACCES; mutex_unlock(&iint->mutex); kfree(xattr_value); + ima_free_modsig(modsig); out: if (pathbuf) __putname(pathbuf); diff --git a/security/integrity/ima/ima_modsig.c b/security/integrity/ima/ima_modsig.c index 87503bfe8c8b..f41ebe370fa0 100644 --- a/security/integrity/ima/ima_modsig.c +++ b/security/integrity/ima/ima_modsig.c @@ -8,8 +8,17 @@ * Thiago Jung Bauermann */ +#include +#include +#include +#include + #include "ima.h" +struct modsig { + struct pkcs7_message *pkcs7_msg; +}; + /** * ima_hook_supports_modsig - can the policy allow modsig for this hook? * @@ -29,3 +38,65 @@ bool ima_hook_supports_modsig(enum ima_hooks func) return false; } } + +/* + * ima_read_modsig - Read modsig from buf. + * + * Return: 0 on success, error code otherwise. + */ +int ima_read_modsig(enum ima_hooks func, const void *buf, loff_t buf_len, + struct modsig **modsig) +{ + const size_t marker_len = strlen(MODULE_SIG_STRING); + const struct module_signature *sig; + struct modsig *hdr; + size_t sig_len; + const void *p; + int rc; + + if (buf_len <= marker_len + sizeof(*sig)) + return -ENOENT; + + p = buf + buf_len - marker_len; + if (memcmp(p, MODULE_SIG_STRING, marker_len)) + return -ENOENT; + + buf_len -= marker_len; + sig = (const struct module_signature *)(p - sizeof(*sig)); + + rc = mod_check_sig(sig, buf_len, func_tokens[func]); + if (rc) + return rc; + + sig_len = be32_to_cpu(sig->sig_len); + buf_len -= sig_len + sizeof(*sig); + + hdr = kmalloc(sizeof(*hdr), GFP_KERNEL); + if (!hdr) + return -ENOMEM; + + hdr->pkcs7_msg = pkcs7_parse_message(buf + buf_len, sig_len); + if (IS_ERR(hdr->pkcs7_msg)) { + kfree(hdr); + return PTR_ERR(hdr->pkcs7_msg); + } + + *modsig = hdr; + + return 0; +} + +int ima_modsig_verify(struct key *keyring, const struct modsig *modsig) +{ + return verify_pkcs7_message_sig(NULL, 0, modsig->pkcs7_msg, keyring, + VERIFYING_MODULE_SIGNATURE, NULL, NULL); +} + +void ima_free_modsig(struct modsig *modsig) +{ + if (!modsig) + return; + + pkcs7_free_message(modsig->pkcs7_msg); + kfree(modsig); +} diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c index 5b6061d6bce0..873dd7edaa78 100644 --- a/security/integrity/ima/ima_policy.c +++ b/security/integrity/ima/ima_policy.c @@ -1258,6 +1258,12 @@ void ima_delete_rules(void) } } +#define __ima_hook_stringify(str) (#str), + +const char *const func_tokens[] = { + __ima_hooks(__ima_hook_stringify) +}; + #ifdef CONFIG_IMA_READ_POLICY enum { mask_exec = 0, mask_write, mask_read, mask_append @@ -1270,12 +1276,6 @@ static const char *const mask_tokens[] = { "^MAY_APPEND" }; -#define __ima_hook_stringify(str) (#str), - -static const char *const func_tokens[] = { - __ima_hooks(__ima_hook_stringify) -}; - void *ima_policy_start(struct seq_file *m, loff_t *pos) { loff_t l = *pos; diff --git a/security/integrity/integrity.h b/security/integrity/integrity.h index 8c5736b68156..d9323d31a3a8 100644 --- a/security/integrity/integrity.h +++ b/security/integrity/integrity.h @@ -148,10 +148,13 @@ int integrity_kernel_read(struct file *file, loff_t offset, extern struct dentry *integrity_dir; +struct modsig; + #ifdef CONFIG_INTEGRITY_SIGNATURE int integrity_digsig_verify(const unsigned int id, const char *sig, int siglen, const char *digest, int digestlen); +int integrity_modsig_verify(unsigned int id, const struct modsig *modsig); int __init integrity_init_keyring(const unsigned int id); int __init integrity_load_x509(const unsigned int id, const char *path); @@ -166,6 +169,12 @@ static inline int integrity_digsig_verify(const unsigned int id, return -EOPNOTSUPP; } +static inline int integrity_modsig_verify(unsigned int id, + const struct modsig *modsig) +{ + return -EOPNOTSUPP; +} + static inline int integrity_init_keyring(const unsigned int id) { return 0; @@ -191,6 +200,16 @@ static inline int asymmetric_verify(struct key *keyring, const char *sig, } #endif +#ifdef CONFIG_IMA_APPRAISE_MODSIG +int ima_modsig_verify(struct key *keyring, const struct modsig *modsig); +#else +static inline int ima_modsig_verify(struct key *keyring, + const struct modsig *modsig) +{ + return -EOPNOTSUPP; +} +#endif + #ifdef CONFIG_IMA_LOAD_X509 void __init ima_load_x509(void); #else From 15588227e086ec662d59df144e48af82e3e592f1 Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Thu, 27 Jun 2019 23:19:31 -0300 Subject: [PATCH 012/334] ima: Collect modsig Obtain the modsig and calculate its corresponding hash in ima_collect_measurement(). Signed-off-by: Thiago Jung Bauermann Signed-off-by: Mimi Zohar --- security/integrity/ima/ima.h | 8 ++++- security/integrity/ima/ima_api.c | 5 ++- security/integrity/ima/ima_appraise.c | 2 +- security/integrity/ima/ima_main.c | 2 +- security/integrity/ima/ima_modsig.c | 48 ++++++++++++++++++++++++++- 5 files changed, 60 insertions(+), 5 deletions(-) diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h index ed8e19d38805..0bc764c80327 100644 --- a/security/integrity/ima/ima.h +++ b/security/integrity/ima/ima.h @@ -207,7 +207,7 @@ int ima_get_action(struct inode *inode, const struct cred *cred, u32 secid, int ima_must_measure(struct inode *inode, int mask, enum ima_hooks func); int ima_collect_measurement(struct integrity_iint_cache *iint, struct file *file, void *buf, loff_t size, - enum hash_algo algo); + enum hash_algo algo, struct modsig *modsig); void ima_store_measurement(struct integrity_iint_cache *iint, struct file *file, const unsigned char *filename, struct evm_ima_xattr_data *xattr_value, @@ -311,6 +311,7 @@ static inline int ima_read_xattr(struct dentry *dentry, bool ima_hook_supports_modsig(enum ima_hooks func); int ima_read_modsig(enum ima_hooks func, const void *buf, loff_t buf_len, struct modsig **modsig); +void ima_collect_modsig(struct modsig *modsig, const void *buf, loff_t size); void ima_free_modsig(struct modsig *modsig); #else static inline bool ima_hook_supports_modsig(enum ima_hooks func) @@ -324,6 +325,11 @@ static inline int ima_read_modsig(enum ima_hooks func, const void *buf, return -EOPNOTSUPP; } +static inline void ima_collect_modsig(struct modsig *modsig, const void *buf, + loff_t size) +{ +} + static inline void ima_free_modsig(struct modsig *modsig) { } diff --git a/security/integrity/ima/ima_api.c b/security/integrity/ima/ima_api.c index f614e22bf39f..ff8b7fb03ea0 100644 --- a/security/integrity/ima/ima_api.c +++ b/security/integrity/ima/ima_api.c @@ -205,7 +205,7 @@ int ima_get_action(struct inode *inode, const struct cred *cred, u32 secid, */ int ima_collect_measurement(struct integrity_iint_cache *iint, struct file *file, void *buf, loff_t size, - enum hash_algo algo) + enum hash_algo algo, struct modsig *modsig) { const char *audit_cause = "failed"; struct inode *inode = file_inode(file); @@ -252,6 +252,9 @@ int ima_collect_measurement(struct integrity_iint_cache *iint, memcpy(iint->ima_hash, &hash, length); iint->version = i_version; + if (modsig) + ima_collect_modsig(modsig, buf, size); + /* Possibly temporary failure due to type of read (eg. O_DIRECT) */ if (!result) iint->flags |= IMA_COLLECTED; diff --git a/security/integrity/ima/ima_appraise.c b/security/integrity/ima/ima_appraise.c index d21e40eaba42..136ae4e0ee92 100644 --- a/security/integrity/ima/ima_appraise.c +++ b/security/integrity/ima/ima_appraise.c @@ -435,7 +435,7 @@ void ima_update_xattr(struct integrity_iint_cache *iint, struct file *file) !(iint->flags & IMA_HASH)) return; - rc = ima_collect_measurement(iint, file, NULL, 0, ima_hash_algo); + rc = ima_collect_measurement(iint, file, NULL, 0, ima_hash_algo, NULL); if (rc < 0) return; diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index d8672e850615..7c8d92cf2755 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -314,7 +314,7 @@ static int process_measurement(struct file *file, const struct cred *cred, hash_algo = ima_get_hash_algo(xattr_value, xattr_len); - rc = ima_collect_measurement(iint, file, buf, size, hash_algo); + rc = ima_collect_measurement(iint, file, buf, size, hash_algo, modsig); if (rc != 0 && rc != -EBADF && rc != -EINVAL) goto out_locked; diff --git a/security/integrity/ima/ima_modsig.c b/security/integrity/ima/ima_modsig.c index f41ebe370fa0..257387ce3b70 100644 --- a/security/integrity/ima/ima_modsig.c +++ b/security/integrity/ima/ima_modsig.c @@ -17,6 +17,19 @@ struct modsig { struct pkcs7_message *pkcs7_msg; + + enum hash_algo hash_algo; + + /* This digest will go in the 'd-modsig' field of the IMA template. */ + const u8 *digest; + u32 digest_size; + + /* + * This is what will go to the measurement list if the template requires + * storing the signature. + */ + int raw_pkcs7_len; + u8 raw_pkcs7[]; }; /** @@ -71,7 +84,8 @@ int ima_read_modsig(enum ima_hooks func, const void *buf, loff_t buf_len, sig_len = be32_to_cpu(sig->sig_len); buf_len -= sig_len + sizeof(*sig); - hdr = kmalloc(sizeof(*hdr), GFP_KERNEL); + /* Allocate sig_len additional bytes to hold the raw PKCS#7 data. */ + hdr = kzalloc(sizeof(*hdr) + sig_len, GFP_KERNEL); if (!hdr) return -ENOMEM; @@ -81,11 +95,43 @@ int ima_read_modsig(enum ima_hooks func, const void *buf, loff_t buf_len, return PTR_ERR(hdr->pkcs7_msg); } + memcpy(hdr->raw_pkcs7, buf + buf_len, sig_len); + hdr->raw_pkcs7_len = sig_len; + + /* We don't know the hash algorithm yet. */ + hdr->hash_algo = HASH_ALGO__LAST; + *modsig = hdr; return 0; } +/** + * ima_collect_modsig - Calculate the file hash without the appended signature. + * + * Since the modsig is part of the file contents, the hash used in its signature + * isn't the same one ordinarily calculated by IMA. Therefore PKCS7 code + * calculates a separate one for signature verification. + */ +void ima_collect_modsig(struct modsig *modsig, const void *buf, loff_t size) +{ + int rc; + + /* + * Provide the file contents (minus the appended sig) so that the PKCS7 + * code can calculate the file hash. + */ + size -= modsig->raw_pkcs7_len + strlen(MODULE_SIG_STRING) + + sizeof(struct module_signature); + rc = pkcs7_supply_detached_data(modsig->pkcs7_msg, buf, size); + if (rc) + return; + + /* Ask the PKCS7 code to calculate the file hash. */ + rc = pkcs7_get_digest(modsig->pkcs7_msg, &modsig->digest, + &modsig->digest_size, &modsig->hash_algo); +} + int ima_modsig_verify(struct key *keyring, const struct modsig *modsig) { return verify_pkcs7_message_sig(NULL, 0, modsig->pkcs7_msg, keyring, From 3878d505aa718bcc7b1eb4089ab9b9fb27dee957 Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Thu, 27 Jun 2019 23:19:32 -0300 Subject: [PATCH 013/334] ima: Define ima-modsig template Define new "d-modsig" template field which holds the digest that is expected to match the one contained in the modsig, and also new "modsig" template field which holds the appended file signature. Add a new "ima-modsig" defined template descriptor with the new fields as well as the ones from the "ima-sig" descriptor. Change ima_store_measurement() to accept a struct modsig * argument so that it can be passed along to the templates via struct ima_event_data. Suggested-by: Mimi Zohar Signed-off-by: Thiago Jung Bauermann Signed-off-by: Mimi Zohar --- Documentation/security/IMA-templates.rst | 3 ++ security/integrity/ima/ima.h | 20 ++++++- security/integrity/ima/ima_api.c | 5 +- security/integrity/ima/ima_main.c | 2 +- security/integrity/ima/ima_modsig.c | 19 +++++++ security/integrity/ima/ima_policy.c | 41 +++++++++++++++ security/integrity/ima/ima_template.c | 7 ++- security/integrity/ima/ima_template_lib.c | 64 ++++++++++++++++++++++- security/integrity/ima/ima_template_lib.h | 4 ++ 9 files changed, 159 insertions(+), 6 deletions(-) diff --git a/Documentation/security/IMA-templates.rst b/Documentation/security/IMA-templates.rst index 3d1cca287aa4..c5a8432972ef 100644 --- a/Documentation/security/IMA-templates.rst +++ b/Documentation/security/IMA-templates.rst @@ -68,8 +68,10 @@ descriptors by adding their identifier to the format string - 'd-ng': the digest of the event, calculated with an arbitrary hash algorithm (field format: [:]digest, where the digest prefix is shown only if the hash algorithm is not SHA1 or MD5); + - 'd-modsig': the digest of the event without the appended modsig; - 'n-ng': the name of the event, without size limitations; - 'sig': the file signature; + - 'modsig' the appended file signature; - 'buf': the buffer data that was used to generate the hash without size limitations; @@ -79,6 +81,7 @@ Below, there is the list of defined template descriptors: - "ima-ng" (default): its format is ``d-ng|n-ng``; - "ima-sig": its format is ``d-ng|n-ng|sig``; - "ima-buf": its format is ``d-ng|n-ng|buf``; + - "ima-modsig": its format is ``d-ng|n-ng|sig|d-modsig|modsig``; Use diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h index 0bc764c80327..cfc8b0887776 100644 --- a/security/integrity/ima/ima.h +++ b/security/integrity/ima/ima.h @@ -60,6 +60,7 @@ struct ima_event_data { const unsigned char *filename; struct evm_ima_xattr_data *xattr_value; int xattr_len; + const struct modsig *modsig; const char *violation; const void *buf; int buf_len; @@ -211,7 +212,7 @@ int ima_collect_measurement(struct integrity_iint_cache *iint, void ima_store_measurement(struct integrity_iint_cache *iint, struct file *file, const unsigned char *filename, struct evm_ima_xattr_data *xattr_value, - int xattr_len, int pcr, + int xattr_len, const struct modsig *modsig, int pcr, struct ima_template_desc *template_desc); void ima_audit_measurement(struct integrity_iint_cache *iint, const unsigned char *filename); @@ -312,6 +313,10 @@ bool ima_hook_supports_modsig(enum ima_hooks func); int ima_read_modsig(enum ima_hooks func, const void *buf, loff_t buf_len, struct modsig **modsig); void ima_collect_modsig(struct modsig *modsig, const void *buf, loff_t size); +int ima_get_modsig_digest(const struct modsig *modsig, enum hash_algo *algo, + const u8 **digest, u32 *digest_size); +int ima_get_raw_modsig(const struct modsig *modsig, const void **data, + u32 *data_len); void ima_free_modsig(struct modsig *modsig); #else static inline bool ima_hook_supports_modsig(enum ima_hooks func) @@ -330,6 +335,19 @@ static inline void ima_collect_modsig(struct modsig *modsig, const void *buf, { } +static inline int ima_get_modsig_digest(const struct modsig *modsig, + enum hash_algo *algo, const u8 **digest, + u32 *digest_size) +{ + return -EOPNOTSUPP; +} + +static inline int ima_get_raw_modsig(const struct modsig *modsig, + const void **data, u32 *data_len) +{ + return -EOPNOTSUPP; +} + static inline void ima_free_modsig(struct modsig *modsig) { } diff --git a/security/integrity/ima/ima_api.c b/security/integrity/ima/ima_api.c index ff8b7fb03ea0..ca930e2ebc2c 100644 --- a/security/integrity/ima/ima_api.c +++ b/security/integrity/ima/ima_api.c @@ -288,7 +288,7 @@ out: void ima_store_measurement(struct integrity_iint_cache *iint, struct file *file, const unsigned char *filename, struct evm_ima_xattr_data *xattr_value, - int xattr_len, int pcr, + int xattr_len, const struct modsig *modsig, int pcr, struct ima_template_desc *template_desc) { static const char op[] = "add_template_measure"; @@ -300,7 +300,8 @@ void ima_store_measurement(struct integrity_iint_cache *iint, .file = file, .filename = filename, .xattr_value = xattr_value, - .xattr_len = xattr_len }; + .xattr_len = xattr_len, + .modsig = modsig }; int violation = 0; if (iint->measured_pcrs & (0x1 << pcr)) diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index 7c8d92cf2755..c87645c2c4c0 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -323,7 +323,7 @@ static int process_measurement(struct file *file, const struct cred *cred, if (action & IMA_MEASURE) ima_store_measurement(iint, file, pathname, - xattr_value, xattr_len, pcr, + xattr_value, xattr_len, modsig, pcr, template_desc); if (rc == 0 && (action & IMA_APPRAISE_SUBMASK)) { inode_lock(inode); diff --git a/security/integrity/ima/ima_modsig.c b/security/integrity/ima/ima_modsig.c index 257387ce3b70..c412e31d1714 100644 --- a/security/integrity/ima/ima_modsig.c +++ b/security/integrity/ima/ima_modsig.c @@ -138,6 +138,25 @@ int ima_modsig_verify(struct key *keyring, const struct modsig *modsig) VERIFYING_MODULE_SIGNATURE, NULL, NULL); } +int ima_get_modsig_digest(const struct modsig *modsig, enum hash_algo *algo, + const u8 **digest, u32 *digest_size) +{ + *algo = modsig->hash_algo; + *digest = modsig->digest; + *digest_size = modsig->digest_size; + + return 0; +} + +int ima_get_raw_modsig(const struct modsig *modsig, const void **data, + u32 *data_len) +{ + *data = &modsig->raw_pkcs7; + *data_len = modsig->raw_pkcs7_len; + + return 0; +} + void ima_free_modsig(struct modsig *modsig) { if (!modsig) diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c index 873dd7edaa78..4badc4fcda98 100644 --- a/security/integrity/ima/ima_policy.c +++ b/security/integrity/ima/ima_policy.c @@ -6,6 +6,9 @@ * ima_policy.c * - initialize default measure policy rules */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -845,6 +848,38 @@ static void ima_log_string(struct audit_buffer *ab, char *key, char *value) ima_log_string_op(ab, key, value, NULL); } +/* + * Validating the appended signature included in the measurement list requires + * the file hash calculated without the appended signature (i.e., the 'd-modsig' + * field). Therefore, notify the user if they have the 'modsig' field but not + * the 'd-modsig' field in the template. + */ +static void check_template_modsig(const struct ima_template_desc *template) +{ +#define MSG "template with 'modsig' field also needs 'd-modsig' field\n" + bool has_modsig, has_dmodsig; + static bool checked; + int i; + + /* We only need to notify the user once. */ + if (checked) + return; + + has_modsig = has_dmodsig = false; + for (i = 0; i < template->num_fields; i++) { + if (!strcmp(template->fields[i]->field_id, "modsig")) + has_modsig = true; + else if (!strcmp(template->fields[i]->field_id, "d-modsig")) + has_dmodsig = true; + } + + if (has_modsig && !has_dmodsig) + pr_notice(MSG); + + checked = true; +#undef MSG +} + static int ima_parse_rule(char *rule, struct ima_rule_entry *entry) { struct audit_buffer *ab; @@ -1187,6 +1222,12 @@ static int ima_parse_rule(char *rule, struct ima_rule_entry *entry) else if (entry->action == APPRAISE) temp_ima_appraise |= ima_appraise_flag(entry->func); + if (!result && entry->flags & IMA_MODSIG_ALLOWED) { + template_desc = entry->template ? entry->template : + ima_template_desc_current(); + check_template_modsig(template_desc); + } + audit_log_format(ab, "res=%d", !result); audit_log_end(ab); return result; diff --git a/security/integrity/ima/ima_template.c b/security/integrity/ima/ima_template.c index cb349d7b2601..88d494ca6248 100644 --- a/security/integrity/ima/ima_template.c +++ b/security/integrity/ima/ima_template.c @@ -23,6 +23,7 @@ static struct ima_template_desc builtin_templates[] = { {.name = "ima-ng", .fmt = "d-ng|n-ng"}, {.name = "ima-sig", .fmt = "d-ng|n-ng|sig"}, {.name = "ima-buf", .fmt = "d-ng|n-ng|buf"}, + {.name = "ima-modsig", .fmt = "d-ng|n-ng|sig|d-modsig|modsig"}, {.name = "", .fmt = ""}, /* placeholder for a custom format */ }; @@ -42,6 +43,10 @@ static const struct ima_template_field supported_fields[] = { .field_show = ima_show_template_sig}, {.field_id = "buf", .field_init = ima_eventbuf_init, .field_show = ima_show_template_buf}, + {.field_id = "d-modsig", .field_init = ima_eventdigest_modsig_init, + .field_show = ima_show_template_digest_ng}, + {.field_id = "modsig", .field_init = ima_eventmodsig_init, + .field_show = ima_show_template_sig}, }; /* @@ -49,7 +54,7 @@ static const struct ima_template_field supported_fields[] = { * need to be accounted for since they shouldn't be defined in the same template * description as 'd-ng' and 'n-ng' respectively. */ -#define MAX_TEMPLATE_NAME_LEN sizeof("d-ng|n-ng|sig|buf") +#define MAX_TEMPLATE_NAME_LEN sizeof("d-ng|n-ng|sig|buf|d-modisg|modsig") static struct ima_template_desc *ima_template; diff --git a/security/integrity/ima/ima_template_lib.c b/security/integrity/ima/ima_template_lib.c index 2fb9a10bc6b7..32ae05d88257 100644 --- a/security/integrity/ima/ima_template_lib.c +++ b/security/integrity/ima/ima_template_lib.c @@ -225,7 +225,8 @@ int ima_parse_buf(void *bufstartp, void *bufendp, void **bufcurp, return 0; } -static int ima_eventdigest_init_common(u8 *digest, u32 digestsize, u8 hash_algo, +static int ima_eventdigest_init_common(const u8 *digest, u32 digestsize, + u8 hash_algo, struct ima_field_data *field_data) { /* @@ -328,6 +329,41 @@ out: hash_algo, field_data); } +/* + * This function writes the digest of the file which is expected to match the + * digest contained in the file's appended signature. + */ +int ima_eventdigest_modsig_init(struct ima_event_data *event_data, + struct ima_field_data *field_data) +{ + enum hash_algo hash_algo; + const u8 *cur_digest; + u32 cur_digestsize; + + if (!event_data->modsig) + return 0; + + if (event_data->violation) { + /* Recording a violation. */ + hash_algo = HASH_ALGO_SHA1; + cur_digest = NULL; + cur_digestsize = 0; + } else { + int rc; + + rc = ima_get_modsig_digest(event_data->modsig, &hash_algo, + &cur_digest, &cur_digestsize); + if (rc) + return rc; + else if (hash_algo == HASH_ALGO__LAST || cur_digestsize == 0) + /* There was some error collecting the digest. */ + return -EINVAL; + } + + return ima_eventdigest_init_common(cur_digest, cur_digestsize, + hash_algo, field_data); +} + static int ima_eventname_init_common(struct ima_event_data *event_data, struct ima_field_data *field_data, bool size_limit) @@ -406,3 +442,29 @@ int ima_eventbuf_init(struct ima_event_data *event_data, event_data->buf_len, DATA_FMT_HEX, field_data); } + +/* + * ima_eventmodsig_init - include the appended file signature as part of the + * template data + */ +int ima_eventmodsig_init(struct ima_event_data *event_data, + struct ima_field_data *field_data) +{ + const void *data; + u32 data_len; + int rc; + + if (!event_data->modsig) + return 0; + + /* + * modsig is a runtime structure containing pointers. Get its raw data + * instead. + */ + rc = ima_get_raw_modsig(event_data->modsig, &data, &data_len); + if (rc) + return rc; + + return ima_write_template_field_data(data, data_len, DATA_FMT_HEX, + field_data); +} diff --git a/security/integrity/ima/ima_template_lib.h b/security/integrity/ima/ima_template_lib.h index 652aa5de81ef..9a88c79a7a61 100644 --- a/security/integrity/ima/ima_template_lib.h +++ b/security/integrity/ima/ima_template_lib.h @@ -36,10 +36,14 @@ int ima_eventname_init(struct ima_event_data *event_data, struct ima_field_data *field_data); int ima_eventdigest_ng_init(struct ima_event_data *event_data, struct ima_field_data *field_data); +int ima_eventdigest_modsig_init(struct ima_event_data *event_data, + struct ima_field_data *field_data); int ima_eventname_ng_init(struct ima_event_data *event_data, struct ima_field_data *field_data); int ima_eventsig_init(struct ima_event_data *event_data, struct ima_field_data *field_data); int ima_eventbuf_init(struct ima_event_data *event_data, struct ima_field_data *field_data); +int ima_eventmodsig_init(struct ima_event_data *event_data, + struct ima_field_data *field_data); #endif /* __LINUX_IMA_TEMPLATE_LIB_H */ From e5092255bb3967bcc473dc86492dbbd5f7714023 Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Thu, 27 Jun 2019 23:19:33 -0300 Subject: [PATCH 014/334] ima: Store the measurement again when appraising a modsig If the IMA template contains the "modsig" or "d-modsig" field, then the modsig should be added to the measurement list when the file is appraised. And that is what normally happens, but if a measurement rule caused a file containing a modsig to be measured before a different rule causes it to be appraised, the resulting measurement entry will not contain the modsig because it is only fetched during appraisal. When the appraisal rule triggers, it won't store a new measurement containing the modsig because the file was already measured. We need to detect that situation and store an additional measurement with the modsig. This is done by adding an IMA_MEASURE action flag if we read a modsig and the IMA template contains a modsig field. Suggested-by: Mimi Zohar Signed-off-by: Thiago Jung Bauermann Signed-off-by: Mimi Zohar --- security/integrity/ima/ima.h | 1 + security/integrity/ima/ima_api.c | 19 +++++++++++++++---- security/integrity/ima/ima_main.c | 15 ++++++++++++--- security/integrity/ima/ima_template.c | 19 +++++++++++++++++++ 4 files changed, 47 insertions(+), 7 deletions(-) diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h index cfc8b0887776..19769bf5f6ab 100644 --- a/security/integrity/ima/ima.h +++ b/security/integrity/ima/ima.h @@ -150,6 +150,7 @@ int template_desc_init_fields(const char *template_fmt, int *num_fields); struct ima_template_desc *ima_template_desc_current(void); struct ima_template_desc *lookup_template_desc(const char *name); +bool ima_template_has_modsig(const struct ima_template_desc *ima_template); int ima_restore_measurement_entry(struct ima_template_entry *entry); int ima_restore_measurement_list(loff_t bufsize, void *buf); int ima_measurements_show(struct seq_file *m, void *v); diff --git a/security/integrity/ima/ima_api.c b/security/integrity/ima/ima_api.c index ca930e2ebc2c..65224474675b 100644 --- a/security/integrity/ima/ima_api.c +++ b/security/integrity/ima/ima_api.c @@ -219,6 +219,14 @@ int ima_collect_measurement(struct integrity_iint_cache *iint, char digest[IMA_MAX_DIGEST_SIZE]; } hash; + /* + * Always collect the modsig, because IMA might have already collected + * the file digest without collecting the modsig in a previous + * measurement rule. + */ + if (modsig) + ima_collect_modsig(modsig, buf, size); + if (iint->flags & IMA_COLLECTED) goto out; @@ -252,9 +260,6 @@ int ima_collect_measurement(struct integrity_iint_cache *iint, memcpy(iint->ima_hash, &hash, length); iint->version = i_version; - if (modsig) - ima_collect_modsig(modsig, buf, size); - /* Possibly temporary failure due to type of read (eg. O_DIRECT) */ if (!result) iint->flags |= IMA_COLLECTED; @@ -304,7 +309,13 @@ void ima_store_measurement(struct integrity_iint_cache *iint, .modsig = modsig }; int violation = 0; - if (iint->measured_pcrs & (0x1 << pcr)) + /* + * We still need to store the measurement in the case of MODSIG because + * we only have its contents to put in the list at the time of + * appraisal, but a file measurement from earlier might already exist in + * the measurement list. + */ + if (iint->measured_pcrs & (0x1 << pcr) && !modsig) return; result = ima_alloc_init_template(&event_data, &entry, template_desc); diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index c87645c2c4c0..79c01516211b 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -307,9 +307,18 @@ static int process_measurement(struct file *file, const struct cred *cred, /* read 'security.ima' */ xattr_len = ima_read_xattr(file_dentry(file), &xattr_value); - /* Read the appended modsig if allowed by the policy. */ - if (iint->flags & IMA_MODSIG_ALLOWED) - ima_read_modsig(func, buf, size, &modsig); + /* + * Read the appended modsig if allowed by the policy, and allow + * an additional measurement list entry, if needed, based on the + * template format and whether the file was already measured. + */ + if (iint->flags & IMA_MODSIG_ALLOWED) { + rc = ima_read_modsig(func, buf, size, &modsig); + + if (!rc && ima_template_has_modsig(template_desc) && + iint->flags & IMA_MEASURED) + action |= IMA_MEASURE; + } } hash_algo = ima_get_hash_algo(xattr_value, xattr_len); diff --git a/security/integrity/ima/ima_template.c b/security/integrity/ima/ima_template.c index 88d494ca6248..f5b950e0a955 100644 --- a/security/integrity/ima/ima_template.c +++ b/security/integrity/ima/ima_template.c @@ -58,6 +58,25 @@ static const struct ima_template_field supported_fields[] = { static struct ima_template_desc *ima_template; +/** + * ima_template_has_modsig - Check whether template has modsig-related fields. + * @ima_template: IMA template to check. + * + * Tells whether the given template has fields referencing a file's appended + * signature. + */ +bool ima_template_has_modsig(const struct ima_template_desc *ima_template) +{ + int i; + + for (i = 0; i < ima_template->num_fields; i++) + if (!strcmp(ima_template->fields[i]->field_id, "modsig") || + !strcmp(ima_template->fields[i]->field_id, "d-modsig")) + return true; + + return false; +} + static int __init ima_template_setup(char *str) { struct ima_template_desc *template_desc; From f5e1040196dbfe14c77ce3dfe3b7b08d2d961e88 Mon Sep 17 00:00:00 2001 From: Sascha Hauer Date: Tue, 2 Jul 2019 10:00:40 +0200 Subject: [PATCH 015/334] ima: always return negative code for error integrity_kernel_read() returns the number of bytes read. If this is a short read then this positive value is returned from ima_calc_file_hash_atfm(). Currently this is only indirectly called from ima_calc_file_hash() and this function only tests for the return value being zero or nonzero and also doesn't forward the return value. Nevertheless there's no point in returning a positive value as an error, so translate a short read into -EINVAL. Signed-off-by: Sascha Hauer Signed-off-by: Mimi Zohar --- security/integrity/ima/ima_crypto.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/security/integrity/ima/ima_crypto.c b/security/integrity/ima/ima_crypto.c index d4c7b8e1b083..7532b062be59 100644 --- a/security/integrity/ima/ima_crypto.c +++ b/security/integrity/ima/ima_crypto.c @@ -268,8 +268,11 @@ static int ima_calc_file_hash_atfm(struct file *file, rbuf_len = min_t(loff_t, i_size - offset, rbuf_size[active]); rc = integrity_kernel_read(file, offset, rbuf[active], rbuf_len); - if (rc != rbuf_len) + if (rc != rbuf_len) { + if (rc >= 0) + rc = -EINVAL; goto out3; + } if (rbuf[1] && offset) { /* Using two buffers, and it is not the first From 4ece3125f21b1d42b84896c5646dbf0e878464e1 Mon Sep 17 00:00:00 2001 From: Sascha Hauer Date: Tue, 2 Jul 2019 10:00:41 +0200 Subject: [PATCH 016/334] ima: fix freeing ongoing ahash_request integrity_kernel_read() can fail in which case we forward to call ahash_request_free() on a currently running request. We have to wait for its completion before we can free the request. This was observed by interrupting a "find / -type f -xdev -print0 | xargs -0 cat 1>/dev/null" with ctrl-c on an IMA enabled filesystem. Signed-off-by: Sascha Hauer Signed-off-by: Mimi Zohar --- security/integrity/ima/ima_crypto.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/security/integrity/ima/ima_crypto.c b/security/integrity/ima/ima_crypto.c index 7532b062be59..73044fc6a952 100644 --- a/security/integrity/ima/ima_crypto.c +++ b/security/integrity/ima/ima_crypto.c @@ -271,6 +271,11 @@ static int ima_calc_file_hash_atfm(struct file *file, if (rc != rbuf_len) { if (rc >= 0) rc = -EINVAL; + /* + * Forward current rc, do not overwrite with return value + * from ahash_wait() + */ + ahash_wait(ahash_rc, &wait); goto out3; } From 4af9027d3f4061992c0b065102a0a666b72f073b Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Tue, 30 Jul 2019 17:02:26 +0800 Subject: [PATCH 017/334] csky/dma: Fixup cache_op failed when cross memory ZONEs If the paddr and size are cross between NORMAL_ZONE and HIGHMEM_ZONE memory range, cache_op will panic in do_page_fault with bad_area. Optimize the code to support the range which cross memory ZONEs. Changes for V2: - Revert back to postcore_initcall Signed-off-by: Guo Ren Cc: Christoph Hellwig Cc: Arnd Bergmann --- arch/csky/mm/dma-mapping.c | 71 ++++++++++++++------------------------ 1 file changed, 26 insertions(+), 45 deletions(-) diff --git a/arch/csky/mm/dma-mapping.c b/arch/csky/mm/dma-mapping.c index 80783bb71c5c..65f531d54814 100644 --- a/arch/csky/mm/dma-mapping.c +++ b/arch/csky/mm/dma-mapping.c @@ -20,69 +20,50 @@ static int __init atomic_pool_init(void) } postcore_initcall(atomic_pool_init); -void arch_dma_prep_coherent(struct page *page, size_t size) -{ - if (PageHighMem(page)) { - unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; - - do { - void *ptr = kmap_atomic(page); - size_t _size = (size < PAGE_SIZE) ? size : PAGE_SIZE; - - memset(ptr, 0, _size); - dma_wbinv_range((unsigned long)ptr, - (unsigned long)ptr + _size); - - kunmap_atomic(ptr); - - page++; - size -= PAGE_SIZE; - count--; - } while (count); - } else { - void *ptr = page_address(page); - - memset(ptr, 0, size); - dma_wbinv_range((unsigned long)ptr, (unsigned long)ptr + size); - } -} - static inline void cache_op(phys_addr_t paddr, size_t size, void (*fn)(unsigned long start, unsigned long end)) { - struct page *page = pfn_to_page(paddr >> PAGE_SHIFT); - unsigned int offset = paddr & ~PAGE_MASK; - size_t left = size; - unsigned long start; + struct page *page = phys_to_page(paddr); + void *start = __va(page_to_phys(page)); + unsigned long offset = offset_in_page(paddr); + size_t left = size; do { size_t len = left; + if (offset + len > PAGE_SIZE) + len = PAGE_SIZE - offset; + if (PageHighMem(page)) { - void *addr; + start = kmap_atomic(page); - if (offset + len > PAGE_SIZE) { - if (offset >= PAGE_SIZE) { - page += offset >> PAGE_SHIFT; - offset &= ~PAGE_MASK; - } - len = PAGE_SIZE - offset; - } + fn((unsigned long)start + offset, + (unsigned long)start + offset + len); - addr = kmap_atomic(page); - start = (unsigned long)(addr + offset); - fn(start, start + len); - kunmap_atomic(addr); + kunmap_atomic(start); } else { - start = (unsigned long)phys_to_virt(paddr); - fn(start, start + size); + fn((unsigned long)start + offset, + (unsigned long)start + offset + len); } offset = 0; + page++; + start += PAGE_SIZE; left -= len; } while (left); } +static void dma_wbinv_set_zero_range(unsigned long start, unsigned long end) +{ + memset((void *)start, 0, end - start); + dma_wbinv_range(start, end); +} + +void arch_dma_prep_coherent(struct page *page, size_t size) +{ + cache_op(page_to_phys(page), size, dma_wbinv_set_zero_range); +} + void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr, size_t size, enum dma_data_direction dir) { From ae76f635d4e1cffa6870cc5472567ca9d6940a22 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Tue, 30 Jul 2019 17:16:28 +0800 Subject: [PATCH 018/334] csky: Optimize arch_sync_dma_for_cpu/device with dma_inv_range DMA_FROM_DEVICE only need to read dma data of memory into CPU cache, so there is no need to clear cache before. Also clear + inv for DMA_FROM_DEVICE won't cause problem, because the memory range for dma won't be touched by software during dma working. Changes for V2: - Remove clr cache and ignore the DMA_TO_DEVICE in _for_cpu. - Change inv to wbinv cache with DMA_FROM_DEVICE in _for_device. Signed-off-by: Guo Ren Cc: Arnd Bergmann --- arch/csky/include/asm/cache.h | 1 + arch/csky/mm/cachev1.c | 7 ++++++- arch/csky/mm/cachev2.c | 11 ++++++++++- arch/csky/mm/dma-mapping.c | 5 ++--- 4 files changed, 19 insertions(+), 5 deletions(-) diff --git a/arch/csky/include/asm/cache.h b/arch/csky/include/asm/cache.h index d68373463676..1d5fc2f78fd7 100644 --- a/arch/csky/include/asm/cache.h +++ b/arch/csky/include/asm/cache.h @@ -24,6 +24,7 @@ void cache_wbinv_range(unsigned long start, unsigned long end); void cache_wbinv_all(void); void dma_wbinv_range(unsigned long start, unsigned long end); +void dma_inv_range(unsigned long start, unsigned long end); void dma_wb_range(unsigned long start, unsigned long end); #endif diff --git a/arch/csky/mm/cachev1.c b/arch/csky/mm/cachev1.c index b8a75cce0b8c..494ec912abff 100644 --- a/arch/csky/mm/cachev1.c +++ b/arch/csky/mm/cachev1.c @@ -120,7 +120,12 @@ void dma_wbinv_range(unsigned long start, unsigned long end) cache_op_range(start, end, DATA_CACHE|CACHE_CLR|CACHE_INV, 1); } +void dma_inv_range(unsigned long start, unsigned long end) +{ + cache_op_range(start, end, DATA_CACHE|CACHE_CLR|CACHE_INV, 1); +} + void dma_wb_range(unsigned long start, unsigned long end) { - cache_op_range(start, end, DATA_CACHE|CACHE_INV, 1); + cache_op_range(start, end, DATA_CACHE|CACHE_CLR|CACHE_INV, 1); } diff --git a/arch/csky/mm/cachev2.c b/arch/csky/mm/cachev2.c index baaf05d69f44..b61be6518e21 100644 --- a/arch/csky/mm/cachev2.c +++ b/arch/csky/mm/cachev2.c @@ -69,11 +69,20 @@ void dma_wbinv_range(unsigned long start, unsigned long end) sync_is(); } +void dma_inv_range(unsigned long start, unsigned long end) +{ + unsigned long i = start & ~(L1_CACHE_BYTES - 1); + + for (; i < end; i += L1_CACHE_BYTES) + asm volatile("dcache.iva %0\n"::"r"(i):"memory"); + sync_is(); +} + void dma_wb_range(unsigned long start, unsigned long end) { unsigned long i = start & ~(L1_CACHE_BYTES - 1); for (; i < end; i += L1_CACHE_BYTES) - asm volatile("dcache.civa %0\n"::"r"(i):"memory"); + asm volatile("dcache.cva %0\n"::"r"(i):"memory"); sync_is(); } diff --git a/arch/csky/mm/dma-mapping.c b/arch/csky/mm/dma-mapping.c index 65f531d54814..106ef02a8f89 100644 --- a/arch/csky/mm/dma-mapping.c +++ b/arch/csky/mm/dma-mapping.c @@ -85,11 +85,10 @@ void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr, { switch (dir) { case DMA_TO_DEVICE: - cache_op(paddr, size, dma_wb_range); - break; + return; case DMA_FROM_DEVICE: case DMA_BIDIRECTIONAL: - cache_op(paddr, size, dma_wbinv_range); + cache_op(paddr, size, dma_inv_range); break; default: BUG(); From 70433f67ec3a54710744902d782f8954325e25b8 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Tue, 6 Aug 2019 12:15:19 +1000 Subject: [PATCH 019/334] MODSIGN: make new include file self contained Fixes: c8424e776b09 ("MODSIGN: Export module signature definitions") Signed-off-by: Stephen Rothwell Signed-off-by: Mimi Zohar --- include/linux/module_signature.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/module_signature.h b/include/linux/module_signature.h index 523617fc5b6a..7eb4b00381ac 100644 --- a/include/linux/module_signature.h +++ b/include/linux/module_signature.h @@ -9,6 +9,8 @@ #ifndef _LINUX_MODULE_SIGNATURE_H #define _LINUX_MODULE_SIGNATURE_H +#include + /* In stripped ARM and x86-64 modules, ~ is surprisingly rare. */ #define MODULE_SIG_STRING "~Module signature appended~\n" From a92c7ba982e3950a82fd63b3fd289248bd99e8f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valdis=20Kl=C4=93tnieks?= Date: Wed, 7 Aug 2019 19:22:34 -0400 Subject: [PATCH 020/334] fs/handle.c - fix up kerneldoc When building with W=1, we get some kerneldoc warnings: CC fs/fhandle.o fs/fhandle.c:259: warning: Function parameter or member 'flags' not described in 'sys_open_by_handle_at' fs/fhandle.c:259: warning: Excess function parameter 'flag' description in 'sys_open_by_handle_at' Fix the typo that caused it. Signed-off-by: Valdis Kletnieks Signed-off-by: Al Viro --- fs/fhandle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fhandle.c b/fs/fhandle.c index 0ee727485615..01263ffbc4c0 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -246,7 +246,7 @@ static long do_handle_open(int mountdirfd, struct file_handle __user *ufh, * sys_open_by_handle_at: Open the file handle * @mountdirfd: directory file descriptor * @handle: file handle to be opened - * @flag: open flags. + * @flags: open flags. * * @mountdirfd indicate the directory file descriptor * of the mount point. file handle is decoded relative From 5336c17928cc464845ff765ce45b368c22f848e0 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Thu, 15 Aug 2019 16:24:56 +0800 Subject: [PATCH 021/334] csky: Fixup ioremap function losing Implement the following apis to meet usage in different scenarios. - ioremap (NonCache + StrongOrder) - ioremap_nocache (NonCache + StrongOrder) - ioremap_wc (NonCache + WeakOrder ) - ioremap_cache ( Cache + WeakOrder ) Also change flag VM_ALLOC to VM_IOREMAP in get_vm_area_caller. Signed-off-by: Guo Ren Cc: Arnd Bergmann Cc: Christoph Hellwig --- arch/csky/include/asm/io.h | 23 ++++++++++++----------- arch/csky/mm/ioremap.c | 23 +++++++++++++++++------ 2 files changed, 29 insertions(+), 17 deletions(-) diff --git a/arch/csky/include/asm/io.h b/arch/csky/include/asm/io.h index c1dfa9c10e36..80d071e2567f 100644 --- a/arch/csky/include/asm/io.h +++ b/arch/csky/include/asm/io.h @@ -4,17 +4,10 @@ #ifndef __ASM_CSKY_IO_H #define __ASM_CSKY_IO_H -#include +#include #include #include -extern void __iomem *ioremap(phys_addr_t offset, size_t size); - -extern void iounmap(void *addr); - -extern int remap_area_pages(unsigned long address, phys_addr_t phys_addr, - size_t size, unsigned long flags); - /* * I/O memory access primitives. Reads are ordered relative to any * following Normal memory access. Writes are ordered relative to any prior @@ -40,9 +33,17 @@ extern int remap_area_pages(unsigned long address, phys_addr_t phys_addr, #define writel(v,c) ({ wmb(); writel_relaxed((v),(c)); mb(); }) #endif -#define ioremap_nocache(phy, sz) ioremap(phy, sz) -#define ioremap_wc ioremap_nocache -#define ioremap_wt ioremap_nocache +/* + * I/O memory mapping functions. + */ +extern void __iomem *ioremap_cache(phys_addr_t addr, size_t size); +extern void __iomem *__ioremap(phys_addr_t addr, size_t size, pgprot_t prot); +extern void iounmap(void *addr); + +#define ioremap(addr, size) __ioremap((addr), (size), pgprot_noncached(PAGE_KERNEL)) +#define ioremap_wc(addr, size) __ioremap((addr), (size), pgprot_writecombine(PAGE_KERNEL)) +#define ioremap_nocache(addr, size) ioremap((addr), (size)) +#define ioremap_cache ioremap_cache #include diff --git a/arch/csky/mm/ioremap.c b/arch/csky/mm/ioremap.c index 48531115fd9d..e13cd3497628 100644 --- a/arch/csky/mm/ioremap.c +++ b/arch/csky/mm/ioremap.c @@ -8,12 +8,12 @@ #include -void __iomem *ioremap(phys_addr_t addr, size_t size) +static void __iomem *__ioremap_caller(phys_addr_t addr, size_t size, + pgprot_t prot, void *caller) { phys_addr_t last_addr; unsigned long offset, vaddr; struct vm_struct *area; - pgprot_t prot; last_addr = addr + size - 1; if (!size || last_addr < addr) @@ -23,14 +23,12 @@ void __iomem *ioremap(phys_addr_t addr, size_t size) addr &= PAGE_MASK; size = PAGE_ALIGN(size + offset); - area = get_vm_area_caller(size, VM_ALLOC, __builtin_return_address(0)); + area = get_vm_area_caller(size, VM_IOREMAP, caller); if (!area) return NULL; vaddr = (unsigned long)area->addr; - prot = pgprot_noncached(PAGE_KERNEL); - if (ioremap_page_range(vaddr, vaddr + size, addr, prot)) { free_vm_area(area); return NULL; @@ -38,7 +36,20 @@ void __iomem *ioremap(phys_addr_t addr, size_t size) return (void __iomem *)(vaddr + offset); } -EXPORT_SYMBOL(ioremap); + +void __iomem *__ioremap(phys_addr_t phys_addr, size_t size, pgprot_t prot) +{ + return __ioremap_caller(phys_addr, size, prot, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(__ioremap); + +void __iomem *ioremap_cache(phys_addr_t phys_addr, size_t size) +{ + return __ioremap_caller(phys_addr, size, PAGE_KERNEL, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(ioremap_cache); void iounmap(void __iomem *addr) { From 10fa8acf0fa6ed79ddb662488addb3ba71e9db60 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 30 Jul 2019 19:06:38 -0400 Subject: [PATCH 022/334] nfsd: Remove unnecessary NULL checks "cb" is never actually NULL in these functions. On a quick skim of the history, they seem to have been there from the beginning. I'm not sure if they originally served a purpose. Reported-by: Jia-Ju Bai Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4callback.c | 35 +++++++++++++++-------------------- 1 file changed, 15 insertions(+), 20 deletions(-) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 397eb7820929..524111420b48 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -512,11 +512,9 @@ static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, if (unlikely(status)) return status; - if (cb != NULL) { - status = decode_cb_sequence4res(xdr, cb); - if (unlikely(status || cb->cb_seq_status)) - return status; - } + status = decode_cb_sequence4res(xdr, cb); + if (unlikely(status || cb->cb_seq_status)) + return status; return decode_cb_op_status(xdr, OP_CB_RECALL, &cb->cb_status); } @@ -604,11 +602,10 @@ static int nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp, if (unlikely(status)) return status; - if (cb) { - status = decode_cb_sequence4res(xdr, cb); - if (unlikely(status || cb->cb_seq_status)) - return status; - } + status = decode_cb_sequence4res(xdr, cb); + if (unlikely(status || cb->cb_seq_status)) + return status; + return decode_cb_op_status(xdr, OP_CB_LAYOUTRECALL, &cb->cb_status); } #endif /* CONFIG_NFSD_PNFS */ @@ -663,11 +660,10 @@ static int nfs4_xdr_dec_cb_notify_lock(struct rpc_rqst *rqstp, if (unlikely(status)) return status; - if (cb) { - status = decode_cb_sequence4res(xdr, cb); - if (unlikely(status || cb->cb_seq_status)) - return status; - } + status = decode_cb_sequence4res(xdr, cb); + if (unlikely(status || cb->cb_seq_status)) + return status; + return decode_cb_op_status(xdr, OP_CB_NOTIFY_LOCK, &cb->cb_status); } @@ -759,11 +755,10 @@ static int nfs4_xdr_dec_cb_offload(struct rpc_rqst *rqstp, if (unlikely(status)) return status; - if (cb) { - status = decode_cb_sequence4res(xdr, cb); - if (unlikely(status || cb->cb_seq_status)) - return status; - } + status = decode_cb_sequence4res(xdr, cb); + if (unlikely(status || cb->cb_seq_status)) + return status; + return decode_cb_op_status(xdr, OP_CB_OFFLOAD, &cb->cb_status); } /* From d6dfe43ec6062beea5ba1172b957e74a13c95b86 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 16 Aug 2019 17:48:36 -0400 Subject: [PATCH 023/334] svcrdma: Remove svc_rdma_wq Clean up: the system workqueue will work just as well. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 1 - net/sunrpc/xprtrdma/svc_rdma.c | 7 ------- net/sunrpc/xprtrdma/svc_rdma_transport.c | 3 ++- 3 files changed, 2 insertions(+), 9 deletions(-) diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 981f0d726ad4..edb39900fe04 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -200,7 +200,6 @@ extern struct svc_xprt_class svc_rdma_bc_class; #endif /* svc_rdma.c */ -extern struct workqueue_struct *svc_rdma_wq; extern int svc_rdma_init(void); extern void svc_rdma_cleanup(void); diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c index abdb3004a1e3..97bca509a391 100644 --- a/net/sunrpc/xprtrdma/svc_rdma.c +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -73,8 +73,6 @@ atomic_t rdma_stat_rq_prod; atomic_t rdma_stat_sq_poll; atomic_t rdma_stat_sq_prod; -struct workqueue_struct *svc_rdma_wq; - /* * This function implements reading and resetting an atomic_t stat * variable through read/write to a proc file. Any write to the file @@ -230,7 +228,6 @@ static struct ctl_table svcrdma_root_table[] = { void svc_rdma_cleanup(void) { dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n"); - destroy_workqueue(svc_rdma_wq); if (svcrdma_table_header) { unregister_sysctl_table(svcrdma_table_header); svcrdma_table_header = NULL; @@ -246,10 +243,6 @@ int svc_rdma_init(void) dprintk("\tmax_bc_requests : %u\n", svcrdma_max_bc_requests); dprintk("\tmax_inline : %d\n", svcrdma_max_req_size); - svc_rdma_wq = alloc_workqueue("svc_rdma", 0, 0); - if (!svc_rdma_wq) - return -ENOMEM; - if (!svcrdma_table_header) svcrdma_table_header = register_sysctl_table(svcrdma_root_table); diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 3fe665152d95..18d6eb3686e7 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -630,8 +630,9 @@ static void svc_rdma_free(struct svc_xprt *xprt) { struct svcxprt_rdma *rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt); + INIT_WORK(&rdma->sc_work, __svc_rdma_free); - queue_work(svc_rdma_wq, &rdma->sc_work); + schedule_work(&rdma->sc_work); } static int svc_rdma_has_wspace(struct svc_xprt *xprt) From 4866073e6ddf03066c925d3237903d7f4ca68982 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 16 Aug 2019 17:49:38 -0400 Subject: [PATCH 024/334] svcrdma: Use llist for managing cache of recv_ctxts Use a wait-free mechanism for managing the svc_rdma_recv_ctxts free list. Subsequently, sc_recv_lock can be eliminated. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 5 +++-- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 24 ++++++++++-------------- net/sunrpc/xprtrdma/svc_rdma_transport.c | 3 +-- 3 files changed, 14 insertions(+), 18 deletions(-) diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index edb39900fe04..40f65888dd38 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -42,6 +42,7 @@ #ifndef SVC_RDMA_H #define SVC_RDMA_H +#include #include #include #include @@ -107,8 +108,7 @@ struct svcxprt_rdma { struct list_head sc_read_complete_q; struct work_struct sc_work; - spinlock_t sc_recv_lock; - struct list_head sc_recv_ctxts; + struct llist_head sc_recv_ctxts; }; /* sc_flags */ #define RDMAXPRT_CONN_PENDING 3 @@ -125,6 +125,7 @@ enum { #define RPCSVC_MAXPAYLOAD_RDMA RPCSVC_MAXPAYLOAD struct svc_rdma_recv_ctxt { + struct llist_node rc_node; struct list_head rc_list; struct ib_recv_wr rc_recv_wr; struct ib_cqe rc_cqe; diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 65e2fb9aac65..96bccd398469 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -172,9 +172,10 @@ static void svc_rdma_recv_ctxt_destroy(struct svcxprt_rdma *rdma, void svc_rdma_recv_ctxts_destroy(struct svcxprt_rdma *rdma) { struct svc_rdma_recv_ctxt *ctxt; + struct llist_node *node; - while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_recv_ctxts))) { - list_del(&ctxt->rc_list); + while ((node = llist_del_first(&rdma->sc_recv_ctxts))) { + ctxt = llist_entry(node, struct svc_rdma_recv_ctxt, rc_node); svc_rdma_recv_ctxt_destroy(rdma, ctxt); } } @@ -183,21 +184,18 @@ static struct svc_rdma_recv_ctxt * svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma) { struct svc_rdma_recv_ctxt *ctxt; + struct llist_node *node; - spin_lock(&rdma->sc_recv_lock); - ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_recv_ctxts); - if (!ctxt) + node = llist_del_first(&rdma->sc_recv_ctxts); + if (!node) goto out_empty; - list_del(&ctxt->rc_list); - spin_unlock(&rdma->sc_recv_lock); + ctxt = llist_entry(node, struct svc_rdma_recv_ctxt, rc_node); out: ctxt->rc_page_count = 0; return ctxt; out_empty: - spin_unlock(&rdma->sc_recv_lock); - ctxt = svc_rdma_recv_ctxt_alloc(rdma); if (!ctxt) return NULL; @@ -218,11 +216,9 @@ void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma, for (i = 0; i < ctxt->rc_page_count; i++) put_page(ctxt->rc_pages[i]); - if (!ctxt->rc_temp) { - spin_lock(&rdma->sc_recv_lock); - list_add(&ctxt->rc_list, &rdma->sc_recv_ctxts); - spin_unlock(&rdma->sc_recv_lock); - } else + if (!ctxt->rc_temp) + llist_add(&ctxt->rc_node, &rdma->sc_recv_ctxts); + else svc_rdma_recv_ctxt_destroy(rdma, ctxt); } diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 18d6eb3686e7..4182d569b5cf 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -140,14 +140,13 @@ static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv, INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q); INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q); INIT_LIST_HEAD(&cma_xprt->sc_send_ctxts); - INIT_LIST_HEAD(&cma_xprt->sc_recv_ctxts); + init_llist_head(&cma_xprt->sc_recv_ctxts); INIT_LIST_HEAD(&cma_xprt->sc_rw_ctxts); init_waitqueue_head(&cma_xprt->sc_send_wait); spin_lock_init(&cma_xprt->sc_lock); spin_lock_init(&cma_xprt->sc_rq_dto_lock); spin_lock_init(&cma_xprt->sc_send_lock); - spin_lock_init(&cma_xprt->sc_recv_lock); spin_lock_init(&cma_xprt->sc_rw_ctxt_lock); /* From f69d6d8eef7807f8d937b81da24bebd2e926e4d2 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sun, 18 Aug 2019 14:18:44 -0400 Subject: [PATCH 025/334] sunrpc: add a new cache_detail operation for when a cache is flushed When the exports table is changed, exportfs will usually write a new time to the "flush" file in the nfsd.export cache procfile. This tells the kernel to flush any entries that are older than that value. This gives us a mechanism to tell whether an unexport might have occurred. Add a new ->flush cache_detail operation that is called after flushing the cache whenever someone writes to a "flush" file. Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/cache.h | 1 + net/sunrpc/cache.c | 3 +++ 2 files changed, 4 insertions(+) diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h index f7d086b77a21..f8603724fbee 100644 --- a/include/linux/sunrpc/cache.h +++ b/include/linux/sunrpc/cache.h @@ -87,6 +87,7 @@ struct cache_detail { int has_died); struct cache_head * (*alloc)(void); + void (*flush)(void); int (*match)(struct cache_head *orig, struct cache_head *new); void (*init)(struct cache_head *orig, struct cache_head *new); void (*update)(struct cache_head *orig, struct cache_head *new); diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index a6a6190ad37a..a349094f6fb7 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -1524,6 +1524,9 @@ static ssize_t write_flush(struct file *file, const char __user *buf, cd->nextcheck = now; cache_flush(); + if (cd->flush) + cd->flush(); + *ppos += count; return count; } From 18f6622ebbdea56a83f8e553c159ce2d62d3ad0c Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sun, 18 Aug 2019 14:18:45 -0400 Subject: [PATCH 026/334] locks: create a new notifier chain for lease attempts With the new file caching infrastructure in nfsd, we can end up holding files open for an indefinite period of time, even when they are still idle. This may prevent the kernel from handing out leases on the file, which is something we don't want to block. Fix this by running a SRCU notifier call chain whenever on any lease attempt. nfsd can then purge the cache for that inode before returning. Since SRCU is only conditionally compiled in, we must only define the new chain if it's enabled, and users of the chain must ensure that SRCU is enabled. Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/locks.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/fs.h | 5 ++++ 2 files changed, 66 insertions(+) diff --git a/fs/locks.c b/fs/locks.c index 686eae21daf6..1913481bfbf7 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1990,6 +1990,64 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp, } EXPORT_SYMBOL(generic_setlease); +#if IS_ENABLED(CONFIG_SRCU) +/* + * Kernel subsystems can register to be notified on any attempt to set + * a new lease with the lease_notifier_chain. This is used by (e.g.) nfsd + * to close files that it may have cached when there is an attempt to set a + * conflicting lease. + */ +static struct srcu_notifier_head lease_notifier_chain; + +static inline void +lease_notifier_chain_init(void) +{ + srcu_init_notifier_head(&lease_notifier_chain); +} + +static inline void +setlease_notifier(long arg, struct file_lock *lease) +{ + if (arg != F_UNLCK) + srcu_notifier_call_chain(&lease_notifier_chain, arg, lease); +} + +int lease_register_notifier(struct notifier_block *nb) +{ + return srcu_notifier_chain_register(&lease_notifier_chain, nb); +} +EXPORT_SYMBOL_GPL(lease_register_notifier); + +void lease_unregister_notifier(struct notifier_block *nb) +{ + srcu_notifier_chain_unregister(&lease_notifier_chain, nb); +} +EXPORT_SYMBOL_GPL(lease_unregister_notifier); + +#else /* !IS_ENABLED(CONFIG_SRCU) */ +static inline void +lease_notifier_chain_init(void) +{ +} + +static inline void +setlease_notifier(long arg, struct file_lock *lease) +{ +} + +int lease_register_notifier(struct notifier_block *nb) +{ + return 0; +} +EXPORT_SYMBOL_GPL(lease_register_notifier); + +void lease_unregister_notifier(struct notifier_block *nb) +{ +} +EXPORT_SYMBOL_GPL(lease_unregister_notifier); + +#endif /* IS_ENABLED(CONFIG_SRCU) */ + /** * vfs_setlease - sets a lease on an open file * @filp: file pointer @@ -2010,6 +2068,8 @@ EXPORT_SYMBOL(generic_setlease); int vfs_setlease(struct file *filp, long arg, struct file_lock **lease, void **priv) { + if (lease) + setlease_notifier(arg, *lease); if (filp->f_op->setlease) return filp->f_op->setlease(filp, arg, lease, priv); else @@ -2923,6 +2983,7 @@ static int __init filelock_init(void) INIT_HLIST_HEAD(&fll->hlist); } + lease_notifier_chain_init(); return 0; } core_initcall(filelock_init); diff --git a/include/linux/fs.h b/include/linux/fs.h index 56b8e358af5c..0f106c7f4bb9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1155,6 +1155,11 @@ extern void lease_get_mtime(struct inode *, struct timespec64 *time); extern int generic_setlease(struct file *, long, struct file_lock **, void **priv); extern int vfs_setlease(struct file *, long, struct file_lock **, void **); extern int lease_modify(struct file_lock *, int, struct list_head *); + +struct notifier_block; +extern int lease_register_notifier(struct notifier_block *); +extern void lease_unregister_notifier(struct notifier_block *); + struct files_struct; extern void show_fd_locks(struct seq_file *f, struct file *filp, struct files_struct *files); From b72679ee89a0a0ecd26f7b6fcae96cdaababff94 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 18 Aug 2019 14:18:46 -0400 Subject: [PATCH 027/334] notify: export symbols for use by the knfsd file cache The knfsd file cache will need to detect when files are unlinked, so that it can close the associated cached files. Export a minimal set of notifier functions to allow it to do so. Signed-off-by: Trond Myklebust Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/notify/fsnotify.h | 2 -- fs/notify/group.c | 2 ++ fs/notify/mark.c | 6 ++++++ include/linux/fsnotify_backend.h | 2 ++ 4 files changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index 5a00121fb219..f3462828a0e2 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h @@ -54,8 +54,6 @@ static inline void fsnotify_clear_marks_by_sb(struct super_block *sb) { fsnotify_destroy_marks(&sb->s_fsnotify_marks); } -/* Wait until all marks queued for destruction are destroyed */ -extern void fsnotify_wait_marks_destroyed(void); /* * update the dentry->d_flags of all of inode's children to indicate if inode cares diff --git a/fs/notify/group.c b/fs/notify/group.c index 0391190305cc..133f723aca07 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -108,6 +108,7 @@ void fsnotify_put_group(struct fsnotify_group *group) if (refcount_dec_and_test(&group->refcnt)) fsnotify_final_destroy_group(group); } +EXPORT_SYMBOL_GPL(fsnotify_put_group); /* * Create a new fsnotify_group and hold a reference for the group returned. @@ -137,6 +138,7 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops) return group; } +EXPORT_SYMBOL_GPL(fsnotify_alloc_group); int fsnotify_fasync(int fd, struct file *file, int on) { diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 99ddd126f6f0..1d96216dffd1 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -276,6 +276,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) queue_delayed_work(system_unbound_wq, &reaper_work, FSNOTIFY_REAPER_DELAY); } +EXPORT_SYMBOL_GPL(fsnotify_put_mark); /* * Get mark reference when we found the mark via lockless traversal of object @@ -430,6 +431,7 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark, mutex_unlock(&group->mark_mutex); fsnotify_free_mark(mark); } +EXPORT_SYMBOL_GPL(fsnotify_destroy_mark); /* * Sorting function for lists of fsnotify marks. @@ -685,6 +687,7 @@ int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_connp_t *connp, mutex_unlock(&group->mark_mutex); return ret; } +EXPORT_SYMBOL_GPL(fsnotify_add_mark); /* * Given a list of marks, find the mark associated with given group. If found @@ -711,6 +714,7 @@ struct fsnotify_mark *fsnotify_find_mark(fsnotify_connp_t *connp, spin_unlock(&conn->lock); return NULL; } +EXPORT_SYMBOL_GPL(fsnotify_find_mark); /* Clear any marks in a group with given type mask */ void fsnotify_clear_marks_by_group(struct fsnotify_group *group, @@ -809,6 +813,7 @@ void fsnotify_init_mark(struct fsnotify_mark *mark, mark->group = group; WRITE_ONCE(mark->connector, NULL); } +EXPORT_SYMBOL_GPL(fsnotify_init_mark); /* * Destroy all marks in destroy_list, waits for SRCU period to finish before @@ -837,3 +842,4 @@ void fsnotify_wait_marks_destroyed(void) { flush_delayed_work(&reaper_work); } +EXPORT_SYMBOL_GPL(fsnotify_wait_marks_destroyed); diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 2de3b2ddd19a..1915bdba2fad 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -475,6 +475,8 @@ extern void fsnotify_destroy_mark(struct fsnotify_mark *mark, extern void fsnotify_detach_mark(struct fsnotify_mark *mark); /* free mark */ extern void fsnotify_free_mark(struct fsnotify_mark *mark); +/* Wait until all marks queued for destruction are destroyed */ +extern void fsnotify_wait_marks_destroyed(void); /* run all the marks in a group, and clear all of the marks attached to given object type */ extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group, unsigned int type); /* run all the marks in a group, and clear all of the vfsmount marks */ From 7239a40ca8bfd88dc5d2f66a14882054fe8e3b92 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 18 Aug 2019 14:18:47 -0400 Subject: [PATCH 028/334] vfs: Export flush_delayed_fput for use by knfsd. Allow knfsd to flush the delayed fput list so that it can ensure the cached struct file is closed before it is unlinked. Signed-off-by: Trond Myklebust Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/file_table.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/file_table.c b/fs/file_table.c index b07b53f24ff5..30d55c9a1744 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -327,6 +327,7 @@ void flush_delayed_fput(void) { delayed_fput(NULL); } +EXPORT_SYMBOL_GPL(flush_delayed_fput); static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput); From 65294c1f2c5e72b15b76e16c8c8cfd9359fc9f6f Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sun, 18 Aug 2019 14:18:48 -0400 Subject: [PATCH 029/334] nfsd: add a new struct file caching facility to nfsd Currently, NFSv2/3 reads and writes have to open a file, do the read or write and then close it again for each RPC. This is highly inefficient, especially when the underlying filesystem has a relatively slow open routine. This patch adds a new open file cache to knfsd. Rather than doing an open for each RPC, the read/write handlers can call into this cache to see if there is one already there for the correct filehandle and NFS_MAY_READ/WRITE flags. If there isn't an entry, then we create a new one and attempt to perform the open. If there is, then we wait until the entry is fully instantiated and return it if it is at the end of the wait. If it's not, then we attempt to take over construction. Since the main goal is to speed up NFSv2/3 I/O, we don't want to close these files on last put of these objects. We need to keep them around for a little while since we never know when the next READ/WRITE will come in. Cache entries have a hardcoded 1s timeout, and we have a recurring workqueue job that walks the cache and purges any entries that have expired. Signed-off-by: Jeff Layton Signed-off-by: Weston Andros Adamson Signed-off-by: Richard Sharpe Signed-off-by: Trond Myklebust Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/nfsd/Kconfig | 1 + fs/nfsd/Makefile | 3 +- fs/nfsd/export.c | 13 + fs/nfsd/filecache.c | 885 ++++++++++++++++++++++++++++++++++++++++++++ fs/nfsd/filecache.h | 60 +++ fs/nfsd/nfssvc.c | 9 +- fs/nfsd/trace.h | 140 +++++++ fs/nfsd/vfs.c | 65 ++-- fs/nfsd/vfs.h | 3 + 9 files changed, 1155 insertions(+), 24 deletions(-) create mode 100644 fs/nfsd/filecache.c create mode 100644 fs/nfsd/filecache.h diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index d25f6bbe7006..bff8456220e0 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -3,6 +3,7 @@ config NFSD tristate "NFS server support" depends on INET depends on FILE_LOCKING + depends on FSNOTIFY select LOCKD select SUNRPC select EXPORTFS diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile index 2bfb58eefad1..6a40b1afe703 100644 --- a/fs/nfsd/Makefile +++ b/fs/nfsd/Makefile @@ -11,7 +11,8 @@ obj-$(CONFIG_NFSD) += nfsd.o nfsd-y += trace.o nfsd-y += nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \ - export.o auth.o lockd.o nfscache.o nfsxdr.o stats.o + export.o auth.o lockd.o nfscache.o nfsxdr.o \ + stats.o filecache.o nfsd-$(CONFIG_NFSD_FAULT_INJECTION) += fault_inject.o nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs3xdr.o diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index baa01956a5b3..052fac64b578 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -22,6 +22,7 @@ #include "nfsfh.h" #include "netns.h" #include "pnfs.h" +#include "filecache.h" #define NFSDDBG_FACILITY NFSDDBG_EXPORT @@ -232,6 +233,17 @@ static struct cache_head *expkey_alloc(void) return NULL; } +static void expkey_flush(void) +{ + /* + * Take the nfsd_mutex here to ensure that the file cache is not + * destroyed while we're in the middle of flushing. + */ + mutex_lock(&nfsd_mutex); + nfsd_file_cache_purge(); + mutex_unlock(&nfsd_mutex); +} + static const struct cache_detail svc_expkey_cache_template = { .owner = THIS_MODULE, .hash_size = EXPKEY_HASHMAX, @@ -244,6 +256,7 @@ static const struct cache_detail svc_expkey_cache_template = { .init = expkey_init, .update = expkey_update, .alloc = expkey_alloc, + .flush = expkey_flush, }; static int diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c new file mode 100644 index 000000000000..a2fcb251d2f6 --- /dev/null +++ b/fs/nfsd/filecache.c @@ -0,0 +1,885 @@ +/* + * Open file cache. + * + * (c) 2015 - Jeff Layton + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "vfs.h" +#include "nfsd.h" +#include "nfsfh.h" +#include "filecache.h" +#include "trace.h" + +#define NFSDDBG_FACILITY NFSDDBG_FH + +/* FIXME: dynamically size this for the machine somehow? */ +#define NFSD_FILE_HASH_BITS 12 +#define NFSD_FILE_HASH_SIZE (1 << NFSD_FILE_HASH_BITS) +#define NFSD_LAUNDRETTE_DELAY (2 * HZ) + +#define NFSD_FILE_LRU_RESCAN (0) +#define NFSD_FILE_SHUTDOWN (1) +#define NFSD_FILE_LRU_THRESHOLD (4096UL) +#define NFSD_FILE_LRU_LIMIT (NFSD_FILE_LRU_THRESHOLD << 2) + +/* We only care about NFSD_MAY_READ/WRITE for this cache */ +#define NFSD_FILE_MAY_MASK (NFSD_MAY_READ|NFSD_MAY_WRITE) + +struct nfsd_fcache_bucket { + struct hlist_head nfb_head; + spinlock_t nfb_lock; + unsigned int nfb_count; + unsigned int nfb_maxcount; +}; + +static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits); + +static struct kmem_cache *nfsd_file_slab; +static struct kmem_cache *nfsd_file_mark_slab; +static struct nfsd_fcache_bucket *nfsd_file_hashtbl; +static struct list_lru nfsd_file_lru; +static long nfsd_file_lru_flags; +static struct fsnotify_group *nfsd_file_fsnotify_group; +static atomic_long_t nfsd_filecache_count; +static struct delayed_work nfsd_filecache_laundrette; + +enum nfsd_file_laundrette_ctl { + NFSD_FILE_LAUNDRETTE_NOFLUSH = 0, + NFSD_FILE_LAUNDRETTE_MAY_FLUSH +}; + +static void +nfsd_file_schedule_laundrette(enum nfsd_file_laundrette_ctl ctl) +{ + long count = atomic_long_read(&nfsd_filecache_count); + + if (count == 0 || test_bit(NFSD_FILE_SHUTDOWN, &nfsd_file_lru_flags)) + return; + + /* Be more aggressive about scanning if over the threshold */ + if (count > NFSD_FILE_LRU_THRESHOLD) + mod_delayed_work(system_wq, &nfsd_filecache_laundrette, 0); + else + schedule_delayed_work(&nfsd_filecache_laundrette, NFSD_LAUNDRETTE_DELAY); + + if (ctl == NFSD_FILE_LAUNDRETTE_NOFLUSH) + return; + + /* ...and don't delay flushing if we're out of control */ + if (count >= NFSD_FILE_LRU_LIMIT) + flush_delayed_work(&nfsd_filecache_laundrette); +} + +static void +nfsd_file_slab_free(struct rcu_head *rcu) +{ + struct nfsd_file *nf = container_of(rcu, struct nfsd_file, nf_rcu); + + put_cred(nf->nf_cred); + kmem_cache_free(nfsd_file_slab, nf); +} + +static void +nfsd_file_mark_free(struct fsnotify_mark *mark) +{ + struct nfsd_file_mark *nfm = container_of(mark, struct nfsd_file_mark, + nfm_mark); + + kmem_cache_free(nfsd_file_mark_slab, nfm); +} + +static struct nfsd_file_mark * +nfsd_file_mark_get(struct nfsd_file_mark *nfm) +{ + if (!atomic_inc_not_zero(&nfm->nfm_ref)) + return NULL; + return nfm; +} + +static void +nfsd_file_mark_put(struct nfsd_file_mark *nfm) +{ + if (atomic_dec_and_test(&nfm->nfm_ref)) { + + fsnotify_destroy_mark(&nfm->nfm_mark, nfsd_file_fsnotify_group); + fsnotify_put_mark(&nfm->nfm_mark); + } +} + +static struct nfsd_file_mark * +nfsd_file_mark_find_or_create(struct nfsd_file *nf) +{ + int err; + struct fsnotify_mark *mark; + struct nfsd_file_mark *nfm = NULL, *new; + struct inode *inode = nf->nf_inode; + + do { + mutex_lock(&nfsd_file_fsnotify_group->mark_mutex); + mark = fsnotify_find_mark(&inode->i_fsnotify_marks, + nfsd_file_fsnotify_group); + if (mark) { + nfm = nfsd_file_mark_get(container_of(mark, + struct nfsd_file_mark, + nfm_mark)); + mutex_unlock(&nfsd_file_fsnotify_group->mark_mutex); + fsnotify_put_mark(mark); + if (likely(nfm)) + break; + } else + mutex_unlock(&nfsd_file_fsnotify_group->mark_mutex); + + /* allocate a new nfm */ + new = kmem_cache_alloc(nfsd_file_mark_slab, GFP_KERNEL); + if (!new) + return NULL; + fsnotify_init_mark(&new->nfm_mark, nfsd_file_fsnotify_group); + new->nfm_mark.mask = FS_ATTRIB|FS_DELETE_SELF; + atomic_set(&new->nfm_ref, 1); + + err = fsnotify_add_inode_mark(&new->nfm_mark, inode, 0); + + /* + * If the add was successful, then return the object. + * Otherwise, we need to put the reference we hold on the + * nfm_mark. The fsnotify code will take a reference and put + * it on failure, so we can't just free it directly. It's also + * not safe to call fsnotify_destroy_mark on it as the + * mark->group will be NULL. Thus, we can't let the nfm_ref + * counter drive the destruction at this point. + */ + if (likely(!err)) + nfm = new; + else + fsnotify_put_mark(&new->nfm_mark); + } while (unlikely(err == -EEXIST)); + + return nfm; +} + +static struct nfsd_file * +nfsd_file_alloc(struct inode *inode, unsigned int may, unsigned int hashval) +{ + struct nfsd_file *nf; + + nf = kmem_cache_alloc(nfsd_file_slab, GFP_KERNEL); + if (nf) { + INIT_HLIST_NODE(&nf->nf_node); + INIT_LIST_HEAD(&nf->nf_lru); + nf->nf_file = NULL; + nf->nf_cred = get_current_cred(); + nf->nf_flags = 0; + nf->nf_inode = inode; + nf->nf_hashval = hashval; + atomic_set(&nf->nf_ref, 1); + nf->nf_may = may & NFSD_FILE_MAY_MASK; + if (may & NFSD_MAY_NOT_BREAK_LEASE) { + if (may & NFSD_MAY_WRITE) + __set_bit(NFSD_FILE_BREAK_WRITE, &nf->nf_flags); + if (may & NFSD_MAY_READ) + __set_bit(NFSD_FILE_BREAK_READ, &nf->nf_flags); + } + nf->nf_mark = NULL; + trace_nfsd_file_alloc(nf); + } + return nf; +} + +static bool +nfsd_file_free(struct nfsd_file *nf) +{ + bool flush = false; + + trace_nfsd_file_put_final(nf); + if (nf->nf_mark) + nfsd_file_mark_put(nf->nf_mark); + if (nf->nf_file) { + get_file(nf->nf_file); + filp_close(nf->nf_file, NULL); + fput(nf->nf_file); + flush = true; + } + call_rcu(&nf->nf_rcu, nfsd_file_slab_free); + return flush; +} + +static void +nfsd_file_do_unhash(struct nfsd_file *nf) +{ + lockdep_assert_held(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock); + + trace_nfsd_file_unhash(nf); + + --nfsd_file_hashtbl[nf->nf_hashval].nfb_count; + hlist_del_rcu(&nf->nf_node); + if (!list_empty(&nf->nf_lru)) + list_lru_del(&nfsd_file_lru, &nf->nf_lru); + atomic_long_dec(&nfsd_filecache_count); +} + +static bool +nfsd_file_unhash(struct nfsd_file *nf) +{ + if (test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) { + nfsd_file_do_unhash(nf); + return true; + } + return false; +} + +/* + * Return true if the file was unhashed. + */ +static bool +nfsd_file_unhash_and_release_locked(struct nfsd_file *nf, struct list_head *dispose) +{ + lockdep_assert_held(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock); + + trace_nfsd_file_unhash_and_release_locked(nf); + if (!nfsd_file_unhash(nf)) + return false; + /* keep final reference for nfsd_file_lru_dispose */ + if (atomic_add_unless(&nf->nf_ref, -1, 1)) + return true; + + list_add(&nf->nf_lru, dispose); + return true; +} + +static int +nfsd_file_put_noref(struct nfsd_file *nf) +{ + int count; + trace_nfsd_file_put(nf); + + count = atomic_dec_return(&nf->nf_ref); + if (!count) { + WARN_ON(test_bit(NFSD_FILE_HASHED, &nf->nf_flags)); + nfsd_file_free(nf); + } + return count; +} + +void +nfsd_file_put(struct nfsd_file *nf) +{ + bool is_hashed = test_bit(NFSD_FILE_HASHED, &nf->nf_flags) != 0; + + set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags); + if (nfsd_file_put_noref(nf) == 1 && is_hashed) + nfsd_file_schedule_laundrette(NFSD_FILE_LAUNDRETTE_MAY_FLUSH); +} + +struct nfsd_file * +nfsd_file_get(struct nfsd_file *nf) +{ + if (likely(atomic_inc_not_zero(&nf->nf_ref))) + return nf; + return NULL; +} + +static void +nfsd_file_dispose_list(struct list_head *dispose) +{ + struct nfsd_file *nf; + + while(!list_empty(dispose)) { + nf = list_first_entry(dispose, struct nfsd_file, nf_lru); + list_del(&nf->nf_lru); + nfsd_file_put_noref(nf); + } +} + +static void +nfsd_file_dispose_list_sync(struct list_head *dispose) +{ + bool flush = false; + struct nfsd_file *nf; + + while(!list_empty(dispose)) { + nf = list_first_entry(dispose, struct nfsd_file, nf_lru); + list_del(&nf->nf_lru); + if (!atomic_dec_and_test(&nf->nf_ref)) + continue; + if (nfsd_file_free(nf)) + flush = true; + } + if (flush) + flush_delayed_fput(); +} + +/* + * Note this can deadlock with nfsd_file_cache_purge. + */ +static enum lru_status +nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru, + spinlock_t *lock, void *arg) + __releases(lock) + __acquires(lock) +{ + struct list_head *head = arg; + struct nfsd_file *nf = list_entry(item, struct nfsd_file, nf_lru); + + /* + * Do a lockless refcount check. The hashtable holds one reference, so + * we look to see if anything else has a reference, or if any have + * been put since the shrinker last ran. Those don't get unhashed and + * released. + * + * Note that in the put path, we set the flag and then decrement the + * counter. Here we check the counter and then test and clear the flag. + * That order is deliberate to ensure that we can do this locklessly. + */ + if (atomic_read(&nf->nf_ref) > 1) + goto out_skip; + if (test_and_clear_bit(NFSD_FILE_REFERENCED, &nf->nf_flags)) + goto out_rescan; + + if (!test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) + goto out_skip; + + list_lru_isolate_move(lru, &nf->nf_lru, head); + return LRU_REMOVED; +out_rescan: + set_bit(NFSD_FILE_LRU_RESCAN, &nfsd_file_lru_flags); +out_skip: + return LRU_SKIP; +} + +static void +nfsd_file_lru_dispose(struct list_head *head) +{ + while(!list_empty(head)) { + struct nfsd_file *nf = list_first_entry(head, + struct nfsd_file, nf_lru); + list_del_init(&nf->nf_lru); + spin_lock(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock); + nfsd_file_do_unhash(nf); + spin_unlock(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock); + nfsd_file_put_noref(nf); + } +} + +static unsigned long +nfsd_file_lru_count(struct shrinker *s, struct shrink_control *sc) +{ + return list_lru_count(&nfsd_file_lru); +} + +static unsigned long +nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc) +{ + LIST_HEAD(head); + unsigned long ret; + + ret = list_lru_shrink_walk(&nfsd_file_lru, sc, nfsd_file_lru_cb, &head); + nfsd_file_lru_dispose(&head); + return ret; +} + +static struct shrinker nfsd_file_shrinker = { + .scan_objects = nfsd_file_lru_scan, + .count_objects = nfsd_file_lru_count, + .seeks = 1, +}; + +static void +__nfsd_file_close_inode(struct inode *inode, unsigned int hashval, + struct list_head *dispose) +{ + struct nfsd_file *nf; + struct hlist_node *tmp; + + spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock); + hlist_for_each_entry_safe(nf, tmp, &nfsd_file_hashtbl[hashval].nfb_head, nf_node) { + if (inode == nf->nf_inode) + nfsd_file_unhash_and_release_locked(nf, dispose); + } + spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock); +} + +/** + * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file + * @inode: inode of the file to attempt to remove + * + * Walk the whole hash bucket, looking for any files that correspond to "inode". + * If any do, then unhash them and put the hashtable reference to them and + * destroy any that had their last reference put. Also ensure that any of the + * fputs also have their final __fput done as well. + */ +void +nfsd_file_close_inode_sync(struct inode *inode) +{ + unsigned int hashval = (unsigned int)hash_long(inode->i_ino, + NFSD_FILE_HASH_BITS); + LIST_HEAD(dispose); + + __nfsd_file_close_inode(inode, hashval, &dispose); + trace_nfsd_file_close_inode_sync(inode, hashval, !list_empty(&dispose)); + nfsd_file_dispose_list_sync(&dispose); +} + +/** + * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file + * @inode: inode of the file to attempt to remove + * + * Walk the whole hash bucket, looking for any files that correspond to "inode". + * If any do, then unhash them and put the hashtable reference to them and + * destroy any that had their last reference put. + */ +static void +nfsd_file_close_inode(struct inode *inode) +{ + unsigned int hashval = (unsigned int)hash_long(inode->i_ino, + NFSD_FILE_HASH_BITS); + LIST_HEAD(dispose); + + __nfsd_file_close_inode(inode, hashval, &dispose); + trace_nfsd_file_close_inode(inode, hashval, !list_empty(&dispose)); + nfsd_file_dispose_list(&dispose); +} + +/** + * nfsd_file_delayed_close - close unused nfsd_files + * @work: dummy + * + * Walk the LRU list and close any entries that have not been used since + * the last scan. + * + * Note this can deadlock with nfsd_file_cache_purge. + */ +static void +nfsd_file_delayed_close(struct work_struct *work) +{ + LIST_HEAD(head); + + list_lru_walk(&nfsd_file_lru, nfsd_file_lru_cb, &head, LONG_MAX); + + if (test_and_clear_bit(NFSD_FILE_LRU_RESCAN, &nfsd_file_lru_flags)) + nfsd_file_schedule_laundrette(NFSD_FILE_LAUNDRETTE_NOFLUSH); + + if (!list_empty(&head)) { + nfsd_file_lru_dispose(&head); + flush_delayed_fput(); + } +} + +static int +nfsd_file_lease_notifier_call(struct notifier_block *nb, unsigned long arg, + void *data) +{ + struct file_lock *fl = data; + + /* Only close files for F_SETLEASE leases */ + if (fl->fl_flags & FL_LEASE) + nfsd_file_close_inode_sync(file_inode(fl->fl_file)); + return 0; +} + +static struct notifier_block nfsd_file_lease_notifier = { + .notifier_call = nfsd_file_lease_notifier_call, +}; + +static int +nfsd_file_fsnotify_handle_event(struct fsnotify_group *group, + struct inode *inode, + u32 mask, const void *data, int data_type, + const struct qstr *file_name, u32 cookie, + struct fsnotify_iter_info *iter_info) +{ + trace_nfsd_file_fsnotify_handle_event(inode, mask); + + /* Should be no marks on non-regular files */ + if (!S_ISREG(inode->i_mode)) { + WARN_ON_ONCE(1); + return 0; + } + + /* don't close files if this was not the last link */ + if (mask & FS_ATTRIB) { + if (inode->i_nlink) + return 0; + } + + nfsd_file_close_inode(inode); + return 0; +} + + +static const struct fsnotify_ops nfsd_file_fsnotify_ops = { + .handle_event = nfsd_file_fsnotify_handle_event, + .free_mark = nfsd_file_mark_free, +}; + +int +nfsd_file_cache_init(void) +{ + int ret = -ENOMEM; + unsigned int i; + + clear_bit(NFSD_FILE_SHUTDOWN, &nfsd_file_lru_flags); + + if (nfsd_file_hashtbl) + return 0; + + nfsd_file_hashtbl = kcalloc(NFSD_FILE_HASH_SIZE, + sizeof(*nfsd_file_hashtbl), GFP_KERNEL); + if (!nfsd_file_hashtbl) { + pr_err("nfsd: unable to allocate nfsd_file_hashtbl\n"); + goto out_err; + } + + nfsd_file_slab = kmem_cache_create("nfsd_file", + sizeof(struct nfsd_file), 0, 0, NULL); + if (!nfsd_file_slab) { + pr_err("nfsd: unable to create nfsd_file_slab\n"); + goto out_err; + } + + nfsd_file_mark_slab = kmem_cache_create("nfsd_file_mark", + sizeof(struct nfsd_file_mark), 0, 0, NULL); + if (!nfsd_file_mark_slab) { + pr_err("nfsd: unable to create nfsd_file_mark_slab\n"); + goto out_err; + } + + + ret = list_lru_init(&nfsd_file_lru); + if (ret) { + pr_err("nfsd: failed to init nfsd_file_lru: %d\n", ret); + goto out_err; + } + + ret = register_shrinker(&nfsd_file_shrinker); + if (ret) { + pr_err("nfsd: failed to register nfsd_file_shrinker: %d\n", ret); + goto out_lru; + } + + ret = lease_register_notifier(&nfsd_file_lease_notifier); + if (ret) { + pr_err("nfsd: unable to register lease notifier: %d\n", ret); + goto out_shrinker; + } + + nfsd_file_fsnotify_group = fsnotify_alloc_group(&nfsd_file_fsnotify_ops); + if (IS_ERR(nfsd_file_fsnotify_group)) { + pr_err("nfsd: unable to create fsnotify group: %ld\n", + PTR_ERR(nfsd_file_fsnotify_group)); + nfsd_file_fsnotify_group = NULL; + goto out_notifier; + } + + for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) { + INIT_HLIST_HEAD(&nfsd_file_hashtbl[i].nfb_head); + spin_lock_init(&nfsd_file_hashtbl[i].nfb_lock); + } + + INIT_DELAYED_WORK(&nfsd_filecache_laundrette, nfsd_file_delayed_close); +out: + return ret; +out_notifier: + lease_unregister_notifier(&nfsd_file_lease_notifier); +out_shrinker: + unregister_shrinker(&nfsd_file_shrinker); +out_lru: + list_lru_destroy(&nfsd_file_lru); +out_err: + kmem_cache_destroy(nfsd_file_slab); + nfsd_file_slab = NULL; + kmem_cache_destroy(nfsd_file_mark_slab); + nfsd_file_mark_slab = NULL; + kfree(nfsd_file_hashtbl); + nfsd_file_hashtbl = NULL; + goto out; +} + +/* + * Note this can deadlock with nfsd_file_lru_cb. + */ +void +nfsd_file_cache_purge(void) +{ + unsigned int i; + struct nfsd_file *nf; + LIST_HEAD(dispose); + bool del; + + if (!nfsd_file_hashtbl) + return; + + for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) { + spin_lock(&nfsd_file_hashtbl[i].nfb_lock); + while(!hlist_empty(&nfsd_file_hashtbl[i].nfb_head)) { + nf = hlist_entry(nfsd_file_hashtbl[i].nfb_head.first, + struct nfsd_file, nf_node); + del = nfsd_file_unhash_and_release_locked(nf, &dispose); + + /* + * Deadlock detected! Something marked this entry as + * unhased, but hasn't removed it from the hash list. + */ + WARN_ON_ONCE(!del); + } + spin_unlock(&nfsd_file_hashtbl[i].nfb_lock); + nfsd_file_dispose_list(&dispose); + } +} + +void +nfsd_file_cache_shutdown(void) +{ + LIST_HEAD(dispose); + + set_bit(NFSD_FILE_SHUTDOWN, &nfsd_file_lru_flags); + + lease_unregister_notifier(&nfsd_file_lease_notifier); + unregister_shrinker(&nfsd_file_shrinker); + /* + * make sure all callers of nfsd_file_lru_cb are done before + * calling nfsd_file_cache_purge + */ + cancel_delayed_work_sync(&nfsd_filecache_laundrette); + nfsd_file_cache_purge(); + list_lru_destroy(&nfsd_file_lru); + rcu_barrier(); + fsnotify_put_group(nfsd_file_fsnotify_group); + nfsd_file_fsnotify_group = NULL; + kmem_cache_destroy(nfsd_file_slab); + nfsd_file_slab = NULL; + fsnotify_wait_marks_destroyed(); + kmem_cache_destroy(nfsd_file_mark_slab); + nfsd_file_mark_slab = NULL; + kfree(nfsd_file_hashtbl); + nfsd_file_hashtbl = NULL; +} + +static bool +nfsd_match_cred(const struct cred *c1, const struct cred *c2) +{ + int i; + + if (!uid_eq(c1->fsuid, c2->fsuid)) + return false; + if (!gid_eq(c1->fsgid, c2->fsgid)) + return false; + if (c1->group_info == NULL || c2->group_info == NULL) + return c1->group_info == c2->group_info; + if (c1->group_info->ngroups != c2->group_info->ngroups) + return false; + for (i = 0; i < c1->group_info->ngroups; i++) { + if (!gid_eq(c1->group_info->gid[i], c2->group_info->gid[i])) + return false; + } + return true; +} + +static struct nfsd_file * +nfsd_file_find_locked(struct inode *inode, unsigned int may_flags, + unsigned int hashval) +{ + struct nfsd_file *nf; + unsigned char need = may_flags & NFSD_FILE_MAY_MASK; + + hlist_for_each_entry_rcu(nf, &nfsd_file_hashtbl[hashval].nfb_head, + nf_node) { + if ((need & nf->nf_may) != need) + continue; + if (nf->nf_inode != inode) + continue; + if (!nfsd_match_cred(nf->nf_cred, current_cred())) + continue; + if (nfsd_file_get(nf) != NULL) + return nf; + } + return NULL; +} + +/** + * nfsd_file_is_cached - are there any cached open files for this fh? + * @inode: inode of the file to check + * + * Scan the hashtable for open files that match this fh. Returns true if there + * are any, and false if not. + */ +bool +nfsd_file_is_cached(struct inode *inode) +{ + bool ret = false; + struct nfsd_file *nf; + unsigned int hashval; + + hashval = (unsigned int)hash_long(inode->i_ino, NFSD_FILE_HASH_BITS); + + rcu_read_lock(); + hlist_for_each_entry_rcu(nf, &nfsd_file_hashtbl[hashval].nfb_head, + nf_node) { + if (inode == nf->nf_inode) { + ret = true; + break; + } + } + rcu_read_unlock(); + trace_nfsd_file_is_cached(inode, hashval, (int)ret); + return ret; +} + +__be32 +nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, + unsigned int may_flags, struct nfsd_file **pnf) +{ + __be32 status; + struct nfsd_file *nf, *new; + struct inode *inode; + unsigned int hashval; + + /* FIXME: skip this if fh_dentry is already set? */ + status = fh_verify(rqstp, fhp, S_IFREG, + may_flags|NFSD_MAY_OWNER_OVERRIDE); + if (status != nfs_ok) + return status; + + inode = d_inode(fhp->fh_dentry); + hashval = (unsigned int)hash_long(inode->i_ino, NFSD_FILE_HASH_BITS); +retry: + rcu_read_lock(); + nf = nfsd_file_find_locked(inode, may_flags, hashval); + rcu_read_unlock(); + if (nf) + goto wait_for_construction; + + new = nfsd_file_alloc(inode, may_flags, hashval); + if (!new) { + trace_nfsd_file_acquire(rqstp, hashval, inode, may_flags, + NULL, nfserr_jukebox); + return nfserr_jukebox; + } + + spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock); + nf = nfsd_file_find_locked(inode, may_flags, hashval); + if (nf == NULL) + goto open_file; + spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock); + nfsd_file_slab_free(&new->nf_rcu); + +wait_for_construction: + wait_on_bit(&nf->nf_flags, NFSD_FILE_PENDING, TASK_UNINTERRUPTIBLE); + + /* Did construction of this file fail? */ + if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) { + nfsd_file_put_noref(nf); + goto retry; + } + + this_cpu_inc(nfsd_file_cache_hits); + + if (!(may_flags & NFSD_MAY_NOT_BREAK_LEASE)) { + bool write = (may_flags & NFSD_MAY_WRITE); + + if (test_bit(NFSD_FILE_BREAK_READ, &nf->nf_flags) || + (test_bit(NFSD_FILE_BREAK_WRITE, &nf->nf_flags) && write)) { + status = nfserrno(nfsd_open_break_lease( + file_inode(nf->nf_file), may_flags)); + if (status == nfs_ok) { + clear_bit(NFSD_FILE_BREAK_READ, &nf->nf_flags); + if (write) + clear_bit(NFSD_FILE_BREAK_WRITE, + &nf->nf_flags); + } + } + } +out: + if (status == nfs_ok) { + *pnf = nf; + } else { + nfsd_file_put(nf); + nf = NULL; + } + + trace_nfsd_file_acquire(rqstp, hashval, inode, may_flags, nf, status); + return status; +open_file: + nf = new; + /* Take reference for the hashtable */ + atomic_inc(&nf->nf_ref); + __set_bit(NFSD_FILE_HASHED, &nf->nf_flags); + __set_bit(NFSD_FILE_PENDING, &nf->nf_flags); + list_lru_add(&nfsd_file_lru, &nf->nf_lru); + hlist_add_head_rcu(&nf->nf_node, &nfsd_file_hashtbl[hashval].nfb_head); + ++nfsd_file_hashtbl[hashval].nfb_count; + nfsd_file_hashtbl[hashval].nfb_maxcount = max(nfsd_file_hashtbl[hashval].nfb_maxcount, + nfsd_file_hashtbl[hashval].nfb_count); + spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock); + atomic_long_inc(&nfsd_filecache_count); + + nf->nf_mark = nfsd_file_mark_find_or_create(nf); + if (nf->nf_mark) + status = nfsd_open_verified(rqstp, fhp, S_IFREG, + may_flags, &nf->nf_file); + else + status = nfserr_jukebox; + /* + * If construction failed, or we raced with a call to unlink() + * then unhash. + */ + if (status != nfs_ok || inode->i_nlink == 0) { + bool do_free; + spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock); + do_free = nfsd_file_unhash(nf); + spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock); + if (do_free) + nfsd_file_put_noref(nf); + } + clear_bit_unlock(NFSD_FILE_PENDING, &nf->nf_flags); + smp_mb__after_atomic(); + wake_up_bit(&nf->nf_flags, NFSD_FILE_PENDING); + goto out; +} + +/* + * Note that fields may be added, removed or reordered in the future. Programs + * scraping this file for info should test the labels to ensure they're + * getting the correct field. + */ +static int nfsd_file_cache_stats_show(struct seq_file *m, void *v) +{ + unsigned int i, count = 0, longest = 0; + unsigned long hits = 0; + + /* + * No need for spinlocks here since we're not terribly interested in + * accuracy. We do take the nfsd_mutex simply to ensure that we + * don't end up racing with server shutdown + */ + mutex_lock(&nfsd_mutex); + if (nfsd_file_hashtbl) { + for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) { + count += nfsd_file_hashtbl[i].nfb_count; + longest = max(longest, nfsd_file_hashtbl[i].nfb_count); + } + } + mutex_unlock(&nfsd_mutex); + + for_each_possible_cpu(i) + hits += per_cpu(nfsd_file_cache_hits, i); + + seq_printf(m, "total entries: %u\n", count); + seq_printf(m, "longest chain: %u\n", longest); + seq_printf(m, "cache hits: %lu\n", hits); + return 0; +} + +int nfsd_file_cache_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, nfsd_file_cache_stats_show, NULL); +} diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h new file mode 100644 index 000000000000..0c0c67166b87 --- /dev/null +++ b/fs/nfsd/filecache.h @@ -0,0 +1,60 @@ +#ifndef _FS_NFSD_FILECACHE_H +#define _FS_NFSD_FILECACHE_H + +#include + +/* + * This is the fsnotify_mark container that nfsd attaches to the files that it + * is holding open. Note that we have a separate refcount here aside from the + * one in the fsnotify_mark. We only want a single fsnotify_mark attached to + * the inode, and for each nfsd_file to hold a reference to it. + * + * The fsnotify_mark is itself refcounted, but that's not sufficient to tell us + * how to put that reference. If there are still outstanding nfsd_files that + * reference the mark, then we would want to call fsnotify_put_mark on it. + * If there were not, then we'd need to call fsnotify_destroy_mark. Since we + * can't really tell the difference, we use the nfm_mark to keep track of how + * many nfsd_files hold references to the mark. When that counter goes to zero + * then we know to call fsnotify_destroy_mark on it. + */ +struct nfsd_file_mark { + struct fsnotify_mark nfm_mark; + atomic_t nfm_ref; +}; + +/* + * A representation of a file that has been opened by knfsd. These are hashed + * in the hashtable by inode pointer value. Note that this object doesn't + * hold a reference to the inode by itself, so the nf_inode pointer should + * never be dereferenced, only used for comparison. + */ +struct nfsd_file { + struct hlist_node nf_node; + struct list_head nf_lru; + struct rcu_head nf_rcu; + struct file *nf_file; + const struct cred *nf_cred; +#define NFSD_FILE_HASHED (0) +#define NFSD_FILE_PENDING (1) +#define NFSD_FILE_BREAK_READ (2) +#define NFSD_FILE_BREAK_WRITE (3) +#define NFSD_FILE_REFERENCED (4) + unsigned long nf_flags; + struct inode *nf_inode; + unsigned int nf_hashval; + atomic_t nf_ref; + unsigned char nf_may; + struct nfsd_file_mark *nf_mark; +}; + +int nfsd_file_cache_init(void); +void nfsd_file_cache_purge(void); +void nfsd_file_cache_shutdown(void); +void nfsd_file_put(struct nfsd_file *nf); +struct nfsd_file *nfsd_file_get(struct nfsd_file *nf); +void nfsd_file_close_inode_sync(struct inode *inode); +bool nfsd_file_is_cached(struct inode *inode); +__be32 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, + unsigned int may_flags, struct nfsd_file **nfp); +int nfsd_file_cache_stats_open(struct inode *, struct file *); +#endif /* _FS_NFSD_FILECACHE_H */ diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 18d94ea984ba..a6b1eab7b722 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -27,6 +27,7 @@ #include "cache.h" #include "vfs.h" #include "netns.h" +#include "filecache.h" #define NFSDDBG_FACILITY NFSDDBG_SVC @@ -313,6 +314,9 @@ static int nfsd_startup_generic(int nrservs) if (nfsd_users++) return 0; + ret = nfsd_file_cache_init(); + if (ret) + goto dec_users; /* * Readahead param cache - will no-op if it already exists. * (Note therefore results will be suboptimal if number of @@ -320,7 +324,7 @@ static int nfsd_startup_generic(int nrservs) */ ret = nfsd_racache_init(2*nrservs); if (ret) - goto dec_users; + goto out_file_cache; ret = nfs4_state_start(); if (ret) @@ -329,6 +333,8 @@ static int nfsd_startup_generic(int nrservs) out_racache: nfsd_racache_shutdown(); +out_file_cache: + nfsd_file_cache_shutdown(); dec_users: nfsd_users--; return ret; @@ -340,6 +346,7 @@ static void nfsd_shutdown_generic(void) return; nfs4_state_shutdown(); + nfsd_file_cache_shutdown(); nfsd_racache_shutdown(); } diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 80933e4334d8..ffc78a0e28b2 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -126,6 +126,8 @@ DEFINE_NFSD_ERR_EVENT(read_err); DEFINE_NFSD_ERR_EVENT(write_err); #include "state.h" +#include "filecache.h" +#include "vfs.h" DECLARE_EVENT_CLASS(nfsd_stateid_class, TP_PROTO(stateid_t *stp), @@ -164,6 +166,144 @@ DEFINE_STATEID_EVENT(layout_recall_done); DEFINE_STATEID_EVENT(layout_recall_fail); DEFINE_STATEID_EVENT(layout_recall_release); +#define show_nf_flags(val) \ + __print_flags(val, "|", \ + { 1 << NFSD_FILE_HASHED, "HASHED" }, \ + { 1 << NFSD_FILE_PENDING, "PENDING" }, \ + { 1 << NFSD_FILE_BREAK_READ, "BREAK_READ" }, \ + { 1 << NFSD_FILE_BREAK_WRITE, "BREAK_WRITE" }, \ + { 1 << NFSD_FILE_REFERENCED, "REFERENCED"}) + +/* FIXME: This should probably be fleshed out in the future. */ +#define show_nf_may(val) \ + __print_flags(val, "|", \ + { NFSD_MAY_READ, "READ" }, \ + { NFSD_MAY_WRITE, "WRITE" }, \ + { NFSD_MAY_NOT_BREAK_LEASE, "NOT_BREAK_LEASE" }) + +DECLARE_EVENT_CLASS(nfsd_file_class, + TP_PROTO(struct nfsd_file *nf), + TP_ARGS(nf), + TP_STRUCT__entry( + __field(unsigned int, nf_hashval) + __field(void *, nf_inode) + __field(int, nf_ref) + __field(unsigned long, nf_flags) + __field(unsigned char, nf_may) + __field(struct file *, nf_file) + ), + TP_fast_assign( + __entry->nf_hashval = nf->nf_hashval; + __entry->nf_inode = nf->nf_inode; + __entry->nf_ref = atomic_read(&nf->nf_ref); + __entry->nf_flags = nf->nf_flags; + __entry->nf_may = nf->nf_may; + __entry->nf_file = nf->nf_file; + ), + TP_printk("hash=0x%x inode=0x%p ref=%d flags=%s may=%s file=%p", + __entry->nf_hashval, + __entry->nf_inode, + __entry->nf_ref, + show_nf_flags(__entry->nf_flags), + show_nf_may(__entry->nf_may), + __entry->nf_file) +) + +#define DEFINE_NFSD_FILE_EVENT(name) \ +DEFINE_EVENT(nfsd_file_class, name, \ + TP_PROTO(struct nfsd_file *nf), \ + TP_ARGS(nf)) + +DEFINE_NFSD_FILE_EVENT(nfsd_file_alloc); +DEFINE_NFSD_FILE_EVENT(nfsd_file_put_final); +DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash); +DEFINE_NFSD_FILE_EVENT(nfsd_file_put); +DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_release_locked); + +TRACE_EVENT(nfsd_file_acquire, + TP_PROTO(struct svc_rqst *rqstp, unsigned int hash, + struct inode *inode, unsigned int may_flags, + struct nfsd_file *nf, __be32 status), + + TP_ARGS(rqstp, hash, inode, may_flags, nf, status), + + TP_STRUCT__entry( + __field(__be32, xid) + __field(unsigned int, hash) + __field(void *, inode) + __field(unsigned int, may_flags) + __field(int, nf_ref) + __field(unsigned long, nf_flags) + __field(unsigned char, nf_may) + __field(struct file *, nf_file) + __field(__be32, status) + ), + + TP_fast_assign( + __entry->xid = rqstp->rq_xid; + __entry->hash = hash; + __entry->inode = inode; + __entry->may_flags = may_flags; + __entry->nf_ref = nf ? atomic_read(&nf->nf_ref) : 0; + __entry->nf_flags = nf ? nf->nf_flags : 0; + __entry->nf_may = nf ? nf->nf_may : 0; + __entry->nf_file = nf ? nf->nf_file : NULL; + __entry->status = status; + ), + + TP_printk("xid=0x%x hash=0x%x inode=0x%p may_flags=%s ref=%d nf_flags=%s nf_may=%s nf_file=0x%p status=%u", + be32_to_cpu(__entry->xid), __entry->hash, __entry->inode, + show_nf_may(__entry->may_flags), __entry->nf_ref, + show_nf_flags(__entry->nf_flags), + show_nf_may(__entry->nf_may), __entry->nf_file, + be32_to_cpu(__entry->status)) +); + +DECLARE_EVENT_CLASS(nfsd_file_search_class, + TP_PROTO(struct inode *inode, unsigned int hash, int found), + TP_ARGS(inode, hash, found), + TP_STRUCT__entry( + __field(struct inode *, inode) + __field(unsigned int, hash) + __field(int, found) + ), + TP_fast_assign( + __entry->inode = inode; + __entry->hash = hash; + __entry->found = found; + ), + TP_printk("hash=0x%x inode=0x%p found=%d", __entry->hash, + __entry->inode, __entry->found) +); + +#define DEFINE_NFSD_FILE_SEARCH_EVENT(name) \ +DEFINE_EVENT(nfsd_file_search_class, name, \ + TP_PROTO(struct inode *inode, unsigned int hash, int found), \ + TP_ARGS(inode, hash, found)) + +DEFINE_NFSD_FILE_SEARCH_EVENT(nfsd_file_close_inode_sync); +DEFINE_NFSD_FILE_SEARCH_EVENT(nfsd_file_close_inode); +DEFINE_NFSD_FILE_SEARCH_EVENT(nfsd_file_is_cached); + +TRACE_EVENT(nfsd_file_fsnotify_handle_event, + TP_PROTO(struct inode *inode, u32 mask), + TP_ARGS(inode, mask), + TP_STRUCT__entry( + __field(struct inode *, inode) + __field(unsigned int, nlink) + __field(umode_t, mode) + __field(u32, mask) + ), + TP_fast_assign( + __entry->inode = inode; + __entry->nlink = inode->i_nlink; + __entry->mode = inode->i_mode; + __entry->mask = mask; + ), + TP_printk("inode=0x%p nlink=%u mode=0%ho mask=0x%x", __entry->inode, + __entry->nlink, __entry->mode, __entry->mask) +); + #endif /* _NFSD_TRACE_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index c85783e536d5..5983206ab036 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -699,7 +699,7 @@ nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *suppor } #endif /* CONFIG_NFSD_V3 */ -static int nfsd_open_break_lease(struct inode *inode, int access) +int nfsd_open_break_lease(struct inode *inode, int access) { unsigned int mode; @@ -715,8 +715,8 @@ static int nfsd_open_break_lease(struct inode *inode, int access) * and additional flags. * N.B. After this call fhp needs an fh_put */ -__be32 -nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, +static __be32 +__nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int may_flags, struct file **filp) { struct path path; @@ -726,25 +726,6 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, __be32 err; int host_err = 0; - validate_process_creds(); - - /* - * If we get here, then the client has already done an "open", - * and (hopefully) checked permission - so allow OWNER_OVERRIDE - * in case a chmod has now revoked permission. - * - * Arguably we should also allow the owner override for - * directories, but we never have and it doesn't seem to have - * caused anyone a problem. If we were to change this, note - * also that our filldir callbacks would need a variant of - * lookup_one_len that doesn't check permissions. - */ - if (type == S_IFREG) - may_flags |= NFSD_MAY_OWNER_OVERRIDE; - err = fh_verify(rqstp, fhp, type, may_flags); - if (err) - goto out; - path.mnt = fhp->fh_export->ex_path.mnt; path.dentry = fhp->fh_dentry; inode = d_inode(path.dentry); @@ -798,10 +779,50 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, out_nfserr: err = nfserrno(host_err); out: + return err; +} + +__be32 +nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, + int may_flags, struct file **filp) +{ + __be32 err; + + validate_process_creds(); + /* + * If we get here, then the client has already done an "open", + * and (hopefully) checked permission - so allow OWNER_OVERRIDE + * in case a chmod has now revoked permission. + * + * Arguably we should also allow the owner override for + * directories, but we never have and it doesn't seem to have + * caused anyone a problem. If we were to change this, note + * also that our filldir callbacks would need a variant of + * lookup_one_len that doesn't check permissions. + */ + if (type == S_IFREG) + may_flags |= NFSD_MAY_OWNER_OVERRIDE; + err = fh_verify(rqstp, fhp, type, may_flags); + if (!err) + err = __nfsd_open(rqstp, fhp, type, may_flags, filp); validate_process_creds(); return err; } +__be32 +nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, + int may_flags, struct file **filp) +{ + __be32 err; + + validate_process_creds(); + err = __nfsd_open(rqstp, fhp, type, may_flags, filp); + validate_process_creds(); + return err; +} + + + struct raparms * nfsd_init_raparms(struct file *file) { diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index db351247892d..31fdae34e028 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -75,8 +75,11 @@ __be32 do_nfsd_create(struct svc_rqst *, struct svc_fh *, __be32 nfsd_commit(struct svc_rqst *, struct svc_fh *, loff_t, unsigned long); #endif /* CONFIG_NFSD_V3 */ +int nfsd_open_break_lease(struct inode *, int); __be32 nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t, int, struct file **); +__be32 nfsd_open_verified(struct svc_rqst *, struct svc_fh *, umode_t, + int, struct file **); struct raparms; __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, From b493523926f9b466db3c440ac64beb93d8068cdf Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sun, 18 Aug 2019 14:18:49 -0400 Subject: [PATCH 030/334] nfsd: hook up nfsd_write to the new nfsd_file cache Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 5983206ab036..2f5b52004a18 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -44,6 +44,7 @@ #include "nfsd.h" #include "vfs.h" +#include "filecache.h" #include "trace.h" #define NFSDDBG_FACILITY NFSDDBG_FILEOP @@ -1104,17 +1105,18 @@ __be32 nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset, struct kvec *vec, int vlen, unsigned long *cnt, int stable) { - struct file *file = NULL; - __be32 err = 0; + struct nfsd_file *nf; + __be32 err; trace_nfsd_write_start(rqstp, fhp, offset, *cnt); - err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file); + err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_WRITE, &nf); if (err) goto out; - err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt, stable); - fput(file); + err = nfsd_vfs_write(rqstp, fhp, nf->nf_file, offset, vec, + vlen, cnt, stable); + nfsd_file_put(nf); out: trace_nfsd_write_done(rqstp, fhp, offset, *cnt); return err; From 48cd7b51258c1a158293cefb97dda988080f5e13 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sun, 18 Aug 2019 14:18:50 -0400 Subject: [PATCH 031/334] nfsd: hook up nfsd_read to the nfsd_file cache Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 2f5b52004a18..8593c6423336 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1071,25 +1071,22 @@ out_nfserr: __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset, struct kvec *vec, int vlen, unsigned long *count) { + struct nfsd_file *nf; struct file *file; - struct raparms *ra; __be32 err; trace_nfsd_read_start(rqstp, fhp, offset, *count); - err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file); + err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_READ, &nf); if (err) return err; - ra = nfsd_init_raparms(file); - + file = nf->nf_file; if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &rqstp->rq_flags)) err = nfsd_splice_read(rqstp, fhp, file, offset, count); else err = nfsd_readv(rqstp, fhp, file, offset, vec, vlen, count); - if (ra) - nfsd_put_raparams(file, ra); - fput(file); + nfsd_file_put(nf); trace_nfsd_read_done(rqstp, fhp, offset, *count); From 5920afa3c85ff38642f652b6e3880e79392fcc89 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sun, 18 Aug 2019 14:18:51 -0400 Subject: [PATCH 032/334] nfsd: hook nfsd_commit up to the nfsd_file cache Use cached filps if possible instead of opening a new one every time. Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 8593c6423336..ec254bff1893 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1133,9 +1133,9 @@ __be32 nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset, unsigned long count) { - struct file *file; - loff_t end = LLONG_MAX; - __be32 err = nfserr_inval; + struct nfsd_file *nf; + loff_t end = LLONG_MAX; + __be32 err = nfserr_inval; if (offset < 0) goto out; @@ -1145,12 +1145,12 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, goto out; } - err = nfsd_open(rqstp, fhp, S_IFREG, - NFSD_MAY_WRITE|NFSD_MAY_NOT_BREAK_LEASE, &file); + err = nfsd_file_acquire(rqstp, fhp, + NFSD_MAY_WRITE|NFSD_MAY_NOT_BREAK_LEASE, &nf); if (err) goto out; if (EX_ISSYNC(fhp->fh_export)) { - int err2 = vfs_fsync_range(file, offset, end, 0); + int err2 = vfs_fsync_range(nf->nf_file, offset, end, 0); if (err2 != -EINVAL) err = nfserrno(err2); @@ -1158,7 +1158,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, err = nfserr_notsupp; } - fput(file); + nfsd_file_put(nf); out: return err; } From fd4f83fd7dfb1bce2f1af51fcbaf6575f4b9d189 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sun, 18 Aug 2019 14:18:52 -0400 Subject: [PATCH 033/334] nfsd: convert nfs4_file->fi_fds array to use nfsd_files Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 35 ++++++++++++++++++----------------- fs/nfsd/state.h | 2 +- 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 7857942c5ca6..b126b86ba644 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -50,6 +50,7 @@ #include "netns.h" #include "pnfs.h" +#include "filecache.h" #define NFSDDBG_FACILITY NFSDDBG_PROC @@ -433,7 +434,7 @@ static struct file * __nfs4_get_fd(struct nfs4_file *f, int oflag) { if (f->fi_fds[oflag]) - return get_file(f->fi_fds[oflag]); + return get_file(f->fi_fds[oflag]->nf_file); return NULL; } @@ -590,17 +591,17 @@ static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag) might_lock(&fp->fi_lock); if (atomic_dec_and_lock(&fp->fi_access[oflag], &fp->fi_lock)) { - struct file *f1 = NULL; - struct file *f2 = NULL; + struct nfsd_file *f1 = NULL; + struct nfsd_file *f2 = NULL; swap(f1, fp->fi_fds[oflag]); if (atomic_read(&fp->fi_access[1 - oflag]) == 0) swap(f2, fp->fi_fds[O_RDWR]); spin_unlock(&fp->fi_lock); if (f1) - fput(f1); + nfsd_file_put(f1); if (f2) - fput(f2); + nfsd_file_put(f2); } } @@ -2323,9 +2324,9 @@ static void states_stop(struct seq_file *s, void *v) spin_unlock(&clp->cl_lock); } -static void nfs4_show_superblock(struct seq_file *s, struct file *f) +static void nfs4_show_superblock(struct seq_file *s, struct nfsd_file *f) { - struct inode *inode = file_inode(f); + struct inode *inode = f->nf_inode; seq_printf(s, "superblock: \"%02x:%02x:%ld\"", MAJOR(inode->i_sb->s_dev), @@ -2343,7 +2344,7 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st) { struct nfs4_ol_stateid *ols; struct nfs4_file *nf; - struct file *file; + struct nfsd_file *file; struct nfs4_stateowner *oo; unsigned int access, deny; @@ -2370,7 +2371,7 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st) seq_printf(s, ", "); nfs4_show_owner(s, oo); seq_printf(s, " }\n"); - fput(file); + nfsd_file_put(file); return 0; } @@ -2379,7 +2380,7 @@ static int nfs4_show_lock(struct seq_file *s, struct nfs4_stid *st) { struct nfs4_ol_stateid *ols; struct nfs4_file *nf; - struct file *file; + struct nfsd_file *file; struct nfs4_stateowner *oo; ols = openlockstateid(st); @@ -2401,7 +2402,7 @@ static int nfs4_show_lock(struct seq_file *s, struct nfs4_stid *st) seq_printf(s, ", "); nfs4_show_owner(s, oo); seq_printf(s, " }\n"); - fput(file); + nfsd_file_put(file); return 0; } @@ -4651,7 +4652,7 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, struct nfsd4_open *open) { - struct file *filp = NULL; + struct nfsd_file *nf = NULL; __be32 status; int oflag = nfs4_access_to_omode(open->op_share_access); int access = nfs4_access_to_access(open->op_share_access); @@ -4687,18 +4688,18 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, if (!fp->fi_fds[oflag]) { spin_unlock(&fp->fi_lock); - status = nfsd_open(rqstp, cur_fh, S_IFREG, access, &filp); + status = nfsd_file_acquire(rqstp, cur_fh, access, &nf); if (status) goto out_put_access; spin_lock(&fp->fi_lock); if (!fp->fi_fds[oflag]) { - fp->fi_fds[oflag] = filp; - filp = NULL; + fp->fi_fds[oflag] = nf; + nf = NULL; } } spin_unlock(&fp->fi_lock); - if (filp) - fput(filp); + if (nf) + nfsd_file_put(nf); status = nfsd4_truncate(rqstp, cur_fh, open); if (status) diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 5dbd16946e8e..8196bfb74f12 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -506,7 +506,7 @@ struct nfs4_file { }; struct list_head fi_clnt_odstate; /* One each for O_RDONLY, O_WRONLY, O_RDWR: */ - struct file * fi_fds[3]; + struct nfsd_file *fi_fds[3]; /* * Each open or lock stateid contributes 0-4 to the counts * below depending on which bits are set in st_access_bitmap: From eb82dd393744107ebc365a53e7813c7c67cb203b Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sun, 18 Aug 2019 14:18:53 -0400 Subject: [PATCH 034/334] nfsd: convert fi_deleg_file and ls_file fields to nfsd_file Have them keep an nfsd_file reference instead of a struct file. Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/locks.c | 1 + fs/nfsd/blocklayout.c | 3 +- fs/nfsd/nfs4layouts.c | 12 ++-- fs/nfsd/nfs4state.c | 144 +++++++++++++++++++++--------------------- fs/nfsd/state.h | 6 +- 5 files changed, 85 insertions(+), 81 deletions(-) diff --git a/fs/locks.c b/fs/locks.c index 1913481bfbf7..c31674d10567 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -212,6 +212,7 @@ struct file_lock_list_struct { static DEFINE_PER_CPU(struct file_lock_list_struct, file_lock_list); DEFINE_STATIC_PERCPU_RWSEM(file_rwsem); + /* * The blocked_hash is used to find POSIX lock loops for deadlock detection. * It is protected by blocked_lock_lock. diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index 66d4c55eb48e..9bbaa671c079 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -15,6 +15,7 @@ #include "blocklayoutxdr.h" #include "pnfs.h" +#include "filecache.h" #define NFSDDBG_FACILITY NFSDDBG_PNFS @@ -404,7 +405,7 @@ static void nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls) { struct nfs4_client *clp = ls->ls_stid.sc_client; - struct block_device *bdev = ls->ls_file->f_path.mnt->mnt_sb->s_bdev; + struct block_device *bdev = ls->ls_file->nf_file->f_path.mnt->mnt_sb->s_bdev; bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY, nfsd4_scsi_pr_key(clp), 0, true); diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index a79e24b79095..2681c70283ce 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -169,8 +169,8 @@ nfsd4_free_layout_stateid(struct nfs4_stid *stid) spin_unlock(&fp->fi_lock); if (!nfsd4_layout_ops[ls->ls_layout_type]->disable_recalls) - vfs_setlease(ls->ls_file, F_UNLCK, NULL, (void **)&ls); - fput(ls->ls_file); + vfs_setlease(ls->ls_file->nf_file, F_UNLCK, NULL, (void **)&ls); + nfsd_file_put(ls->ls_file); if (ls->ls_recalled) atomic_dec(&ls->ls_stid.sc_file->fi_lo_recalls); @@ -197,7 +197,7 @@ nfsd4_layout_setlease(struct nfs4_layout_stateid *ls) fl->fl_end = OFFSET_MAX; fl->fl_owner = ls; fl->fl_pid = current->tgid; - fl->fl_file = ls->ls_file; + fl->fl_file = ls->ls_file->nf_file; status = vfs_setlease(fl->fl_file, fl->fl_type, &fl, NULL); if (status) { @@ -236,13 +236,13 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate, NFSPROC4_CLNT_CB_LAYOUT); if (parent->sc_type == NFS4_DELEG_STID) - ls->ls_file = get_file(fp->fi_deleg_file); + ls->ls_file = nfsd_file_get(fp->fi_deleg_file); else ls->ls_file = find_any_file(fp); BUG_ON(!ls->ls_file); if (nfsd4_layout_setlease(ls)) { - fput(ls->ls_file); + nfsd_file_put(ls->ls_file); put_nfs4_file(fp); kmem_cache_free(nfs4_layout_stateid_cache, ls); return NULL; @@ -626,7 +626,7 @@ nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls) argv[0] = (char *)nfsd_recall_failed; argv[1] = addr_str; - argv[2] = ls->ls_file->f_path.mnt->mnt_sb->s_id; + argv[2] = ls->ls_file->nf_file->f_path.mnt->mnt_sb->s_id; argv[3] = NULL; error = call_usermodehelper(nfsd_recall_failed, argv, envp, diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index b126b86ba644..137f9b119a54 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -430,18 +430,18 @@ put_nfs4_file(struct nfs4_file *fi) } } -static struct file * +static struct nfsd_file * __nfs4_get_fd(struct nfs4_file *f, int oflag) { if (f->fi_fds[oflag]) - return get_file(f->fi_fds[oflag]->nf_file); + return nfsd_file_get(f->fi_fds[oflag]); return NULL; } -static struct file * +static struct nfsd_file * find_writeable_file_locked(struct nfs4_file *f) { - struct file *ret; + struct nfsd_file *ret; lockdep_assert_held(&f->fi_lock); @@ -451,10 +451,10 @@ find_writeable_file_locked(struct nfs4_file *f) return ret; } -static struct file * +static struct nfsd_file * find_writeable_file(struct nfs4_file *f) { - struct file *ret; + struct nfsd_file *ret; spin_lock(&f->fi_lock); ret = find_writeable_file_locked(f); @@ -463,9 +463,10 @@ find_writeable_file(struct nfs4_file *f) return ret; } -static struct file *find_readable_file_locked(struct nfs4_file *f) +static struct nfsd_file * +find_readable_file_locked(struct nfs4_file *f) { - struct file *ret; + struct nfsd_file *ret; lockdep_assert_held(&f->fi_lock); @@ -475,10 +476,10 @@ static struct file *find_readable_file_locked(struct nfs4_file *f) return ret; } -static struct file * +static struct nfsd_file * find_readable_file(struct nfs4_file *f) { - struct file *ret; + struct nfsd_file *ret; spin_lock(&f->fi_lock); ret = find_readable_file_locked(f); @@ -487,10 +488,10 @@ find_readable_file(struct nfs4_file *f) return ret; } -struct file * +struct nfsd_file * find_any_file(struct nfs4_file *f) { - struct file *ret; + struct nfsd_file *ret; spin_lock(&f->fi_lock); ret = __nfs4_get_fd(f, O_RDWR); @@ -934,25 +935,25 @@ nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid) static void put_deleg_file(struct nfs4_file *fp) { - struct file *filp = NULL; + struct nfsd_file *nf = NULL; spin_lock(&fp->fi_lock); if (--fp->fi_delegees == 0) - swap(filp, fp->fi_deleg_file); + swap(nf, fp->fi_deleg_file); spin_unlock(&fp->fi_lock); - if (filp) - fput(filp); + if (nf) + nfsd_file_put(nf); } static void nfs4_unlock_deleg_lease(struct nfs4_delegation *dp) { struct nfs4_file *fp = dp->dl_stid.sc_file; - struct file *filp = fp->fi_deleg_file; + struct nfsd_file *nf = fp->fi_deleg_file; WARN_ON_ONCE(!fp->fi_delegees); - vfs_setlease(filp, F_UNLCK, NULL, (void **)&dp); + vfs_setlease(nf->nf_file, F_UNLCK, NULL, (void **)&dp); put_deleg_file(fp); } @@ -1290,11 +1291,14 @@ static void nfs4_free_lock_stateid(struct nfs4_stid *stid) { struct nfs4_ol_stateid *stp = openlockstateid(stid); struct nfs4_lockowner *lo = lockowner(stp->st_stateowner); - struct file *file; + struct nfsd_file *nf; - file = find_any_file(stp->st_stid.sc_file); - if (file) - filp_close(file, (fl_owner_t)lo); + nf = find_any_file(stp->st_stid.sc_file); + if (nf) { + get_file(nf->nf_file); + filp_close(nf->nf_file, (fl_owner_t)lo); + nfsd_file_put(nf); + } nfs4_free_ol_stateid(stid); } @@ -2411,7 +2415,7 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st) { struct nfs4_delegation *ds; struct nfs4_file *nf; - struct file *file; + struct nfsd_file *file; ds = delegstateid(st); nf = st->sc_file; @@ -2434,7 +2438,7 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st) static int nfs4_show_layout(struct seq_file *s, struct nfs4_stid *st) { struct nfs4_layout_stateid *ls; - struct file *file; + struct nfsd_file *file; ls = container_of(st, struct nfs4_layout_stateid, ls_stid); file = ls->ls_file; @@ -4768,7 +4772,7 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp, fl->fl_end = OFFSET_MAX; fl->fl_owner = (fl_owner_t)dp; fl->fl_pid = current->tgid; - fl->fl_file = dp->dl_stid.sc_file->fi_deleg_file; + fl->fl_file = dp->dl_stid.sc_file->fi_deleg_file->nf_file; return fl; } @@ -4778,7 +4782,7 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, { int status = 0; struct nfs4_delegation *dp; - struct file *filp; + struct nfsd_file *nf; struct file_lock *fl; /* @@ -4789,8 +4793,8 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, if (fp->fi_had_conflict) return ERR_PTR(-EAGAIN); - filp = find_readable_file(fp); - if (!filp) { + nf = find_readable_file(fp); + if (!nf) { /* We should always have a readable file here */ WARN_ON_ONCE(1); return ERR_PTR(-EBADF); @@ -4800,17 +4804,17 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, if (nfs4_delegation_exists(clp, fp)) status = -EAGAIN; else if (!fp->fi_deleg_file) { - fp->fi_deleg_file = filp; + fp->fi_deleg_file = nf; /* increment early to prevent fi_deleg_file from being * cleared */ fp->fi_delegees = 1; - filp = NULL; + nf = NULL; } else fp->fi_delegees++; spin_unlock(&fp->fi_lock); spin_unlock(&state_lock); - if (filp) - fput(filp); + if (nf) + nfsd_file_put(nf); if (status) return ERR_PTR(status); @@ -4823,7 +4827,7 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, if (!fl) goto out_clnt_odstate; - status = vfs_setlease(fp->fi_deleg_file, fl->fl_type, &fl, NULL); + status = vfs_setlease(fp->fi_deleg_file->nf_file, fl->fl_type, &fl, NULL); if (fl) locks_free_lock(fl); if (status) @@ -4843,7 +4847,7 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, return dp; out_unlock: - vfs_setlease(fp->fi_deleg_file, F_UNLCK, NULL, (void **)&dp); + vfs_setlease(fp->fi_deleg_file->nf_file, F_UNLCK, NULL, (void **)&dp); out_clnt_odstate: put_clnt_odstate(dp->dl_clnt_odstate); nfs4_put_stid(&dp->dl_stid); @@ -5514,7 +5518,7 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, return nfs_ok; } -static struct file * +static struct nfsd_file * nfs4_find_file(struct nfs4_stid *s, int flags) { if (!s) @@ -5524,7 +5528,7 @@ nfs4_find_file(struct nfs4_stid *s, int flags) case NFS4_DELEG_STID: if (WARN_ON_ONCE(!s->sc_file->fi_deleg_file)) return NULL; - return get_file(s->sc_file->fi_deleg_file); + return nfsd_file_get(s->sc_file->fi_deleg_file); case NFS4_OPEN_STID: case NFS4_LOCK_STID: if (flags & RD_STATE) @@ -5553,29 +5557,27 @@ nfs4_check_file(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfs4_stid *s, struct file **filpp, bool *tmp_file, int flags) { int acc = (flags & RD_STATE) ? NFSD_MAY_READ : NFSD_MAY_WRITE; - struct file *file; + struct nfsd_file *nf; __be32 status; - file = nfs4_find_file(s, flags); - if (file) { + nf = nfs4_find_file(s, flags); + if (nf) { status = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry, acc | NFSD_MAY_OWNER_OVERRIDE); - if (status) { - fput(file); - return status; - } - - *filpp = file; + if (status) + goto out; } else { - status = nfsd_open(rqstp, fhp, S_IFREG, acc, filpp); + status = nfsd_file_acquire(rqstp, fhp, acc, &nf); if (status) return status; if (tmp_file) *tmp_file = true; } - - return 0; + *filpp = get_file(nf->nf_file); +out: + nfsd_file_put(nf); + return status; } /* @@ -6393,7 +6395,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfs4_ol_stateid *lock_stp = NULL; struct nfs4_ol_stateid *open_stp = NULL; struct nfs4_file *fp; - struct file *filp = NULL; + struct nfsd_file *nf = NULL; struct nfsd4_blocked_lock *nbl = NULL; struct file_lock *file_lock = NULL; struct file_lock *conflock = NULL; @@ -6475,8 +6477,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, /* Fallthrough */ case NFS4_READ_LT: spin_lock(&fp->fi_lock); - filp = find_readable_file_locked(fp); - if (filp) + nf = find_readable_file_locked(fp); + if (nf) get_lock_access(lock_stp, NFS4_SHARE_ACCESS_READ); spin_unlock(&fp->fi_lock); fl_type = F_RDLCK; @@ -6487,8 +6489,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, /* Fallthrough */ case NFS4_WRITE_LT: spin_lock(&fp->fi_lock); - filp = find_writeable_file_locked(fp); - if (filp) + nf = find_writeable_file_locked(fp); + if (nf) get_lock_access(lock_stp, NFS4_SHARE_ACCESS_WRITE); spin_unlock(&fp->fi_lock); fl_type = F_WRLCK; @@ -6498,7 +6500,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; } - if (!filp) { + if (!nf) { status = nfserr_openmode; goto out; } @@ -6514,7 +6516,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, file_lock->fl_type = fl_type; file_lock->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(&lock_sop->lo_owner)); file_lock->fl_pid = current->tgid; - file_lock->fl_file = filp; + file_lock->fl_file = nf->nf_file; file_lock->fl_flags = fl_flags; file_lock->fl_lmops = &nfsd_posix_mng_ops; file_lock->fl_start = lock->lk_offset; @@ -6536,7 +6538,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, spin_unlock(&nn->blocked_locks_lock); } - err = vfs_lock_file(filp, F_SETLK, file_lock, conflock); + err = vfs_lock_file(nf->nf_file, F_SETLK, file_lock, conflock); switch (err) { case 0: /* success! */ nfs4_inc_and_copy_stateid(&lock->lk_resp_stateid, &lock_stp->st_stid); @@ -6571,8 +6573,8 @@ out: } free_blocked_lock(nbl); } - if (filp) - fput(filp); + if (nf) + nfsd_file_put(nf); if (lock_stp) { /* Bump seqid manually if the 4.0 replay owner is openowner */ if (cstate->replay_owner && @@ -6699,7 +6701,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { struct nfsd4_locku *locku = &u->locku; struct nfs4_ol_stateid *stp; - struct file *filp = NULL; + struct nfsd_file *nf = NULL; struct file_lock *file_lock = NULL; __be32 status; int err; @@ -6717,8 +6719,8 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, &stp, nn); if (status) goto out; - filp = find_any_file(stp->st_stid.sc_file); - if (!filp) { + nf = find_any_file(stp->st_stid.sc_file); + if (!nf) { status = nfserr_lock_range; goto put_stateid; } @@ -6726,13 +6728,13 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (!file_lock) { dprintk("NFSD: %s: unable to allocate lock!\n", __func__); status = nfserr_jukebox; - goto fput; + goto put_file; } file_lock->fl_type = F_UNLCK; file_lock->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(stp->st_stateowner)); file_lock->fl_pid = current->tgid; - file_lock->fl_file = filp; + file_lock->fl_file = nf->nf_file; file_lock->fl_flags = FL_POSIX; file_lock->fl_lmops = &nfsd_posix_mng_ops; file_lock->fl_start = locku->lu_offset; @@ -6741,14 +6743,14 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, locku->lu_length); nfs4_transform_lock_offset(file_lock); - err = vfs_lock_file(filp, F_SETLK, file_lock, NULL); + err = vfs_lock_file(nf->nf_file, F_SETLK, file_lock, NULL); if (err) { dprintk("NFSD: nfs4_locku: vfs_lock_file failed!\n"); goto out_nfserr; } nfs4_inc_and_copy_stateid(&locku->lu_stateid, &stp->st_stid); -fput: - fput(filp); +put_file: + nfsd_file_put(nf); put_stateid: mutex_unlock(&stp->st_mutex); nfs4_put_stid(&stp->st_stid); @@ -6760,7 +6762,7 @@ out: out_nfserr: status = nfserrno(err); - goto fput; + goto put_file; } /* @@ -6773,17 +6775,17 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner) { struct file_lock *fl; int status = false; - struct file *filp = find_any_file(fp); + struct nfsd_file *nf = find_any_file(fp); struct inode *inode; struct file_lock_context *flctx; - if (!filp) { + if (!nf) { /* Any valid lock stateid should have some sort of access */ WARN_ON_ONCE(1); return status; } - inode = locks_inode(filp); + inode = locks_inode(nf->nf_file); flctx = inode->i_flctx; if (flctx && !list_empty_careful(&flctx->flc_posix)) { @@ -6796,7 +6798,7 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner) } spin_unlock(&flctx->flc_lock); } - fput(filp); + nfsd_file_put(nf); return status; } diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 8196bfb74f12..d89d1ade1254 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -516,7 +516,7 @@ struct nfs4_file { */ atomic_t fi_access[2]; u32 fi_share_deny; - struct file *fi_deleg_file; + struct nfsd_file *fi_deleg_file; int fi_delegees; struct knfsd_fh fi_fhandle; bool fi_had_conflict; @@ -565,7 +565,7 @@ struct nfs4_layout_stateid { spinlock_t ls_lock; struct list_head ls_layouts; u32 ls_layout_type; - struct file *ls_file; + struct nfsd_file *ls_file; struct nfsd4_callback ls_recall; stateid_t ls_recall_sid; bool ls_recalled; @@ -657,7 +657,7 @@ static inline void get_nfs4_file(struct nfs4_file *fi) { refcount_inc(&fi->fi_ref); } -struct file *find_any_file(struct nfs4_file *f); +struct nfsd_file *find_any_file(struct nfs4_file *f); /* grace period management */ void nfsd4_end_grace(struct nfsd_net *nn); From 5c4583b2b78eef4eb33fca9a4598e72e08dd514b Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sun, 18 Aug 2019 14:18:54 -0400 Subject: [PATCH 035/334] nfsd: hook up nfs4_preprocess_stateid_op to the nfsd_file cache Have nfs4_preprocess_stateid_op pass back a nfsd_file instead of a filp. Since we now presume that the struct file will be persistent in most cases, we can stop fiddling with the raparms in the read code. This also means that we don't really care about the rd_tmp_file field anymore. Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 83 +++++++++++++++++++++++---------------------- fs/nfsd/nfs4state.c | 24 ++++++------- fs/nfsd/nfs4xdr.c | 14 ++++---- fs/nfsd/state.h | 2 +- fs/nfsd/xdr4.h | 19 +++++------ 5 files changed, 68 insertions(+), 74 deletions(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 8beda999e134..cb51893ec1cd 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -761,7 +761,7 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_read *read = &u->read; __be32 status; - read->rd_filp = NULL; + read->rd_nf = NULL; if (read->rd_offset >= OFFSET_MAX) return nfserr_inval; @@ -782,7 +782,7 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, /* check stateid */ status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, &read->rd_stateid, RD_STATE, - &read->rd_filp, &read->rd_tmp_file); + &read->rd_nf); if (status) { dprintk("NFSD: nfsd4_read: couldn't process stateid!\n"); goto out; @@ -798,8 +798,8 @@ out: static void nfsd4_read_release(union nfsd4_op_u *u) { - if (u->read.rd_filp) - fput(u->read.rd_filp); + if (u->read.rd_nf) + nfsd_file_put(u->read.rd_nf); trace_nfsd_read_done(u->read.rd_rqstp, u->read.rd_fhp, u->read.rd_offset, u->read.rd_length); } @@ -954,7 +954,7 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (setattr->sa_iattr.ia_valid & ATTR_SIZE) { status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, &setattr->sa_stateid, - WR_STATE, NULL, NULL); + WR_STATE, NULL); if (status) { dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n"); return status; @@ -993,7 +993,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { struct nfsd4_write *write = &u->write; stateid_t *stateid = &write->wr_stateid; - struct file *filp = NULL; + struct nfsd_file *nf = NULL; __be32 status = nfs_ok; unsigned long cnt; int nvecs; @@ -1005,7 +1005,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, trace_nfsd_write_start(rqstp, &cstate->current_fh, write->wr_offset, cnt); status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, - stateid, WR_STATE, &filp, NULL); + stateid, WR_STATE, &nf); if (status) { dprintk("NFSD: nfsd4_write: couldn't process stateid!\n"); return status; @@ -1018,10 +1018,10 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, &write->wr_head, write->wr_buflen); WARN_ON_ONCE(nvecs > ARRAY_SIZE(rqstp->rq_vec)); - status = nfsd_vfs_write(rqstp, &cstate->current_fh, filp, + status = nfsd_vfs_write(rqstp, &cstate->current_fh, nf->nf_file, write->wr_offset, rqstp->rq_vec, nvecs, &cnt, write->wr_how_written); - fput(filp); + nfsd_file_put(nf); write->wr_bytes_written = cnt; trace_nfsd_write_done(rqstp, &cstate->current_fh, @@ -1031,8 +1031,8 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, static __be32 nfsd4_verify_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, - stateid_t *src_stateid, struct file **src, - stateid_t *dst_stateid, struct file **dst) + stateid_t *src_stateid, struct nfsd_file **src, + stateid_t *dst_stateid, struct nfsd_file **dst) { __be32 status; @@ -1040,22 +1040,22 @@ nfsd4_verify_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return nfserr_nofilehandle; status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->save_fh, - src_stateid, RD_STATE, src, NULL); + src_stateid, RD_STATE, src); if (status) { dprintk("NFSD: %s: couldn't process src stateid!\n", __func__); goto out; } status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, - dst_stateid, WR_STATE, dst, NULL); + dst_stateid, WR_STATE, dst); if (status) { dprintk("NFSD: %s: couldn't process dst stateid!\n", __func__); goto out_put_src; } /* fix up for NFS-specific error code */ - if (!S_ISREG(file_inode(*src)->i_mode) || - !S_ISREG(file_inode(*dst)->i_mode)) { + if (!S_ISREG(file_inode((*src)->nf_file)->i_mode) || + !S_ISREG(file_inode((*dst)->nf_file)->i_mode)) { status = nfserr_wrong_type; goto out_put_dst; } @@ -1063,9 +1063,9 @@ nfsd4_verify_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, out: return status; out_put_dst: - fput(*dst); + nfsd_file_put(*dst); out_put_src: - fput(*src); + nfsd_file_put(*src); goto out; } @@ -1074,7 +1074,7 @@ nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_clone *clone = &u->clone; - struct file *src, *dst; + struct nfsd_file *src, *dst; __be32 status; status = nfsd4_verify_copy(rqstp, cstate, &clone->cl_src_stateid, &src, @@ -1082,11 +1082,11 @@ nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (status) goto out; - status = nfsd4_clone_file_range(src, clone->cl_src_pos, - dst, clone->cl_dst_pos, clone->cl_count); + status = nfsd4_clone_file_range(src->nf_file, clone->cl_src_pos, + dst->nf_file, clone->cl_dst_pos, clone->cl_count); - fput(dst); - fput(src); + nfsd_file_put(dst); + nfsd_file_put(src); out: return status; } @@ -1176,8 +1176,9 @@ static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy) do { if (kthread_should_stop()) break; - bytes_copied = nfsd_copy_file_range(copy->file_src, src_pos, - copy->file_dst, dst_pos, bytes_total); + bytes_copied = nfsd_copy_file_range(copy->nf_src->nf_file, + src_pos, copy->nf_dst->nf_file, dst_pos, + bytes_total); if (bytes_copied <= 0) break; bytes_total -= bytes_copied; @@ -1204,8 +1205,8 @@ static __be32 nfsd4_do_copy(struct nfsd4_copy *copy, bool sync) status = nfs_ok; } - fput(copy->file_src); - fput(copy->file_dst); + nfsd_file_put(copy->nf_src); + nfsd_file_put(copy->nf_dst); return status; } @@ -1218,16 +1219,16 @@ static void dup_copy_fields(struct nfsd4_copy *src, struct nfsd4_copy *dst) memcpy(&dst->cp_res, &src->cp_res, sizeof(src->cp_res)); memcpy(&dst->fh, &src->fh, sizeof(src->fh)); dst->cp_clp = src->cp_clp; - dst->file_dst = get_file(src->file_dst); - dst->file_src = get_file(src->file_src); + dst->nf_dst = nfsd_file_get(src->nf_dst); + dst->nf_src = nfsd_file_get(src->nf_src); memcpy(&dst->cp_stateid, &src->cp_stateid, sizeof(src->cp_stateid)); } static void cleanup_async_copy(struct nfsd4_copy *copy) { nfs4_free_cp_state(copy); - fput(copy->file_dst); - fput(copy->file_src); + nfsd_file_put(copy->nf_dst); + nfsd_file_put(copy->nf_src); spin_lock(©->cp_clp->async_lock); list_del(©->copies); spin_unlock(©->cp_clp->async_lock); @@ -1264,8 +1265,8 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_copy *async_copy = NULL; status = nfsd4_verify_copy(rqstp, cstate, ©->cp_src_stateid, - ©->file_src, ©->cp_dst_stateid, - ©->file_dst); + ©->nf_src, ©->cp_dst_stateid, + ©->nf_dst); if (status) goto out; @@ -1347,21 +1348,21 @@ nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_fallocate *fallocate, int flags) { __be32 status; - struct file *file; + struct nfsd_file *nf; status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, &fallocate->falloc_stateid, - WR_STATE, &file, NULL); + WR_STATE, &nf); if (status != nfs_ok) { dprintk("NFSD: nfsd4_fallocate: couldn't process stateid!\n"); return status; } - status = nfsd4_vfs_fallocate(rqstp, &cstate->current_fh, file, + status = nfsd4_vfs_fallocate(rqstp, &cstate->current_fh, nf->nf_file, fallocate->falloc_offset, fallocate->falloc_length, flags); - fput(file); + nfsd_file_put(nf); return status; } static __be32 @@ -1406,11 +1407,11 @@ nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_seek *seek = &u->seek; int whence; __be32 status; - struct file *file; + struct nfsd_file *nf; status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, &seek->seek_stateid, - RD_STATE, &file, NULL); + RD_STATE, &nf); if (status) { dprintk("NFSD: nfsd4_seek: couldn't process stateid!\n"); return status; @@ -1432,14 +1433,14 @@ nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * Note: This call does change file->f_pos, but nothing in NFSD * should ever file->f_pos. */ - seek->seek_pos = vfs_llseek(file, seek->seek_offset, whence); + seek->seek_pos = vfs_llseek(nf->nf_file, seek->seek_offset, whence); if (seek->seek_pos < 0) status = nfserrno(seek->seek_pos); - else if (seek->seek_pos >= i_size_read(file_inode(file))) + else if (seek->seek_pos >= i_size_read(file_inode(nf->nf_file))) seek->seek_eof = true; out: - fput(file); + nfsd_file_put(nf); return status; } diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 137f9b119a54..17b1ad4a619f 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -5554,7 +5554,7 @@ nfs4_check_olstateid(struct nfs4_ol_stateid *ols, int flags) static __be32 nfs4_check_file(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfs4_stid *s, - struct file **filpp, bool *tmp_file, int flags) + struct nfsd_file **nfp, int flags) { int acc = (flags & RD_STATE) ? NFSD_MAY_READ : NFSD_MAY_WRITE; struct nfsd_file *nf; @@ -5564,19 +5564,17 @@ nfs4_check_file(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfs4_stid *s, if (nf) { status = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry, acc | NFSD_MAY_OWNER_OVERRIDE); - if (status) + if (status) { + nfsd_file_put(nf); goto out; + } } else { status = nfsd_file_acquire(rqstp, fhp, acc, &nf); if (status) return status; - - if (tmp_file) - *tmp_file = true; } - *filpp = get_file(nf->nf_file); + *nfp = nf; out: - nfsd_file_put(nf); return status; } @@ -5586,7 +5584,7 @@ out: __be32 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct svc_fh *fhp, - stateid_t *stateid, int flags, struct file **filpp, bool *tmp_file) + stateid_t *stateid, int flags, struct nfsd_file **nfp) { struct inode *ino = d_inode(fhp->fh_dentry); struct net *net = SVC_NET(rqstp); @@ -5594,10 +5592,8 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, struct nfs4_stid *s = NULL; __be32 status; - if (filpp) - *filpp = NULL; - if (tmp_file) - *tmp_file = false; + if (nfp) + *nfp = NULL; if (grace_disallows_io(net, ino)) return nfserr_grace; @@ -5634,8 +5630,8 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, status = nfs4_check_fh(fhp, s); done: - if (!status && filpp) - status = nfs4_check_file(rqstp, fhp, s, filpp, tmp_file, flags); + if (status == nfs_ok && nfp) + status = nfs4_check_file(rqstp, fhp, s, nfp, flags); out: if (s) nfs4_put_stid(s); diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 442811809f3d..7b03bcca0dfe 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -49,6 +49,7 @@ #include "cache.h" #include "netns.h" #include "pnfs.h" +#include "filecache.h" #ifdef CONFIG_NFSD_V4_SECURITY_LABEL #include @@ -3574,11 +3575,14 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, { unsigned long maxcount; struct xdr_stream *xdr = &resp->xdr; - struct file *file = read->rd_filp; + struct file *file; int starting_len = xdr->buf->len; - struct raparms *ra = NULL; __be32 *p; + if (nfserr) + return nfserr; + file = read->rd_nf->nf_file; + p = xdr_reserve_space(xdr, 8); /* eof flag and byte count */ if (!p) { WARN_ON_ONCE(test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)); @@ -3596,18 +3600,12 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, (xdr->buf->buflen - xdr->buf->len)); maxcount = min_t(unsigned long, maxcount, read->rd_length); - if (read->rd_tmp_file) - ra = nfsd_init_raparms(file); - if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)) nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount); else nfserr = nfsd4_encode_readv(resp, read, file, maxcount); - if (ra) - nfsd_put_raparams(file, ra); - if (nfserr) xdr_truncate_encode(xdr, starting_len); diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index d89d1ade1254..d00c86d05beb 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -616,7 +616,7 @@ struct nfsd4_copy; extern __be32 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct svc_fh *fhp, - stateid_t *stateid, int flags, struct file **filp, bool *tmp_file); + stateid_t *stateid, int flags, struct nfsd_file **filp); __be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, stateid_t *stateid, unsigned char typemask, struct nfs4_stid **s, struct nfsd_net *nn); diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index d64c870f998a..f4737d66ee98 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -273,15 +273,14 @@ struct nfsd4_open_downgrade { struct nfsd4_read { - stateid_t rd_stateid; /* request */ - u64 rd_offset; /* request */ - u32 rd_length; /* request */ - int rd_vlen; - struct file *rd_filp; - bool rd_tmp_file; + stateid_t rd_stateid; /* request */ + u64 rd_offset; /* request */ + u32 rd_length; /* request */ + int rd_vlen; + struct nfsd_file *rd_nf; - struct svc_rqst *rd_rqstp; /* response */ - struct svc_fh * rd_fhp; /* response */ + struct svc_rqst *rd_rqstp; /* response */ + struct svc_fh *rd_fhp; /* response */ }; struct nfsd4_readdir { @@ -538,8 +537,8 @@ struct nfsd4_copy { struct nfs4_client *cp_clp; - struct file *file_src; - struct file *file_dst; + struct nfsd_file *nf_src; + struct nfsd_file *nf_dst; stateid_t cp_stateid; From 6b556ca2872be223176c1a1e249606248ce0b201 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sun, 18 Aug 2019 14:18:55 -0400 Subject: [PATCH 036/334] nfsd: have nfsd_test_lock use the nfsd_file cache Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 17b1ad4a619f..906e214dcce8 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -6605,11 +6605,11 @@ out: */ static __be32 nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file_lock *lock) { - struct file *file; - __be32 err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file); + struct nfsd_file *nf; + __be32 err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_READ, &nf); if (!err) { - err = nfserrno(vfs_test_lock(file, lock)); - fput(file); + err = nfserrno(vfs_test_lock(nf->nf_file, lock)); + nfsd_file_put(nf); } return err; } From 501cb1849f865960501d19d54e6a5af306f9b6fd Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sun, 18 Aug 2019 14:18:56 -0400 Subject: [PATCH 037/334] nfsd: rip out the raparms cache The raparms cache was set up in order to ensure that we carry readahead information forward from one RPC call to the next. In other words, it was set up because each RPC call was forced to open a struct file, then close it, causing the loss of readahead information that is normally cached in that struct file, and used to keep the page cache filled when a user calls read() multiple times on the same file descriptor. Now that we cache the struct file, and reuse it for all the I/O calls to a given file by a given user, we no longer have to keep a separate readahead cache. Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/nfsd/nfssvc.c | 13 +---- fs/nfsd/vfs.c | 149 ----------------------------------------------- fs/nfsd/vfs.h | 6 -- 3 files changed, 1 insertion(+), 167 deletions(-) diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index a6b1eab7b722..d02712ca2685 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -317,22 +317,12 @@ static int nfsd_startup_generic(int nrservs) ret = nfsd_file_cache_init(); if (ret) goto dec_users; - /* - * Readahead param cache - will no-op if it already exists. - * (Note therefore results will be suboptimal if number of - * threads is modified after nfsd start.) - */ - ret = nfsd_racache_init(2*nrservs); - if (ret) - goto out_file_cache; ret = nfs4_state_start(); if (ret) - goto out_racache; + goto out_file_cache; return 0; -out_racache: - nfsd_racache_shutdown(); out_file_cache: nfsd_file_cache_shutdown(); dec_users: @@ -347,7 +337,6 @@ static void nfsd_shutdown_generic(void) nfs4_state_shutdown(); nfsd_file_cache_shutdown(); - nfsd_racache_shutdown(); } static bool nfsd_needs_lockd(struct nfsd_net *nn) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index ec254bff1893..8e2c8f36eba3 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -49,34 +49,6 @@ #define NFSDDBG_FACILITY NFSDDBG_FILEOP - -/* - * This is a cache of readahead params that help us choose the proper - * readahead strategy. Initially, we set all readahead parameters to 0 - * and let the VFS handle things. - * If you increase the number of cached files very much, you'll need to - * add a hash table here. - */ -struct raparms { - struct raparms *p_next; - unsigned int p_count; - ino_t p_ino; - dev_t p_dev; - int p_set; - struct file_ra_state p_ra; - unsigned int p_hindex; -}; - -struct raparm_hbucket { - struct raparms *pb_head; - spinlock_t pb_lock; -} ____cacheline_aligned_in_smp; - -#define RAPARM_HASH_BITS 4 -#define RAPARM_HASH_SIZE (1<i_sb->s_dev; - ino_t ino = inode->i_ino; - struct raparms *ra, **rap, **frap = NULL; - int depth = 0; - unsigned int hash; - struct raparm_hbucket *rab; - - hash = jhash_2words(dev, ino, 0xfeedbeef) & RAPARM_HASH_MASK; - rab = &raparm_hash[hash]; - - spin_lock(&rab->pb_lock); - for (rap = &rab->pb_head; (ra = *rap); rap = &ra->p_next) { - if (ra->p_ino == ino && ra->p_dev == dev) - goto found; - depth++; - if (ra->p_count == 0) - frap = rap; - } - depth = nfsdstats.ra_size; - if (!frap) { - spin_unlock(&rab->pb_lock); - return NULL; - } - rap = frap; - ra = *frap; - ra->p_dev = dev; - ra->p_ino = ino; - ra->p_set = 0; - ra->p_hindex = hash; -found: - if (rap != &rab->pb_head) { - *rap = ra->p_next; - ra->p_next = rab->pb_head; - rab->pb_head = ra; - } - ra->p_count++; - nfsdstats.ra_depth[depth*10/nfsdstats.ra_size]++; - spin_unlock(&rab->pb_lock); - - if (ra->p_set) - file->f_ra = ra->p_ra; - return ra; -} - -void nfsd_put_raparams(struct file *file, struct raparms *ra) -{ - struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex]; - - spin_lock(&rab->pb_lock); - ra->p_ra = file->f_ra; - ra->p_set = 1; - ra->p_count--; - spin_unlock(&rab->pb_lock); -} - /* * Grab and keep cached pages associated with a file in the svc_rqst * so that they can be passed to the network sendmsg/sendpage routines @@ -2094,63 +2005,3 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, return err? nfserrno(err) : 0; } - -void -nfsd_racache_shutdown(void) -{ - struct raparms *raparm, *last_raparm; - unsigned int i; - - dprintk("nfsd: freeing readahead buffers.\n"); - - for (i = 0; i < RAPARM_HASH_SIZE; i++) { - raparm = raparm_hash[i].pb_head; - while(raparm) { - last_raparm = raparm; - raparm = raparm->p_next; - kfree(last_raparm); - } - raparm_hash[i].pb_head = NULL; - } -} -/* - * Initialize readahead param cache - */ -int -nfsd_racache_init(int cache_size) -{ - int i; - int j = 0; - int nperbucket; - struct raparms **raparm = NULL; - - - if (raparm_hash[0].pb_head) - return 0; - nperbucket = DIV_ROUND_UP(cache_size, RAPARM_HASH_SIZE); - nperbucket = max(2, nperbucket); - cache_size = nperbucket * RAPARM_HASH_SIZE; - - dprintk("nfsd: allocating %d readahead buffers.\n", cache_size); - - for (i = 0; i < RAPARM_HASH_SIZE; i++) { - spin_lock_init(&raparm_hash[i].pb_lock); - - raparm = &raparm_hash[i].pb_head; - for (j = 0; j < nperbucket; j++) { - *raparm = kzalloc(sizeof(struct raparms), GFP_KERNEL); - if (!*raparm) - goto out_nomem; - raparm = &(*raparm)->p_next; - } - *raparm = NULL; - } - - nfsdstats.ra_size = cache_size; - return 0; - -out_nomem: - dprintk("nfsd: kmalloc failed, freeing readahead buffers\n"); - nfsd_racache_shutdown(); - return -ENOMEM; -} diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 31fdae34e028..e0f7792165a6 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -40,8 +40,6 @@ typedef int (*nfsd_filldir_t)(void *, const char *, int, loff_t, u64, unsigned); /* nfsd/vfs.c */ -int nfsd_racache_init(int); -void nfsd_racache_shutdown(void); int nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, struct svc_export **expp); __be32 nfsd_lookup(struct svc_rqst *, struct svc_fh *, @@ -80,7 +78,6 @@ __be32 nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t, int, struct file **); __be32 nfsd_open_verified(struct svc_rqst *, struct svc_fh *, umode_t, int, struct file **); -struct raparms; __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, unsigned long *count); @@ -118,9 +115,6 @@ __be32 nfsd_statfs(struct svc_rqst *, struct svc_fh *, __be32 nfsd_permission(struct svc_rqst *, struct svc_export *, struct dentry *, int); -struct raparms *nfsd_init_raparms(struct file *file); -void nfsd_put_raparams(struct file *file, struct raparms *ra); - static inline int fh_want_write(struct svc_fh *fh) { int ret; From 7775ec57f4c71309109f9e98cf8854a2ca8c9df1 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sun, 18 Aug 2019 14:18:57 -0400 Subject: [PATCH 038/334] nfsd: close cached files prior to a REMOVE or RENAME that would replace target It's not uncommon for some workloads to do a bunch of I/O to a file and delete it just afterward. If knfsd has a cached open file however, then the file may still be open when the dentry is unlinked. If the underlying filesystem is nfs, then that could trigger it to do a sillyrename. On a REMOVE or RENAME scan the nfsd_file cache for open files that correspond to the inode, and proactively unhash and put their references. This should prevent any delete-on-last-close activity from occurring, solely due to knfsd's open file cache. This must be done synchronously though so we use the variants that call flush_delayed_fput. There are deadlock possibilities if you call flush_delayed_fput while holding locks, however. In the case of nfsd_rename, we don't even do the lookups of the dentries to be renamed until we've locked for rename. Once we've figured out what the target dentry is for a rename, check to see whether there are cached open files associated with it. If there are, then unwind all of the locking, close them all, and then reattempt the rename. Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.c | 62 +++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 53 insertions(+), 9 deletions(-) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 8e2c8f36eba3..84e87772c2b8 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1590,6 +1590,26 @@ out_nfserr: goto out_unlock; } +static void +nfsd_close_cached_files(struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + + if (inode && S_ISREG(inode->i_mode)) + nfsd_file_close_inode_sync(inode); +} + +static bool +nfsd_has_cached_files(struct dentry *dentry) +{ + bool ret = false; + struct inode *inode = d_inode(dentry); + + if (inode && S_ISREG(inode->i_mode)) + ret = nfsd_file_is_cached(inode); + return ret; +} + /* * Rename a file * N.B. After this call _both_ ffhp and tfhp need an fh_put @@ -1602,6 +1622,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, struct inode *fdir, *tdir; __be32 err; int host_err; + bool has_cached = false; err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_REMOVE); if (err) @@ -1620,6 +1641,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen)) goto out; +retry: host_err = fh_want_write(ffhp); if (host_err) { err = nfserrno(host_err); @@ -1659,11 +1681,16 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry) goto out_dput_new; - host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL, 0); - if (!host_err) { - host_err = commit_metadata(tfhp); - if (!host_err) - host_err = commit_metadata(ffhp); + if (nfsd_has_cached_files(ndentry)) { + has_cached = true; + goto out_dput_old; + } else { + host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL, 0); + if (!host_err) { + host_err = commit_metadata(tfhp); + if (!host_err) + host_err = commit_metadata(ffhp); + } } out_dput_new: dput(ndentry); @@ -1676,12 +1703,26 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, * as that would do the wrong thing if the two directories * were the same, so again we do it by hand. */ - fill_post_wcc(ffhp); - fill_post_wcc(tfhp); + if (!has_cached) { + fill_post_wcc(ffhp); + fill_post_wcc(tfhp); + } unlock_rename(tdentry, fdentry); ffhp->fh_locked = tfhp->fh_locked = false; fh_drop_write(ffhp); + /* + * If the target dentry has cached open files, then we need to try to + * close them prior to doing the rename. Flushing delayed fput + * shouldn't be done with locks held however, so we delay it until this + * point and then reattempt the whole shebang. + */ + if (has_cached) { + has_cached = false; + nfsd_close_cached_files(ndentry); + dput(ndentry); + goto retry; + } out: return err; } @@ -1728,10 +1769,13 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, if (!type) type = d_inode(rdentry)->i_mode & S_IFMT; - if (type != S_IFDIR) + if (type != S_IFDIR) { + nfsd_close_cached_files(rdentry); host_err = vfs_unlink(dirp, rdentry, NULL); - else + } else { host_err = vfs_rmdir(dirp, rdentry); + } + if (!host_err) host_err = commit_metadata(fhp); dput(rdentry); From b96811cd02466c6bb65c17639a96f2a766b7db9c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 18 Aug 2019 14:18:58 -0400 Subject: [PATCH 039/334] nfsd: Fix up some unused variable warnings Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 7b03bcca0dfe..e08d890a1915 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1419,7 +1419,6 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp, struct nfsd4_create_session *sess) { DECODE_HEAD; - u32 dummy; READ_BUF(16); COPYMEM(&sess->clientid, 8); @@ -1428,7 +1427,7 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp, /* Fore channel attrs */ READ_BUF(28); - dummy = be32_to_cpup(p++); /* headerpadsz is always 0 */ + p++; /* headerpadsz is always 0 */ sess->fore_channel.maxreq_sz = be32_to_cpup(p++); sess->fore_channel.maxresp_sz = be32_to_cpup(p++); sess->fore_channel.maxresp_cached = be32_to_cpup(p++); @@ -1445,7 +1444,7 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp, /* Back channel attrs */ READ_BUF(28); - dummy = be32_to_cpup(p++); /* headerpadsz is always 0 */ + p++; /* headerpadsz is always 0 */ sess->back_channel.maxreq_sz = be32_to_cpup(p++); sess->back_channel.maxresp_sz = be32_to_cpup(p++); sess->back_channel.maxresp_cached = be32_to_cpup(p++); @@ -1737,7 +1736,6 @@ static __be32 nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy) { DECODE_HEAD; - unsigned int tmp; status = nfsd4_decode_stateid(argp, ©->cp_src_stateid); if (status) @@ -1752,7 +1750,7 @@ nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy) p = xdr_decode_hyper(p, ©->cp_count); p++; /* ca_consecutive: we always do consecutive copies */ copy->cp_synchronous = be32_to_cpup(p++); - tmp = be32_to_cpup(p); /* Source server list not supported */ + /* tmp = be32_to_cpup(p); Source server list not supported */ DECODE_TAIL; } @@ -3218,9 +3216,8 @@ nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ if (!p) return nfserr_resource; encode_cinfo(p, &create->cr_cinfo); - nfserr = nfsd4_encode_bitmap(xdr, create->cr_bmval[0], + return nfsd4_encode_bitmap(xdr, create->cr_bmval[0], create->cr_bmval[1], create->cr_bmval[2]); - return 0; } static __be32 From ed9927533a64ae55fa5adc91bf7383ee12f5710d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 18 Aug 2019 14:18:59 -0400 Subject: [PATCH 040/334] nfsd: Fix the documentation for svcxdr_tmpalloc() Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index e08d890a1915..565d2169902c 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -212,10 +212,10 @@ static int zero_clientid(clientid_t *clid) /** * svcxdr_tmpalloc - allocate memory to be freed after compound processing * @argp: NFSv4 compound argument structure - * @p: pointer to be freed (with kfree()) + * @len: length of buffer to allocate * - * Marks @p to be freed when processing the compound operation - * described in @argp finishes. + * Allocates a buffer of size @len to be freed when processing the compound + * operation described in @argp finishes. */ static void * svcxdr_tmpalloc(struct nfsd4_compoundargs *argp, u32 len) From e6b1db98cf4d54d9ea59cfcc195f70dc946fdd38 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Mon, 19 Aug 2019 17:17:37 -0700 Subject: [PATCH 041/334] security: Support early LSMs The lockdown module is intended to allow for kernels to be locked down early in boot - sufficiently early that we don't have the ability to kmalloc() yet. Add support for early initialisation of some LSMs, and then add them to the list of names when we do full initialisation later. Early LSMs are initialised in link order and cannot be overridden via boot parameters, and cannot make use of kmalloc() (since the allocator isn't initialised yet). (Fixed by Stephen Rothwell to include a stub to fix builds when !CONFIG_SECURITY) Signed-off-by: Matthew Garrett Acked-by: Kees Cook Acked-by: Casey Schaufler Cc: Stephen Rothwell Signed-off-by: James Morris --- include/asm-generic/vmlinux.lds.h | 8 ++++- include/linux/lsm_hooks.h | 6 ++++ include/linux/security.h | 6 ++++ init/main.c | 1 + security/security.c | 50 ++++++++++++++++++++++++++----- 5 files changed, 62 insertions(+), 9 deletions(-) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 088987e9a3ea..c1807d14daa3 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -208,8 +208,13 @@ __start_lsm_info = .; \ KEEP(*(.lsm_info.init)) \ __end_lsm_info = .; +#define EARLY_LSM_TABLE() . = ALIGN(8); \ + __start_early_lsm_info = .; \ + KEEP(*(.early_lsm_info.init)) \ + __end_early_lsm_info = .; #else #define LSM_TABLE() +#define EARLY_LSM_TABLE() #endif #define ___OF_TABLE(cfg, name) _OF_TABLE_##cfg(name) @@ -609,7 +614,8 @@ ACPI_PROBE_TABLE(irqchip) \ ACPI_PROBE_TABLE(timer) \ EARLYCON_TABLE() \ - LSM_TABLE() + LSM_TABLE() \ + EARLY_LSM_TABLE() #define INIT_TEXT \ *(.init.text .init.text.*) \ diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 47f58cfb6a19..b02e8bb6654d 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -2104,12 +2104,18 @@ struct lsm_info { }; extern struct lsm_info __start_lsm_info[], __end_lsm_info[]; +extern struct lsm_info __start_early_lsm_info[], __end_early_lsm_info[]; #define DEFINE_LSM(lsm) \ static struct lsm_info __lsm_##lsm \ __used __section(.lsm_info.init) \ __aligned(sizeof(unsigned long)) +#define DEFINE_EARLY_LSM(lsm) \ + static struct lsm_info __early_lsm_##lsm \ + __used __section(.early_lsm_info.init) \ + __aligned(sizeof(unsigned long)) + #ifdef CONFIG_SECURITY_SELINUX_DISABLE /* * Assuring the safety of deleting a security module is up to diff --git a/include/linux/security.h b/include/linux/security.h index 659071c2e57c..c5dd90981c98 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -195,6 +195,7 @@ int unregister_lsm_notifier(struct notifier_block *nb); /* prototypes */ extern int security_init(void); +extern int early_security_init(void); /* Security operations */ int security_binder_set_context_mgr(struct task_struct *mgr); @@ -423,6 +424,11 @@ static inline int security_init(void) return 0; } +static inline int early_security_init(void) +{ + return 0; +} + static inline int security_binder_set_context_mgr(struct task_struct *mgr) { return 0; diff --git a/init/main.c b/init/main.c index 66a196c5e4c3..598effd29a0a 100644 --- a/init/main.c +++ b/init/main.c @@ -569,6 +569,7 @@ asmlinkage __visible void __init start_kernel(void) boot_cpu_init(); page_address_init(); pr_notice("%s", linux_banner); + early_security_init(); setup_arch(&command_line); mm_init_cpumask(&init_mm); setup_command_line(command_line); diff --git a/security/security.c b/security/security.c index f493db0bf62a..ef4a0111c8b4 100644 --- a/security/security.c +++ b/security/security.c @@ -33,6 +33,7 @@ /* How many LSMs were built into the kernel? */ #define LSM_COUNT (__end_lsm_info - __start_lsm_info) +#define EARLY_LSM_COUNT (__end_early_lsm_info - __start_early_lsm_info) struct security_hook_heads security_hook_heads __lsm_ro_after_init; static ATOMIC_NOTIFIER_HEAD(lsm_notifier_chain); @@ -277,6 +278,8 @@ static void __init ordered_lsm_parse(const char *order, const char *origin) static void __init lsm_early_cred(struct cred *cred); static void __init lsm_early_task(struct task_struct *task); +static int lsm_append(const char *new, char **result); + static void __init ordered_lsm_init(void) { struct lsm_info **lsm; @@ -323,6 +326,26 @@ static void __init ordered_lsm_init(void) kfree(ordered_lsms); } +int __init early_security_init(void) +{ + int i; + struct hlist_head *list = (struct hlist_head *) &security_hook_heads; + struct lsm_info *lsm; + + for (i = 0; i < sizeof(security_hook_heads) / sizeof(struct hlist_head); + i++) + INIT_HLIST_HEAD(&list[i]); + + for (lsm = __start_early_lsm_info; lsm < __end_early_lsm_info; lsm++) { + if (!lsm->enabled) + lsm->enabled = &lsm_enabled_true; + prepare_lsm(lsm); + initialize_lsm(lsm); + } + + return 0; +} + /** * security_init - initializes the security framework * @@ -330,14 +353,18 @@ static void __init ordered_lsm_init(void) */ int __init security_init(void) { - int i; - struct hlist_head *list = (struct hlist_head *) &security_hook_heads; + struct lsm_info *lsm; pr_info("Security Framework initializing\n"); - for (i = 0; i < sizeof(security_hook_heads) / sizeof(struct hlist_head); - i++) - INIT_HLIST_HEAD(&list[i]); + /* + * Append the names of the early LSM modules now that kmalloc() is + * available + */ + for (lsm = __start_early_lsm_info; lsm < __end_early_lsm_info; lsm++) { + if (lsm->enabled) + lsm_append(lsm->name, &lsm_names); + } /* Load LSMs in specified order. */ ordered_lsm_init(); @@ -384,7 +411,7 @@ static bool match_last_lsm(const char *list, const char *lsm) return !strcmp(last, lsm); } -static int lsm_append(char *new, char **result) +static int lsm_append(const char *new, char **result) { char *cp; @@ -422,8 +449,15 @@ void __init security_add_hooks(struct security_hook_list *hooks, int count, hooks[i].lsm = lsm; hlist_add_tail_rcu(&hooks[i].list, hooks[i].head); } - if (lsm_append(lsm, &lsm_names) < 0) - panic("%s - Cannot get early memory.\n", __func__); + + /* + * Don't try to append during early_security_init(), we'll come back + * and fix this up afterwards. + */ + if (slab_is_available()) { + if (lsm_append(lsm, &lsm_names) < 0) + panic("%s - Cannot get early memory.\n", __func__); + } } int call_lsm_notifier(enum lsm_event event, void *data) From 9e47d31d6a57b5babaca36d42b0d11b6db6019b7 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Mon, 19 Aug 2019 17:17:38 -0700 Subject: [PATCH 042/334] security: Add a "locked down" LSM hook Add a mechanism to allow LSMs to make a policy decision around whether kernel functionality that would allow tampering with or examining the runtime state of the kernel should be permitted. Signed-off-by: Matthew Garrett Acked-by: Kees Cook Acked-by: Casey Schaufler Signed-off-by: James Morris --- include/linux/lsm_hooks.h | 7 +++++++ include/linux/security.h | 32 ++++++++++++++++++++++++++++++++ security/security.c | 6 ++++++ 3 files changed, 45 insertions(+) diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index b02e8bb6654d..2f4ba9062fb8 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -1446,6 +1446,11 @@ * @bpf_prog_free_security: * Clean up the security information stored inside bpf prog. * + * @locked_down + * Determine whether a kernel feature that potentially enables arbitrary + * code execution in kernel space should be permitted. + * + * @what: kernel feature being accessed */ union security_list_options { int (*binder_set_context_mgr)(struct task_struct *mgr); @@ -1807,6 +1812,7 @@ union security_list_options { int (*bpf_prog_alloc_security)(struct bpf_prog_aux *aux); void (*bpf_prog_free_security)(struct bpf_prog_aux *aux); #endif /* CONFIG_BPF_SYSCALL */ + int (*locked_down)(enum lockdown_reason what); }; struct security_hook_heads { @@ -2046,6 +2052,7 @@ struct security_hook_heads { struct hlist_head bpf_prog_alloc_security; struct hlist_head bpf_prog_free_security; #endif /* CONFIG_BPF_SYSCALL */ + struct hlist_head locked_down; } __randomize_layout; /* diff --git a/include/linux/security.h b/include/linux/security.h index c5dd90981c98..04cf48fab15d 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -77,6 +77,33 @@ enum lsm_event { LSM_POLICY_CHANGE, }; +/* + * These are reasons that can be passed to the security_locked_down() + * LSM hook. Lockdown reasons that protect kernel integrity (ie, the + * ability for userland to modify kernel code) are placed before + * LOCKDOWN_INTEGRITY_MAX. Lockdown reasons that protect kernel + * confidentiality (ie, the ability for userland to extract + * information from the running kernel that would otherwise be + * restricted) are placed before LOCKDOWN_CONFIDENTIALITY_MAX. + * + * LSM authors should note that the semantics of any given lockdown + * reason are not guaranteed to be stable - the same reason may block + * one set of features in one kernel release, and a slightly different + * set of features in a later kernel release. LSMs that seek to expose + * lockdown policy at any level of granularity other than "none", + * "integrity" or "confidentiality" are responsible for either + * ensuring that they expose a consistent level of functionality to + * userland, or ensuring that userland is aware that this is + * potentially a moving target. It is easy to misuse this information + * in a way that could break userspace. Please be careful not to do + * so. + */ +enum lockdown_reason { + LOCKDOWN_NONE, + LOCKDOWN_INTEGRITY_MAX, + LOCKDOWN_CONFIDENTIALITY_MAX, +}; + /* These functions are in security/commoncap.c */ extern int cap_capable(const struct cred *cred, struct user_namespace *ns, int cap, unsigned int opts); @@ -393,6 +420,7 @@ void security_inode_invalidate_secctx(struct inode *inode); int security_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen); int security_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen); int security_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen); +int security_locked_down(enum lockdown_reason what); #else /* CONFIG_SECURITY */ static inline int call_lsm_notifier(enum lsm_event event, void *data) @@ -1210,6 +1238,10 @@ static inline int security_inode_getsecctx(struct inode *inode, void **ctx, u32 { return -EOPNOTSUPP; } +static inline int security_locked_down(enum lockdown_reason what) +{ + return 0; +} #endif /* CONFIG_SECURITY */ #ifdef CONFIG_SECURITY_NETWORK diff --git a/security/security.c b/security/security.c index ef4a0111c8b4..7fc373486d7a 100644 --- a/security/security.c +++ b/security/security.c @@ -2389,3 +2389,9 @@ void security_bpf_prog_free(struct bpf_prog_aux *aux) call_void_hook(bpf_prog_free_security, aux); } #endif /* CONFIG_BPF_SYSCALL */ + +int security_locked_down(enum lockdown_reason what) +{ + return call_int_hook(locked_down, 0, what); +} +EXPORT_SYMBOL(security_locked_down); From 000d388ed3bbed745f366ce71b2bb7c2ee70f449 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Mon, 19 Aug 2019 17:17:39 -0700 Subject: [PATCH 043/334] security: Add a static lockdown policy LSM While existing LSMs can be extended to handle lockdown policy, distributions generally want to be able to apply a straightforward static policy. This patch adds a simple LSM that can be configured to reject either integrity or all lockdown queries, and can be configured at runtime (through securityfs), boot time (via a kernel parameter) or build time (via a kconfig option). Based on initial code by David Howells. Signed-off-by: Matthew Garrett Reviewed-by: Kees Cook Cc: David Howells Signed-off-by: James Morris --- .../admin-guide/kernel-parameters.txt | 9 + include/linux/security.h | 3 + security/Kconfig | 11 +- security/Makefile | 2 + security/lockdown/Kconfig | 46 +++++ security/lockdown/Makefile | 1 + security/lockdown/lockdown.c | 169 ++++++++++++++++++ 7 files changed, 236 insertions(+), 5 deletions(-) create mode 100644 security/lockdown/Kconfig create mode 100644 security/lockdown/Makefile create mode 100644 security/lockdown/lockdown.c diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 138f6664b2e2..0f28350f1ee6 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2244,6 +2244,15 @@ lockd.nlm_udpport=M [NFS] Assign UDP port. Format: + lockdown= [SECURITY] + { integrity | confidentiality } + Enable the kernel lockdown feature. If set to + integrity, kernel features that allow userland to + modify the running kernel are disabled. If set to + confidentiality, kernel features that allow userland + to extract confidential information from the kernel + are also disabled. + locktorture.nreaders_stress= [KNL] Set the number of locking read-acquisition kthreads. Defaults to being automatically set based on the diff --git a/include/linux/security.h b/include/linux/security.h index 04cf48fab15d..74787335d9ce 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -97,6 +97,9 @@ enum lsm_event { * potentially a moving target. It is easy to misuse this information * in a way that could break userspace. Please be careful not to do * so. + * + * If you add to this, remember to extend lockdown_reasons in + * security/lockdown/lockdown.c. */ enum lockdown_reason { LOCKDOWN_NONE, diff --git a/security/Kconfig b/security/Kconfig index 466cc1f8ffed..7c62d446e209 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -237,6 +237,7 @@ source "security/apparmor/Kconfig" source "security/loadpin/Kconfig" source "security/yama/Kconfig" source "security/safesetid/Kconfig" +source "security/lockdown/Kconfig" source "security/integrity/Kconfig" @@ -276,11 +277,11 @@ endchoice config LSM string "Ordered list of enabled LSMs" - default "yama,loadpin,safesetid,integrity,smack,selinux,tomoyo,apparmor" if DEFAULT_SECURITY_SMACK - default "yama,loadpin,safesetid,integrity,apparmor,selinux,smack,tomoyo" if DEFAULT_SECURITY_APPARMOR - default "yama,loadpin,safesetid,integrity,tomoyo" if DEFAULT_SECURITY_TOMOYO - default "yama,loadpin,safesetid,integrity" if DEFAULT_SECURITY_DAC - default "yama,loadpin,safesetid,integrity,selinux,smack,tomoyo,apparmor" + default "lockdown,yama,loadpin,safesetid,integrity,smack,selinux,tomoyo,apparmor" if DEFAULT_SECURITY_SMACK + default "lockdown,yama,loadpin,safesetid,integrity,apparmor,selinux,smack,tomoyo" if DEFAULT_SECURITY_APPARMOR + default "lockdown,yama,loadpin,safesetid,integrity,tomoyo" if DEFAULT_SECURITY_TOMOYO + default "lockdown,yama,loadpin,safesetid,integrity" if DEFAULT_SECURITY_DAC + default "lockdown,yama,loadpin,safesetid,integrity,selinux,smack,tomoyo,apparmor" help A comma-separated list of LSMs, in initialization order. Any LSMs left off this list will be ignored. This can be diff --git a/security/Makefile b/security/Makefile index c598b904938f..be1dd9d2cb2f 100644 --- a/security/Makefile +++ b/security/Makefile @@ -11,6 +11,7 @@ subdir-$(CONFIG_SECURITY_APPARMOR) += apparmor subdir-$(CONFIG_SECURITY_YAMA) += yama subdir-$(CONFIG_SECURITY_LOADPIN) += loadpin subdir-$(CONFIG_SECURITY_SAFESETID) += safesetid +subdir-$(CONFIG_SECURITY_LOCKDOWN_LSM) += lockdown # always enable default capabilities obj-y += commoncap.o @@ -27,6 +28,7 @@ obj-$(CONFIG_SECURITY_APPARMOR) += apparmor/ obj-$(CONFIG_SECURITY_YAMA) += yama/ obj-$(CONFIG_SECURITY_LOADPIN) += loadpin/ obj-$(CONFIG_SECURITY_SAFESETID) += safesetid/ +obj-$(CONFIG_SECURITY_LOCKDOWN_LSM) += lockdown/ obj-$(CONFIG_CGROUP_DEVICE) += device_cgroup.o # Object integrity file lists diff --git a/security/lockdown/Kconfig b/security/lockdown/Kconfig new file mode 100644 index 000000000000..7a1d213227a4 --- /dev/null +++ b/security/lockdown/Kconfig @@ -0,0 +1,46 @@ +config SECURITY_LOCKDOWN_LSM + bool "Basic module for enforcing kernel lockdown" + depends on SECURITY + help + Build support for an LSM that enforces a coarse kernel lockdown + behaviour. + +config SECURITY_LOCKDOWN_LSM_EARLY + bool "Enable lockdown LSM early in init" + depends on SECURITY_LOCKDOWN_LSM + help + Enable the lockdown LSM early in boot. This is necessary in order + to ensure that lockdown enforcement can be carried out on kernel + boot parameters that are otherwise parsed before the security + subsystem is fully initialised. If enabled, lockdown will + unconditionally be called before any other LSMs. + +choice + prompt "Kernel default lockdown mode" + default LOCK_DOWN_KERNEL_FORCE_NONE + depends on SECURITY_LOCKDOWN_LSM + help + The kernel can be configured to default to differing levels of + lockdown. + +config LOCK_DOWN_KERNEL_FORCE_NONE + bool "None" + help + No lockdown functionality is enabled by default. Lockdown may be + enabled via the kernel commandline or /sys/kernel/security/lockdown. + +config LOCK_DOWN_KERNEL_FORCE_INTEGRITY + bool "Integrity" + help + The kernel runs in integrity mode by default. Features that allow + the kernel to be modified at runtime are disabled. + +config LOCK_DOWN_KERNEL_FORCE_CONFIDENTIALITY + bool "Confidentiality" + help + The kernel runs in confidentiality mode by default. Features that + allow the kernel to be modified at runtime or that permit userland + code to read confidential material held inside the kernel are + disabled. + +endchoice diff --git a/security/lockdown/Makefile b/security/lockdown/Makefile new file mode 100644 index 000000000000..e3634b9017e7 --- /dev/null +++ b/security/lockdown/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_SECURITY_LOCKDOWN_LSM) += lockdown.o diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c new file mode 100644 index 000000000000..7172ad75496b --- /dev/null +++ b/security/lockdown/lockdown.c @@ -0,0 +1,169 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Lock down the kernel + * + * Copyright (C) 2016 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include +#include +#include + +static enum lockdown_reason kernel_locked_down; + +static char *lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { + [LOCKDOWN_NONE] = "none", + [LOCKDOWN_INTEGRITY_MAX] = "integrity", + [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality", +}; + +static enum lockdown_reason lockdown_levels[] = {LOCKDOWN_NONE, + LOCKDOWN_INTEGRITY_MAX, + LOCKDOWN_CONFIDENTIALITY_MAX}; + +/* + * Put the kernel into lock-down mode. + */ +static int lock_kernel_down(const char *where, enum lockdown_reason level) +{ + if (kernel_locked_down >= level) + return -EPERM; + + kernel_locked_down = level; + pr_notice("Kernel is locked down from %s; see man kernel_lockdown.7\n", + where); + return 0; +} + +static int __init lockdown_param(char *level) +{ + if (!level) + return -EINVAL; + + if (strcmp(level, "integrity") == 0) + lock_kernel_down("command line", LOCKDOWN_INTEGRITY_MAX); + else if (strcmp(level, "confidentiality") == 0) + lock_kernel_down("command line", LOCKDOWN_CONFIDENTIALITY_MAX); + else + return -EINVAL; + + return 0; +} + +early_param("lockdown", lockdown_param); + +/** + * lockdown_is_locked_down - Find out if the kernel is locked down + * @what: Tag to use in notice generated if lockdown is in effect + */ +static int lockdown_is_locked_down(enum lockdown_reason what) +{ + if (kernel_locked_down >= what) { + if (lockdown_reasons[what]) + pr_notice("Lockdown: %s is restricted; see man kernel_lockdown.7\n", + lockdown_reasons[what]); + return -EPERM; + } + + return 0; +} + +static struct security_hook_list lockdown_hooks[] __lsm_ro_after_init = { + LSM_HOOK_INIT(locked_down, lockdown_is_locked_down), +}; + +static int __init lockdown_lsm_init(void) +{ +#if defined(CONFIG_LOCK_DOWN_KERNEL_FORCE_INTEGRITY) + lock_kernel_down("Kernel configuration", LOCKDOWN_INTEGRITY_MAX); +#elif defined(CONFIG_LOCK_DOWN_KERNEL_FORCE_CONFIDENTIALITY) + lock_kernel_down("Kernel configuration", LOCKDOWN_CONFIDENTIALITY_MAX); +#endif + security_add_hooks(lockdown_hooks, ARRAY_SIZE(lockdown_hooks), + "lockdown"); + return 0; +} + +static ssize_t lockdown_read(struct file *filp, char __user *buf, size_t count, + loff_t *ppos) +{ + char temp[80]; + int i, offset = 0; + + for (i = 0; i < ARRAY_SIZE(lockdown_levels); i++) { + enum lockdown_reason level = lockdown_levels[i]; + + if (lockdown_reasons[level]) { + const char *label = lockdown_reasons[level]; + + if (kernel_locked_down == level) + offset += sprintf(temp+offset, "[%s] ", label); + else + offset += sprintf(temp+offset, "%s ", label); + } + } + + /* Convert the last space to a newline if needed. */ + if (offset > 0) + temp[offset-1] = '\n'; + + return simple_read_from_buffer(buf, count, ppos, temp, strlen(temp)); +} + +static ssize_t lockdown_write(struct file *file, const char __user *buf, + size_t n, loff_t *ppos) +{ + char *state; + int i, len, err = -EINVAL; + + state = memdup_user_nul(buf, n); + if (IS_ERR(state)) + return PTR_ERR(state); + + len = strlen(state); + if (len && state[len-1] == '\n') { + state[len-1] = '\0'; + len--; + } + + for (i = 0; i < ARRAY_SIZE(lockdown_levels); i++) { + enum lockdown_reason level = lockdown_levels[i]; + const char *label = lockdown_reasons[level]; + + if (label && !strcmp(state, label)) + err = lock_kernel_down("securityfs", level); + } + + kfree(state); + return err ? err : n; +} + +static const struct file_operations lockdown_ops = { + .read = lockdown_read, + .write = lockdown_write, +}; + +static int __init lockdown_secfs_init(void) +{ + struct dentry *dentry; + + dentry = securityfs_create_file("lockdown", 0600, NULL, NULL, + &lockdown_ops); + return PTR_ERR_OR_ZERO(dentry); +} + +core_initcall(lockdown_secfs_init); + +#ifdef CONFIG_SECURITY_LOCKDOWN_LSM_EARLY +DEFINE_EARLY_LSM(lockdown) = { +#else +DEFINE_LSM(lockdown) = { +#endif + .name = "lockdown", + .init = lockdown_lsm_init, +}; From 49fcf732bdae0550721ef73af7c45109ce26b2a9 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 19 Aug 2019 17:17:40 -0700 Subject: [PATCH 044/334] lockdown: Enforce module signatures if the kernel is locked down If the kernel is locked down, require that all modules have valid signatures that we can verify. I have adjusted the errors generated: (1) If there's no signature (ENODATA) or we can't check it (ENOPKG, ENOKEY), then: (a) If signatures are enforced then EKEYREJECTED is returned. (b) If there's no signature or we can't check it, but the kernel is locked down then EPERM is returned (this is then consistent with other lockdown cases). (2) If the signature is unparseable (EBADMSG, EINVAL), the signature fails the check (EKEYREJECTED) or a system error occurs (eg. ENOMEM), we return the error we got. Note that the X.509 code doesn't check for key expiry as the RTC might not be valid or might not have been transferred to the kernel's clock yet. [Modified by Matthew Garrett to remove the IMA integration. This will be replaced with integration with the IMA architecture policy patchset.] Signed-off-by: David Howells Signed-off-by: Matthew Garrett Reviewed-by: Kees Cook Cc: Jessica Yu Signed-off-by: James Morris --- include/linux/security.h | 1 + init/Kconfig | 5 +++++ kernel/module.c | 39 ++++++++++++++++++++++++++++-------- security/lockdown/Kconfig | 1 + security/lockdown/lockdown.c | 1 + 5 files changed, 39 insertions(+), 8 deletions(-) diff --git a/include/linux/security.h b/include/linux/security.h index 74787335d9ce..9e8abb60a99f 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -103,6 +103,7 @@ enum lsm_event { */ enum lockdown_reason { LOCKDOWN_NONE, + LOCKDOWN_MODULE_SIGNATURE, LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_CONFIDENTIALITY_MAX, }; diff --git a/init/Kconfig b/init/Kconfig index 0e2344389501..e6069368f278 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1939,6 +1939,11 @@ config MODULE_SIG kernel build dependency so that the signing tool can use its crypto library. + You should enable this option if you wish to use either + CONFIG_SECURITY_LOCKDOWN_LSM or lockdown functionality imposed via + another LSM - otherwise unsigned modules will be loadable regardless + of the lockdown policy. + !!!WARNING!!! If you enable this option, you MUST make sure that the module DOES NOT get stripped after being signed. This includes the debuginfo strip done by some packagers (such as rpmbuild) and diff --git a/kernel/module.c b/kernel/module.c index 80c7c09584cf..2206c08a5e10 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2753,8 +2753,9 @@ static inline void kmemleak_load_module(const struct module *mod, #ifdef CONFIG_MODULE_SIG static int module_sig_check(struct load_info *info, int flags) { - int err = -ENOKEY; + int err = -ENODATA; const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1; + const char *reason; const void *mod = info->hdr; /* @@ -2769,16 +2770,38 @@ static int module_sig_check(struct load_info *info, int flags) err = mod_verify_sig(mod, info); } - if (!err) { + switch (err) { + case 0: info->sig_ok = true; return 0; + + /* We don't permit modules to be loaded into trusted kernels + * without a valid signature on them, but if we're not + * enforcing, certain errors are non-fatal. + */ + case -ENODATA: + reason = "Loading of unsigned module"; + goto decide; + case -ENOPKG: + reason = "Loading of module with unsupported crypto"; + goto decide; + case -ENOKEY: + reason = "Loading of module with unavailable key"; + decide: + if (is_module_sig_enforced()) { + pr_notice("%s is rejected\n", reason); + return -EKEYREJECTED; + } + + return security_locked_down(LOCKDOWN_MODULE_SIGNATURE); + + /* All other errors are fatal, including nomem, unparseable + * signatures and signature check failures - even if signatures + * aren't required. + */ + default: + return err; } - - /* Not having a signature is only an error if we're strict. */ - if (err == -ENOKEY && !is_module_sig_enforced()) - err = 0; - - return err; } #else /* !CONFIG_MODULE_SIG */ static int module_sig_check(struct load_info *info, int flags) diff --git a/security/lockdown/Kconfig b/security/lockdown/Kconfig index 7a1d213227a4..e84ddf484010 100644 --- a/security/lockdown/Kconfig +++ b/security/lockdown/Kconfig @@ -1,6 +1,7 @@ config SECURITY_LOCKDOWN_LSM bool "Basic module for enforcing kernel lockdown" depends on SECURITY + select MODULE_SIG if MODULES help Build support for an LSM that enforces a coarse kernel lockdown behaviour. diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c index 7172ad75496b..d8e42125a5dd 100644 --- a/security/lockdown/lockdown.c +++ b/security/lockdown/lockdown.c @@ -18,6 +18,7 @@ static enum lockdown_reason kernel_locked_down; static char *lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_NONE] = "none", + [LOCKDOWN_MODULE_SIGNATURE] = "unsigned module loading", [LOCKDOWN_INTEGRITY_MAX] = "integrity", [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality", }; From 9b9d8dda1ed72e9bd560ab0ca93d322a9440510e Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Mon, 19 Aug 2019 17:17:41 -0700 Subject: [PATCH 045/334] lockdown: Restrict /dev/{mem,kmem,port} when the kernel is locked down Allowing users to read and write to core kernel memory makes it possible for the kernel to be subverted, avoiding module loading restrictions, and also to steal cryptographic information. Disallow /dev/mem and /dev/kmem from being opened this when the kernel has been locked down to prevent this. Also disallow /dev/port from being opened to prevent raw ioport access and thus DMA from being used to accomplish the same thing. Signed-off-by: David Howells Signed-off-by: Matthew Garrett Reviewed-by: Kees Cook Cc: x86@kernel.org Signed-off-by: James Morris --- drivers/char/mem.c | 7 +++++-- include/linux/security.h | 1 + security/lockdown/lockdown.c | 1 + 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/char/mem.c b/drivers/char/mem.c index b08dc50f9f26..d0148aee1aab 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -29,8 +29,8 @@ #include #include #include - #include +#include #ifdef CONFIG_IA64 # include @@ -786,7 +786,10 @@ static loff_t memory_lseek(struct file *file, loff_t offset, int orig) static int open_port(struct inode *inode, struct file *filp) { - return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + + return security_locked_down(LOCKDOWN_DEV_MEM); } #define zero_lseek null_lseek diff --git a/include/linux/security.h b/include/linux/security.h index 9e8abb60a99f..e5dd446ef35b 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -104,6 +104,7 @@ enum lsm_event { enum lockdown_reason { LOCKDOWN_NONE, LOCKDOWN_MODULE_SIGNATURE, + LOCKDOWN_DEV_MEM, LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_CONFIDENTIALITY_MAX, }; diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c index d8e42125a5dd..240ecaa10a1d 100644 --- a/security/lockdown/lockdown.c +++ b/security/lockdown/lockdown.c @@ -19,6 +19,7 @@ static enum lockdown_reason kernel_locked_down; static char *lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_NONE] = "none", [LOCKDOWN_MODULE_SIGNATURE] = "unsigned module loading", + [LOCKDOWN_DEV_MEM] = "/dev/mem,kmem,port", [LOCKDOWN_INTEGRITY_MAX] = "integrity", [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality", }; From 7d31f4602f8d366072471ca138e4ea7b8edf9be0 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Mon, 19 Aug 2019 17:17:42 -0700 Subject: [PATCH 046/334] kexec_load: Disable at runtime if the kernel is locked down The kexec_load() syscall permits the loading and execution of arbitrary code in ring 0, which is something that lock-down is meant to prevent. It makes sense to disable kexec_load() in this situation. This does not affect kexec_file_load() syscall which can check for a signature on the image to be booted. Signed-off-by: David Howells Signed-off-by: Matthew Garrett Acked-by: Dave Young Reviewed-by: Kees Cook cc: kexec@lists.infradead.org Signed-off-by: James Morris --- include/linux/security.h | 1 + kernel/kexec.c | 8 ++++++++ security/lockdown/lockdown.c | 1 + 3 files changed, 10 insertions(+) diff --git a/include/linux/security.h b/include/linux/security.h index e5dd446ef35b..b607a8ac97fe 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -105,6 +105,7 @@ enum lockdown_reason { LOCKDOWN_NONE, LOCKDOWN_MODULE_SIGNATURE, LOCKDOWN_DEV_MEM, + LOCKDOWN_KEXEC, LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_CONFIDENTIALITY_MAX, }; diff --git a/kernel/kexec.c b/kernel/kexec.c index 1b018f1a6e0d..bc933c0db9bf 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -205,6 +205,14 @@ static inline int kexec_load_check(unsigned long nr_segments, if (result < 0) return result; + /* + * kexec can be used to circumvent module loading restrictions, so + * prevent loading in that case + */ + result = security_locked_down(LOCKDOWN_KEXEC); + if (result) + return result; + /* * Verify we have a legal set of flags * This leaves us room for future extensions. diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c index 240ecaa10a1d..aaf30ad351f9 100644 --- a/security/lockdown/lockdown.c +++ b/security/lockdown/lockdown.c @@ -20,6 +20,7 @@ static char *lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_NONE] = "none", [LOCKDOWN_MODULE_SIGNATURE] = "unsigned module loading", [LOCKDOWN_DEV_MEM] = "/dev/mem,kmem,port", + [LOCKDOWN_KEXEC] = "kexec of unsigned images", [LOCKDOWN_INTEGRITY_MAX] = "integrity", [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality", }; From fef5dad9876034253d59acbf8c0c314f4d94cf87 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Mon, 19 Aug 2019 17:17:43 -0700 Subject: [PATCH 047/334] lockdown: Copy secure_boot flag in boot params across kexec reboot Kexec reboot in case secure boot being enabled does not keep the secure boot mode in new kernel, so later one can load unsigned kernel via legacy kexec_load. In this state, the system is missing the protections provided by secure boot. Adding a patch to fix this by retain the secure_boot flag in original kernel. secure_boot flag in boot_params is set in EFI stub, but kexec bypasses the stub. Fixing this issue by copying secure_boot flag across kexec reboot. Signed-off-by: Dave Young Signed-off-by: David Howells Signed-off-by: Matthew Garrett Reviewed-by: Kees Cook cc: kexec@lists.infradead.org Signed-off-by: James Morris --- arch/x86/kernel/kexec-bzimage64.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index f03237e3f192..5d64a22f99ce 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -180,6 +180,7 @@ setup_efi_state(struct boot_params *params, unsigned long params_load_addr, if (efi_enabled(EFI_OLD_MEMMAP)) return 0; + params->secure_boot = boot_params.secure_boot; ei->efi_loader_signature = current_ei->efi_loader_signature; ei->efi_systab = current_ei->efi_systab; ei->efi_systab_hi = current_ei->efi_systab_hi; From 99d5cadfde2b1acb7650021df5abaa5ec447dd10 Mon Sep 17 00:00:00 2001 From: Jiri Bohac Date: Mon, 19 Aug 2019 17:17:44 -0700 Subject: [PATCH 048/334] kexec_file: split KEXEC_VERIFY_SIG into KEXEC_SIG and KEXEC_SIG_FORCE This is a preparatory patch for kexec_file_load() lockdown. A locked down kernel needs to prevent unsigned kernel images from being loaded with kexec_file_load(). Currently, the only way to force the signature verification is compiling with KEXEC_VERIFY_SIG. This prevents loading usigned images even when the kernel is not locked down at runtime. This patch splits KEXEC_VERIFY_SIG into KEXEC_SIG and KEXEC_SIG_FORCE. Analogous to the MODULE_SIG and MODULE_SIG_FORCE for modules, KEXEC_SIG turns on the signature verification but allows unsigned images to be loaded. KEXEC_SIG_FORCE disallows images without a valid signature. Signed-off-by: Jiri Bohac Signed-off-by: David Howells Signed-off-by: Matthew Garrett cc: kexec@lists.infradead.org Signed-off-by: James Morris --- arch/arm64/Kconfig | 6 +-- arch/s390/Kconfig | 2 +- arch/s390/configs/debug_defconfig | 2 +- arch/s390/configs/defconfig | 2 +- arch/s390/configs/performance_defconfig | 2 +- arch/s390/kernel/kexec_elf.c | 4 +- arch/s390/kernel/kexec_image.c | 4 +- arch/s390/kernel/machine_kexec_file.c | 4 +- arch/x86/Kconfig | 22 ++++++--- arch/x86/kernel/ima_arch.c | 4 +- crypto/asymmetric_keys/verify_pefile.c | 4 +- include/linux/kexec.h | 4 +- kernel/kexec_file.c | 60 +++++++++++++++++++++---- security/integrity/ima/Kconfig | 2 +- security/integrity/ima/ima_main.c | 2 +- 15 files changed, 89 insertions(+), 35 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 697ea0510729..f940500a941b 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -961,7 +961,7 @@ config KEXEC_FILE for kernel and initramfs as opposed to list of segments as accepted by previous system call. -config KEXEC_VERIFY_SIG +config KEXEC_SIG bool "Verify kernel signature during kexec_file_load() syscall" depends on KEXEC_FILE help @@ -976,13 +976,13 @@ config KEXEC_VERIFY_SIG config KEXEC_IMAGE_VERIFY_SIG bool "Enable Image signature verification support" default y - depends on KEXEC_VERIFY_SIG + depends on KEXEC_SIG depends on EFI && SIGNED_PE_FILE_VERIFICATION help Enable Image signature verification support. comment "Support for PE file signature verification disabled" - depends on KEXEC_VERIFY_SIG + depends on KEXEC_SIG depends on !EFI || !SIGNED_PE_FILE_VERIFICATION config CRASH_DUMP diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 109243fdb6ec..c4a423f30d49 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -555,7 +555,7 @@ config ARCH_HAS_KEXEC_PURGATORY def_bool y depends on KEXEC_FILE -config KEXEC_VERIFY_SIG +config KEXEC_SIG bool "Verify kernel signature during kexec_file_load() syscall" depends on KEXEC_FILE && SYSTEM_DATA_VERIFICATION help diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index b0920b35f87b..525e0a6addb9 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -64,7 +64,7 @@ CONFIG_NUMA=y CONFIG_PREEMPT=y CONFIG_HZ_100=y CONFIG_KEXEC_FILE=y -CONFIG_KEXEC_VERIFY_SIG=y +CONFIG_KEXEC_SIG=y CONFIG_EXPOLINE=y CONFIG_EXPOLINE_AUTO=y CONFIG_MEMORY_HOTPLUG=y diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index c59b922cb6c5..4c37279acdb4 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -39,7 +39,7 @@ CONFIG_NR_CPUS=256 CONFIG_NUMA=y CONFIG_HZ_100=y CONFIG_KEXEC_FILE=y -CONFIG_KEXEC_VERIFY_SIG=y +CONFIG_KEXEC_SIG=y CONFIG_CRASH_DUMP=y CONFIG_HIBERNATION=y CONFIG_PM_DEBUG=y diff --git a/arch/s390/configs/performance_defconfig b/arch/s390/configs/performance_defconfig index 09aa5cb14873..158ad0f0d433 100644 --- a/arch/s390/configs/performance_defconfig +++ b/arch/s390/configs/performance_defconfig @@ -65,7 +65,7 @@ CONFIG_NR_CPUS=512 CONFIG_NUMA=y CONFIG_HZ_100=y CONFIG_KEXEC_FILE=y -CONFIG_KEXEC_VERIFY_SIG=y +CONFIG_KEXEC_SIG=y CONFIG_EXPOLINE=y CONFIG_EXPOLINE_AUTO=y CONFIG_MEMORY_HOTPLUG=y diff --git a/arch/s390/kernel/kexec_elf.c b/arch/s390/kernel/kexec_elf.c index 6d0635ceddd0..9b4f37a4edf1 100644 --- a/arch/s390/kernel/kexec_elf.c +++ b/arch/s390/kernel/kexec_elf.c @@ -130,7 +130,7 @@ static int s390_elf_probe(const char *buf, unsigned long len) const struct kexec_file_ops s390_kexec_elf_ops = { .probe = s390_elf_probe, .load = s390_elf_load, -#ifdef CONFIG_KEXEC_VERIFY_SIG +#ifdef CONFIG_KEXEC__SIG .verify_sig = s390_verify_sig, -#endif /* CONFIG_KEXEC_VERIFY_SIG */ +#endif /* CONFIG_KEXEC_SIG */ }; diff --git a/arch/s390/kernel/kexec_image.c b/arch/s390/kernel/kexec_image.c index 58318bf89fd9..af23eff5774d 100644 --- a/arch/s390/kernel/kexec_image.c +++ b/arch/s390/kernel/kexec_image.c @@ -59,7 +59,7 @@ static int s390_image_probe(const char *buf, unsigned long len) const struct kexec_file_ops s390_kexec_image_ops = { .probe = s390_image_probe, .load = s390_image_load, -#ifdef CONFIG_KEXEC_VERIFY_SIG +#ifdef CONFIG_KEXEC_SIG .verify_sig = s390_verify_sig, -#endif /* CONFIG_KEXEC_VERIFY_SIG */ +#endif /* CONFIG_KEXEC_SIG */ }; diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c index fbdd3ea73667..c0f33ba49a9a 100644 --- a/arch/s390/kernel/machine_kexec_file.c +++ b/arch/s390/kernel/machine_kexec_file.c @@ -22,7 +22,7 @@ const struct kexec_file_ops * const kexec_file_loaders[] = { NULL, }; -#ifdef CONFIG_KEXEC_VERIFY_SIG +#ifdef CONFIG_KEXEC_SIG /* * Module signature information block. * @@ -90,7 +90,7 @@ int s390_verify_sig(const char *kernel, unsigned long kernel_len) VERIFYING_MODULE_SIGNATURE, NULL, NULL); } -#endif /* CONFIG_KEXEC_VERIFY_SIG */ +#endif /* CONFIG_KEXEC_SIG */ static int kexec_file_update_purgatory(struct kimage *image, struct s390_load_data *data) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2bbbd4d1ba31..cd41998aa6e6 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2006,20 +2006,30 @@ config KEXEC_FILE config ARCH_HAS_KEXEC_PURGATORY def_bool KEXEC_FILE -config KEXEC_VERIFY_SIG +config KEXEC_SIG bool "Verify kernel signature during kexec_file_load() syscall" depends on KEXEC_FILE + ---help--- + + This option makes the kexec_file_load() syscall check for a valid + signature of the kernel image. The image can still be loaded without + a valid signature unless you also enable KEXEC_SIG_FORCE, though if + there's a signature that we can check, then it must be valid. + + In addition to this option, you need to enable signature + verification for the corresponding kernel image type being + loaded in order for this to work. + +config KEXEC_SIG_FORCE + bool "Require a valid signature in kexec_file_load() syscall" + depends on KEXEC_SIG ---help--- This option makes kernel signature verification mandatory for the kexec_file_load() syscall. - In addition to that option, you need to enable signature - verification for the corresponding kernel image type being - loaded in order for this to work. - config KEXEC_BZIMAGE_VERIFY_SIG bool "Enable bzImage signature verification support" - depends on KEXEC_VERIFY_SIG + depends on KEXEC_SIG depends on SIGNED_PE_FILE_VERIFICATION select SYSTEM_TRUSTED_KEYRING ---help--- diff --git a/arch/x86/kernel/ima_arch.c b/arch/x86/kernel/ima_arch.c index 64b973f0e985..b98890894731 100644 --- a/arch/x86/kernel/ima_arch.c +++ b/arch/x86/kernel/ima_arch.c @@ -66,9 +66,9 @@ bool arch_ima_get_secureboot(void) /* secureboot arch rules */ static const char * const sb_arch_rules[] = { -#if !IS_ENABLED(CONFIG_KEXEC_VERIFY_SIG) +#if !IS_ENABLED(CONFIG_KEXEC_SIG) "appraise func=KEXEC_KERNEL_CHECK appraise_type=imasig", -#endif /* CONFIG_KEXEC_VERIFY_SIG */ +#endif /* CONFIG_KEXEC_SIG */ "measure func=KEXEC_KERNEL_CHECK", #if !IS_ENABLED(CONFIG_MODULE_SIG) "appraise func=MODULE_CHECK appraise_type=imasig", diff --git a/crypto/asymmetric_keys/verify_pefile.c b/crypto/asymmetric_keys/verify_pefile.c index 3b303fe2f061..cc9dbcecaaca 100644 --- a/crypto/asymmetric_keys/verify_pefile.c +++ b/crypto/asymmetric_keys/verify_pefile.c @@ -96,7 +96,7 @@ static int pefile_parse_binary(const void *pebuf, unsigned int pelen, if (!ddir->certs.virtual_address || !ddir->certs.size) { pr_debug("Unsigned PE binary\n"); - return -EKEYREJECTED; + return -ENODATA; } chkaddr(ctx->header_size, ddir->certs.virtual_address, @@ -403,6 +403,8 @@ error_no_desc: * (*) 0 if at least one signature chain intersects with the keys in the trust * keyring, or: * + * (*) -ENODATA if there is no signature present. + * * (*) -ENOPKG if a suitable crypto module couldn't be found for a check on a * chain. * diff --git a/include/linux/kexec.h b/include/linux/kexec.h index b9b1bc5f9669..58b27c7bdc2b 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -125,7 +125,7 @@ typedef void *(kexec_load_t)(struct kimage *image, char *kernel_buf, unsigned long cmdline_len); typedef int (kexec_cleanup_t)(void *loader_data); -#ifdef CONFIG_KEXEC_VERIFY_SIG +#ifdef CONFIG_KEXEC_SIG typedef int (kexec_verify_sig_t)(const char *kernel_buf, unsigned long kernel_len); #endif @@ -134,7 +134,7 @@ struct kexec_file_ops { kexec_probe_t *probe; kexec_load_t *load; kexec_cleanup_t *cleanup; -#ifdef CONFIG_KEXEC_VERIFY_SIG +#ifdef CONFIG_KEXEC_SIG kexec_verify_sig_t *verify_sig; #endif }; diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index ef7b951a8087..972931201995 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -88,7 +88,7 @@ int __weak arch_kimage_file_post_load_cleanup(struct kimage *image) return kexec_image_post_load_cleanup_default(image); } -#ifdef CONFIG_KEXEC_VERIFY_SIG +#ifdef CONFIG_KEXEC_SIG static int kexec_image_verify_sig_default(struct kimage *image, void *buf, unsigned long buf_len) { @@ -177,6 +177,51 @@ void kimage_file_post_load_cleanup(struct kimage *image) image->image_loader_data = NULL; } +#ifdef CONFIG_KEXEC_SIG +static int +kimage_validate_signature(struct kimage *image) +{ + const char *reason; + int ret; + + ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf, + image->kernel_buf_len); + switch (ret) { + case 0: + break; + + /* Certain verification errors are non-fatal if we're not + * checking errors, provided we aren't mandating that there + * must be a valid signature. + */ + case -ENODATA: + reason = "kexec of unsigned image"; + goto decide; + case -ENOPKG: + reason = "kexec of image with unsupported crypto"; + goto decide; + case -ENOKEY: + reason = "kexec of image with unavailable key"; + decide: + if (IS_ENABLED(CONFIG_KEXEC_SIG_FORCE)) { + pr_notice("%s rejected\n", reason); + return ret; + } + + return 0; + + /* All other errors are fatal, including nomem, unparseable + * signatures and signature check failures - even if signatures + * aren't required. + */ + default: + pr_notice("kernel signature verification failed (%d).\n", ret); + } + + return ret; +} +#endif + /* * In file mode list of segments is prepared by kernel. Copy relevant * data from user space, do error checking, prepare segment list @@ -186,7 +231,7 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, const char __user *cmdline_ptr, unsigned long cmdline_len, unsigned flags) { - int ret = 0; + int ret; void *ldata; loff_t size; @@ -205,14 +250,11 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, if (ret) goto out; -#ifdef CONFIG_KEXEC_VERIFY_SIG - ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf, - image->kernel_buf_len); - if (ret) { - pr_debug("kernel signature verification failed.\n"); +#ifdef CONFIG_KEXEC_SIG + ret = kimage_validate_signature(image); + + if (ret) goto out; - } - pr_debug("kernel signature verification successful.\n"); #endif /* It is possible that there no initramfs is being loaded */ if (!(flags & KEXEC_FILE_NO_INITRAMFS)) { diff --git a/security/integrity/ima/Kconfig b/security/integrity/ima/Kconfig index 2692c7358c2c..32cd25fa44a5 100644 --- a/security/integrity/ima/Kconfig +++ b/security/integrity/ima/Kconfig @@ -160,7 +160,7 @@ config IMA_APPRAISE config IMA_ARCH_POLICY bool "Enable loading an IMA architecture specific policy" - depends on KEXEC_VERIFY_SIG || IMA_APPRAISE && INTEGRITY_ASYMMETRIC_KEYS + depends on KEXEC_SIG || IMA_APPRAISE && INTEGRITY_ASYMMETRIC_KEYS default n help This option enables loading an IMA architecture specific policy diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index f556e6c18f9b..1cffda4412b7 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -541,7 +541,7 @@ int ima_load_data(enum kernel_load_data_id id) switch (id) { case LOADING_KEXEC_IMAGE: - if (IS_ENABLED(CONFIG_KEXEC_VERIFY_SIG) + if (IS_ENABLED(CONFIG_KEXEC_SIG) && arch_ima_get_secureboot()) { pr_err("impossible to appraise a kernel image without a file descriptor; try using kexec_file_load syscall.\n"); return -EACCES; From 155bdd30af17e90941589b5db4dab9a29b28c112 Mon Sep 17 00:00:00 2001 From: Jiri Bohac Date: Mon, 19 Aug 2019 17:17:45 -0700 Subject: [PATCH 049/334] kexec_file: Restrict at runtime if the kernel is locked down When KEXEC_SIG is not enabled, kernel should not load images through kexec_file systemcall if the kernel is locked down. [Modified by David Howells to fit with modifications to the previous patch and to return -EPERM if the kernel is locked down for consistency with other lockdowns. Modified by Matthew Garrett to remove the IMA integration, which will be replaced by integrating with the IMA architecture policy patches.] Signed-off-by: Jiri Bohac Signed-off-by: David Howells Signed-off-by: Matthew Garrett cc: kexec@lists.infradead.org Signed-off-by: James Morris --- kernel/kexec_file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 972931201995..43109ef4d6bf 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -208,7 +208,7 @@ kimage_validate_signature(struct kimage *image) return ret; } - return 0; + return security_locked_down(LOCKDOWN_KEXEC); /* All other errors are fatal, including nomem, unparseable * signatures and signature check failures - even if signatures From 38bd94b8a1bd46e6d3d9718c7ff582e4c8ccb440 Mon Sep 17 00:00:00 2001 From: Josh Boyer Date: Mon, 19 Aug 2019 17:17:46 -0700 Subject: [PATCH 050/334] hibernate: Disable when the kernel is locked down There is currently no way to verify the resume image when returning from hibernate. This might compromise the signed modules trust model, so until we can work with signed hibernate images we disable it when the kernel is locked down. Signed-off-by: Josh Boyer Signed-off-by: David Howells Signed-off-by: Matthew Garrett Reviewed-by: Kees Cook Cc: rjw@rjwysocki.net Cc: pavel@ucw.cz cc: linux-pm@vger.kernel.org Signed-off-by: James Morris --- include/linux/security.h | 1 + kernel/power/hibernate.c | 3 ++- security/lockdown/lockdown.c | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/include/linux/security.h b/include/linux/security.h index b607a8ac97fe..80ac7fb27aa9 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -106,6 +106,7 @@ enum lockdown_reason { LOCKDOWN_MODULE_SIGNATURE, LOCKDOWN_DEV_MEM, LOCKDOWN_KEXEC, + LOCKDOWN_HIBERNATION, LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_CONFIDENTIALITY_MAX, }; diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index cd7434e6000d..3c0a5a8170b0 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include "power.h" @@ -68,7 +69,7 @@ static const struct platform_hibernation_ops *hibernation_ops; bool hibernation_available(void) { - return (nohibernate == 0); + return nohibernate == 0 && !security_locked_down(LOCKDOWN_HIBERNATION); } /** diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c index aaf30ad351f9..3462f7edcaac 100644 --- a/security/lockdown/lockdown.c +++ b/security/lockdown/lockdown.c @@ -21,6 +21,7 @@ static char *lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_MODULE_SIGNATURE] = "unsigned module loading", [LOCKDOWN_DEV_MEM] = "/dev/mem,kmem,port", [LOCKDOWN_KEXEC] = "kexec of unsigned images", + [LOCKDOWN_HIBERNATION] = "hibernation", [LOCKDOWN_INTEGRITY_MAX] = "integrity", [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality", }; From eb627e17727ebeede70697ae1798688b0d328b54 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Mon, 19 Aug 2019 17:17:47 -0700 Subject: [PATCH 051/334] PCI: Lock down BAR access when the kernel is locked down Any hardware that can potentially generate DMA has to be locked down in order to avoid it being possible for an attacker to modify kernel code, allowing them to circumvent disabled module loading or module signing. Default to paranoid - in future we can potentially relax this for sufficiently IOMMU-isolated devices. Signed-off-by: David Howells Signed-off-by: Matthew Garrett Acked-by: Bjorn Helgaas Reviewed-by: Kees Cook cc: linux-pci@vger.kernel.org Signed-off-by: James Morris --- drivers/pci/pci-sysfs.c | 16 ++++++++++++++++ drivers/pci/proc.c | 14 ++++++++++++-- drivers/pci/syscall.c | 4 +++- include/linux/security.h | 1 + security/lockdown/lockdown.c | 1 + 5 files changed, 33 insertions(+), 3 deletions(-) diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index 6d27475e39b2..ec103a7e13fc 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -903,6 +903,11 @@ static ssize_t pci_write_config(struct file *filp, struct kobject *kobj, unsigned int size = count; loff_t init_off = off; u8 *data = (u8 *) buf; + int ret; + + ret = security_locked_down(LOCKDOWN_PCI_ACCESS); + if (ret) + return ret; if (off > dev->cfg_size) return 0; @@ -1164,6 +1169,11 @@ static int pci_mmap_resource(struct kobject *kobj, struct bin_attribute *attr, int bar = (unsigned long)attr->private; enum pci_mmap_state mmap_type; struct resource *res = &pdev->resource[bar]; + int ret; + + ret = security_locked_down(LOCKDOWN_PCI_ACCESS); + if (ret) + return ret; if (res->flags & IORESOURCE_MEM && iomem_is_exclusive(res->start)) return -EINVAL; @@ -1240,6 +1250,12 @@ static ssize_t pci_write_resource_io(struct file *filp, struct kobject *kobj, struct bin_attribute *attr, char *buf, loff_t off, size_t count) { + int ret; + + ret = security_locked_down(LOCKDOWN_PCI_ACCESS); + if (ret) + return ret; + return pci_resource_io(filp, kobj, attr, buf, off, count, true); } diff --git a/drivers/pci/proc.c b/drivers/pci/proc.c index 445b51db75b0..e29b0d5ced62 100644 --- a/drivers/pci/proc.c +++ b/drivers/pci/proc.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include "pci.h" @@ -115,7 +116,11 @@ static ssize_t proc_bus_pci_write(struct file *file, const char __user *buf, struct pci_dev *dev = PDE_DATA(ino); int pos = *ppos; int size = dev->cfg_size; - int cnt; + int cnt, ret; + + ret = security_locked_down(LOCKDOWN_PCI_ACCESS); + if (ret) + return ret; if (pos >= size) return 0; @@ -196,6 +201,10 @@ static long proc_bus_pci_ioctl(struct file *file, unsigned int cmd, #endif /* HAVE_PCI_MMAP */ int ret = 0; + ret = security_locked_down(LOCKDOWN_PCI_ACCESS); + if (ret) + return ret; + switch (cmd) { case PCIIOC_CONTROLLER: ret = pci_domain_nr(dev->bus); @@ -238,7 +247,8 @@ static int proc_bus_pci_mmap(struct file *file, struct vm_area_struct *vma) struct pci_filp_private *fpriv = file->private_data; int i, ret, write_combine = 0, res_bit = IORESOURCE_MEM; - if (!capable(CAP_SYS_RAWIO)) + if (!capable(CAP_SYS_RAWIO) || + security_locked_down(LOCKDOWN_PCI_ACCESS)) return -EPERM; if (fpriv->mmap_state == pci_mmap_io) { diff --git a/drivers/pci/syscall.c b/drivers/pci/syscall.c index d96626c614f5..31e39558d49d 100644 --- a/drivers/pci/syscall.c +++ b/drivers/pci/syscall.c @@ -7,6 +7,7 @@ #include #include +#include #include #include #include "pci.h" @@ -90,7 +91,8 @@ SYSCALL_DEFINE5(pciconfig_write, unsigned long, bus, unsigned long, dfn, u32 dword; int err = 0; - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_SYS_ADMIN) || + security_locked_down(LOCKDOWN_PCI_ACCESS)) return -EPERM; dev = pci_get_domain_bus_and_slot(0, bus, dfn); diff --git a/include/linux/security.h b/include/linux/security.h index 80ac7fb27aa9..2b763f0ee352 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -107,6 +107,7 @@ enum lockdown_reason { LOCKDOWN_DEV_MEM, LOCKDOWN_KEXEC, LOCKDOWN_HIBERNATION, + LOCKDOWN_PCI_ACCESS, LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_CONFIDENTIALITY_MAX, }; diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c index 3462f7edcaac..410e90eda848 100644 --- a/security/lockdown/lockdown.c +++ b/security/lockdown/lockdown.c @@ -22,6 +22,7 @@ static char *lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_DEV_MEM] = "/dev/mem,kmem,port", [LOCKDOWN_KEXEC] = "kexec of unsigned images", [LOCKDOWN_HIBERNATION] = "hibernation", + [LOCKDOWN_PCI_ACCESS] = "direct PCI access", [LOCKDOWN_INTEGRITY_MAX] = "integrity", [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality", }; From 96c4f67293e4cd8b3394adce5a8041a2784e68a3 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Mon, 19 Aug 2019 17:17:48 -0700 Subject: [PATCH 052/334] x86: Lock down IO port access when the kernel is locked down IO port access would permit users to gain access to PCI configuration registers, which in turn (on a lot of hardware) give access to MMIO register space. This would potentially permit root to trigger arbitrary DMA, so lock it down by default. This also implicitly locks down the KDADDIO, KDDELIO, KDENABIO and KDDISABIO console ioctls. Signed-off-by: Matthew Garrett Signed-off-by: David Howells Reviewed-by: Kees Cook cc: x86@kernel.org Signed-off-by: James Morris --- arch/x86/kernel/ioport.c | 7 +++++-- include/linux/security.h | 1 + security/lockdown/lockdown.c | 1 + 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 0fe1c8782208..61a89d3c0382 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -31,7 +32,8 @@ long ksys_ioperm(unsigned long from, unsigned long num, int turn_on) if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) return -EINVAL; - if (turn_on && !capable(CAP_SYS_RAWIO)) + if (turn_on && (!capable(CAP_SYS_RAWIO) || + security_locked_down(LOCKDOWN_IOPORT))) return -EPERM; /* @@ -126,7 +128,8 @@ SYSCALL_DEFINE1(iopl, unsigned int, level) return -EINVAL; /* Trying to gain more privileges? */ if (level > old) { - if (!capable(CAP_SYS_RAWIO)) + if (!capable(CAP_SYS_RAWIO) || + security_locked_down(LOCKDOWN_IOPORT)) return -EPERM; } regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | diff --git a/include/linux/security.h b/include/linux/security.h index 2b763f0ee352..cd93fa5d3c6d 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -108,6 +108,7 @@ enum lockdown_reason { LOCKDOWN_KEXEC, LOCKDOWN_HIBERNATION, LOCKDOWN_PCI_ACCESS, + LOCKDOWN_IOPORT, LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_CONFIDENTIALITY_MAX, }; diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c index 410e90eda848..8b7d65dbb086 100644 --- a/security/lockdown/lockdown.c +++ b/security/lockdown/lockdown.c @@ -23,6 +23,7 @@ static char *lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_KEXEC] = "kexec of unsigned images", [LOCKDOWN_HIBERNATION] = "hibernation", [LOCKDOWN_PCI_ACCESS] = "direct PCI access", + [LOCKDOWN_IOPORT] = "raw io port access", [LOCKDOWN_INTEGRITY_MAX] = "integrity", [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality", }; From 95f5e95f41dff31b2a4566c5a8975c08a49ae4e3 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Mon, 19 Aug 2019 17:17:49 -0700 Subject: [PATCH 053/334] x86/msr: Restrict MSR access when the kernel is locked down Writing to MSRs should not be allowed if the kernel is locked down, since it could lead to execution of arbitrary code in kernel mode. Based on a patch by Kees Cook. Signed-off-by: Matthew Garrett Signed-off-by: David Howells Acked-by: Kees Cook Reviewed-by: Thomas Gleixner cc: x86@kernel.org Signed-off-by: James Morris --- arch/x86/kernel/msr.c | 8 ++++++++ include/linux/security.h | 1 + security/lockdown/lockdown.c | 1 + 3 files changed, 10 insertions(+) diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 3db2252b958d..1547be359d7f 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -79,6 +80,10 @@ static ssize_t msr_write(struct file *file, const char __user *buf, int err = 0; ssize_t bytes = 0; + err = security_locked_down(LOCKDOWN_MSR); + if (err) + return err; + if (count % 8) return -EINVAL; /* Invalid chunk size */ @@ -130,6 +135,9 @@ static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg) err = -EFAULT; break; } + err = security_locked_down(LOCKDOWN_MSR); + if (err) + break; err = wrmsr_safe_regs_on_cpu(cpu, regs); if (err) break; diff --git a/include/linux/security.h b/include/linux/security.h index cd93fa5d3c6d..010637a79eac 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -109,6 +109,7 @@ enum lockdown_reason { LOCKDOWN_HIBERNATION, LOCKDOWN_PCI_ACCESS, LOCKDOWN_IOPORT, + LOCKDOWN_MSR, LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_CONFIDENTIALITY_MAX, }; diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c index 8b7d65dbb086..b1c1c72440d5 100644 --- a/security/lockdown/lockdown.c +++ b/security/lockdown/lockdown.c @@ -24,6 +24,7 @@ static char *lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_HIBERNATION] = "hibernation", [LOCKDOWN_PCI_ACCESS] = "direct PCI access", [LOCKDOWN_IOPORT] = "raw io port access", + [LOCKDOWN_MSR] = "raw MSR access", [LOCKDOWN_INTEGRITY_MAX] = "integrity", [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality", }; From f474e1486b78ac15322f8a1cda48a32a1deff9d3 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Mon, 19 Aug 2019 17:17:50 -0700 Subject: [PATCH 054/334] ACPI: Limit access to custom_method when the kernel is locked down custom_method effectively allows arbitrary access to system memory, making it possible for an attacker to circumvent restrictions on module loading. Disable it if the kernel is locked down. Signed-off-by: Matthew Garrett Signed-off-by: David Howells Reviewed-by: Kees Cook cc: linux-acpi@vger.kernel.org Signed-off-by: James Morris --- drivers/acpi/custom_method.c | 6 ++++++ include/linux/security.h | 1 + security/lockdown/lockdown.c | 1 + 3 files changed, 8 insertions(+) diff --git a/drivers/acpi/custom_method.c b/drivers/acpi/custom_method.c index b2ef4c2ec955..7031307becd7 100644 --- a/drivers/acpi/custom_method.c +++ b/drivers/acpi/custom_method.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "internal.h" @@ -29,6 +30,11 @@ static ssize_t cm_write(struct file *file, const char __user * user_buf, struct acpi_table_header table; acpi_status status; + int ret; + + ret = security_locked_down(LOCKDOWN_ACPI_TABLES); + if (ret) + return ret; if (!(*ppos)) { /* parse the table header to get the table length */ diff --git a/include/linux/security.h b/include/linux/security.h index 010637a79eac..390e39395112 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -110,6 +110,7 @@ enum lockdown_reason { LOCKDOWN_PCI_ACCESS, LOCKDOWN_IOPORT, LOCKDOWN_MSR, + LOCKDOWN_ACPI_TABLES, LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_CONFIDENTIALITY_MAX, }; diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c index b1c1c72440d5..6d44db0ddffa 100644 --- a/security/lockdown/lockdown.c +++ b/security/lockdown/lockdown.c @@ -25,6 +25,7 @@ static char *lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_PCI_ACCESS] = "direct PCI access", [LOCKDOWN_IOPORT] = "raw io port access", [LOCKDOWN_MSR] = "raw MSR access", + [LOCKDOWN_ACPI_TABLES] = "modifying ACPI tables", [LOCKDOWN_INTEGRITY_MAX] = "integrity", [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality", }; From 41fa1ee9c6d687afb05760dd349f361855f1d7f5 Mon Sep 17 00:00:00 2001 From: Josh Boyer Date: Mon, 19 Aug 2019 17:17:51 -0700 Subject: [PATCH 055/334] acpi: Ignore acpi_rsdp kernel param when the kernel has been locked down This option allows userspace to pass the RSDP address to the kernel, which makes it possible for a user to modify the workings of hardware. Reject the option when the kernel is locked down. This requires some reworking of the existing RSDP command line logic, since the early boot code also makes use of a command-line passed RSDP when locating the SRAT table before the lockdown code has been initialised. This is achieved by separating the command line RSDP path in the early boot code from the generic RSDP path, and then copying the command line RSDP into boot params in the kernel proper if lockdown is not enabled. If lockdown is enabled and an RSDP is provided on the command line, this will only be used when parsing SRAT (which shouldn't permit kernel code execution) and will be ignored in the rest of the kernel. (Modified by Matthew Garrett in order to handle the early boot RSDP environment) Signed-off-by: Josh Boyer Signed-off-by: David Howells Signed-off-by: Matthew Garrett Reviewed-by: Kees Cook cc: Dave Young cc: linux-acpi@vger.kernel.org Signed-off-by: James Morris --- arch/x86/boot/compressed/acpi.c | 19 +++++++++++++------ arch/x86/include/asm/acpi.h | 9 +++++++++ arch/x86/include/asm/x86_init.h | 2 ++ arch/x86/kernel/acpi/boot.c | 5 +++++ arch/x86/kernel/x86_init.c | 1 + drivers/acpi/osl.c | 14 +++++++++++++- include/linux/acpi.h | 6 ++++++ 7 files changed, 49 insertions(+), 7 deletions(-) diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c index ad84239e595e..e726e9b44bb1 100644 --- a/arch/x86/boot/compressed/acpi.c +++ b/arch/x86/boot/compressed/acpi.c @@ -26,7 +26,7 @@ struct mem_vector immovable_mem[MAX_NUMNODES*2]; */ #define MAX_ADDR_LEN 19 -static acpi_physical_address get_acpi_rsdp(void) +static acpi_physical_address get_cmdline_acpi_rsdp(void) { acpi_physical_address addr = 0; @@ -215,10 +215,7 @@ acpi_physical_address get_rsdp_addr(void) { acpi_physical_address pa; - pa = get_acpi_rsdp(); - - if (!pa) - pa = boot_params->acpi_rsdp_addr; + pa = boot_params->acpi_rsdp_addr; if (!pa) pa = efi_get_rsdp_addr(); @@ -240,7 +237,17 @@ static unsigned long get_acpi_srat_table(void) char arg[10]; u8 *entry; - rsdp = (struct acpi_table_rsdp *)(long)boot_params->acpi_rsdp_addr; + /* + * Check whether we were given an RSDP on the command line. We don't + * stash this in boot params because the kernel itself may have + * different ideas about whether to trust a command-line parameter. + */ + rsdp = (struct acpi_table_rsdp *)get_cmdline_acpi_rsdp(); + + if (!rsdp) + rsdp = (struct acpi_table_rsdp *)(long) + boot_params->acpi_rsdp_addr; + if (!rsdp) return 0; diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index aac686e1e005..bc9693c9107e 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -117,6 +117,12 @@ static inline bool acpi_has_cpu_in_madt(void) return !!acpi_lapic; } +#define ACPI_HAVE_ARCH_SET_ROOT_POINTER +static inline void acpi_arch_set_root_pointer(u64 addr) +{ + x86_init.acpi.set_root_pointer(addr); +} + #define ACPI_HAVE_ARCH_GET_ROOT_POINTER static inline u64 acpi_arch_get_root_pointer(void) { @@ -125,6 +131,7 @@ static inline u64 acpi_arch_get_root_pointer(void) void acpi_generic_reduced_hw_init(void); +void x86_default_set_root_pointer(u64 addr); u64 x86_default_get_root_pointer(void); #else /* !CONFIG_ACPI */ @@ -138,6 +145,8 @@ static inline void disable_acpi(void) { } static inline void acpi_generic_reduced_hw_init(void) { } +static inline void x86_default_set_root_pointer(u64 addr) { } + static inline u64 x86_default_get_root_pointer(void) { return 0; diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index b85a7c54c6a1..d584128435cb 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -134,10 +134,12 @@ struct x86_hyper_init { /** * struct x86_init_acpi - x86 ACPI init functions + * @set_root_poitner: set RSDP address * @get_root_pointer: get RSDP address * @reduced_hw_early_init: hardware reduced platform early init */ struct x86_init_acpi { + void (*set_root_pointer)(u64 addr); u64 (*get_root_pointer)(void); void (*reduced_hw_early_init)(void); }; diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 17b33ef604f3..04205ce127a1 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1760,6 +1760,11 @@ void __init arch_reserve_mem_area(acpi_physical_address addr, size_t size) e820__update_table_print(); } +void x86_default_set_root_pointer(u64 addr) +{ + boot_params.acpi_rsdp_addr = addr; +} + u64 x86_default_get_root_pointer(void) { return boot_params.acpi_rsdp_addr; diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 50a2b492fdd6..d0b8f5585a73 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -95,6 +95,7 @@ struct x86_init_ops x86_init __initdata = { }, .acpi = { + .set_root_pointer = x86_default_set_root_pointer, .get_root_pointer = x86_default_get_root_pointer, .reduced_hw_early_init = acpi_generic_reduced_hw_init, }, diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c index cc7507091dec..b7c3aeb175dd 100644 --- a/drivers/acpi/osl.c +++ b/drivers/acpi/osl.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -180,8 +181,19 @@ acpi_physical_address __init acpi_os_get_root_pointer(void) acpi_physical_address pa; #ifdef CONFIG_KEXEC - if (acpi_rsdp) + /* + * We may have been provided with an RSDP on the command line, + * but if a malicious user has done so they may be pointing us + * at modified ACPI tables that could alter kernel behaviour - + * so, we check the lockdown status before making use of + * it. If we trust it then also stash it in an architecture + * specific location (if appropriate) so it can be carried + * over further kexec()s. + */ + if (acpi_rsdp && !security_locked_down(LOCKDOWN_ACPI_TABLES)) { + acpi_arch_set_root_pointer(acpi_rsdp); return acpi_rsdp; + } #endif pa = acpi_arch_get_root_pointer(); if (pa) diff --git a/include/linux/acpi.h b/include/linux/acpi.h index d315d86844e4..268a4d91f54c 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -632,6 +632,12 @@ bool acpi_gtdt_c3stop(int type); int acpi_arch_timer_mem_init(struct arch_timer_mem *timer_mem, int *timer_count); #endif +#ifndef ACPI_HAVE_ARCH_SET_ROOT_POINTER +static inline void acpi_arch_set_root_pointer(u64 addr) +{ +} +#endif + #ifndef ACPI_HAVE_ARCH_GET_ROOT_POINTER static inline u64 acpi_arch_get_root_pointer(void) { From 6ea0e815fc5e18597724169caa6e4d46dd8e693d Mon Sep 17 00:00:00 2001 From: Linn Crosetto Date: Mon, 19 Aug 2019 17:17:52 -0700 Subject: [PATCH 056/334] acpi: Disable ACPI table override if the kernel is locked down >From the kernel documentation (initrd_table_override.txt): If the ACPI_INITRD_TABLE_OVERRIDE compile option is true, it is possible to override nearly any ACPI table provided by the BIOS with an instrumented, modified one. When lockdown is enabled, the kernel should disallow any unauthenticated changes to kernel space. ACPI tables contain code invoked by the kernel, so do not allow ACPI tables to be overridden if the kernel is locked down. Signed-off-by: Linn Crosetto Signed-off-by: David Howells Signed-off-by: Matthew Garrett Reviewed-by: Kees Cook cc: linux-acpi@vger.kernel.org Signed-off-by: James Morris --- drivers/acpi/tables.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c index de974322a197..b7c29a11c0c1 100644 --- a/drivers/acpi/tables.c +++ b/drivers/acpi/tables.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "internal.h" #ifdef CONFIG_ACPI_CUSTOM_DSDT @@ -577,6 +578,11 @@ void __init acpi_table_upgrade(void) if (table_nr == 0) return; + if (security_locked_down(LOCKDOWN_ACPI_TABLES)) { + pr_notice("kernel is locked down, ignoring table override\n"); + return; + } + acpi_tables_addr = memblock_find_in_range(0, ACPI_TABLE_UPGRADE_MAX_PHYS, all_tables_size, PAGE_SIZE); From 3f19cad3fa0d0fff18ee126f03a80420ae7bcbc9 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 19 Aug 2019 17:17:53 -0700 Subject: [PATCH 057/334] lockdown: Prohibit PCMCIA CIS storage when the kernel is locked down Prohibit replacement of the PCMCIA Card Information Structure when the kernel is locked down. Suggested-by: Dominik Brodowski Signed-off-by: David Howells Signed-off-by: Matthew Garrett Reviewed-by: Kees Cook Signed-off-by: James Morris --- drivers/pcmcia/cistpl.c | 5 +++++ include/linux/security.h | 1 + security/lockdown/lockdown.c | 1 + 3 files changed, 7 insertions(+) diff --git a/drivers/pcmcia/cistpl.c b/drivers/pcmcia/cistpl.c index abd029945cc8..629359fe3513 100644 --- a/drivers/pcmcia/cistpl.c +++ b/drivers/pcmcia/cistpl.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -1575,6 +1576,10 @@ static ssize_t pccard_store_cis(struct file *filp, struct kobject *kobj, struct pcmcia_socket *s; int error; + error = security_locked_down(LOCKDOWN_PCMCIA_CIS); + if (error) + return error; + s = to_socket(container_of(kobj, struct device, kobj)); if (off) diff --git a/include/linux/security.h b/include/linux/security.h index 390e39395112..683f0607e6f2 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -111,6 +111,7 @@ enum lockdown_reason { LOCKDOWN_IOPORT, LOCKDOWN_MSR, LOCKDOWN_ACPI_TABLES, + LOCKDOWN_PCMCIA_CIS, LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_CONFIDENTIALITY_MAX, }; diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c index 6d44db0ddffa..db3477585972 100644 --- a/security/lockdown/lockdown.c +++ b/security/lockdown/lockdown.c @@ -26,6 +26,7 @@ static char *lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_IOPORT] = "raw io port access", [LOCKDOWN_MSR] = "raw MSR access", [LOCKDOWN_ACPI_TABLES] = "modifying ACPI tables", + [LOCKDOWN_PCMCIA_CIS] = "direct PCMCIA CIS storage", [LOCKDOWN_INTEGRITY_MAX] = "integrity", [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality", }; From 794edf30ee6cd088d5f4079b1d4a4cfe5371203e Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 19 Aug 2019 17:17:54 -0700 Subject: [PATCH 058/334] lockdown: Lock down TIOCSSERIAL Lock down TIOCSSERIAL as that can be used to change the ioport and irq settings on a serial port. This only appears to be an issue for the serial drivers that use the core serial code. All other drivers seem to either ignore attempts to change port/irq or give an error. Reported-by: Greg Kroah-Hartman Signed-off-by: David Howells Signed-off-by: Matthew Garrett Reviewed-by: Kees Cook cc: Jiri Slaby Cc: linux-serial@vger.kernel.org Signed-off-by: James Morris --- drivers/tty/serial/serial_core.c | 5 +++++ include/linux/security.h | 1 + security/lockdown/lockdown.c | 1 + 3 files changed, 7 insertions(+) diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c index 83f4dd0bfd74..bbad407557b9 100644 --- a/drivers/tty/serial/serial_core.c +++ b/drivers/tty/serial/serial_core.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -862,6 +863,10 @@ static int uart_set_info(struct tty_struct *tty, struct tty_port *port, goto check_and_exit; } + retval = security_locked_down(LOCKDOWN_TIOCSSERIAL); + if (retval && (change_irq || change_port)) + goto exit; + /* * Ask the low level driver to verify the settings. */ diff --git a/include/linux/security.h b/include/linux/security.h index 683f0607e6f2..b4a85badb03a 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -112,6 +112,7 @@ enum lockdown_reason { LOCKDOWN_MSR, LOCKDOWN_ACPI_TABLES, LOCKDOWN_PCMCIA_CIS, + LOCKDOWN_TIOCSSERIAL, LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_CONFIDENTIALITY_MAX, }; diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c index db3477585972..771c77f9c04a 100644 --- a/security/lockdown/lockdown.c +++ b/security/lockdown/lockdown.c @@ -27,6 +27,7 @@ static char *lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_MSR] = "raw MSR access", [LOCKDOWN_ACPI_TABLES] = "modifying ACPI tables", [LOCKDOWN_PCMCIA_CIS] = "direct PCMCIA CIS storage", + [LOCKDOWN_TIOCSSERIAL] = "reconfiguration of serial port IO", [LOCKDOWN_INTEGRITY_MAX] = "integrity", [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality", }; From 20657f66ef52e5005369e4ef539d4cbf01eab10d Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 19 Aug 2019 17:17:55 -0700 Subject: [PATCH 059/334] lockdown: Lock down module params that specify hardware parameters (eg. ioport) Provided an annotation for module parameters that specify hardware parameters (such as io ports, iomem addresses, irqs, dma channels, fixed dma buffers and other types). Suggested-by: Alan Cox Signed-off-by: David Howells Signed-off-by: Matthew Garrett Reviewed-by: Kees Cook Cc: Jessica Yu Signed-off-by: James Morris --- include/linux/security.h | 1 + kernel/params.c | 21 ++++++++++++++++----- security/lockdown/lockdown.c | 1 + 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/include/linux/security.h b/include/linux/security.h index b4a85badb03a..1a3404f9c060 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -113,6 +113,7 @@ enum lockdown_reason { LOCKDOWN_ACPI_TABLES, LOCKDOWN_PCMCIA_CIS, LOCKDOWN_TIOCSSERIAL, + LOCKDOWN_MODULE_PARAMETERS, LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_CONFIDENTIALITY_MAX, }; diff --git a/kernel/params.c b/kernel/params.c index cf448785d058..8e56f8b12d8f 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -12,6 +12,7 @@ #include #include #include +#include #ifdef CONFIG_SYSFS /* Protects all built-in parameters, modules use their own param_lock */ @@ -96,13 +97,19 @@ bool parameq(const char *a, const char *b) return parameqn(a, b, strlen(a)+1); } -static void param_check_unsafe(const struct kernel_param *kp) +static bool param_check_unsafe(const struct kernel_param *kp) { + if (kp->flags & KERNEL_PARAM_FL_HWPARAM && + security_locked_down(LOCKDOWN_MODULE_PARAMETERS)) + return false; + if (kp->flags & KERNEL_PARAM_FL_UNSAFE) { pr_notice("Setting dangerous option %s - tainting kernel\n", kp->name); add_taint(TAINT_USER, LOCKDEP_STILL_OK); } + + return true; } static int parse_one(char *param, @@ -132,8 +139,10 @@ static int parse_one(char *param, pr_debug("handling %s with %p\n", param, params[i].ops->set); kernel_param_lock(params[i].mod); - param_check_unsafe(¶ms[i]); - err = params[i].ops->set(val, ¶ms[i]); + if (param_check_unsafe(¶ms[i])) + err = params[i].ops->set(val, ¶ms[i]); + else + err = -EPERM; kernel_param_unlock(params[i].mod); return err; } @@ -553,8 +562,10 @@ static ssize_t param_attr_store(struct module_attribute *mattr, return -EPERM; kernel_param_lock(mk->mod); - param_check_unsafe(attribute->param); - err = attribute->param->ops->set(buf, attribute->param); + if (param_check_unsafe(attribute->param)) + err = attribute->param->ops->set(buf, attribute->param); + else + err = -EPERM; kernel_param_unlock(mk->mod); if (!err) return len; diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c index 771c77f9c04a..0fa434294667 100644 --- a/security/lockdown/lockdown.c +++ b/security/lockdown/lockdown.c @@ -28,6 +28,7 @@ static char *lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_ACPI_TABLES] = "modifying ACPI tables", [LOCKDOWN_PCMCIA_CIS] = "direct PCMCIA CIS storage", [LOCKDOWN_TIOCSSERIAL] = "reconfiguration of serial port IO", + [LOCKDOWN_MODULE_PARAMETERS] = "unsafe module parameters", [LOCKDOWN_INTEGRITY_MAX] = "integrity", [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality", }; From 906357f77a077508d160e729f917c5f0a4304f25 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 19 Aug 2019 17:17:56 -0700 Subject: [PATCH 060/334] x86/mmiotrace: Lock down the testmmiotrace module The testmmiotrace module shouldn't be permitted when the kernel is locked down as it can be used to arbitrarily read and write MMIO space. This is a runtime check rather than buildtime in order to allow configurations where the same kernel may be run in both locked down or permissive modes depending on local policy. Suggested-by: Thomas Gleixner Signed-off-by: David Howells Acked-by: Steven Rostedt (VMware) Reviewed-by: Kees Cook cc: Thomas Gleixner cc: Steven Rostedt cc: Ingo Molnar cc: "H. Peter Anvin" cc: x86@kernel.org Signed-off-by: James Morris --- arch/x86/mm/testmmiotrace.c | 5 +++++ include/linux/security.h | 1 + security/lockdown/lockdown.c | 1 + 3 files changed, 7 insertions(+) diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c index 0881e1ff1e58..a8bd952e136d 100644 --- a/arch/x86/mm/testmmiotrace.c +++ b/arch/x86/mm/testmmiotrace.c @@ -8,6 +8,7 @@ #include #include #include +#include static unsigned long mmio_address; module_param_hw(mmio_address, ulong, iomem, 0); @@ -115,6 +116,10 @@ static void do_test_bulk_ioremapping(void) static int __init init(void) { unsigned long size = (read_far) ? (8 << 20) : (16 << 10); + int ret = security_locked_down(LOCKDOWN_MMIOTRACE); + + if (ret) + return ret; if (mmio_address == 0) { pr_err("you have to use the module argument mmio_address.\n"); diff --git a/include/linux/security.h b/include/linux/security.h index 1a3404f9c060..d8db7ea4c4bf 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -114,6 +114,7 @@ enum lockdown_reason { LOCKDOWN_PCMCIA_CIS, LOCKDOWN_TIOCSSERIAL, LOCKDOWN_MODULE_PARAMETERS, + LOCKDOWN_MMIOTRACE, LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_CONFIDENTIALITY_MAX, }; diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c index 0fa434294667..2eadbe0667e7 100644 --- a/security/lockdown/lockdown.c +++ b/security/lockdown/lockdown.c @@ -29,6 +29,7 @@ static char *lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_PCMCIA_CIS] = "direct PCMCIA CIS storage", [LOCKDOWN_TIOCSSERIAL] = "reconfiguration of serial port IO", [LOCKDOWN_MODULE_PARAMETERS] = "unsafe module parameters", + [LOCKDOWN_MMIOTRACE] = "unsafe mmio", [LOCKDOWN_INTEGRITY_MAX] = "integrity", [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality", }; From 02e935bf5b34edcc4cb0dc532dd0e1a1bfb33b51 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 19 Aug 2019 17:17:57 -0700 Subject: [PATCH 061/334] lockdown: Lock down /proc/kcore Disallow access to /proc/kcore when the kernel is locked down to prevent access to cryptographic data. This is limited to lockdown confidentiality mode and is still permitted in integrity mode. Signed-off-by: David Howells Signed-off-by: Matthew Garrett Reviewed-by: Kees Cook Signed-off-by: James Morris --- fs/proc/kcore.c | 5 +++++ include/linux/security.h | 1 + security/lockdown/lockdown.c | 1 + 3 files changed, 7 insertions(+) diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index f5834488b67d..ee2c576cc94e 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include "internal.h" @@ -545,6 +546,10 @@ out: static int open_kcore(struct inode *inode, struct file *filp) { + int ret = security_locked_down(LOCKDOWN_KCORE); + + if (ret) + return ret; if (!capable(CAP_SYS_RAWIO)) return -EPERM; diff --git a/include/linux/security.h b/include/linux/security.h index d8db7ea4c4bf..669e8de5299d 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -116,6 +116,7 @@ enum lockdown_reason { LOCKDOWN_MODULE_PARAMETERS, LOCKDOWN_MMIOTRACE, LOCKDOWN_INTEGRITY_MAX, + LOCKDOWN_KCORE, LOCKDOWN_CONFIDENTIALITY_MAX, }; diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c index 2eadbe0667e7..403b30357f75 100644 --- a/security/lockdown/lockdown.c +++ b/security/lockdown/lockdown.c @@ -31,6 +31,7 @@ static char *lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_MODULE_PARAMETERS] = "unsafe module parameters", [LOCKDOWN_MMIOTRACE] = "unsafe mmio", [LOCKDOWN_INTEGRITY_MAX] = "integrity", + [LOCKDOWN_KCORE] = "/proc/kcore access", [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality", }; From a94549dd87f5ea4ca50fee493df08a2dc6256b53 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 19 Aug 2019 17:17:58 -0700 Subject: [PATCH 062/334] lockdown: Lock down tracing and perf kprobes when in confidentiality mode Disallow the creation of perf and ftrace kprobes when the kernel is locked down in confidentiality mode by preventing their registration. This prevents kprobes from being used to access kernel memory to steal crypto data, but continues to allow the use of kprobes from signed modules. Reported-by: Alexei Starovoitov Signed-off-by: David Howells Signed-off-by: Matthew Garrett Acked-by: Masami Hiramatsu Reviewed-by: Kees Cook Cc: Naveen N. Rao Cc: Anil S Keshavamurthy Cc: davem@davemloft.net Cc: Masami Hiramatsu Signed-off-by: James Morris --- include/linux/security.h | 1 + kernel/trace/trace_kprobe.c | 5 +++++ security/lockdown/lockdown.c | 1 + 3 files changed, 7 insertions(+) diff --git a/include/linux/security.h b/include/linux/security.h index 669e8de5299d..0b2529dbf0f4 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -117,6 +117,7 @@ enum lockdown_reason { LOCKDOWN_MMIOTRACE, LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_KCORE, + LOCKDOWN_KPROBES, LOCKDOWN_CONFIDENTIALITY_MAX, }; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 7d736248a070..fcb28b0702b2 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "trace_dynevent.h" #include "trace_kprobe_selftest.h" @@ -415,6 +416,10 @@ static int __register_trace_kprobe(struct trace_kprobe *tk) { int i, ret; + ret = security_locked_down(LOCKDOWN_KPROBES); + if (ret) + return ret; + if (trace_probe_is_registered(&tk->tp)) return -EINVAL; diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c index 403b30357f75..27b2cf51e443 100644 --- a/security/lockdown/lockdown.c +++ b/security/lockdown/lockdown.c @@ -32,6 +32,7 @@ static char *lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_MMIOTRACE] = "unsafe mmio", [LOCKDOWN_INTEGRITY_MAX] = "integrity", [LOCKDOWN_KCORE] = "/proc/kcore access", + [LOCKDOWN_KPROBES] = "use of kprobes", [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality", }; From 9d1f8be5cf42b497a3bddf1d523f2bb142e9318c Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 19 Aug 2019 17:17:59 -0700 Subject: [PATCH 063/334] bpf: Restrict bpf when kernel lockdown is in confidentiality mode bpf_read() and bpf_read_str() could potentially be abused to (eg) allow private keys in kernel memory to be leaked. Disable them if the kernel has been locked down in confidentiality mode. Suggested-by: Alexei Starovoitov Signed-off-by: Matthew Garrett Reviewed-by: Kees Cook cc: netdev@vger.kernel.org cc: Chun-Yi Lee cc: Alexei Starovoitov Cc: Daniel Borkmann Signed-off-by: James Morris --- include/linux/security.h | 1 + kernel/trace/bpf_trace.c | 10 ++++++++++ security/lockdown/lockdown.c | 1 + 3 files changed, 12 insertions(+) diff --git a/include/linux/security.h b/include/linux/security.h index 0b2529dbf0f4..e604f4c67f03 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -118,6 +118,7 @@ enum lockdown_reason { LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_KCORE, LOCKDOWN_KPROBES, + LOCKDOWN_BPF_READ, LOCKDOWN_CONFIDENTIALITY_MAX, }; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 1c9a4745e596..33a954c367f3 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -139,8 +139,13 @@ BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr) { int ret; + ret = security_locked_down(LOCKDOWN_BPF_READ); + if (ret < 0) + goto out; + ret = probe_kernel_read(dst, unsafe_ptr, size); if (unlikely(ret < 0)) +out: memset(dst, 0, size); return ret; @@ -566,6 +571,10 @@ BPF_CALL_3(bpf_probe_read_str, void *, dst, u32, size, { int ret; + ret = security_locked_down(LOCKDOWN_BPF_READ); + if (ret < 0) + goto out; + /* * The strncpy_from_unsafe() call will likely not fill the entire * buffer, but that's okay in this circumstance as we're probing @@ -577,6 +586,7 @@ BPF_CALL_3(bpf_probe_read_str, void *, dst, u32, size, */ ret = strncpy_from_unsafe(dst, unsafe_ptr, size); if (unlikely(ret < 0)) +out: memset(dst, 0, size); return ret; diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c index 27b2cf51e443..2397772c56bd 100644 --- a/security/lockdown/lockdown.c +++ b/security/lockdown/lockdown.c @@ -33,6 +33,7 @@ static char *lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_INTEGRITY_MAX] = "integrity", [LOCKDOWN_KCORE] = "/proc/kcore access", [LOCKDOWN_KPROBES] = "use of kprobes", + [LOCKDOWN_BPF_READ] = "use of bpf to read kernel RAM", [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality", }; From b0c8fdc7fdb77586c3d1937050925b960743306e Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 19 Aug 2019 17:18:00 -0700 Subject: [PATCH 064/334] lockdown: Lock down perf when in confidentiality mode Disallow the use of certain perf facilities that might allow userspace to access kernel data. Signed-off-by: David Howells Signed-off-by: Matthew Garrett Reviewed-by: Kees Cook Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Signed-off-by: James Morris --- include/linux/security.h | 1 + kernel/events/core.c | 7 +++++++ security/lockdown/lockdown.c | 1 + 3 files changed, 9 insertions(+) diff --git a/include/linux/security.h b/include/linux/security.h index e604f4c67f03..b94f1e697537 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -119,6 +119,7 @@ enum lockdown_reason { LOCKDOWN_KCORE, LOCKDOWN_KPROBES, LOCKDOWN_BPF_READ, + LOCKDOWN_PERF, LOCKDOWN_CONFIDENTIALITY_MAX, }; diff --git a/kernel/events/core.c b/kernel/events/core.c index f85929ce13be..8732f980a4fc 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10798,6 +10798,13 @@ SYSCALL_DEFINE5(perf_event_open, perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) return -EACCES; + err = security_locked_down(LOCKDOWN_PERF); + if (err && (attr.sample_type & PERF_SAMPLE_REGS_INTR)) + /* REGS_INTR can leak data, lockdown must prevent this */ + return err; + + err = 0; + /* * In cgroup mode, the pid argument is used to pass the fd * opened to the cgroup directory in cgroupfs. The cpu argument diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c index 2397772c56bd..3d7b1039457b 100644 --- a/security/lockdown/lockdown.c +++ b/security/lockdown/lockdown.c @@ -34,6 +34,7 @@ static char *lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_KCORE] = "/proc/kcore access", [LOCKDOWN_KPROBES] = "use of kprobes", [LOCKDOWN_BPF_READ] = "use of bpf to read kernel RAM", + [LOCKDOWN_PERF] = "unsafe use of perf", [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality", }; From 29d3c1c8dfe752c01b7115ecd5a3142b232a38e1 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Mon, 19 Aug 2019 17:18:01 -0700 Subject: [PATCH 065/334] kexec: Allow kexec_file() with appropriate IMA policy when locked down Systems in lockdown mode should block the kexec of untrusted kernels. For x86 and ARM we can ensure that a kernel is trustworthy by validating a PE signature, but this isn't possible on other architectures. On those platforms we can use IMA digital signatures instead. Add a function to determine whether IMA has or will verify signatures for a given event type, and if so permit kexec_file() even if the kernel is otherwise locked down. This is restricted to cases where CONFIG_INTEGRITY_TRUSTED_KEYRING is set in order to prevent an attacker from loading additional keys at runtime. Signed-off-by: Matthew Garrett Acked-by: Mimi Zohar Cc: Dmitry Kasatkin Cc: linux-integrity@vger.kernel.org Signed-off-by: James Morris --- include/linux/ima.h | 9 ++++++ kernel/kexec_file.c | 10 +++++- security/integrity/ima/ima.h | 2 ++ security/integrity/ima/ima_main.c | 2 +- security/integrity/ima/ima_policy.c | 50 +++++++++++++++++++++++++++++ 5 files changed, 71 insertions(+), 2 deletions(-) diff --git a/include/linux/ima.h b/include/linux/ima.h index 00036d2f57c3..8e2f324fb901 100644 --- a/include/linux/ima.h +++ b/include/linux/ima.h @@ -129,4 +129,13 @@ static inline int ima_inode_removexattr(struct dentry *dentry, return 0; } #endif /* CONFIG_IMA_APPRAISE */ + +#if defined(CONFIG_IMA_APPRAISE) && defined(CONFIG_INTEGRITY_TRUSTED_KEYRING) +extern bool ima_appraise_signature(enum kernel_read_file_id func); +#else +static inline bool ima_appraise_signature(enum kernel_read_file_id func) +{ + return false; +} +#endif /* CONFIG_IMA_APPRAISE && CONFIG_INTEGRITY_TRUSTED_KEYRING */ #endif /* _LINUX_IMA_H */ diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 43109ef4d6bf..7f4a618fc8c1 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -208,7 +208,15 @@ kimage_validate_signature(struct kimage *image) return ret; } - return security_locked_down(LOCKDOWN_KEXEC); + /* If IMA is guaranteed to appraise a signature on the kexec + * image, permit it even if the kernel is otherwise locked + * down. + */ + if (!ima_appraise_signature(READING_KEXEC_IMAGE) && + security_locked_down(LOCKDOWN_KEXEC)) + return -EPERM; + + return 0; /* All other errors are fatal, including nomem, unparseable * signatures and signature check failures - even if signatures diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h index ca10917b5f89..874bd77d3b91 100644 --- a/security/integrity/ima/ima.h +++ b/security/integrity/ima/ima.h @@ -111,6 +111,8 @@ struct ima_kexec_hdr { u64 count; }; +extern const int read_idmap[]; + #ifdef CONFIG_HAVE_IMA_KEXEC void ima_load_kexec_buffer(void); #else diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index 1cffda4412b7..1747bc7bcb60 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -469,7 +469,7 @@ int ima_read_file(struct file *file, enum kernel_read_file_id read_id) return 0; } -static const int read_idmap[READING_MAX_ID] = { +const int read_idmap[READING_MAX_ID] = { [READING_FIRMWARE] = FIRMWARE_CHECK, [READING_FIRMWARE_PREALLOC_BUFFER] = FIRMWARE_CHECK, [READING_MODULE] = MODULE_CHECK, diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c index 7b53f2ca58e2..b8773f05f9da 100644 --- a/security/integrity/ima/ima_policy.c +++ b/security/integrity/ima/ima_policy.c @@ -1339,3 +1339,53 @@ int ima_policy_show(struct seq_file *m, void *v) return 0; } #endif /* CONFIG_IMA_READ_POLICY */ + +#if defined(CONFIG_IMA_APPRAISE) && defined(CONFIG_INTEGRITY_TRUSTED_KEYRING) +/* + * ima_appraise_signature: whether IMA will appraise a given function using + * an IMA digital signature. This is restricted to cases where the kernel + * has a set of built-in trusted keys in order to avoid an attacker simply + * loading additional keys. + */ +bool ima_appraise_signature(enum kernel_read_file_id id) +{ + struct ima_rule_entry *entry; + bool found = false; + enum ima_hooks func; + + if (id >= READING_MAX_ID) + return false; + + func = read_idmap[id] ?: FILE_CHECK; + + rcu_read_lock(); + list_for_each_entry_rcu(entry, ima_rules, list) { + if (entry->action != APPRAISE) + continue; + + /* + * A generic entry will match, but otherwise require that it + * match the func we're looking for + */ + if (entry->func && entry->func != func) + continue; + + /* + * We require this to be a digital signature, not a raw IMA + * hash. + */ + if (entry->flags & IMA_DIGSIG_REQUIRED) + found = true; + + /* + * We've found a rule that matches, so break now even if it + * didn't require a digital signature - a later rule that does + * won't override it, so would be a false positive. + */ + break; + } + + rcu_read_unlock(); + return found; +} +#endif /* CONFIG_IMA_APPRAISE && CONFIG_INTEGRITY_TRUSTED_KEYRING */ From 5496197f9b084f086cb410dd566648b0896fcc74 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 19 Aug 2019 17:18:02 -0700 Subject: [PATCH 066/334] debugfs: Restrict debugfs when the kernel is locked down Disallow opening of debugfs files that might be used to muck around when the kernel is locked down as various drivers give raw access to hardware through debugfs. Given the effort of auditing all 2000 or so files and manually fixing each one as necessary, I've chosen to apply a heuristic instead. The following changes are made: (1) chmod and chown are disallowed on debugfs objects (though the root dir can be modified by mount and remount, but I'm not worried about that). (2) When the kernel is locked down, only files with the following criteria are permitted to be opened: - The file must have mode 00444 - The file must not have ioctl methods - The file must not have mmap (3) When the kernel is locked down, files may only be opened for reading. Normal device interaction should be done through configfs, sysfs or a miscdev, not debugfs. Note that this makes it unnecessary to specifically lock down show_dsts(), show_devs() and show_call() in the asus-wmi driver. I would actually prefer to lock down all files by default and have the the files unlocked by the creator. This is tricky to manage correctly, though, as there are 19 creation functions and ~1600 call sites (some of them in loops scanning tables). Signed-off-by: David Howells cc: Andy Shevchenko cc: acpi4asus-user@lists.sourceforge.net cc: platform-driver-x86@vger.kernel.org cc: Matthew Garrett cc: Thomas Gleixner Cc: Greg KH Cc: Rafael J. Wysocki Signed-off-by: Matthew Garrett Signed-off-by: James Morris --- fs/debugfs/file.c | 30 ++++++++++++++++++++++++++++++ fs/debugfs/inode.c | 32 ++++++++++++++++++++++++++++++-- include/linux/security.h | 1 + security/lockdown/lockdown.c | 1 + 4 files changed, 62 insertions(+), 2 deletions(-) diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index ddd708b09fa1..5d3e449b5988 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "internal.h" @@ -136,6 +137,25 @@ void debugfs_file_put(struct dentry *dentry) } EXPORT_SYMBOL_GPL(debugfs_file_put); +/* + * Only permit access to world-readable files when the kernel is locked down. + * We also need to exclude any file that has ways to write or alter it as root + * can bypass the permissions check. + */ +static bool debugfs_is_locked_down(struct inode *inode, + struct file *filp, + const struct file_operations *real_fops) +{ + if ((inode->i_mode & 07777) == 0444 && + !(filp->f_mode & FMODE_WRITE) && + !real_fops->unlocked_ioctl && + !real_fops->compat_ioctl && + !real_fops->mmap) + return false; + + return security_locked_down(LOCKDOWN_DEBUGFS); +} + static int open_proxy_open(struct inode *inode, struct file *filp) { struct dentry *dentry = F_DENTRY(filp); @@ -147,6 +167,11 @@ static int open_proxy_open(struct inode *inode, struct file *filp) return r == -EIO ? -ENOENT : r; real_fops = debugfs_real_fops(filp); + + r = debugfs_is_locked_down(inode, filp, real_fops); + if (r) + goto out; + real_fops = fops_get(real_fops); if (!real_fops) { /* Huh? Module did not clean up after itself at exit? */ @@ -272,6 +297,11 @@ static int full_proxy_open(struct inode *inode, struct file *filp) return r == -EIO ? -ENOENT : r; real_fops = debugfs_real_fops(filp); + + r = debugfs_is_locked_down(inode, filp, real_fops); + if (r) + goto out; + real_fops = fops_get(real_fops); if (!real_fops) { /* Huh? Module did not cleanup after itself at exit? */ diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index acef14ad53db..c8613bcad252 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -23,6 +23,7 @@ #include #include #include +#include #include "internal.h" @@ -32,6 +33,32 @@ static struct vfsmount *debugfs_mount; static int debugfs_mount_count; static bool debugfs_registered; +/* + * Don't allow access attributes to be changed whilst the kernel is locked down + * so that we can use the file mode as part of a heuristic to determine whether + * to lock down individual files. + */ +static int debugfs_setattr(struct dentry *dentry, struct iattr *ia) +{ + int ret = security_locked_down(LOCKDOWN_DEBUGFS); + + if (ret && (ia->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID))) + return ret; + return simple_setattr(dentry, ia); +} + +static const struct inode_operations debugfs_file_inode_operations = { + .setattr = debugfs_setattr, +}; +static const struct inode_operations debugfs_dir_inode_operations = { + .lookup = simple_lookup, + .setattr = debugfs_setattr, +}; +static const struct inode_operations debugfs_symlink_inode_operations = { + .get_link = simple_get_link, + .setattr = debugfs_setattr, +}; + static struct inode *debugfs_get_inode(struct super_block *sb) { struct inode *inode = new_inode(sb); @@ -355,6 +382,7 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode, inode->i_mode = mode; inode->i_private = data; + inode->i_op = &debugfs_file_inode_operations; inode->i_fop = proxy_fops; dentry->d_fsdata = (void *)((unsigned long)real_fops | DEBUGFS_FSDATA_IS_REAL_FOPS_BIT); @@ -515,7 +543,7 @@ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent) return failed_creating(dentry); inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; - inode->i_op = &simple_dir_inode_operations; + inode->i_op = &debugfs_dir_inode_operations; inode->i_fop = &simple_dir_operations; /* directory inodes start off with i_nlink == 2 (for "." entry) */ @@ -610,7 +638,7 @@ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent, return failed_creating(dentry); } inode->i_mode = S_IFLNK | S_IRWXUGO; - inode->i_op = &simple_symlink_inode_operations; + inode->i_op = &debugfs_symlink_inode_operations; inode->i_link = link; d_instantiate(dentry, inode); return end_creating(dentry); diff --git a/include/linux/security.h b/include/linux/security.h index b94f1e697537..152824b6f456 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -115,6 +115,7 @@ enum lockdown_reason { LOCKDOWN_TIOCSSERIAL, LOCKDOWN_MODULE_PARAMETERS, LOCKDOWN_MMIOTRACE, + LOCKDOWN_DEBUGFS, LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_KCORE, LOCKDOWN_KPROBES, diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c index 3d7b1039457b..edd1fff0147d 100644 --- a/security/lockdown/lockdown.c +++ b/security/lockdown/lockdown.c @@ -30,6 +30,7 @@ static char *lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_TIOCSSERIAL] = "reconfiguration of serial port IO", [LOCKDOWN_MODULE_PARAMETERS] = "unsafe module parameters", [LOCKDOWN_MMIOTRACE] = "unsafe mmio", + [LOCKDOWN_DEBUGFS] = "debugfs access", [LOCKDOWN_INTEGRITY_MAX] = "integrity", [LOCKDOWN_KCORE] = "/proc/kcore access", [LOCKDOWN_KPROBES] = "use of kprobes", From ccbd54ff54e8b1880456b81c4aea352ebe208843 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Mon, 19 Aug 2019 17:18:03 -0700 Subject: [PATCH 067/334] tracefs: Restrict tracefs when the kernel is locked down Tracefs may release more information about the kernel than desirable, so restrict it when the kernel is locked down in confidentiality mode by preventing open(). (Fixed by Ben Hutchings to avoid a null dereference in default_file_open()) Signed-off-by: Matthew Garrett Reviewed-by: Steven Rostedt (VMware) Cc: Ben Hutchings Signed-off-by: James Morris --- fs/tracefs/inode.c | 42 +++++++++++++++++++++++++++++++++++- include/linux/security.h | 1 + security/lockdown/lockdown.c | 1 + 3 files changed, 43 insertions(+), 1 deletion(-) diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index a5bab190a297..761af8ce4015 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -20,6 +20,7 @@ #include #include #include +#include #define TRACEFS_DEFAULT_MODE 0700 @@ -27,6 +28,25 @@ static struct vfsmount *tracefs_mount; static int tracefs_mount_count; static bool tracefs_registered; +static int default_open_file(struct inode *inode, struct file *filp) +{ + struct dentry *dentry = filp->f_path.dentry; + struct file_operations *real_fops; + int ret; + + if (!dentry) + return -EINVAL; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + + real_fops = dentry->d_fsdata; + if (!real_fops->open) + return 0; + return real_fops->open(inode, filp); +} + static ssize_t default_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -221,6 +241,12 @@ static int tracefs_apply_options(struct super_block *sb) return 0; } +static void tracefs_destroy_inode(struct inode *inode) +{ + if (S_ISREG(inode->i_mode)) + kfree(inode->i_fop); +} + static int tracefs_remount(struct super_block *sb, int *flags, char *data) { int err; @@ -257,6 +283,7 @@ static int tracefs_show_options(struct seq_file *m, struct dentry *root) static const struct super_operations tracefs_super_operations = { .statfs = simple_statfs, .remount_fs = tracefs_remount, + .destroy_inode = tracefs_destroy_inode, .show_options = tracefs_show_options, }; @@ -387,6 +414,7 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops) { + struct file_operations *proxy_fops; struct dentry *dentry; struct inode *inode; @@ -402,8 +430,20 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode, if (unlikely(!inode)) return failed_creating(dentry); + proxy_fops = kzalloc(sizeof(struct file_operations), GFP_KERNEL); + if (unlikely(!proxy_fops)) { + iput(inode); + return failed_creating(dentry); + } + + if (!fops) + fops = &tracefs_file_operations; + + dentry->d_fsdata = (void *)fops; + memcpy(proxy_fops, fops, sizeof(*proxy_fops)); + proxy_fops->open = default_open_file; inode->i_mode = mode; - inode->i_fop = fops ? fops : &tracefs_file_operations; + inode->i_fop = proxy_fops; inode->i_private = data; d_instantiate(dentry, inode); fsnotify_create(dentry->d_parent->d_inode, dentry); diff --git a/include/linux/security.h b/include/linux/security.h index 152824b6f456..429f9f03372b 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -121,6 +121,7 @@ enum lockdown_reason { LOCKDOWN_KPROBES, LOCKDOWN_BPF_READ, LOCKDOWN_PERF, + LOCKDOWN_TRACEFS, LOCKDOWN_CONFIDENTIALITY_MAX, }; diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c index edd1fff0147d..84df03b1f5a7 100644 --- a/security/lockdown/lockdown.c +++ b/security/lockdown/lockdown.c @@ -36,6 +36,7 @@ static char *lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_KPROBES] = "use of kprobes", [LOCKDOWN_BPF_READ] = "use of bpf to read kernel RAM", [LOCKDOWN_PERF] = "unsafe use of perf", + [LOCKDOWN_TRACEFS] = "use of tracefs", [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality", }; From 1957a85b0032a81e6482ca4aab883643b8dae06e Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Mon, 19 Aug 2019 17:18:04 -0700 Subject: [PATCH 068/334] efi: Restrict efivar_ssdt_load when the kernel is locked down efivar_ssdt_load allows the kernel to import arbitrary ACPI code from an EFI variable, which gives arbitrary code execution in ring 0. Prevent that when the kernel is locked down. Signed-off-by: Matthew Garrett Acked-by: Ard Biesheuvel Reviewed-by: Kees Cook Cc: Ard Biesheuvel Cc: linux-efi@vger.kernel.org Signed-off-by: James Morris --- drivers/firmware/efi/efi.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 4b7cf7bc0ded..5f98374f77f4 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -30,6 +30,7 @@ #include #include #include +#include #include @@ -241,6 +242,11 @@ static void generic_ops_unregister(void) static char efivar_ssdt[EFIVAR_SSDT_NAME_MAX] __initdata; static int __init efivar_ssdt_setup(char *str) { + int ret = security_locked_down(LOCKDOWN_ACPI_TABLES); + + if (ret) + return ret; + if (strlen(str) < sizeof(efivar_ssdt)) memcpy(efivar_ssdt, str, strlen(str)); else From b602614a81078bf29c82b2671bb96a63488f68d6 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Mon, 19 Aug 2019 17:18:05 -0700 Subject: [PATCH 069/334] lockdown: Print current->comm in restriction messages Print the content of current->comm in messages generated by lockdown to indicate a restriction that was hit. This makes it a bit easier to find out what caused the message. The message now patterned something like: Lockdown: : is restricted; see man kernel_lockdown.7 Signed-off-by: David Howells Signed-off-by: Matthew Garrett Reviewed-by: Kees Cook Signed-off-by: James Morris --- fs/proc/kcore.c | 5 +++-- security/lockdown/lockdown.c | 8 ++++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index ee2c576cc94e..e2ed8e08cc7a 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -548,11 +548,12 @@ static int open_kcore(struct inode *inode, struct file *filp) { int ret = security_locked_down(LOCKDOWN_KCORE); - if (ret) - return ret; if (!capable(CAP_SYS_RAWIO)) return -EPERM; + if (ret) + return ret; + filp->private_data = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!filp->private_data) return -ENOMEM; diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c index 84df03b1f5a7..0068cec77c05 100644 --- a/security/lockdown/lockdown.c +++ b/security/lockdown/lockdown.c @@ -81,10 +81,14 @@ early_param("lockdown", lockdown_param); */ static int lockdown_is_locked_down(enum lockdown_reason what) { + if (WARN(what >= LOCKDOWN_CONFIDENTIALITY_MAX, + "Invalid lockdown reason")) + return -EPERM; + if (kernel_locked_down >= what) { if (lockdown_reasons[what]) - pr_notice("Lockdown: %s is restricted; see man kernel_lockdown.7\n", - lockdown_reasons[what]); + pr_notice("Lockdown: %s: %s is restricted; see man kernel_lockdown.7\n", + current->comm, lockdown_reasons[what]); return -EPERM; } From be819aa6f11145de32dab8690ec6055348488c18 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Tue, 20 Aug 2019 15:31:29 +0800 Subject: [PATCH 070/334] csky: Fixup arch_get_unmapped_area() implementation Current arch_get_unmapped_area() of abiv1 doesn't use standard kernel api. After referring to the implementation of arch/arm, we implement it with vm_unmapped_area() from linux/mm.h. Signed-off-by: Guo Ren Cc: Arnd Bergmann --- arch/csky/abiv1/inc/abi/page.h | 5 ++- arch/csky/abiv1/mmap.c | 77 ++++++++++++++++++---------------- 2 files changed, 44 insertions(+), 38 deletions(-) diff --git a/arch/csky/abiv1/inc/abi/page.h b/arch/csky/abiv1/inc/abi/page.h index 6336e92a103a..c864519117c7 100644 --- a/arch/csky/abiv1/inc/abi/page.h +++ b/arch/csky/abiv1/inc/abi/page.h @@ -1,13 +1,14 @@ /* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd. -extern unsigned long shm_align_mask; +#include + extern void flush_dcache_page(struct page *page); static inline unsigned long pages_do_alias(unsigned long addr1, unsigned long addr2) { - return (addr1 ^ addr2) & shm_align_mask; + return (addr1 ^ addr2) & (SHMLBA-1); } static inline void clear_user_page(void *addr, unsigned long vaddr, diff --git a/arch/csky/abiv1/mmap.c b/arch/csky/abiv1/mmap.c index b462fd50b23a..6792aca49999 100644 --- a/arch/csky/abiv1/mmap.c +++ b/arch/csky/abiv1/mmap.c @@ -9,58 +9,63 @@ #include #include -unsigned long shm_align_mask = (0x4000 >> 1) - 1; /* Sane caches */ +#define COLOUR_ALIGN(addr,pgoff) \ + ((((addr)+SHMLBA-1)&~(SHMLBA-1)) + \ + (((pgoff)<mm; + struct vm_area_struct *vma; + int do_align = 0; + struct vm_unmapped_area_info info; + /* + * We only need to do colour alignment if either the I or D + * caches alias. + */ + do_align = filp || (flags & MAP_SHARED); + + /* + * We enforce the MAP_FIXED case. + */ if (flags & MAP_FIXED) { - /* - * We do not accept a shared mapping if it would violate - * cache aliasing constraints. - */ - if ((flags & MAP_SHARED) && - ((addr - (pgoff << PAGE_SHIFT)) & shm_align_mask)) + if (flags & MAP_SHARED && + (addr - (pgoff << PAGE_SHIFT)) & (SHMLBA - 1)) return -EINVAL; return addr; } if (len > TASK_SIZE) return -ENOMEM; - do_color_align = 0; - if (filp || (flags & MAP_SHARED)) - do_color_align = 1; + if (addr) { - if (do_color_align) + if (do_align) addr = COLOUR_ALIGN(addr, pgoff); else addr = PAGE_ALIGN(addr); - vmm = find_vma(current->mm, addr); - if (TASK_SIZE - len >= addr && - (!vmm || addr + len <= vmm->vm_start)) - return addr; - } - addr = TASK_UNMAPPED_BASE; - if (do_color_align) - addr = COLOUR_ALIGN(addr, pgoff); - else - addr = PAGE_ALIGN(addr); - for (vmm = find_vma(current->mm, addr); ; vmm = vmm->vm_next) { - /* At this point: (!vmm || addr < vmm->vm_end). */ - if (TASK_SIZE - len < addr) - return -ENOMEM; - if (!vmm || addr + len <= vmm->vm_start) + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && + (!vma || addr + len <= vm_start_gap(vma))) return addr; - addr = vmm->vm_end; - if (do_color_align) - addr = COLOUR_ALIGN(addr, pgoff); } + + info.flags = 0; + info.length = len; + info.low_limit = mm->mmap_base; + info.high_limit = TASK_SIZE; + info.align_mask = do_align ? (PAGE_MASK & (SHMLBA - 1)) : 0; + info.align_offset = pgoff << PAGE_SHIFT; + return vm_unmapped_area(&info); } From dc140045c0cace809af872e3799e8fbe1b7d7f86 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Tue, 20 Aug 2019 12:47:24 +0800 Subject: [PATCH 071/334] csky: Fixup defer cache flush for 610 We use defer cache flush mechanism to improve the performance of 610, but the implementation is wrong. We fix it up now and update the mechanism: - Zero page needn't be flushed. - If page is file mapping & non-touched in user space, defer flush. - If page is anon mapping or dirty file mapping, flush immediately. - In update_mmu_cache finish the defer flush by flush_dcache_page(). For 610 we need take care the dcache aliasing issue: - VIPT cache with 8K-bytes size per way in 4K page granularity. Signed-off-by: Guo Ren Cc: Arnd Bergmann --- arch/csky/abiv1/cacheflush.c | 50 +++++++++++++++------------- arch/csky/abiv1/inc/abi/cacheflush.h | 4 +-- 2 files changed, 29 insertions(+), 25 deletions(-) diff --git a/arch/csky/abiv1/cacheflush.c b/arch/csky/abiv1/cacheflush.c index 10af8b6fe322..fee99fc6612f 100644 --- a/arch/csky/abiv1/cacheflush.c +++ b/arch/csky/abiv1/cacheflush.c @@ -11,42 +11,46 @@ #include #include +#define PG_dcache_clean PG_arch_1 + void flush_dcache_page(struct page *page) { - struct address_space *mapping = page_mapping(page); - unsigned long addr; + struct address_space *mapping; - if (mapping && !mapping_mapped(mapping)) { - set_bit(PG_arch_1, &(page)->flags); + if (page == ZERO_PAGE(0)) return; + + mapping = page_mapping_file(page); + + if (mapping && !page_mapcount(page)) + clear_bit(PG_dcache_clean, &page->flags); + else { + dcache_wbinv_all(); + if (mapping) + icache_inv_all(); + set_bit(PG_dcache_clean, &page->flags); } - - /* - * We could delay the flush for the !page_mapping case too. But that - * case is for exec env/arg pages and those are %99 certainly going to - * get faulted into the tlb (and thus flushed) anyways. - */ - addr = (unsigned long) page_address(page); - dcache_wb_range(addr, addr + PAGE_SIZE); } +EXPORT_SYMBOL(flush_dcache_page); -void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, - pte_t *pte) +void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep) { - unsigned long addr; + unsigned long pfn = pte_pfn(*ptep); struct page *page; - unsigned long pfn; - pfn = pte_pfn(*pte); - if (unlikely(!pfn_valid(pfn))) + if (!pfn_valid(pfn)) return; page = pfn_to_page(pfn); - addr = (unsigned long) page_address(page); + if (page == ZERO_PAGE(0)) + return; - if (vma->vm_flags & VM_EXEC || - pages_do_alias(addr, address & PAGE_MASK)) - cache_wbinv_all(); + if (!test_and_set_bit(PG_dcache_clean, &page->flags)) + dcache_wbinv_all(); - clear_bit(PG_arch_1, &(page)->flags); + if (page_mapping_file(page)) { + if (vma->vm_flags & VM_EXEC) + icache_inv_all(); + } } diff --git a/arch/csky/abiv1/inc/abi/cacheflush.h b/arch/csky/abiv1/inc/abi/cacheflush.h index 5f663aef9b1b..fce5604cef40 100644 --- a/arch/csky/abiv1/inc/abi/cacheflush.h +++ b/arch/csky/abiv1/inc/abi/cacheflush.h @@ -26,8 +26,8 @@ extern void flush_dcache_page(struct page *); #define flush_icache_page(vma, page) cache_wbinv_all() #define flush_icache_range(start, end) cache_wbinv_range(start, end) -#define flush_icache_user_range(vma, pg, adr, len) \ - cache_wbinv_range(adr, adr + len) +#define flush_icache_user_range(vma,page,addr,len) \ + flush_dcache_page(page) #define copy_from_user_page(vma, page, vaddr, dst, src, len) \ do { \ From c7e6f0e99227b3dcdc5e62f789119e000887ff79 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Tue, 20 Aug 2019 20:15:44 +0800 Subject: [PATCH 072/334] csky: Support kernel non-aligned access We prohibit non-aligned access in kernel mode, but some special NIC driver needs to support kernel-state unaligned access. For example, when the bus does not support unaligned access, IP header parsing will cause non-aligned access and driver does not recopy the skb buffer to dma for performance reasons. Added kernel_enable & user_enable to control unaligned access and added kernel_count & user_count for statistical unaligned access. Signed-off-by: Guo Ren Cc: Arnd Bergmann --- arch/csky/abiv1/alignment.c | 62 +++++++++++++++++++++++++++---------- 1 file changed, 45 insertions(+), 17 deletions(-) diff --git a/arch/csky/abiv1/alignment.c b/arch/csky/abiv1/alignment.c index 27ef5b2c43ab..cb2a0d94a144 100644 --- a/arch/csky/abiv1/alignment.c +++ b/arch/csky/abiv1/alignment.c @@ -5,8 +5,10 @@ #include #include -static int align_enable = 1; -static int align_count; +static int align_kern_enable = 1; +static int align_usr_enable = 1; +static int align_kern_count = 0; +static int align_usr_count = 0; static inline uint32_t get_ptreg(struct pt_regs *regs, uint32_t rx) { @@ -32,9 +34,6 @@ static int ldb_asm(uint32_t addr, uint32_t *valp) uint32_t val; int err; - if (!access_ok((void *)addr, 1)) - return 1; - asm volatile ( "movi %0, 0\n" "1:\n" @@ -67,9 +66,6 @@ static int stb_asm(uint32_t addr, uint32_t val) { int err; - if (!access_ok((void *)addr, 1)) - return 1; - asm volatile ( "movi %0, 0\n" "1:\n" @@ -203,8 +199,6 @@ static int stw_c(struct pt_regs *regs, uint32_t rz, uint32_t addr) if (stb_asm(addr, byte3)) return 1; - align_count++; - return 0; } @@ -226,7 +220,14 @@ void csky_alignment(struct pt_regs *regs) uint32_t addr = 0; if (!user_mode(regs)) + goto kernel_area; + + if (!align_usr_enable) { + pr_err("%s user disabled.\n", __func__); goto bad_area; + } + + align_usr_count++; ret = get_user(tmp, (uint16_t *)instruction_pointer(regs)); if (ret) { @@ -234,6 +235,19 @@ void csky_alignment(struct pt_regs *regs) goto bad_area; } + goto good_area; + +kernel_area: + if (!align_kern_enable) { + pr_err("%s kernel disabled.\n", __func__); + goto bad_area; + } + + align_kern_count++; + + tmp = *(uint16_t *)instruction_pointer(regs); + +good_area: opcode = (uint32_t)tmp; rx = opcode & 0xf; @@ -286,18 +300,32 @@ bad_area: force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *)addr); } -static struct ctl_table alignment_tbl[4] = { +static struct ctl_table alignment_tbl[5] = { { - .procname = "enable", - .data = &align_enable, - .maxlen = sizeof(align_enable), + .procname = "kernel_enable", + .data = &align_kern_enable, + .maxlen = sizeof(align_kern_enable), .mode = 0666, .proc_handler = &proc_dointvec }, { - .procname = "count", - .data = &align_count, - .maxlen = sizeof(align_count), + .procname = "user_enable", + .data = &align_usr_enable, + .maxlen = sizeof(align_usr_enable), + .mode = 0666, + .proc_handler = &proc_dointvec + }, + { + .procname = "kernel_count", + .data = &align_kern_count, + .maxlen = sizeof(align_kern_count), + .mode = 0666, + .proc_handler = &proc_dointvec + }, + { + .procname = "user_count", + .data = &align_usr_count, + .maxlen = sizeof(align_usr_count), .mode = 0666, .proc_handler = &proc_dointvec }, From bb13f35b96f4c83dc4015d71fa2b543c665fbdae Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 20 Aug 2019 01:32:43 +0000 Subject: [PATCH 073/334] nfsd: remove duplicated include from filecache.c Remove duplicated include. Signed-off-by: YueHaibing Signed-off-by: J. Bruce Fields --- fs/nfsd/filecache.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index a2fcb251d2f6..2e1a972231e5 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -6,7 +6,6 @@ #include #include -#include #include #include #include From 4ad35c1f56386c8e7019c921bba1af109fde9693 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Wed, 21 Aug 2019 19:15:52 +0800 Subject: [PATCH 074/334] csky: Fixup 610 vipt cache flush mechanism 610 has vipt aliasing issue, so we need to finish the cache flush apis mentioned in cachetlb.rst to avoid data corruption. Here is the list of modified apis in the patch: - flush_kernel_dcache_page (new add) - flush_dcache_mmap_lock (new add) - flush_dcache_mmap_unlock (new add) - flush_kernel_vmap_range (new add) - invalidate_kernel_vmap_range (new add) - flush_anon_page (new add) - flush_cache_range (new add) - flush_cache_vmap (flush all) - flush_cache_vunmap (flush all) - flush_cache_mm (only dcache flush) - flush_icache_page (just nop) - copy_from_user_page (remove no need flush) - copy_to_user_page (remove no need flush) Change to V2: - Fixup compile error with xa_lock*(&mapping->i_pages) Signed-off-by: Guo Ren Cc: Arnd Bergmann Cc: Christoph Hellwig --- arch/csky/abiv1/cacheflush.c | 20 ++++++++++++++ arch/csky/abiv1/inc/abi/cacheflush.h | 41 ++++++++++++++++++++-------- 2 files changed, 49 insertions(+), 12 deletions(-) diff --git a/arch/csky/abiv1/cacheflush.c b/arch/csky/abiv1/cacheflush.c index fee99fc6612f..9f1fe80cc847 100644 --- a/arch/csky/abiv1/cacheflush.c +++ b/arch/csky/abiv1/cacheflush.c @@ -54,3 +54,23 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr, icache_inv_all(); } } + +void flush_kernel_dcache_page(struct page *page) +{ + struct address_space *mapping; + + mapping = page_mapping_file(page); + + if (!mapping || mapping_mapped(mapping)) + dcache_wbinv_all(); +} +EXPORT_SYMBOL(flush_kernel_dcache_page); + +void flush_cache_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end) +{ + dcache_wbinv_all(); + + if (vma->vm_flags & VM_EXEC) + icache_inv_all(); +} diff --git a/arch/csky/abiv1/inc/abi/cacheflush.h b/arch/csky/abiv1/inc/abi/cacheflush.h index fce5604cef40..79ef9e8c1afd 100644 --- a/arch/csky/abiv1/inc/abi/cacheflush.h +++ b/arch/csky/abiv1/inc/abi/cacheflush.h @@ -4,26 +4,49 @@ #ifndef __ABI_CSKY_CACHEFLUSH_H #define __ABI_CSKY_CACHEFLUSH_H -#include +#include #include #include #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 extern void flush_dcache_page(struct page *); -#define flush_cache_mm(mm) cache_wbinv_all() +#define flush_cache_mm(mm) dcache_wbinv_all() #define flush_cache_page(vma, page, pfn) cache_wbinv_all() #define flush_cache_dup_mm(mm) cache_wbinv_all() +#define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE +extern void flush_kernel_dcache_page(struct page *); + +#define flush_dcache_mmap_lock(mapping) xa_lock_irq(&mapping->i_pages) +#define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages) + +static inline void flush_kernel_vmap_range(void *addr, int size) +{ + dcache_wbinv_all(); +} +static inline void invalidate_kernel_vmap_range(void *addr, int size) +{ + dcache_wbinv_all(); +} + +#define ARCH_HAS_FLUSH_ANON_PAGE +static inline void flush_anon_page(struct vm_area_struct *vma, + struct page *page, unsigned long vmaddr) +{ + if (PageAnon(page)) + cache_wbinv_all(); +} + /* * if (current_mm != vma->mm) cache_wbinv_range(start, end) will be broken. * Use cache_wbinv_all() here and need to be improved in future. */ -#define flush_cache_range(vma, start, end) cache_wbinv_all() -#define flush_cache_vmap(start, end) cache_wbinv_range(start, end) -#define flush_cache_vunmap(start, end) cache_wbinv_range(start, end) +extern void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); +#define flush_cache_vmap(start, end) cache_wbinv_all() +#define flush_cache_vunmap(start, end) cache_wbinv_all() -#define flush_icache_page(vma, page) cache_wbinv_all() +#define flush_icache_page(vma, page) do {} while (0); #define flush_icache_range(start, end) cache_wbinv_range(start, end) #define flush_icache_user_range(vma,page,addr,len) \ @@ -31,19 +54,13 @@ extern void flush_dcache_page(struct page *); #define copy_from_user_page(vma, page, vaddr, dst, src, len) \ do { \ - cache_wbinv_all(); \ memcpy(dst, src, len); \ - cache_wbinv_all(); \ } while (0) #define copy_to_user_page(vma, page, vaddr, dst, src, len) \ do { \ - cache_wbinv_all(); \ memcpy(dst, src, len); \ cache_wbinv_all(); \ } while (0) -#define flush_dcache_mmap_lock(mapping) do {} while (0) -#define flush_dcache_mmap_unlock(mapping) do {} while (0) - #endif /* __ABI_CSKY_CACHEFLUSH_H */ From 9d60d93198c62ac3b3e59e8bba7079646c972a4c Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 26 Aug 2019 10:28:58 -0400 Subject: [PATCH 075/334] Deprecate nfsd fault injection This is only useful for client testing. I haven't really maintained it, and reference counting and locking are wrong at this point. You can get some of the same functionality now from nfsd/clients/. It was a good idea but I think its time has passed. In the unlikely event of users, hopefully the BROKEN dependency will prompt them to speak up. Otherwise I expect to remove it soon. Reported-by: Alex Lyakas Signed-off-by: J. Bruce Fields --- fs/nfsd/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index bff8456220e0..10cefb0c07c7 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -148,7 +148,7 @@ config NFSD_V4_SECURITY_LABEL config NFSD_FAULT_INJECTION bool "NFS server manual fault injection" - depends on NFSD_V4 && DEBUG_KERNEL && DEBUG_FS + depends on NFSD_V4 && DEBUG_KERNEL && DEBUG_FS && BROKEN help This option enables support for manually injecting faults into the NFS server. This is intended to be used for From 556d971bdae643de4cd7e2976e14f70ca2a3061d Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Wed, 7 Aug 2019 21:43:18 -0300 Subject: [PATCH 076/334] ima: Fix use after free in ima_read_modsig() If we can't parse the PKCS7 in the appended modsig, we will free the modsig structure and then access one of its members to determine the error value. Fixes: 39b07096364a ("ima: Implement support for module-style appended signatures") Reported-by: kbuild test robot Reported-by: Julia Lawall Reported-by: Dan Carpenter Signed-off-by: Thiago Jung Bauermann Reviewed-by: Gustavo A. R. Silva Signed-off-by: Mimi Zohar --- security/integrity/ima/ima_modsig.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/security/integrity/ima/ima_modsig.c b/security/integrity/ima/ima_modsig.c index c412e31d1714..d106885cc495 100644 --- a/security/integrity/ima/ima_modsig.c +++ b/security/integrity/ima/ima_modsig.c @@ -91,8 +91,9 @@ int ima_read_modsig(enum ima_hooks func, const void *buf, loff_t buf_len, hdr->pkcs7_msg = pkcs7_parse_message(buf + buf_len, sig_len); if (IS_ERR(hdr->pkcs7_msg)) { + rc = PTR_ERR(hdr->pkcs7_msg); kfree(hdr); - return PTR_ERR(hdr->pkcs7_msg); + return rc; } memcpy(hdr->raw_pkcs7, buf + buf_len, sig_len); From 2b86e3aaf993a3ea6c73dfcf86143061a40c62e6 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 28 Aug 2019 21:13:45 -0400 Subject: [PATCH 077/334] nfsd: eliminate an unnecessary acl size limit We're unnecessarily limiting the size of an ACL to less than what most filesystems will support. Some users do hit the limit and it's confusing and unnecessary. It still seems prudent to impose some limit on the number of ACEs the client gives us before passing it straight to kmalloc(). So, let's just limit it to the maximum number that would be possible given the amount of data left in the argument buffer. That will still leave one limit beyond whatever the filesystem imposes: the client and server negotiate a limit on the size of a request, which we have to respect. But we're no longer imposing any additional arbitrary limit. struct nfs4_ace is 20 bytes on my system and the maximum call size we'll negotiate is about a megabyte, so in practice this is limiting the allocation here to about a megabyte. Reported-by: "de Vandiere, Louis" Signed-off-by: J. Bruce Fields --- fs/nfsd/acl.h | 8 -------- fs/nfsd/nfs4xdr.c | 14 +++++++++++++- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h index 4cd7c69a6cb9..ba14d2f4b64f 100644 --- a/fs/nfsd/acl.h +++ b/fs/nfsd/acl.h @@ -39,14 +39,6 @@ struct nfs4_acl; struct svc_fh; struct svc_rqst; -/* - * Maximum ACL we'll accept from a client; chosen (somewhat - * arbitrarily) so that kmalloc'ing the ACL shouldn't require a - * high-order allocation. This allows 204 ACEs on x86_64: - */ -#define NFS4_ACL_MAX ((PAGE_SIZE - sizeof(struct nfs4_acl)) \ - / sizeof(struct nfs4_ace)) - int nfs4_acl_bytes(int entries); int nfs4_acl_get_whotype(char *, u32); __be32 nfs4_acl_write_who(struct xdr_stream *xdr, int who); diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 565d2169902c..c1fc2641e3e7 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -204,6 +204,13 @@ static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes) return p; } +static unsigned int compoundargs_bytes_left(struct nfsd4_compoundargs *argp) +{ + unsigned int this = (char *)argp->end - (char *)argp->p; + + return this + argp->pagelen; +} + static int zero_clientid(clientid_t *clid) { return (clid->cl_boot == 0) && (clid->cl_id == 0); @@ -348,7 +355,12 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, READ_BUF(4); len += 4; nace = be32_to_cpup(p++); - if (nace > NFS4_ACL_MAX) + if (nace > compoundargs_bytes_left(argp)/20) + /* + * Even with 4-byte names there wouldn't be + * space for that many aces; something fishy is + * going on: + */ return nfserr_fbig; *acl = svcxdr_tmpalloc(argp, nfs4_acl_bytes(nace)); From cbc0425d3dd370a6f0bf23589dc7b6955a53a9ce Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Sun, 25 Aug 2019 09:58:16 -0400 Subject: [PATCH 078/334] sefltest/ima: support appended signatures (modsig) In addition to the PE/COFF and IMA xattr signatures, the kexec kernel image can be signed with an appended signature, using the same scripts/sign-file tool that is used to sign kernel modules. This patch adds support for detecting a kernel image signed with an appended signature and updates the existing test messages appropriately. Reviewed-by: Petr Vorel Acked-by: Shuah Khan Reviewed-by: Thiago Jung Bauermann Reviewed-by: Jordan Hand (x86_64 QEMU) Tested-by: Jordan Hand (x86_64 QEMU) Signed-off-by: Mimi Zohar --- .../selftests/kexec/test_kexec_file_load.sh | 38 +++++++++++++++++-- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/kexec/test_kexec_file_load.sh b/tools/testing/selftests/kexec/test_kexec_file_load.sh index fa7c24e8eefb..2ff600388c30 100755 --- a/tools/testing/selftests/kexec/test_kexec_file_load.sh +++ b/tools/testing/selftests/kexec/test_kexec_file_load.sh @@ -37,11 +37,20 @@ is_ima_sig_required() # sequentially. As a result, a policy rule may be defined, but # might not necessarily be used. This test assumes if a policy # rule is specified, that is the intent. + + # First check for appended signature (modsig), then xattr if [ $ima_read_policy -eq 1 ]; then check_ima_policy "appraise" "func=KEXEC_KERNEL_CHECK" \ - "appraise_type=imasig" + "appraise_type=imasig|modsig" ret=$? - [ $ret -eq 1 ] && log_info "IMA signature required"; + if [ $ret -eq 1 ]; then + log_info "IMA or appended(modsig) signature required" + else + check_ima_policy "appraise" "func=KEXEC_KERNEL_CHECK" \ + "appraise_type=imasig" + ret=$? + [ $ret -eq 1 ] && log_info "IMA signature required"; + fi fi return $ret } @@ -84,6 +93,22 @@ check_for_imasig() return $ret } +# Return 1 for appended signature (modsig) found and 0 for not found. +check_for_modsig() +{ + local module_sig_string="~Module signature appended~" + local sig="$(tail --bytes $((${#module_sig_string} + 1)) $KERNEL_IMAGE)" + local ret=0 + + if [ "$sig" == "$module_sig_string" ]; then + ret=1 + log_info "kexec kernel image modsig signed" + else + log_info "kexec kernel image not modsig signed" + fi + return $ret +} + kexec_file_load_test() { local succeed_msg="kexec_file_load succeeded" @@ -98,7 +123,8 @@ kexec_file_load_test() # In secureboot mode with an architecture specific # policy, make sure either an IMA or PE signature exists. if [ $secureboot -eq 1 ] && [ $arch_policy -eq 1 ] && \ - [ $ima_signed -eq 0 ] && [ $pe_signed -eq 0 ]; then + [ $ima_signed -eq 0 ] && [ $pe_signed -eq 0 ] \ + && [ $ima_modsig -eq 0 ]; then log_fail "$succeed_msg (missing sig)" fi @@ -107,7 +133,8 @@ kexec_file_load_test() log_fail "$succeed_msg (missing PE sig)" fi - if [ $ima_sig_required -eq 1 ] && [ $ima_signed -eq 0 ]; then + if [ $ima_sig_required -eq 1 ] && [ $ima_signed -eq 0 ] \ + && [ $ima_modsig -eq 0 ]; then log_fail "$succeed_msg (missing IMA sig)" fi @@ -204,5 +231,8 @@ pe_signed=$? check_for_imasig ima_signed=$? +check_for_modsig +ima_modsig=$? + # Test loading the kernel image via kexec_file_load syscall kexec_file_load_test From fa5b57175364431245b006c2afcbf94dc2b15400 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 29 May 2019 11:53:43 -0500 Subject: [PATCH 079/334] ima: use struct_size() in kzalloc() One of the more common cases of allocation size calculations is finding the size of a structure that has a zero-sized array at the end, along with memory for some number of elements for that array. For example: struct foo { int stuff; struct boo entry[]; }; instance = kzalloc(sizeof(struct foo) + count * sizeof(struct boo), GFP_KERNEL); Instead of leaving these open-coded and prone to type mistakes, we can now use the new struct_size() helper: instance = kzalloc(struct_size(instance, entry, count), GFP_KERNEL); This code was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Signed-off-by: Mimi Zohar --- security/integrity/ima/ima_template.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/security/integrity/ima/ima_template.c b/security/integrity/ima/ima_template.c index f5b950e0a955..6aa6408603e3 100644 --- a/security/integrity/ima/ima_template.c +++ b/security/integrity/ima/ima_template.c @@ -306,9 +306,8 @@ static int ima_restore_template_data(struct ima_template_desc *template_desc, int ret = 0; int i; - *entry = kzalloc(sizeof(**entry) + - template_desc->num_fields * sizeof(struct ima_field_data), - GFP_NOFS); + *entry = kzalloc(struct_size(*entry, template_data, + template_desc->num_fields), GFP_NOFS); if (!*entry) return -ENOMEM; From 2a7f0e53daf29ca6dc9fbe2a27158f13474ec1b5 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 29 Aug 2019 12:29:16 -0500 Subject: [PATCH 080/334] ima: ima_api: Use struct_size() in kzalloc() One of the more common cases of allocation size calculations is finding the size of a structure that has a zero-sized array at the end, along with memory for some number of elements for that array. For example: struct ima_template_entry { ... struct ima_field_data template_data[0]; /* template related data */ }; instance = kzalloc(sizeof(struct ima_template_entry) + count * sizeof(struct ima_field_data), GFP_NOFS); Instead of leaving these open-coded and prone to type mistakes, we can now use the new struct_size() helper: instance = kzalloc(struct_size(instance, entry, count), GFP_NOFS); This code was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Signed-off-by: Mimi Zohar --- security/integrity/ima/ima_api.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/security/integrity/ima/ima_api.c b/security/integrity/ima/ima_api.c index 65224474675b..610759fe63b8 100644 --- a/security/integrity/ima/ima_api.c +++ b/security/integrity/ima/ima_api.c @@ -45,8 +45,8 @@ int ima_alloc_init_template(struct ima_event_data *event_data, else template_desc = ima_template_desc_current(); - *entry = kzalloc(sizeof(**entry) + template_desc->num_fields * - sizeof(struct ima_field_data), GFP_NOFS); + *entry = kzalloc(struct_size(*entry, template_data, + template_desc->num_fields), GFP_NOFS); if (!*entry) return -ENOMEM; From d098913a10f8ef8e6043765d7f2fa552527d9c42 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Thu, 5 Sep 2019 07:37:22 -0700 Subject: [PATCH 081/334] bus: ti-sysc: Fix clock handling for no-idle quirks NFSroot can fail on dra7 when cpsw is probed using ti-sysc interconnect target module driver as reported by Keerthy. Device clocks and the interconnect target module may or may not be enabled by the bootloader on init, but we currently assume the clocks and module are on from the bootloader for "ti,no-idle" and "ti,no-idle-on-init" quirks as reported by Grygorii Strashko. Let's fix the issue by always enabling clocks init, and never disable them for "ti,no-idle" quirk. For "ti,no-idle-on-init" quirk, we must decrement the usage count later on to allow PM runtime to idle the module if requested. Fixes: 1a5cd7c23cc5 ("bus: ti-sysc: Enable all clocks directly during init to read revision") Cc: Keerthy Cc: Vignesh Raghavendra Reported-by: Keerthy Reported-by: Grygorii Strashko Reviewed-by: Grygorii Strashko Signed-off-by: Tony Lindgren --- drivers/bus/ti-sysc.c | 48 +++++++++++++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 11 deletions(-) diff --git a/drivers/bus/ti-sysc.c b/drivers/bus/ti-sysc.c index 2db474ab4c6b..da88de487792 100644 --- a/drivers/bus/ti-sysc.c +++ b/drivers/bus/ti-sysc.c @@ -1630,17 +1630,19 @@ static int sysc_init_module(struct sysc *ddata) if (error) return error; - if (manage_clocks) { - sysc_clkdm_deny_idle(ddata); + sysc_clkdm_deny_idle(ddata); - error = sysc_enable_opt_clocks(ddata); - if (error) - return error; + /* + * Always enable clocks. The bootloader may or may not have enabled + * the related clocks. + */ + error = sysc_enable_opt_clocks(ddata); + if (error) + return error; - error = sysc_enable_main_clocks(ddata); - if (error) - goto err_opt_clocks; - } + error = sysc_enable_main_clocks(ddata); + if (error) + goto err_opt_clocks; if (!(ddata->cfg.quirks & SYSC_QUIRK_NO_RESET_ON_INIT)) { error = sysc_rstctrl_reset_deassert(ddata, true); @@ -1658,7 +1660,7 @@ static int sysc_init_module(struct sysc *ddata) goto err_main_clocks; } - if (!ddata->legacy_mode && manage_clocks) { + if (!ddata->legacy_mode) { error = sysc_enable_module(ddata->dev); if (error) goto err_main_clocks; @@ -1675,6 +1677,7 @@ err_main_clocks: if (manage_clocks) sysc_disable_main_clocks(ddata); err_opt_clocks: + /* No re-enable of clockdomain autoidle to prevent module autoidle */ if (manage_clocks) { sysc_disable_opt_clocks(ddata); sysc_clkdm_allow_idle(ddata); @@ -2355,6 +2358,28 @@ static void ti_sysc_idle(struct work_struct *work) ddata = container_of(work, struct sysc, idle_work.work); + /* + * One time decrement of clock usage counts if left on from init. + * Note that we disable opt clocks unconditionally in this case + * as they are enabled unconditionally during init without + * considering sysc_opt_clks_needed() at that point. + */ + if (ddata->cfg.quirks & (SYSC_QUIRK_NO_IDLE | + SYSC_QUIRK_NO_IDLE_ON_INIT)) { + sysc_clkdm_deny_idle(ddata); + sysc_disable_main_clocks(ddata); + sysc_disable_opt_clocks(ddata); + sysc_clkdm_allow_idle(ddata); + } + + /* Keep permanent PM runtime usage count for SYSC_QUIRK_NO_IDLE */ + if (ddata->cfg.quirks & SYSC_QUIRK_NO_IDLE) + return; + + /* + * Decrement PM runtime usage count for SYSC_QUIRK_NO_IDLE_ON_INIT + * and SYSC_QUIRK_NO_RESET_ON_INIT + */ if (pm_runtime_active(ddata->dev)) pm_runtime_put_sync(ddata->dev); } @@ -2439,7 +2464,8 @@ static int sysc_probe(struct platform_device *pdev) INIT_DELAYED_WORK(&ddata->idle_work, ti_sysc_idle); /* At least earlycon won't survive without deferred idle */ - if (ddata->cfg.quirks & (SYSC_QUIRK_NO_IDLE_ON_INIT | + if (ddata->cfg.quirks & (SYSC_QUIRK_NO_IDLE | + SYSC_QUIRK_NO_IDLE_ON_INIT | SYSC_QUIRK_NO_RESET_ON_INIT)) { schedule_delayed_work(&ddata->idle_work, 3000); } else { From 2783d0638a51e8dba6319d9f4a2b445995a4fad1 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Thu, 5 Sep 2019 13:01:29 -0700 Subject: [PATCH 082/334] bus: ti-sysc: Fix handling of invalid clocks We can currently get "Unable to handle kernel paging request at virtual address" for invalid clocks with dts node but no driver: (__clk_get_hw) from [] (ti_sysc_find_one_clockdomain+0x18/0x34) (ti_sysc_find_one_clockdomain) from [] (ti_sysc_clkdm_init+0x34/0xdc) (ti_sysc_clkdm_init) from [] (sysc_probe+0xa50/0x10e8) (sysc_probe) from [] (platform_drv_probe+0x58/0xa8) Let's add IS_ERR checks to ti_sysc_clkdm_init() as And let's start treating clk_get() with -ENOENT as a proper error. If the clock name is specified in device tree we must succeed with clk_get() to continue. For modules with no clock names specified in device tree we will just ignore the clocks. Fixes: 2b2f7def058a ("bus: ti-sysc: Add support for missing clockdomain handling") Acked-by: Roger Quadros Tested-by: Keerthy Signed-off-by: Tony Lindgren --- arch/arm/mach-omap2/pdata-quirks.c | 4 ++-- drivers/bus/ti-sysc.c | 5 +---- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/arch/arm/mach-omap2/pdata-quirks.c b/arch/arm/mach-omap2/pdata-quirks.c index 6c6f8fce854e..d942a3357090 100644 --- a/arch/arm/mach-omap2/pdata-quirks.c +++ b/arch/arm/mach-omap2/pdata-quirks.c @@ -491,11 +491,11 @@ static int ti_sysc_clkdm_init(struct device *dev, struct clk *fck, struct clk *ick, struct ti_sysc_cookie *cookie) { - if (fck) + if (!IS_ERR(fck)) cookie->clkdm = ti_sysc_find_one_clockdomain(fck); if (cookie->clkdm) return 0; - if (ick) + if (!IS_ERR(ick)) cookie->clkdm = ti_sysc_find_one_clockdomain(ick); if (cookie->clkdm) return 0; diff --git a/drivers/bus/ti-sysc.c b/drivers/bus/ti-sysc.c index da88de487792..24583d82b584 100644 --- a/drivers/bus/ti-sysc.c +++ b/drivers/bus/ti-sysc.c @@ -280,9 +280,6 @@ static int sysc_get_one_clock(struct sysc *ddata, const char *name) ddata->clocks[index] = devm_clk_get(ddata->dev, name); if (IS_ERR(ddata->clocks[index])) { - if (PTR_ERR(ddata->clocks[index]) == -ENOENT) - return 0; - dev_err(ddata->dev, "clock get error for %s: %li\n", name, PTR_ERR(ddata->clocks[index])); @@ -357,7 +354,7 @@ static int sysc_get_clocks(struct sysc *ddata) continue; error = sysc_get_one_clock(ddata, name); - if (error && error != -ENOENT) + if (error) return error; } From 4957eccf979b025286b39388fd1a60cde601a10a Mon Sep 17 00:00:00 2001 From: Adam Ford Date: Wed, 28 Aug 2019 13:33:49 -0500 Subject: [PATCH 083/334] ARM: omap2plus_defconfig: Fix missing video When the panel-dpi driver was removed, the simple-panels driver was never enabled, so anyone who used the panel-dpi driver lost video, and those who used it inconjunction with simple-panels would have to manually enable CONFIG_DRM_PANEL_SIMPLE. This patch makes CONFIG_DRM_PANEL_SIMPLE a module in the same way the deprecated panel-dpi was. Fixes: 8bf4b1621178 ("drm/omap: Remove panel-dpi driver") Signed-off-by: Adam Ford Signed-off-by: Tony Lindgren --- arch/arm/configs/omap2plus_defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/configs/omap2plus_defconfig b/arch/arm/configs/omap2plus_defconfig index c7bf9c493646..64eb896907bf 100644 --- a/arch/arm/configs/omap2plus_defconfig +++ b/arch/arm/configs/omap2plus_defconfig @@ -363,6 +363,7 @@ CONFIG_DRM_OMAP_PANEL_TPO_TD028TTEC1=m CONFIG_DRM_OMAP_PANEL_TPO_TD043MTEA1=m CONFIG_DRM_OMAP_PANEL_NEC_NL8048HL11=m CONFIG_DRM_TILCDC=m +CONFIG_DRM_PANEL_SIMPLE=m CONFIG_FB=y CONFIG_FIRMWARE_EDID=y CONFIG_FB_MODE_HELPERS=y From f9f5518a38684e031d913f40482721ff553f5ba2 Mon Sep 17 00:00:00 2001 From: Adam Ford Date: Wed, 28 Aug 2019 13:33:50 -0500 Subject: [PATCH 084/334] ARM: dts: logicpd-torpedo-baseboard: Fix missing video A previous commit removed the panel-dpi driver, which made the Torpedo video stop working because it relied on the dpi driver for setting video timings. Now that the simple-panel driver is available in omap2plus, this patch migrates the Torpedo dev kits to use a similar panel and remove the manual timing requirements. Fixes: 8bf4b1621178 ("drm/omap: Remove panel-dpi driver") Signed-off-by: Adam Ford Signed-off-by: Tony Lindgren --- .../boot/dts/logicpd-torpedo-baseboard.dtsi | 39 ++++--------------- 1 file changed, 7 insertions(+), 32 deletions(-) diff --git a/arch/arm/boot/dts/logicpd-torpedo-baseboard.dtsi b/arch/arm/boot/dts/logicpd-torpedo-baseboard.dtsi index 642e809e757a..449cc7616da6 100644 --- a/arch/arm/boot/dts/logicpd-torpedo-baseboard.dtsi +++ b/arch/arm/boot/dts/logicpd-torpedo-baseboard.dtsi @@ -108,7 +108,6 @@ &dss { status = "ok"; vdds_dsi-supply = <&vpll2>; - vdda_video-supply = <&video_reg>; pinctrl-names = "default"; pinctrl-0 = <&dss_dpi_pins1>; port { @@ -124,44 +123,20 @@ display0 = &lcd0; }; - video_reg: video_reg { + lcd0: display { + /* This isn't the exact LCD, but the timings meet spec */ + /* To make it work, set CONFIG_OMAP2_DSS_MIN_FCK_PER_PCK=4 */ + compatible = "newhaven,nhd-4.3-480272ef-atxl"; + label = "15"; pinctrl-names = "default"; pinctrl-0 = <&panel_pwr_pins>; - compatible = "regulator-fixed"; - regulator-name = "fixed-supply"; - regulator-min-microvolt = <3300000>; - regulator-max-microvolt = <3300000>; - gpio = <&gpio5 27 GPIO_ACTIVE_HIGH>; /* gpio155, lcd INI */ - }; - - lcd0: display { - compatible = "panel-dpi"; - label = "15"; - status = "okay"; - /* default-on; */ - pinctrl-names = "default"; - + backlight = <&bl>; + enable-gpios = <&gpio5 27 GPIO_ACTIVE_HIGH>; port { lcd_in: endpoint { remote-endpoint = <&dpi_out>; }; }; - - panel-timing { - clock-frequency = <9000000>; - hactive = <480>; - vactive = <272>; - hfront-porch = <3>; - hback-porch = <2>; - hsync-len = <42>; - vback-porch = <3>; - vfront-porch = <4>; - vsync-len = <11>; - hsync-active = <0>; - vsync-active = <0>; - de-active = <1>; - pixelclk-active = <1>; - }; }; bl: backlight { From 24cf23276a54dd2825d3e3965c1b1b453e2a113d Mon Sep 17 00:00:00 2001 From: Adam Ford Date: Wed, 28 Aug 2019 13:33:51 -0500 Subject: [PATCH 085/334] ARM: dts: am3517-evm: Fix missing video A previous commit removed the panel-dpi driver, which made the video on the AM3517-evm stop working because it relied on the dpi driver for setting video timings. Now that the simple-panel driver is available in omap2plus, this patch migrates the am3517-evm to use a similar panel and remove the manual timing requirements. Fixes: 8bf4b1621178 ("drm/omap: Remove panel-dpi driver") Signed-off-by: Adam Ford Signed-off-by: Tony Lindgren --- arch/arm/boot/dts/am3517-evm.dts | 23 ++++------------------- 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/arch/arm/boot/dts/am3517-evm.dts b/arch/arm/boot/dts/am3517-evm.dts index ebfe28c2f544..a1fd3e63e86e 100644 --- a/arch/arm/boot/dts/am3517-evm.dts +++ b/arch/arm/boot/dts/am3517-evm.dts @@ -124,10 +124,11 @@ }; lcd0: display@0 { - compatible = "panel-dpi"; + /* This isn't the exact LCD, but the timings meet spec */ + /* To make it work, set CONFIG_OMAP2_DSS_MIN_FCK_PER_PCK=4 */ + compatible = "newhaven,nhd-4.3-480272ef-atxl"; label = "15"; - status = "okay"; - pinctrl-names = "default"; + backlight = <&bl>; enable-gpios = <&gpio6 16 GPIO_ACTIVE_HIGH>; /* gpio176, lcd INI */ vcc-supply = <&vdd_io_reg>; @@ -136,22 +137,6 @@ remote-endpoint = <&dpi_out>; }; }; - - panel-timing { - clock-frequency = <9000000>; - hactive = <480>; - vactive = <272>; - hfront-porch = <3>; - hback-porch = <2>; - hsync-len = <42>; - vback-porch = <3>; - vfront-porch = <4>; - vsync-len = <11>; - hsync-active = <0>; - vsync-active = <0>; - de-active = <1>; - pixelclk-active = <1>; - }; }; bl: backlight { From a932b77b4d1939ad173f18be87da409427fb705c Mon Sep 17 00:00:00 2001 From: Adam Ford Date: Tue, 20 Aug 2019 07:17:27 -0500 Subject: [PATCH 086/334] ARM: dts: logicpd-som-lv: Fix i2c2 and i2c3 Pin mux When the pinmux configuration was added, it was accidentally placed into the omap3_pmx_wkup node when it should have been placed into the omap3_pmx_core. This error was accidentally propagated to stable by me when I blindly requested the pull after seeing I2C issues without actually reviewing the content of the pinout. Since the bootloader previously muxed these correctly in the past, was a hidden error. This patch moves the i2c2_pins and i2c3_pins to the correct node which should eliminate i2c bus errors and timeouts due to the fact the bootloader uses the save device tree that no longer properly assigns these pins. Fixes: 5fe3c0fa0d54 ("ARM: dts: Add pinmuxing for i2c2 and i2c3 for LogicPD SOM-LV") #4.9+ Signed-off-by: Adam Ford Signed-off-by: Tony Lindgren --- arch/arm/boot/dts/logicpd-som-lv.dtsi | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/arch/arm/boot/dts/logicpd-som-lv.dtsi b/arch/arm/boot/dts/logicpd-som-lv.dtsi index 5563ee54c960..b56524cc7fe2 100644 --- a/arch/arm/boot/dts/logicpd-som-lv.dtsi +++ b/arch/arm/boot/dts/logicpd-som-lv.dtsi @@ -228,6 +228,20 @@ >; }; + i2c2_pins: pinmux_i2c2_pins { + pinctrl-single,pins = < + OMAP3_CORE1_IOPAD(0x21be, PIN_INPUT | MUX_MODE0) /* i2c2_scl */ + OMAP3_CORE1_IOPAD(0x21c0, PIN_INPUT | MUX_MODE0) /* i2c2_sda */ + >; + }; + + i2c3_pins: pinmux_i2c3_pins { + pinctrl-single,pins = < + OMAP3_CORE1_IOPAD(0x21c2, PIN_INPUT | MUX_MODE0) /* i2c3_scl */ + OMAP3_CORE1_IOPAD(0x21c4, PIN_INPUT | MUX_MODE0) /* i2c3_sda */ + >; + }; + tsc2004_pins: pinmux_tsc2004_pins { pinctrl-single,pins = < OMAP3_CORE1_IOPAD(0x2186, PIN_INPUT | MUX_MODE4) /* mcbsp4_dr.gpio_153 */ @@ -249,18 +263,6 @@ OMAP3_WKUP_IOPAD(0x2a0c, PIN_OUTPUT | MUX_MODE4) /* sys_boot1.gpio_3 */ >; }; - i2c2_pins: pinmux_i2c2_pins { - pinctrl-single,pins = < - OMAP3_CORE1_IOPAD(0x21be, PIN_INPUT | MUX_MODE0) /* i2c2_scl */ - OMAP3_CORE1_IOPAD(0x21c0, PIN_INPUT | MUX_MODE0) /* i2c2_sda */ - >; - }; - i2c3_pins: pinmux_i2c3_pins { - pinctrl-single,pins = < - OMAP3_CORE1_IOPAD(0x21c2, PIN_INPUT | MUX_MODE0) /* i2c3_scl */ - OMAP3_CORE1_IOPAD(0x21c4, PIN_INPUT | MUX_MODE0) /* i2c3_sda */ - >; - }; }; &omap3_pmx_core2 { From a4c8723a162e6244fb01944fbf446750575dba59 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Fri, 6 Sep 2019 12:57:46 -0700 Subject: [PATCH 087/334] bus: ti-sysc: Remove unpaired sysc_clkdm_deny_idle() Commit d098913a10f8 ("bus: ti-sysc: Fix clock handling for no-idle quirks") fixed handling for no-idle quirk modules that are not enabled by the bootloader. But it also caused unpaired clockdomain calls that won't allow idling the system. That's because clkdm_allow_idle_nolock() and clkdm_deny_idle_nolock() have usage count with clkdm->forcewake_count. Let's drop the unpaired sysc_clkdm_deny_idle() to fix idling of devices. Fixes: d098913a10f8 ("bus: ti-sysc: Fix clock handling for no-idle quirks") Cc: Keerthy Cc: Vignesh Raghavendra Signed-off-by: Tony Lindgren --- drivers/bus/ti-sysc.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/bus/ti-sysc.c b/drivers/bus/ti-sysc.c index 24583d82b584..364ee498feb3 100644 --- a/drivers/bus/ti-sysc.c +++ b/drivers/bus/ti-sysc.c @@ -2363,7 +2363,6 @@ static void ti_sysc_idle(struct work_struct *work) */ if (ddata->cfg.quirks & (SYSC_QUIRK_NO_IDLE | SYSC_QUIRK_NO_IDLE_ON_INIT)) { - sysc_clkdm_deny_idle(ddata); sysc_disable_main_clocks(ddata); sysc_disable_opt_clocks(ddata); sysc_clkdm_allow_idle(ddata); From f8a9bc623a6d178f7ecd40fb7db37eb954b6929c Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Tue, 10 Sep 2019 03:03:17 -0700 Subject: [PATCH 088/334] security: constify some arrays in lockdown LSM No reason for these not to be const. Signed-off-by: Matthew Garrett Suggested-by: David Howells Signed-off-by: James Morris --- security/lockdown/lockdown.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c index 0068cec77c05..8a10b43daf74 100644 --- a/security/lockdown/lockdown.c +++ b/security/lockdown/lockdown.c @@ -16,7 +16,7 @@ static enum lockdown_reason kernel_locked_down; -static char *lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { +static const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_NONE] = "none", [LOCKDOWN_MODULE_SIGNATURE] = "unsigned module loading", [LOCKDOWN_DEV_MEM] = "/dev/mem,kmem,port", @@ -40,7 +40,7 @@ static char *lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality", }; -static enum lockdown_reason lockdown_levels[] = {LOCKDOWN_NONE, +static const enum lockdown_reason lockdown_levels[] = {LOCKDOWN_NONE, LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_CONFIDENTIALITY_MAX}; From 45893a0abee6b5fd52994a3a1095735aeaec472b Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Tue, 10 Sep 2019 03:03:18 -0700 Subject: [PATCH 089/334] kexec: Fix file verification on S390 I accidentally typoed this #ifdef, so verification would always be disabled. Signed-off-by: Matthew Garrett Reported-by: Philipp Rudo Signed-off-by: James Morris --- arch/s390/kernel/kexec_elf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kernel/kexec_elf.c b/arch/s390/kernel/kexec_elf.c index 9b4f37a4edf1..9da6fa30c447 100644 --- a/arch/s390/kernel/kexec_elf.c +++ b/arch/s390/kernel/kexec_elf.c @@ -130,7 +130,7 @@ static int s390_elf_probe(const char *buf, unsigned long len) const struct kexec_file_ops s390_kexec_elf_ops = { .probe = s390_elf_probe, .load = s390_elf_load, -#ifdef CONFIG_KEXEC__SIG +#ifdef CONFIG_KEXEC_SIG .verify_sig = s390_verify_sig, #endif /* CONFIG_KEXEC_SIG */ }; From 5e113224c17e2fb156b785ddbbc48a0209fddb0c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 2 Sep 2019 13:02:55 -0400 Subject: [PATCH 090/334] nfsd: nfsd_file cache entries should be per net namespace Ensure that we can safely clear out the file cache entries when the nfs server is shut down on a container. Otherwise, the file cache may end up pinning the mounts. Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/nfsd/export.c | 2 +- fs/nfsd/filecache.c | 33 +++++++++++++++++++++------------ fs/nfsd/filecache.h | 3 ++- fs/nfsd/nfssvc.c | 1 + 4 files changed, 25 insertions(+), 14 deletions(-) diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 052fac64b578..15422c951fd1 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -240,7 +240,7 @@ static void expkey_flush(void) * destroyed while we're in the middle of flushing. */ mutex_lock(&nfsd_mutex); - nfsd_file_cache_purge(); + nfsd_file_cache_purge(current->nsproxy->net_ns); mutex_unlock(&nfsd_mutex); } diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 2e1a972231e5..da9e790a055e 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -16,6 +16,7 @@ #include "vfs.h" #include "nfsd.h" #include "nfsfh.h" +#include "netns.h" #include "filecache.h" #include "trace.h" @@ -167,7 +168,8 @@ nfsd_file_mark_find_or_create(struct nfsd_file *nf) } static struct nfsd_file * -nfsd_file_alloc(struct inode *inode, unsigned int may, unsigned int hashval) +nfsd_file_alloc(struct inode *inode, unsigned int may, unsigned int hashval, + struct net *net) { struct nfsd_file *nf; @@ -177,6 +179,7 @@ nfsd_file_alloc(struct inode *inode, unsigned int may, unsigned int hashval) INIT_LIST_HEAD(&nf->nf_lru); nf->nf_file = NULL; nf->nf_cred = get_current_cred(); + nf->nf_net = net; nf->nf_flags = 0; nf->nf_inode = inode; nf->nf_hashval = hashval; @@ -607,10 +610,11 @@ out_err: * Note this can deadlock with nfsd_file_lru_cb. */ void -nfsd_file_cache_purge(void) +nfsd_file_cache_purge(struct net *net) { unsigned int i; struct nfsd_file *nf; + struct hlist_node *next; LIST_HEAD(dispose); bool del; @@ -618,10 +622,12 @@ nfsd_file_cache_purge(void) return; for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) { - spin_lock(&nfsd_file_hashtbl[i].nfb_lock); - while(!hlist_empty(&nfsd_file_hashtbl[i].nfb_head)) { - nf = hlist_entry(nfsd_file_hashtbl[i].nfb_head.first, - struct nfsd_file, nf_node); + struct nfsd_fcache_bucket *nfb = &nfsd_file_hashtbl[i]; + + spin_lock(&nfb->nfb_lock); + hlist_for_each_entry_safe(nf, next, &nfb->nfb_head, nf_node) { + if (net && nf->nf_net != net) + continue; del = nfsd_file_unhash_and_release_locked(nf, &dispose); /* @@ -630,7 +636,7 @@ nfsd_file_cache_purge(void) */ WARN_ON_ONCE(!del); } - spin_unlock(&nfsd_file_hashtbl[i].nfb_lock); + spin_unlock(&nfb->nfb_lock); nfsd_file_dispose_list(&dispose); } } @@ -649,7 +655,7 @@ nfsd_file_cache_shutdown(void) * calling nfsd_file_cache_purge */ cancel_delayed_work_sync(&nfsd_filecache_laundrette); - nfsd_file_cache_purge(); + nfsd_file_cache_purge(NULL); list_lru_destroy(&nfsd_file_lru); rcu_barrier(); fsnotify_put_group(nfsd_file_fsnotify_group); @@ -685,7 +691,7 @@ nfsd_match_cred(const struct cred *c1, const struct cred *c2) static struct nfsd_file * nfsd_file_find_locked(struct inode *inode, unsigned int may_flags, - unsigned int hashval) + unsigned int hashval, struct net *net) { struct nfsd_file *nf; unsigned char need = may_flags & NFSD_FILE_MAY_MASK; @@ -696,6 +702,8 @@ nfsd_file_find_locked(struct inode *inode, unsigned int may_flags, continue; if (nf->nf_inode != inode) continue; + if (nf->nf_net != net) + continue; if (!nfsd_match_cred(nf->nf_cred, current_cred())) continue; if (nfsd_file_get(nf) != NULL) @@ -738,6 +746,7 @@ nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned int may_flags, struct nfsd_file **pnf) { __be32 status; + struct net *net = SVC_NET(rqstp); struct nfsd_file *nf, *new; struct inode *inode; unsigned int hashval; @@ -752,12 +761,12 @@ nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, hashval = (unsigned int)hash_long(inode->i_ino, NFSD_FILE_HASH_BITS); retry: rcu_read_lock(); - nf = nfsd_file_find_locked(inode, may_flags, hashval); + nf = nfsd_file_find_locked(inode, may_flags, hashval, net); rcu_read_unlock(); if (nf) goto wait_for_construction; - new = nfsd_file_alloc(inode, may_flags, hashval); + new = nfsd_file_alloc(inode, may_flags, hashval, net); if (!new) { trace_nfsd_file_acquire(rqstp, hashval, inode, may_flags, NULL, nfserr_jukebox); @@ -765,7 +774,7 @@ retry: } spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock); - nf = nfsd_file_find_locked(inode, may_flags, hashval); + nf = nfsd_file_find_locked(inode, may_flags, hashval, net); if (nf == NULL) goto open_file; spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock); diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h index 0c0c67166b87..851d9abf54c2 100644 --- a/fs/nfsd/filecache.h +++ b/fs/nfsd/filecache.h @@ -34,6 +34,7 @@ struct nfsd_file { struct rcu_head nf_rcu; struct file *nf_file; const struct cred *nf_cred; + struct net *nf_net; #define NFSD_FILE_HASHED (0) #define NFSD_FILE_PENDING (1) #define NFSD_FILE_BREAK_READ (2) @@ -48,7 +49,7 @@ struct nfsd_file { }; int nfsd_file_cache_init(void); -void nfsd_file_cache_purge(void); +void nfsd_file_cache_purge(struct net *); void nfsd_file_cache_shutdown(void); void nfsd_file_put(struct nfsd_file *nf); struct nfsd_file *nfsd_file_get(struct nfsd_file *nf); diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index d02712ca2685..b944553c6927 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -387,6 +387,7 @@ static void nfsd_shutdown_net(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); + nfsd_file_cache_purge(net); nfs4_state_shutdown_net(net); if (nn->lockd_up) { lockd_down(net); From 27c438f53e79b81dc8805a81f6cd74824ba57290 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 2 Sep 2019 13:02:56 -0400 Subject: [PATCH 091/334] nfsd: Support the server resetting the boot verifier Add support to allow the server to reset the boot verifier in order to force clients to resend I/O after a timeout failure. Signed-off-by: Trond Myklebust Signed-off-by: Lance Shelton Signed-off-by: J. Bruce Fields --- fs/nfsd/netns.h | 4 ++++ fs/nfsd/nfs3xdr.c | 13 +++++++++---- fs/nfsd/nfs4proc.c | 14 ++++---------- fs/nfsd/nfsctl.c | 1 + fs/nfsd/nfssvc.c | 31 ++++++++++++++++++++++++++++++- 5 files changed, 48 insertions(+), 15 deletions(-) diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index bdfe5bcb3dcd..9a4ef815fb8c 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -104,6 +104,7 @@ struct nfsd_net { /* Time of server startup */ struct timespec64 nfssvc_boot; + seqlock_t boot_lock; /* * Max number of connections this nfsd container will allow. Defaults @@ -179,4 +180,7 @@ struct nfsd_net { extern void nfsd_netns_free_versions(struct nfsd_net *nn); extern unsigned int nfsd_net_id; + +void nfsd_copy_boot_verifier(__be32 verf[2], struct nfsd_net *nn); +void nfsd_reset_boot_verifier(struct nfsd_net *nn); #endif /* __NFSD_NETNS_H__ */ diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index fcf31822c74c..86e5658651f1 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -27,6 +27,7 @@ static u32 nfs3_ftypes[] = { NF3SOCK, NF3BAD, NF3LNK, NF3BAD, }; + /* * XDR functions for basic NFS types */ @@ -751,14 +752,16 @@ nfs3svc_encode_writeres(struct svc_rqst *rqstp, __be32 *p) { struct nfsd3_writeres *resp = rqstp->rq_resp; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + __be32 verf[2]; p = encode_wcc_data(rqstp, p, &resp->fh); if (resp->status == 0) { *p++ = htonl(resp->count); *p++ = htonl(resp->committed); /* unique identifier, y2038 overflow can be ignored */ - *p++ = htonl((u32)nn->nfssvc_boot.tv_sec); - *p++ = htonl(nn->nfssvc_boot.tv_nsec); + nfsd_copy_boot_verifier(verf, nn); + *p++ = verf[0]; + *p++ = verf[1]; } return xdr_ressize_check(rqstp, p); } @@ -1125,13 +1128,15 @@ nfs3svc_encode_commitres(struct svc_rqst *rqstp, __be32 *p) { struct nfsd3_commitres *resp = rqstp->rq_resp; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + __be32 verf[2]; p = encode_wcc_data(rqstp, p, &resp->fh); /* Write verifier */ if (resp->status == 0) { /* unique identifier, y2038 overflow can be ignored */ - *p++ = htonl((u32)nn->nfssvc_boot.tv_sec); - *p++ = htonl(nn->nfssvc_boot.tv_nsec); + nfsd_copy_boot_verifier(verf, nn); + *p++ = verf[0]; + *p++ = verf[1]; } return xdr_ressize_check(rqstp, p); } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index cb51893ec1cd..4e3e77b76411 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -568,17 +568,11 @@ nfsd4_access(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, static void gen_boot_verifier(nfs4_verifier *verifier, struct net *net) { - __be32 verf[2]; - struct nfsd_net *nn = net_generic(net, nfsd_net_id); + __be32 *verf = (__be32 *)verifier->data; - /* - * This is opaque to client, so no need to byte-swap. Use - * __force to keep sparse happy. y2038 time_t overflow is - * irrelevant in this usage. - */ - verf[0] = (__force __be32)nn->nfssvc_boot.tv_sec; - verf[1] = (__force __be32)nn->nfssvc_boot.tv_nsec; - memcpy(verifier->data, verf, sizeof(verifier->data)); + BUILD_BUG_ON(2*sizeof(*verf) != sizeof(verifier->data)); + + nfsd_copy_boot_verifier(verf, net_generic(net, nfsd_net_id)); } static __be32 diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 3cf4f6aa48d6..33cbe2e5d937 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1477,6 +1477,7 @@ static __net_init int nfsd_init_net(struct net *net) atomic_set(&nn->ntf_refcnt, 0); init_waitqueue_head(&nn->ntf_wq); + seqlock_init(&nn->boot_lock); mnt = vfs_kern_mount(&nfsd_fs_type, SB_KERNMOUNT, "nfsd", NULL); if (IS_ERR(mnt)) { diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index b944553c6927..3caaf5675259 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -344,6 +344,35 @@ static bool nfsd_needs_lockd(struct nfsd_net *nn) return nfsd_vers(nn, 2, NFSD_TEST) || nfsd_vers(nn, 3, NFSD_TEST); } +void nfsd_copy_boot_verifier(__be32 verf[2], struct nfsd_net *nn) +{ + int seq = 0; + + do { + read_seqbegin_or_lock(&nn->boot_lock, &seq); + /* + * This is opaque to client, so no need to byte-swap. Use + * __force to keep sparse happy. y2038 time_t overflow is + * irrelevant in this usage + */ + verf[0] = (__force __be32)nn->nfssvc_boot.tv_sec; + verf[1] = (__force __be32)nn->nfssvc_boot.tv_nsec; + } while (need_seqretry(&nn->boot_lock, seq)); + done_seqretry(&nn->boot_lock, seq); +} + +void nfsd_reset_boot_verifier_locked(struct nfsd_net *nn) +{ + ktime_get_real_ts64(&nn->nfssvc_boot); +} + +void nfsd_reset_boot_verifier(struct nfsd_net *nn) +{ + write_seqlock(&nn->boot_lock); + nfsd_reset_boot_verifier_locked(nn); + write_sequnlock(&nn->boot_lock); +} + static int nfsd_startup_net(int nrservs, struct net *net, const struct cred *cred) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); @@ -596,7 +625,7 @@ int nfsd_create_serv(struct net *net) #endif } atomic_inc(&nn->ntf_refcnt); - ktime_get_real_ts64(&nn->nfssvc_boot); /* record boot time */ + nfsd_reset_boot_verifier(nn); return 0; } From 055b24a8f23090043b2ce0c30f03e12c5f2c9e72 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 2 Sep 2019 13:02:57 -0400 Subject: [PATCH 092/334] nfsd: Don't garbage collect files that might contain write errors If a file may contain unstable writes that can error out, then we want to avoid garbage collecting the struct nfsd_file that may be tracking those errors. So in the garbage collector, we try to avoid collecting files that aren't clean. Furthermore, we avoid immediately kicking off the garbage collector in the case where the reference drops to zero for the case where there is a write error that is being tracked. If the file is unhashed while an error is pending, then declare a reboot, to ensure the client resends any unstable writes. Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/nfsd/filecache.c | 43 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index da9e790a055e..ef55e9b1cd4e 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -215,6 +215,36 @@ nfsd_file_free(struct nfsd_file *nf) return flush; } +static bool +nfsd_file_check_writeback(struct nfsd_file *nf) +{ + struct file *file = nf->nf_file; + struct address_space *mapping; + + if (!file || !(file->f_mode & FMODE_WRITE)) + return false; + mapping = file->f_mapping; + return mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) || + mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK); +} + +static int +nfsd_file_check_write_error(struct nfsd_file *nf) +{ + struct file *file = nf->nf_file; + + if (!file || !(file->f_mode & FMODE_WRITE)) + return 0; + return filemap_check_wb_err(file->f_mapping, READ_ONCE(file->f_wb_err)); +} + +static bool +nfsd_file_in_use(struct nfsd_file *nf) +{ + return nfsd_file_check_writeback(nf) || + nfsd_file_check_write_error(nf); +} + static void nfsd_file_do_unhash(struct nfsd_file *nf) { @@ -222,6 +252,8 @@ nfsd_file_do_unhash(struct nfsd_file *nf) trace_nfsd_file_unhash(nf); + if (nfsd_file_check_write_error(nf)) + nfsd_reset_boot_verifier(net_generic(nf->nf_net, nfsd_net_id)); --nfsd_file_hashtbl[nf->nf_hashval].nfb_count; hlist_del_rcu(&nf->nf_node); if (!list_empty(&nf->nf_lru)) @@ -276,9 +308,10 @@ void nfsd_file_put(struct nfsd_file *nf) { bool is_hashed = test_bit(NFSD_FILE_HASHED, &nf->nf_flags) != 0; + bool unused = !nfsd_file_in_use(nf); set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags); - if (nfsd_file_put_noref(nf) == 1 && is_hashed) + if (nfsd_file_put_noref(nf) == 1 && is_hashed && unused) nfsd_file_schedule_laundrette(NFSD_FILE_LAUNDRETTE_MAY_FLUSH); } @@ -344,6 +377,14 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru, */ if (atomic_read(&nf->nf_ref) > 1) goto out_skip; + + /* + * Don't throw out files that are still undergoing I/O or + * that have uncleared errors pending. + */ + if (nfsd_file_check_writeback(nf)) + goto out_skip; + if (test_and_clear_bit(NFSD_FILE_REFERENCED, &nf->nf_flags)) goto out_rescan; From bbf2f098838aa86cee240a7a11486e0601d56a3f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 2 Sep 2019 13:02:58 -0400 Subject: [PATCH 093/334] nfsd: Reset the boot verifier on all write I/O errors If multiple clients are writing to the same file, then due to the fact we share a single file descriptor between all NFSv3 clients writing to the file, we have a situation where clients can miss the fact that their file data was not persisted. While this should be rare, it could cause silent data loss in situations where multiple clients are using NLM locking or O_DIRECT to write to the same file. Unfortunately, the stateless nature of NFSv3 and the fact that we can only identify clients by their IP address means that we cannot trivially cache errors; we would not know when it is safe to release them from the cache. So the solution is to declare a reboot. We understand that this should be a rare occurrence, since disks are usually stable. The most frequent occurrence is likely to be ENOSPC, at which point all writes to the given filesystem are likely to fail anyway. So the expectation is that clients will be forced to retry their writes until they hit the fatal error. Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 84e87772c2b8..0867d5319fdb 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -958,8 +958,12 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, nfsdstats.io_write += *cnt; fsnotify_modify(file); - if (stable && use_wgather) + if (stable && use_wgather) { host_err = wait_for_concurrent_writes(file); + if (host_err < 0) + nfsd_reset_boot_verifier(net_generic(SVC_NET(rqstp), + nfsd_net_id)); + } out_nfserr: if (host_err >= 0) { @@ -1063,10 +1067,17 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, if (EX_ISSYNC(fhp->fh_export)) { int err2 = vfs_fsync_range(nf->nf_file, offset, end, 0); - if (err2 != -EINVAL) - err = nfserrno(err2); - else + switch (err2) { + case 0: + break; + case -EINVAL: err = nfserr_notsupp; + break; + default: + err = nfserrno(err2); + nfsd_reset_boot_verifier(net_generic(nf->nf_net, + nfsd_net_id)); + } } nfsd_file_put(nf); From 11a60d159259dbadf9188534b508e5003769af61 Mon Sep 17 00:00:00 2001 From: Scott Mayhew Date: Mon, 9 Sep 2019 16:10:30 -0400 Subject: [PATCH 094/334] nfsd: add a "GetVersion" upcall for nfsdcld Add a "GetVersion" upcall to allow nfsd to determine the maximum upcall version that the nfsdcld userspace daemon supports. If the daemon responds with -EOPNOTSUPP, then we know it only supports v1. Signed-off-by: Scott Mayhew Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4recover.c | 167 ++++++++++++++++++++++++---------- include/uapi/linux/nfsd/cld.h | 11 ++- 2 files changed, 127 insertions(+), 51 deletions(-) diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 87679557d0d6..58a61339d40c 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -59,8 +59,12 @@ struct nfsd4_client_tracking_ops { void (*remove)(struct nfs4_client *); int (*check)(struct nfs4_client *); void (*grace_done)(struct nfsd_net *); + uint8_t version; + size_t msglen; }; +static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops; + /* Globals */ static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; @@ -718,6 +722,8 @@ static const struct nfsd4_client_tracking_ops nfsd4_legacy_tracking_ops = { .remove = nfsd4_remove_clid_dir, .check = nfsd4_check_legacy_client, .grace_done = nfsd4_recdir_purge_old, + .version = 1, + .msglen = 0, }; /* Globals */ @@ -737,19 +743,24 @@ struct cld_upcall { struct list_head cu_list; struct cld_net *cu_net; struct completion cu_done; - struct cld_msg cu_msg; + union { + struct cld_msg_hdr cu_hdr; + struct cld_msg cu_msg; + } cu_u; }; static int -__cld_pipe_upcall(struct rpc_pipe *pipe, struct cld_msg *cmsg) +__cld_pipe_upcall(struct rpc_pipe *pipe, void *cmsg) { int ret; struct rpc_pipe_msg msg; - struct cld_upcall *cup = container_of(cmsg, struct cld_upcall, cu_msg); + struct cld_upcall *cup = container_of(cmsg, struct cld_upcall, cu_u); + struct nfsd_net *nn = net_generic(pipe->dentry->d_sb->s_fs_info, + nfsd_net_id); memset(&msg, 0, sizeof(msg)); msg.data = cmsg; - msg.len = sizeof(*cmsg); + msg.len = nn->client_tracking_ops->msglen; ret = rpc_queue_upcall(pipe, &msg); if (ret < 0) { @@ -765,7 +776,7 @@ out: } static int -cld_pipe_upcall(struct rpc_pipe *pipe, struct cld_msg *cmsg) +cld_pipe_upcall(struct rpc_pipe *pipe, void *cmsg) { int ret; @@ -809,7 +820,7 @@ __cld_pipe_inprogress_downcall(const struct cld_msg __user *cmsg, kfree(name.data); return -EFAULT; } - return sizeof(*cmsg); + return nn->client_tracking_ops->msglen; } return -EFAULT; } @@ -818,6 +829,7 @@ static ssize_t cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) { struct cld_upcall *tmp, *cup; + struct cld_msg_hdr __user *hdr = (struct cld_msg_hdr __user *)src; struct cld_msg __user *cmsg = (struct cld_msg __user *)src; uint32_t xid; struct nfsd_net *nn = net_generic(file_inode(filp)->i_sb->s_fs_info, @@ -825,14 +837,14 @@ cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) struct cld_net *cn = nn->cld_net; int16_t status; - if (mlen != sizeof(*cmsg)) { + if (mlen != nn->client_tracking_ops->msglen) { dprintk("%s: got %zu bytes, expected %zu\n", __func__, mlen, - sizeof(*cmsg)); + nn->client_tracking_ops->msglen); return -EINVAL; } /* copy just the xid so we can try to find that */ - if (copy_from_user(&xid, &cmsg->cm_xid, sizeof(xid)) != 0) { + if (copy_from_user(&xid, &hdr->cm_xid, sizeof(xid)) != 0) { dprintk("%s: error when copying xid from userspace", __func__); return -EFAULT; } @@ -842,7 +854,7 @@ cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) * list (for -EINPROGRESS, we just want to make sure the xid is * valid, not remove the upcall from the list) */ - if (get_user(status, &cmsg->cm_status)) { + if (get_user(status, &hdr->cm_status)) { dprintk("%s: error when copying status from userspace", __func__); return -EFAULT; } @@ -851,7 +863,7 @@ cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) cup = NULL; spin_lock(&cn->cn_lock); list_for_each_entry(tmp, &cn->cn_list, cu_list) { - if (get_unaligned(&tmp->cu_msg.cm_xid) == xid) { + if (get_unaligned(&tmp->cu_u.cu_hdr.cm_xid) == xid) { cup = tmp; if (status != -EINPROGRESS) list_del_init(&cup->cu_list); @@ -869,7 +881,7 @@ cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) if (status == -EINPROGRESS) return __cld_pipe_inprogress_downcall(cmsg, nn); - if (copy_from_user(&cup->cu_msg, src, mlen) != 0) + if (copy_from_user(&cup->cu_u.cu_msg, src, mlen) != 0) return -EFAULT; complete(&cup->cu_done); @@ -881,7 +893,7 @@ cld_pipe_destroy_msg(struct rpc_pipe_msg *msg) { struct cld_msg *cmsg = msg->data; struct cld_upcall *cup = container_of(cmsg, struct cld_upcall, - cu_msg); + cu_u.cu_msg); /* errno >= 0 means we got a downcall */ if (msg->errno >= 0) @@ -1012,9 +1024,10 @@ nfsd4_remove_cld_pipe(struct net *net) } static struct cld_upcall * -alloc_cld_upcall(struct cld_net *cn) +alloc_cld_upcall(struct nfsd_net *nn) { struct cld_upcall *new, *tmp; + struct cld_net *cn = nn->cld_net; new = kzalloc(sizeof(*new), GFP_KERNEL); if (!new) @@ -1024,20 +1037,20 @@ alloc_cld_upcall(struct cld_net *cn) restart_search: spin_lock(&cn->cn_lock); list_for_each_entry(tmp, &cn->cn_list, cu_list) { - if (tmp->cu_msg.cm_xid == cn->cn_xid) { + if (tmp->cu_u.cu_msg.cm_xid == cn->cn_xid) { cn->cn_xid++; spin_unlock(&cn->cn_lock); goto restart_search; } } init_completion(&new->cu_done); - new->cu_msg.cm_vers = CLD_UPCALL_VERSION; - put_unaligned(cn->cn_xid++, &new->cu_msg.cm_xid); + new->cu_u.cu_msg.cm_vers = nn->client_tracking_ops->version; + put_unaligned(cn->cn_xid++, &new->cu_u.cu_msg.cm_xid); new->cu_net = cn; list_add(&new->cu_list, &cn->cn_list); spin_unlock(&cn->cn_lock); - dprintk("%s: allocated xid %u\n", __func__, new->cu_msg.cm_xid); + dprintk("%s: allocated xid %u\n", __func__, new->cu_u.cu_msg.cm_xid); return new; } @@ -1066,20 +1079,20 @@ nfsd4_cld_create(struct nfs4_client *clp) if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return; - cup = alloc_cld_upcall(cn); + cup = alloc_cld_upcall(nn); if (!cup) { ret = -ENOMEM; goto out_err; } - cup->cu_msg.cm_cmd = Cld_Create; - cup->cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len; - memcpy(cup->cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, + cup->cu_u.cu_msg.cm_cmd = Cld_Create; + cup->cu_u.cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len; + memcpy(cup->cu_u.cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, clp->cl_name.len); - ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg); + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg); if (!ret) { - ret = cup->cu_msg.cm_status; + ret = cup->cu_u.cu_msg.cm_status; set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); } @@ -1103,20 +1116,20 @@ nfsd4_cld_remove(struct nfs4_client *clp) if (!test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return; - cup = alloc_cld_upcall(cn); + cup = alloc_cld_upcall(nn); if (!cup) { ret = -ENOMEM; goto out_err; } - cup->cu_msg.cm_cmd = Cld_Remove; - cup->cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len; - memcpy(cup->cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, + cup->cu_u.cu_msg.cm_cmd = Cld_Remove; + cup->cu_u.cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len; + memcpy(cup->cu_u.cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, clp->cl_name.len); - ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg); + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg); if (!ret) { - ret = cup->cu_msg.cm_status; + ret = cup->cu_u.cu_msg.cm_status; clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); } @@ -1145,21 +1158,21 @@ nfsd4_cld_check_v0(struct nfs4_client *clp) if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return 0; - cup = alloc_cld_upcall(cn); + cup = alloc_cld_upcall(nn); if (!cup) { printk(KERN_ERR "NFSD: Unable to check client record on " "stable storage: %d\n", -ENOMEM); return -ENOMEM; } - cup->cu_msg.cm_cmd = Cld_Check; - cup->cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len; - memcpy(cup->cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, + cup->cu_u.cu_msg.cm_cmd = Cld_Check; + cup->cu_u.cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len; + memcpy(cup->cu_u.cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, clp->cl_name.len); - ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg); + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg); if (!ret) { - ret = cup->cu_msg.cm_status; + ret = cup->cu_u.cu_msg.cm_status; set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); } @@ -1223,16 +1236,16 @@ nfsd4_cld_grace_start(struct nfsd_net *nn) struct cld_upcall *cup; struct cld_net *cn = nn->cld_net; - cup = alloc_cld_upcall(cn); + cup = alloc_cld_upcall(nn); if (!cup) { ret = -ENOMEM; goto out_err; } - cup->cu_msg.cm_cmd = Cld_GraceStart; - ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg); + cup->cu_u.cu_msg.cm_cmd = Cld_GraceStart; + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg); if (!ret) - ret = cup->cu_msg.cm_status; + ret = cup->cu_u.cu_msg.cm_status; free_cld_upcall(cup); out_err: @@ -1250,17 +1263,17 @@ nfsd4_cld_grace_done_v0(struct nfsd_net *nn) struct cld_upcall *cup; struct cld_net *cn = nn->cld_net; - cup = alloc_cld_upcall(cn); + cup = alloc_cld_upcall(nn); if (!cup) { ret = -ENOMEM; goto out_err; } - cup->cu_msg.cm_cmd = Cld_GraceDone; - cup->cu_msg.cm_u.cm_gracetime = (int64_t)nn->boot_time; - ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg); + cup->cu_u.cu_msg.cm_cmd = Cld_GraceDone; + cup->cu_u.cu_msg.cm_u.cm_gracetime = (int64_t)nn->boot_time; + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg); if (!ret) - ret = cup->cu_msg.cm_status; + ret = cup->cu_u.cu_msg.cm_status; free_cld_upcall(cup); out_err: @@ -1279,16 +1292,16 @@ nfsd4_cld_grace_done(struct nfsd_net *nn) struct cld_upcall *cup; struct cld_net *cn = nn->cld_net; - cup = alloc_cld_upcall(cn); + cup = alloc_cld_upcall(nn); if (!cup) { ret = -ENOMEM; goto out_err; } - cup->cu_msg.cm_cmd = Cld_GraceDone; - ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg); + cup->cu_u.cu_msg.cm_cmd = Cld_GraceDone; + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg); if (!ret) - ret = cup->cu_msg.cm_status; + ret = cup->cu_u.cu_msg.cm_status; free_cld_upcall(cup); out_err: @@ -1336,6 +1349,50 @@ cld_running(struct nfsd_net *nn) return pipe->nreaders || pipe->nwriters; } +static int +nfsd4_cld_get_version(struct nfsd_net *nn) +{ + int ret = 0; + struct cld_upcall *cup; + struct cld_net *cn = nn->cld_net; + uint8_t version; + + cup = alloc_cld_upcall(nn); + if (!cup) { + ret = -ENOMEM; + goto out_err; + } + cup->cu_u.cu_msg.cm_cmd = Cld_GetVersion; + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg); + if (!ret) { + ret = cup->cu_u.cu_msg.cm_status; + if (ret) + goto out_free; + version = cup->cu_u.cu_msg.cm_u.cm_version; + dprintk("%s: userspace returned version %u\n", + __func__, version); + if (version < 1) + version = 1; + else if (version > CLD_UPCALL_VERSION) + version = CLD_UPCALL_VERSION; + + switch (version) { + case 1: + nn->client_tracking_ops = &nfsd4_cld_tracking_ops; + break; + default: + break; + } + } +out_free: + free_cld_upcall(cup); +out_err: + if (ret) + dprintk("%s: Unable to get version from userspace: %d\n", + __func__, ret); + return ret; +} + static int nfsd4_cld_tracking_init(struct net *net) { @@ -1368,10 +1425,14 @@ nfsd4_cld_tracking_init(struct net *net) goto err_remove; } + status = nfsd4_cld_get_version(nn); + if (status == -EOPNOTSUPP) + pr_warn("NFSD: nfsdcld GetVersion upcall failed. Please upgrade nfsdcld.\n"); + status = nfsd4_cld_grace_start(nn); if (status) { if (status == -EOPNOTSUPP) - printk(KERN_WARNING "NFSD: Please upgrade nfsdcld.\n"); + pr_warn("NFSD: nfsdcld GraceStart upcall failed. Please upgrade nfsdcld.\n"); nfs4_release_reclaim(nn); goto err_remove; } else @@ -1403,6 +1464,8 @@ static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops_v0 = { .remove = nfsd4_cld_remove, .check = nfsd4_cld_check_v0, .grace_done = nfsd4_cld_grace_done_v0, + .version = 1, + .msglen = sizeof(struct cld_msg), }; /* For newer nfsdcld's */ @@ -1413,6 +1476,8 @@ static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = { .remove = nfsd4_cld_remove, .check = nfsd4_cld_check, .grace_done = nfsd4_cld_grace_done, + .version = 1, + .msglen = sizeof(struct cld_msg), }; /* upcall via usermodehelper */ @@ -1760,6 +1825,8 @@ static const struct nfsd4_client_tracking_ops nfsd4_umh_tracking_ops = { .remove = nfsd4_umh_cltrack_remove, .check = nfsd4_umh_cltrack_check, .grace_done = nfsd4_umh_cltrack_grace_done, + .version = 1, + .msglen = 0, }; int diff --git a/include/uapi/linux/nfsd/cld.h b/include/uapi/linux/nfsd/cld.h index b1e9de4f07d5..c5aad16d10c0 100644 --- a/include/uapi/linux/nfsd/cld.h +++ b/include/uapi/linux/nfsd/cld.h @@ -36,7 +36,8 @@ enum cld_command { Cld_Remove, /* remove record of this cm_id */ Cld_Check, /* is this cm_id allowed? */ Cld_GraceDone, /* grace period is complete */ - Cld_GraceStart, + Cld_GraceStart, /* grace start (upload client records) */ + Cld_GetVersion, /* query max supported upcall version */ }; /* representation of long-form NFSv4 client ID */ @@ -54,7 +55,15 @@ struct cld_msg { union { __s64 cm_gracetime; /* grace period start time */ struct cld_name cm_name; + __u8 cm_version; /* for getting max version */ } __attribute__((packed)) cm_u; } __attribute__((packed)); +struct cld_msg_hdr { + __u8 cm_vers; /* upcall version */ + __u8 cm_cmd; /* upcall command */ + __s16 cm_status; /* return code */ + __u32 cm_xid; /* transaction id */ +} __attribute__((packed)); + #endif /* !_NFSD_CLD_H */ From 6ee95d1c899186c0798cafd25998d436bcdb9618 Mon Sep 17 00:00:00 2001 From: Scott Mayhew Date: Mon, 9 Sep 2019 16:10:31 -0400 Subject: [PATCH 095/334] nfsd: add support for upcall version 2 Version 2 upcalls will allow the nfsd to include a hash of the kerberos principal string in the Cld_Create upcall. If a principal is present in the svc_cred, then the hash will be included in the Cld_Create upcall. We attempt to use the svc_cred.cr_raw_principal (which is returned by gssproxy) first, and then fall back to using the svc_cred.cr_principal (which is returned by both gssproxy and rpc.svcgssd). Upon a subsequent restart, the hash will be returned in the Cld_Gracestart downcall and stored in the reclaim_str_hashtbl so it can be used when handling reclaim opens. Signed-off-by: Scott Mayhew Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4recover.c | 223 +++++++++++++++++++++++++++++++--- fs/nfsd/nfs4state.c | 6 +- fs/nfsd/state.h | 3 +- include/uapi/linux/nfsd/cld.h | 30 ++++- 4 files changed, 245 insertions(+), 17 deletions(-) diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 58a61339d40c..cdc75ad4438b 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -64,6 +64,7 @@ struct nfsd4_client_tracking_ops { }; static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops; +static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops_v2; /* Globals */ static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; @@ -177,6 +178,7 @@ __nfsd4_create_reclaim_record_grace(struct nfs4_client *clp, const char *dname, int len, struct nfsd_net *nn) { struct xdr_netobj name; + struct xdr_netobj princhash = { .len = 0, .data = NULL }; struct nfs4_client_reclaim *crp; name.data = kmemdup(dname, len, GFP_KERNEL); @@ -186,7 +188,7 @@ __nfsd4_create_reclaim_record_grace(struct nfs4_client *clp, return; } name.len = len; - crp = nfs4_client_to_reclaim(name, nn); + crp = nfs4_client_to_reclaim(name, princhash, nn); if (!crp) { kfree(name.data); return; @@ -486,6 +488,7 @@ static int load_recdir(struct dentry *parent, struct dentry *child, struct nfsd_net *nn) { struct xdr_netobj name; + struct xdr_netobj princhash = { .len = 0, .data = NULL }; if (child->d_name.len != HEXDIR_LEN - 1) { printk("%s: illegal name %pd in recovery directory\n", @@ -500,7 +503,7 @@ load_recdir(struct dentry *parent, struct dentry *child, struct nfsd_net *nn) goto out; } name.len = HEXDIR_LEN; - if (!nfs4_client_to_reclaim(name, nn)) + if (!nfs4_client_to_reclaim(name, princhash, nn)) kfree(name.data); out: return 0; @@ -737,6 +740,7 @@ struct cld_net { struct list_head cn_list; unsigned int cn_xid; bool cn_has_legacy; + struct crypto_shash *cn_tfm; }; struct cld_upcall { @@ -746,6 +750,7 @@ struct cld_upcall { union { struct cld_msg_hdr cu_hdr; struct cld_msg cu_msg; + struct cld_msg_v2 cu_msg_v2; } cu_u; }; @@ -792,11 +797,11 @@ cld_pipe_upcall(struct rpc_pipe *pipe, void *cmsg) } static ssize_t -__cld_pipe_inprogress_downcall(const struct cld_msg __user *cmsg, +__cld_pipe_inprogress_downcall(const struct cld_msg_v2 __user *cmsg, struct nfsd_net *nn) { - uint8_t cmd; - struct xdr_netobj name; + uint8_t cmd, princhashlen; + struct xdr_netobj name, princhash = { .len = 0, .data = NULL }; uint16_t namelen; struct cld_net *cn = nn->cld_net; @@ -805,19 +810,45 @@ __cld_pipe_inprogress_downcall(const struct cld_msg __user *cmsg, return -EFAULT; } if (cmd == Cld_GraceStart) { - if (get_user(namelen, &cmsg->cm_u.cm_name.cn_len)) - return -EFAULT; - name.data = memdup_user(&cmsg->cm_u.cm_name.cn_id, namelen); - if (IS_ERR_OR_NULL(name.data)) - return -EFAULT; - name.len = namelen; + if (nn->client_tracking_ops->version >= 2) { + const struct cld_clntinfo __user *ci; + + ci = &cmsg->cm_u.cm_clntinfo; + if (get_user(namelen, &ci->cc_name.cn_len)) + return -EFAULT; + name.data = memdup_user(&ci->cc_name.cn_id, namelen); + if (IS_ERR_OR_NULL(name.data)) + return -EFAULT; + name.len = namelen; + get_user(princhashlen, &ci->cc_princhash.cp_len); + if (princhashlen > 0) { + princhash.data = memdup_user( + &ci->cc_princhash.cp_data, + princhashlen); + if (IS_ERR_OR_NULL(princhash.data)) + return -EFAULT; + princhash.len = princhashlen; + } else + princhash.len = 0; + } else { + const struct cld_name __user *cnm; + + cnm = &cmsg->cm_u.cm_name; + if (get_user(namelen, &cnm->cn_len)) + return -EFAULT; + name.data = memdup_user(&cnm->cn_id, namelen); + if (IS_ERR_OR_NULL(name.data)) + return -EFAULT; + name.len = namelen; + } if (name.len > 5 && memcmp(name.data, "hash:", 5) == 0) { name.len = name.len - 5; memmove(name.data, name.data + 5, name.len); cn->cn_has_legacy = true; } - if (!nfs4_client_to_reclaim(name, nn)) { + if (!nfs4_client_to_reclaim(name, princhash, nn)) { kfree(name.data); + kfree(princhash.data); return -EFAULT; } return nn->client_tracking_ops->msglen; @@ -830,7 +861,7 @@ cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) { struct cld_upcall *tmp, *cup; struct cld_msg_hdr __user *hdr = (struct cld_msg_hdr __user *)src; - struct cld_msg __user *cmsg = (struct cld_msg __user *)src; + struct cld_msg_v2 __user *cmsg = (struct cld_msg_v2 __user *)src; uint32_t xid; struct nfsd_net *nn = net_generic(file_inode(filp)->i_sb->s_fs_info, nfsd_net_id); @@ -881,7 +912,7 @@ cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) if (status == -EINPROGRESS) return __cld_pipe_inprogress_downcall(cmsg, nn); - if (copy_from_user(&cup->cu_u.cu_msg, src, mlen) != 0) + if (copy_from_user(&cup->cu_u.cu_msg_v2, src, mlen) != 0) return -EFAULT; complete(&cup->cu_done); @@ -1019,6 +1050,8 @@ nfsd4_remove_cld_pipe(struct net *net) nfsd4_cld_unregister_net(net, cn->cn_pipe); rpc_destroy_pipe_data(cn->cn_pipe); + if (cn->cn_tfm) + crypto_free_shash(cn->cn_tfm); kfree(nn->cld_net); nn->cld_net = NULL; } @@ -1103,6 +1136,75 @@ out_err: "record on stable storage: %d\n", ret); } +/* Ask daemon to create a new record */ +static void +nfsd4_cld_create_v2(struct nfs4_client *clp) +{ + int ret; + struct cld_upcall *cup; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + struct cld_net *cn = nn->cld_net; + struct cld_msg_v2 *cmsg; + struct crypto_shash *tfm = cn->cn_tfm; + struct xdr_netobj cksum; + char *principal = NULL; + SHASH_DESC_ON_STACK(desc, tfm); + + /* Don't upcall if it's already stored */ + if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) + return; + + cup = alloc_cld_upcall(nn); + if (!cup) { + ret = -ENOMEM; + goto out_err; + } + + cmsg = &cup->cu_u.cu_msg_v2; + cmsg->cm_cmd = Cld_Create; + cmsg->cm_u.cm_clntinfo.cc_name.cn_len = clp->cl_name.len; + memcpy(cmsg->cm_u.cm_clntinfo.cc_name.cn_id, clp->cl_name.data, + clp->cl_name.len); + if (clp->cl_cred.cr_raw_principal) + principal = clp->cl_cred.cr_raw_principal; + else if (clp->cl_cred.cr_principal) + principal = clp->cl_cred.cr_principal; + if (principal) { + desc->tfm = tfm; + cksum.len = crypto_shash_digestsize(tfm); + cksum.data = kmalloc(cksum.len, GFP_KERNEL); + if (cksum.data == NULL) { + ret = -ENOMEM; + goto out; + } + ret = crypto_shash_digest(desc, principal, strlen(principal), + cksum.data); + shash_desc_zero(desc); + if (ret) { + kfree(cksum.data); + goto out; + } + cmsg->cm_u.cm_clntinfo.cc_princhash.cp_len = cksum.len; + memcpy(cmsg->cm_u.cm_clntinfo.cc_princhash.cp_data, + cksum.data, cksum.len); + kfree(cksum.data); + } else + cmsg->cm_u.cm_clntinfo.cc_princhash.cp_len = 0; + + ret = cld_pipe_upcall(cn->cn_pipe, cmsg); + if (!ret) { + ret = cmsg->cm_status; + set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); + } + +out: + free_cld_upcall(cup); +out_err: + if (ret) + pr_err("NFSD: Unable to create client record on stable storage: %d\n", + ret); +} + /* Ask daemon to create a new record */ static void nfsd4_cld_remove(struct nfs4_client *clp) @@ -1229,6 +1331,79 @@ found: return 0; } +static int +nfsd4_cld_check_v2(struct nfs4_client *clp) +{ + struct nfs4_client_reclaim *crp; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + struct cld_net *cn = nn->cld_net; + int status; + char dname[HEXDIR_LEN]; + struct xdr_netobj name; + struct crypto_shash *tfm = cn->cn_tfm; + struct xdr_netobj cksum; + char *principal = NULL; + SHASH_DESC_ON_STACK(desc, tfm); + + /* did we already find that this client is stable? */ + if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) + return 0; + + /* look for it in the reclaim hashtable otherwise */ + crp = nfsd4_find_reclaim_client(clp->cl_name, nn); + if (crp) + goto found; + + if (cn->cn_has_legacy) { + status = nfs4_make_rec_clidname(dname, &clp->cl_name); + if (status) + return -ENOENT; + + name.data = kmemdup(dname, HEXDIR_LEN, GFP_KERNEL); + if (!name.data) { + dprintk("%s: failed to allocate memory for name.data\n", + __func__); + return -ENOENT; + } + name.len = HEXDIR_LEN; + crp = nfsd4_find_reclaim_client(name, nn); + kfree(name.data); + if (crp) + goto found; + + } + return -ENOENT; +found: + if (crp->cr_princhash.len) { + if (clp->cl_cred.cr_raw_principal) + principal = clp->cl_cred.cr_raw_principal; + else if (clp->cl_cred.cr_principal) + principal = clp->cl_cred.cr_principal; + if (principal == NULL) + return -ENOENT; + desc->tfm = tfm; + cksum.len = crypto_shash_digestsize(tfm); + cksum.data = kmalloc(cksum.len, GFP_KERNEL); + if (cksum.data == NULL) + return -ENOENT; + status = crypto_shash_digest(desc, principal, strlen(principal), + cksum.data); + shash_desc_zero(desc); + if (status) { + kfree(cksum.data); + return -ENOENT; + } + if (memcmp(crp->cr_princhash.data, cksum.data, + crp->cr_princhash.len)) { + kfree(cksum.data); + return -ENOENT; + } + kfree(cksum.data); + } + crp->cr_clp = clp; + return 0; +} + static int nfsd4_cld_grace_start(struct nfsd_net *nn) { @@ -1380,6 +1555,9 @@ nfsd4_cld_get_version(struct nfsd_net *nn) case 1: nn->client_tracking_ops = &nfsd4_cld_tracking_ops; break; + case 2: + nn->client_tracking_ops = &nfsd4_cld_tracking_ops_v2; + break; default: break; } @@ -1408,6 +1586,11 @@ nfsd4_cld_tracking_init(struct net *net) status = __nfsd4_init_cld_pipe(net); if (status) goto err_shutdown; + nn->cld_net->cn_tfm = crypto_alloc_shash("sha256", 0, 0); + if (IS_ERR(nn->cld_net->cn_tfm)) { + status = PTR_ERR(nn->cld_net->cn_tfm); + goto err_remove; + } /* * rpc pipe upcalls take 30 seconds to time out, so we don't want to @@ -1480,6 +1663,18 @@ static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = { .msglen = sizeof(struct cld_msg), }; +/* v2 create/check ops include the principal, if available */ +static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops_v2 = { + .init = nfsd4_cld_tracking_init, + .exit = nfsd4_cld_tracking_exit, + .create = nfsd4_cld_create_v2, + .remove = nfsd4_cld_remove, + .check = nfsd4_cld_check_v2, + .grace_done = nfsd4_cld_grace_done, + .version = 2, + .msglen = sizeof(struct cld_msg_v2), +}; + /* upcall via usermodehelper */ static char cltrack_prog[PATH_MAX] = "/sbin/nfsdcltrack"; module_param_string(cltrack_prog, cltrack_prog, sizeof(cltrack_prog), diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 906e214dcce8..da1093dd5016 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -6887,7 +6887,8 @@ nfs4_has_reclaimed_state(struct xdr_netobj name, struct nfsd_net *nn) * will be freed in nfs4_remove_reclaim_record in the normal case). */ struct nfs4_client_reclaim * -nfs4_client_to_reclaim(struct xdr_netobj name, struct nfsd_net *nn) +nfs4_client_to_reclaim(struct xdr_netobj name, struct xdr_netobj princhash, + struct nfsd_net *nn) { unsigned int strhashval; struct nfs4_client_reclaim *crp; @@ -6900,6 +6901,8 @@ nfs4_client_to_reclaim(struct xdr_netobj name, struct nfsd_net *nn) list_add(&crp->cr_strhash, &nn->reclaim_str_hashtbl[strhashval]); crp->cr_name.data = name.data; crp->cr_name.len = name.len; + crp->cr_princhash.data = princhash.data; + crp->cr_princhash.len = princhash.len; crp->cr_clp = NULL; nn->reclaim_str_hashtbl_size++; } @@ -6911,6 +6914,7 @@ nfs4_remove_reclaim_record(struct nfs4_client_reclaim *crp, struct nfsd_net *nn) { list_del(&crp->cr_strhash); kfree(crp->cr_name.data); + kfree(crp->cr_princhash.data); kfree(crp); nn->reclaim_str_hashtbl_size--; } diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index d00c86d05beb..46f56afb6cb8 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -378,6 +378,7 @@ struct nfs4_client_reclaim { struct list_head cr_strhash; /* hash by cr_name */ struct nfs4_client *cr_clp; /* pointer to associated clp */ struct xdr_netobj cr_name; /* recovery dir name */ + struct xdr_netobj cr_princhash; }; /* A reasonable value for REPLAY_ISIZE was estimated as follows: @@ -645,7 +646,7 @@ extern void nfsd4_shutdown_callback(struct nfs4_client *); extern void nfsd4_shutdown_copy(struct nfs4_client *clp); extern void nfsd4_prepare_cb_recall(struct nfs4_delegation *dp); extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(struct xdr_netobj name, - struct nfsd_net *nn); + struct xdr_netobj princhash, struct nfsd_net *nn); extern bool nfs4_has_reclaimed_state(struct xdr_netobj name, struct nfsd_net *nn); struct nfs4_file *find_file(struct knfsd_fh *fh); diff --git a/include/uapi/linux/nfsd/cld.h b/include/uapi/linux/nfsd/cld.h index c5aad16d10c0..a519313af953 100644 --- a/include/uapi/linux/nfsd/cld.h +++ b/include/uapi/linux/nfsd/cld.h @@ -26,11 +26,15 @@ #include /* latest upcall version available */ -#define CLD_UPCALL_VERSION 1 +#define CLD_UPCALL_VERSION 2 /* defined by RFC3530 */ #define NFS4_OPAQUE_LIMIT 1024 +#ifndef SHA256_DIGEST_SIZE +#define SHA256_DIGEST_SIZE 32 +#endif + enum cld_command { Cld_Create, /* create a record for this cm_id */ Cld_Remove, /* remove record of this cm_id */ @@ -46,6 +50,17 @@ struct cld_name { unsigned char cn_id[NFS4_OPAQUE_LIMIT]; /* client-provided */ } __attribute__((packed)); +/* sha256 hash of the kerberos principal */ +struct cld_princhash { + __u8 cp_len; /* length of cp_data */ + unsigned char cp_data[SHA256_DIGEST_SIZE]; /* hash of principal */ +} __attribute__((packed)); + +struct cld_clntinfo { + struct cld_name cc_name; + struct cld_princhash cc_princhash; +} __attribute__((packed)); + /* message struct for communication with userspace */ struct cld_msg { __u8 cm_vers; /* upcall version */ @@ -59,6 +74,19 @@ struct cld_msg { } __attribute__((packed)) cm_u; } __attribute__((packed)); +/* version 2 message can include hash of kerberos principal */ +struct cld_msg_v2 { + __u8 cm_vers; /* upcall version */ + __u8 cm_cmd; /* upcall command */ + __s16 cm_status; /* return code */ + __u32 cm_xid; /* transaction id */ + union { + struct cld_name cm_name; + __u8 cm_version; /* for getting max version */ + struct cld_clntinfo cm_clntinfo; /* name & princ hash */ + } __attribute__((packed)) cm_u; +} __attribute__((packed)); + struct cld_msg_hdr { __u8 cm_vers; /* upcall version */ __u8 cm_cmd; /* upcall command */ From 38c7a30a9d5ff8dd0c7a9ab1441aa5a96cf60afa Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 10 Sep 2019 10:26:46 -0700 Subject: [PATCH 096/334] Documentation/process: Volunteer as the ambassador for Intel Cc: Jonathan Corbet Cc: Greg Kroah-Hartman Cc: Sasha Levin Cc: Ben Hutchings Cc: Thomas Gleixner Cc: Laura Abbott Cc: Andrew Cooper Cc: Trilok Soni Cc: Kees Cook Cc: Tony Luck Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: Dan Williams Signed-off-by: Tony Luck Signed-off-by: Dave Hansen Link: https://lore.kernel.org/r/20190910172646.25BFCE7B@viggo.jf.intel.com Signed-off-by: Greg Kroah-Hartman --- Documentation/process/embargoed-hardware-issues.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/process/embargoed-hardware-issues.rst b/Documentation/process/embargoed-hardware-issues.rst index 402636356fbe..e57b9f39c69f 100644 --- a/Documentation/process/embargoed-hardware-issues.rst +++ b/Documentation/process/embargoed-hardware-issues.rst @@ -216,7 +216,7 @@ an involved disclosed party. The current ambassadors list: ARM AMD IBM - Intel + Intel Tony Luck Qualcomm Trilok Soni Microsoft Sasha Levin From 473ef57ad8edc25efd083a583a5f6604b47d3822 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 15 Sep 2019 12:19:48 -0400 Subject: [PATCH 097/334] afs dynroot: switch to simple_dir_operations no point reinventing it (with wrong ->read(), BTW). Signed-off-by: Al Viro --- fs/afs/dynroot.c | 7 ------- fs/afs/inode.c | 2 +- fs/afs/internal.h | 1 - 3 files changed, 1 insertion(+), 9 deletions(-) diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index bcd1bafb0278..4150280509ff 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -10,13 +10,6 @@ #include #include "internal.h" -const struct file_operations afs_dynroot_file_operations = { - .open = dcache_dir_open, - .release = dcache_dir_close, - .iterate_shared = dcache_readdir, - .llseek = dcache_dir_lseek, -}; - /* * Probe to see if a cell may exist. This prevents positive dentries from * being created unnecessarily. diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 7b1c18c32f48..46d2d7cb461d 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -443,7 +443,7 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root) inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; if (root) { inode->i_op = &afs_dynroot_inode_operations; - inode->i_fop = &afs_dynroot_file_operations; + inode->i_fop = &simple_dir_operations; } else { inode->i_op = &afs_autocell_inode_operations; } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index f66a3be12fd6..9b2a2a21e3dc 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -910,7 +910,6 @@ extern int afs_silly_iput(struct dentry *, struct inode *); /* * dynroot.c */ -extern const struct file_operations afs_dynroot_file_operations; extern const struct inode_operations afs_dynroot_inode_operations; extern const struct dentry_operations afs_dynroot_dentry_operations; From dac9f027b1096c5f03ca583e787aac0f852e8f78 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Mon, 16 Sep 2019 17:19:35 -0400 Subject: [PATCH 098/334] sched/fair: Remove unused cfs_rq_clock_task() function cfs_rq_clock_task() was first introduced and used in: f1b17280efbd ("sched: Maintain runnable averages across throttled periods") Over time its use has been graduately removed by the following commits: d31b1a66cbe0 ("sched/fair: Factorize PELT update") 23127296889f ("sched/fair: Update scale invariance of PELT") Today, there is no single user left, so it can be safely removed. Found via the -Wunused-function build warning. Signed-off-by: Qian Cai Cc: Ben Segall Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vincent Guittot Link: https://lkml.kernel.org/r/1568668775-2127-1-git-send-email-cai@lca.pw [ Rewrote the changelog. ] Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d4bbf68c3161..3101c662426d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -749,7 +749,6 @@ void init_entity_runnable_average(struct sched_entity *se) /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } -static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); static void attach_entity_cfs_rq(struct sched_entity *se); /* @@ -4376,15 +4375,6 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) return &tg->cfs_bandwidth; } -/* rq->task_clock normalized against any time this cfs_rq has spent throttled */ -static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) -{ - if (unlikely(cfs_rq->throttle_count)) - return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time; - - return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; -} - /* returns 0 on failure to allocate runtime */ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) { @@ -4476,7 +4466,6 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) cfs_rq->throttle_count--; if (!cfs_rq->throttle_count) { - /* adjust cfs_rq_clock_task() */ cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - cfs_rq->throttled_clock_task; @@ -5080,11 +5069,6 @@ static inline bool cfs_bandwidth_used(void) return false; } -static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) -{ - return rq_clock_task(rq_of(cfs_rq)); -} - static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} From 42fd8baab31f53bed2952485fcf0e92f244c5e55 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Tue, 17 Sep 2019 10:34:54 -0400 Subject: [PATCH 099/334] sched/core: Convert vcpu_is_preempted() from macro to an inline function Clang reports this warning: kernel/locking/osq_lock.c:25:19: warning: unused function 'node_cpu' [-Wunused-function] due to osq_lock() calling vcpu_is_preempted(node_cpu(node->prev))), but vcpu_is_preempted() is compiled away. Fix it by converting the dummy vcpu_is_preempted() from a macro to a proper static inline function. Signed-off-by: Qian Cai Acked-by: Mel Gorman Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bsegall@google.com Cc: dietmar.eggemann@arm.com Cc: juri.lelli@redhat.com Cc: rostedt@goodmis.org Cc: vincent.guittot@linaro.org Link: https://lkml.kernel.org/r/1568730894-10483-1-git-send-email-cai@lca.pw Signed-off-by: Ingo Molnar --- include/linux/sched.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index f0edee94834a..e2e91960d79f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1856,7 +1856,10 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) * running or not. */ #ifndef vcpu_is_preempted -# define vcpu_is_preempted(cpu) false +static inline bool vcpu_is_preempted(int cpu) +{ + return false; +} #endif extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask); From 11ed5cf064beac3ca345bb75fdf2429e03ac106a Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Mon, 9 Sep 2019 16:21:30 +0100 Subject: [PATCH 100/334] firmware: arm_scmi: reset: fix reset_state assignment in scmi_domain_reset Fix the copy paste typo that incorrectly assigns domain_id with the passed 'state' parameter instead of reset_state. Fixes: 95a15d80aa0d ("firmware: arm_scmi: Add RESET protocol in SCMI v2.0") Reported-by: Etienne Carriere Signed-off-by: Sudeep Holla --- drivers/firmware/arm_scmi/reset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firmware/arm_scmi/reset.c b/drivers/firmware/arm_scmi/reset.c index 64cc81915581..ab42c21c5517 100644 --- a/drivers/firmware/arm_scmi/reset.c +++ b/drivers/firmware/arm_scmi/reset.c @@ -150,7 +150,7 @@ static int scmi_domain_reset(const struct scmi_handle *handle, u32 domain, dom = t->tx.buf; dom->domain_id = cpu_to_le32(domain); dom->flags = cpu_to_le32(flags); - dom->domain_id = cpu_to_le32(state); + dom->reset_state = cpu_to_le32(state); if (rdom->async_reset) ret = scmi_do_xfer_with_response(handle, t); From 61423712dbb86e02af4aa5de65b9041493c92cac Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Mon, 9 Sep 2019 16:21:07 +0100 Subject: [PATCH 101/334] reset: reset-scmi: add missing handle initialisation scmi_reset_data->handle needs to be initialised at probe, so that it can be later used to access scmi reset protocol APIs using the same. Since it was tested with a module that obtained handle elsewhere, it was missed easily. Add the missing scmi_reset_data->handle initialisation to fix the issue. Fixes: c8ae9c2da1cc ("reset: Add support for resets provided by SCMI") Acked-by: Philipp Zabel Reported-by: Etienne Carriere Signed-off-by: Sudeep Holla --- drivers/reset/reset-scmi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/reset/reset-scmi.c b/drivers/reset/reset-scmi.c index c6d3c8427f14..b46df80ec6c3 100644 --- a/drivers/reset/reset-scmi.c +++ b/drivers/reset/reset-scmi.c @@ -102,6 +102,7 @@ static int scmi_reset_probe(struct scmi_device *sdev) data->rcdev.owner = THIS_MODULE; data->rcdev.of_node = np; data->rcdev.nr_resets = handle->reset_ops->num_domains_get(handle); + data->handle = handle; return devm_reset_controller_register(dev, &data->rcdev); } From 280ceaed79f18db930c0cc8bb21f6493490bf29c Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Thu, 19 Sep 2019 10:23:08 +0200 Subject: [PATCH 102/334] usbnet: sanity checking of packet sizes and device mtu After a reset packet sizes and device mtu can change and need to be reevaluated to calculate queue sizes. Malicious devices can set this to zero and we divide by it. Introduce sanity checking. Reported-and-tested-by: syzbot+6102c120be558c885f04@syzkaller.appspotmail.com Signed-off-by: Oliver Neukum Signed-off-by: David S. Miller --- drivers/net/usb/usbnet.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c index 58952a79b05f..e44849499b89 100644 --- a/drivers/net/usb/usbnet.c +++ b/drivers/net/usb/usbnet.c @@ -339,6 +339,8 @@ void usbnet_update_max_qlen(struct usbnet *dev) { enum usb_device_speed speed = dev->udev->speed; + if (!dev->rx_urb_size || !dev->hard_mtu) + goto insanity; switch (speed) { case USB_SPEED_HIGH: dev->rx_qlen = MAX_QUEUE_MEMORY / dev->rx_urb_size; @@ -355,6 +357,7 @@ void usbnet_update_max_qlen(struct usbnet *dev) dev->tx_qlen = 5 * MAX_QUEUE_MEMORY / dev->hard_mtu; break; default: +insanity: dev->rx_qlen = dev->tx_qlen = 4; } } From e0973a421c6e9d268db2157bcb8756e7ab4b4313 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Mon, 16 Sep 2019 14:33:42 +0200 Subject: [PATCH 103/334] libbpf: Remove getsockopt() check for XDP_OPTIONS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The xsk_socket__create() function fails and returns an error if it cannot get the XDP_OPTIONS through getsockopt(). However, support for XDP_OPTIONS was not added until kernel 5.3, so this means that creating XSK sockets always fails on older kernels. Since the option is just used to set the zero-copy flag in the xsk struct, and that flag is not really used for anything yet, just remove the getsockopt() call until a proper use for it is introduced. Suggested-by: Yonghong Song Signed-off-by: Toke Høiland-Jørgensen Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- tools/lib/bpf/xsk.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c index 842c4fd55859..24fa313524fb 100644 --- a/tools/lib/bpf/xsk.c +++ b/tools/lib/bpf/xsk.c @@ -65,7 +65,6 @@ struct xsk_socket { int xsks_map_fd; __u32 queue_id; char ifname[IFNAMSIZ]; - bool zc; }; struct xsk_nl_info { @@ -491,7 +490,6 @@ int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname, void *rx_map = NULL, *tx_map = NULL; struct sockaddr_xdp sxdp = {}; struct xdp_mmap_offsets off; - struct xdp_options opts; struct xsk_socket *xsk; socklen_t optlen; int err; @@ -611,15 +609,6 @@ int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname, xsk->prog_fd = -1; - optlen = sizeof(opts); - err = getsockopt(xsk->fd, SOL_XDP, XDP_OPTIONS, &opts, &optlen); - if (err) { - err = -errno; - goto out_mmap_tx; - } - - xsk->zc = opts.flags & XDP_OPTIONS_ZEROCOPY; - if (!(xsk->config.libbpf_flags & XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD)) { err = xsk_setup_xdp_prog(xsk); if (err) From 9eea984979513d6ee137e545e26c5877d46039dd Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 17 Sep 2019 10:45:37 -0700 Subject: [PATCH 104/334] bpf: fix BTF verification of enums vmlinux BTF has enums that are 8 byte and 1 byte in size. 2 byte enum is a valid construct as well. Fix BTF enum verification to accept those sizes. Fixes: 69b693f0aefa ("bpf: btf: Introduce BPF Type Format (BTF)") Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- kernel/bpf/btf.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index adb3adcebe3c..722d38e543e9 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -2377,9 +2377,8 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env, return -EINVAL; } - if (t->size != sizeof(int)) { - btf_verifier_log_type(env, t, "Expected size:%zu", - sizeof(int)); + if (t->size > 8 || !is_power_of_2(t->size)) { + btf_verifier_log_type(env, t, "Unexpected size"); return -EINVAL; } From a0791f0df7d212c245761538b17a9ea93607b667 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 17 Sep 2019 10:45:38 -0700 Subject: [PATCH 105/334] bpf: fix BTF limits vmlinux BTF has more than 64k types. Its string section is also at the offset larger than 64k. Adjust both limits to make in-kernel BTF verifier successfully parse in-kernel BTF. Fixes: 69b693f0aefa ("bpf: btf: Introduce BPF Type Format (BTF)") Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/uapi/linux/btf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h index 63ae4a39e58b..c02dec97e1ce 100644 --- a/include/uapi/linux/btf.h +++ b/include/uapi/linux/btf.h @@ -22,9 +22,9 @@ struct btf_header { }; /* Max # of type identifier */ -#define BTF_MAX_TYPE 0x0000ffff +#define BTF_MAX_TYPE 0x000fffff /* Max offset into the string section */ -#define BTF_MAX_NAME_OFFSET 0x0000ffff +#define BTF_MAX_NAME_OFFSET 0x00ffffff /* Max # of struct/union/enum members or func args */ #define BTF_MAX_VLEN 0xffff From 733ef7f056a5e23b66e8e7bb3508ca882db388f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Wed, 18 Sep 2019 09:57:39 +0200 Subject: [PATCH 106/334] xsk: relax UMEM headroom alignment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch removes the 64B alignment of the UMEM headroom. There is really no reason for it, and having a headroom less than 64B should be valid. Fixes: c0c77d8fb787 ("xsk: add user memory registration support sockopt") Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- net/xdp/xdp_umem.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 947b8ff0227e..cdaef54d48be 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -383,8 +383,6 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) return -EINVAL; } - headroom = ALIGN(headroom, 64); - size_chk = chunk_size - headroom - XDP_PACKET_HEADROOM; if (size_chk < 0) return -EINVAL; From 2d88b2cf2f002417cd7436f0fd34716e8c288fb1 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 18 Sep 2019 16:49:03 +0300 Subject: [PATCH 107/334] iwlwifi: mvm: fix build w/o CONFIG_THERMAL Without CONFIG_THERMAL, the driver fails to link as it calls iwl_mvm_send_temp_report_ths_cmd() unconditionally. Fix this by making that function available, but do almost nothing but send the empty firmware command to enable CT-kill reporting. While at it, also fix that function itself to not error out when the thermal zone hasn't been initialized, but instead just send the empty firmware command in this case as well. Fixes: 242d9c8b9a93 ("iwlwifi: mvm: use FW thermal monitoring regardless of CONFIG_THERMAL") Signed-off-by: Johannes Berg Signed-off-by: Luca Coelho Signed-off-by: Kalle Valo --- drivers/net/wireless/intel/iwlwifi/mvm/tt.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/tt.c b/drivers/net/wireless/intel/iwlwifi/mvm/tt.c index 32a708301cfc..f0c539b37ea7 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/tt.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/tt.c @@ -555,16 +555,19 @@ static int compare_temps(const void *a, const void *b) return ((s16)le16_to_cpu(*(__le16 *)a) - (s16)le16_to_cpu(*(__le16 *)b)); } +#endif int iwl_mvm_send_temp_report_ths_cmd(struct iwl_mvm *mvm) { struct temp_report_ths_cmd cmd = {0}; - int ret, i, j, idx = 0; + int ret; +#ifdef CONFIG_THERMAL + int i, j, idx = 0; lockdep_assert_held(&mvm->mutex); if (!mvm->tz_device.tzone) - return -EINVAL; + goto send; /* The driver holds array of temperature trips that are unsorted * and uncompressed, the FW should get it compressed and sorted @@ -597,6 +600,7 @@ int iwl_mvm_send_temp_report_ths_cmd(struct iwl_mvm *mvm) } send: +#endif ret = iwl_mvm_send_cmd_pdu(mvm, WIDE_ID(PHY_OPS_GROUP, TEMP_REPORTING_THRESHOLDS_CMD), 0, sizeof(cmd), &cmd); @@ -607,6 +611,7 @@ send: return ret; } +#ifdef CONFIG_THERMAL static int iwl_mvm_tzone_get_temp(struct thermal_zone_device *device, int *temperature) { From b47bea2b5c3bf17270defe8529774dad114bcee5 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 17 Sep 2019 16:26:16 -0700 Subject: [PATCH 108/334] ionic: Remove unnecessary ternary operator in ionic_debugfs_add_ident clang warns: ../drivers/net/ethernet/pensando/ionic/ionic_debugfs.c:60:37: warning: expression result unused [-Wunused-value] ionic, &identity_fops) ? 0 : -EOPNOTSUPP; ^~~~~~~~~~~ 1 warning generated. The return value of debugfs_create_file does not need to be checked [1] and the function returns void so get rid of the ternary operator, it is unnecessary. [1]: https://lore.kernel.org/linux-mm/20150815160730.GB25186@kroah.com/ Fixes: fbfb8031533c ("ionic: Add hardware init and device commands") Link: https://github.com/ClangBuiltLinux/linux/issues/658 Signed-off-by: Nathan Chancellor Acked-by: Shannon Nelson Reviewed-by: Greg Kroah-Hartman --- drivers/net/ethernet/pensando/ionic/ionic_debugfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_debugfs.c b/drivers/net/ethernet/pensando/ionic/ionic_debugfs.c index 7afc4a365b75..bc03cecf80cc 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_debugfs.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_debugfs.c @@ -57,7 +57,7 @@ DEFINE_SHOW_ATTRIBUTE(identity); void ionic_debugfs_add_ident(struct ionic *ionic) { debugfs_create_file("identity", 0400, ionic->dentry, - ionic, &identity_fops) ? 0 : -EOPNOTSUPP; + ionic, &identity_fops); } void ionic_debugfs_add_sizes(struct ionic *ionic) From dd0f9d896d167ab37732dd83986adc3a017a13b4 Mon Sep 17 00:00:00 2001 From: Murilo Fossa Vicentini Date: Mon, 16 Sep 2019 11:50:37 -0300 Subject: [PATCH 109/334] ibmvnic: Warn unknown speed message only when carrier is present With commit 0655f9943df2 ("net/ibmvnic: Update carrier state after link state change") we are now able to detect when the carrier is properly present in the device, so only report an unexpected unknown speed when it is properly detected. Unknown speed is expected to be seen by the device in case the backing device has no link detected. Reported-by: Abdul Haleem Tested-by: Abdul Haleem Signed-off-by: Murilo Fossa Vicentini Reviewed-by: Thomas Falcon --- drivers/net/ethernet/ibm/ibmvnic.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 2e5172f61564..3816fff75bb5 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -4312,13 +4312,14 @@ static int handle_query_phys_parms_rsp(union ibmvnic_crq *crq, { struct net_device *netdev = adapter->netdev; int rc; + __be32 rspeed = cpu_to_be32(crq->query_phys_parms_rsp.speed); rc = crq->query_phys_parms_rsp.rc.code; if (rc) { netdev_err(netdev, "Error %d in QUERY_PHYS_PARMS\n", rc); return rc; } - switch (cpu_to_be32(crq->query_phys_parms_rsp.speed)) { + switch (rspeed) { case IBMVNIC_10MBPS: adapter->speed = SPEED_10; break; @@ -4344,8 +4345,8 @@ static int handle_query_phys_parms_rsp(union ibmvnic_crq *crq, adapter->speed = SPEED_100000; break; default: - netdev_warn(netdev, "Unknown speed 0x%08x\n", - cpu_to_be32(crq->query_phys_parms_rsp.speed)); + if (netif_carrier_ok(netdev)) + netdev_warn(netdev, "Unknown speed 0x%08x\n", rspeed); adapter->speed = SPEED_UNKNOWN; } if (crq->query_phys_parms_rsp.flags1 & IBMVNIC_FULL_DUPLEX) From cf0eba334268563152e4a8bc9ab865d0037a7948 Mon Sep 17 00:00:00 2001 From: Vijay Khemka Date: Thu, 12 Sep 2019 12:04:50 -0700 Subject: [PATCH 110/334] net/ncsi: Disable global multicast filter Disabling multicast filtering from NCSI if it is supported. As it should not filter any multicast packets. In current code, multicast filter is enabled and with an exception of optional field supported by device are disabled filtering. Mainly I see if goal is to disable filtering for IPV6 packets then let it disabled for every other types as well. As we are seeing issues with LLDP not working with this enabled filtering. And there are other issues with IPV6. By Disabling this multicast completely, it is working for both IPV6 as well as LLDP. Signed-off-by: Vijay Khemka Acked-by: Samuel Mendoza-Jonas Signed-off-by: Jakub Kicinski --- net/ncsi/internal.h | 7 +-- net/ncsi/ncsi-manage.c | 104 ++++++----------------------------------- 2 files changed, 15 insertions(+), 96 deletions(-) diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h index 0b3f0673e1a2..ad3fd7f1da75 100644 --- a/net/ncsi/internal.h +++ b/net/ncsi/internal.h @@ -264,9 +264,7 @@ enum { ncsi_dev_state_config_ev, ncsi_dev_state_config_sma, ncsi_dev_state_config_ebf, -#if IS_ENABLED(CONFIG_IPV6) - ncsi_dev_state_config_egmf, -#endif + ncsi_dev_state_config_dgmf, ncsi_dev_state_config_ecnt, ncsi_dev_state_config_ec, ncsi_dev_state_config_ae, @@ -295,9 +293,6 @@ struct ncsi_dev_priv { #define NCSI_DEV_RESET 8 /* Reset state of NC */ unsigned int gma_flag; /* OEM GMA flag */ spinlock_t lock; /* Protect the NCSI device */ -#if IS_ENABLED(CONFIG_IPV6) - unsigned int inet6_addr_num; /* Number of IPv6 addresses */ -#endif unsigned int package_probe_id;/* Current ID during probe */ unsigned int package_num; /* Number of packages */ struct list_head packages; /* List of packages */ diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index 755aab66dcab..70fe02697544 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include "internal.h" @@ -978,9 +977,7 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) case ncsi_dev_state_config_ev: case ncsi_dev_state_config_sma: case ncsi_dev_state_config_ebf: -#if IS_ENABLED(CONFIG_IPV6) - case ncsi_dev_state_config_egmf: -#endif + case ncsi_dev_state_config_dgmf: case ncsi_dev_state_config_ecnt: case ncsi_dev_state_config_ec: case ncsi_dev_state_config_ae: @@ -1033,23 +1030,23 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) } else if (nd->state == ncsi_dev_state_config_ebf) { nca.type = NCSI_PKT_CMD_EBF; nca.dwords[0] = nc->caps[NCSI_CAP_BC].cap; + /* if multicast global filtering is supported then + * disable it so that all multicast packet will be + * forwarded to management controller + */ + if (nc->caps[NCSI_CAP_GENERIC].cap & + NCSI_CAP_GENERIC_MC) + nd->state = ncsi_dev_state_config_dgmf; + else if (ncsi_channel_is_tx(ndp, nc)) + nd->state = ncsi_dev_state_config_ecnt; + else + nd->state = ncsi_dev_state_config_ec; + } else if (nd->state == ncsi_dev_state_config_dgmf) { + nca.type = NCSI_PKT_CMD_DGMF; if (ncsi_channel_is_tx(ndp, nc)) nd->state = ncsi_dev_state_config_ecnt; else nd->state = ncsi_dev_state_config_ec; -#if IS_ENABLED(CONFIG_IPV6) - if (ndp->inet6_addr_num > 0 && - (nc->caps[NCSI_CAP_GENERIC].cap & - NCSI_CAP_GENERIC_MC)) - nd->state = ncsi_dev_state_config_egmf; - } else if (nd->state == ncsi_dev_state_config_egmf) { - nca.type = NCSI_PKT_CMD_EGMF; - nca.dwords[0] = nc->caps[NCSI_CAP_MC].cap; - if (ncsi_channel_is_tx(ndp, nc)) - nd->state = ncsi_dev_state_config_ecnt; - else - nd->state = ncsi_dev_state_config_ec; -#endif /* CONFIG_IPV6 */ } else if (nd->state == ncsi_dev_state_config_ecnt) { if (np->preferred_channel && nc != np->preferred_channel) @@ -1483,70 +1480,6 @@ out: return -ENODEV; } -#if IS_ENABLED(CONFIG_IPV6) -static int ncsi_inet6addr_event(struct notifier_block *this, - unsigned long event, void *data) -{ - struct inet6_ifaddr *ifa = data; - struct net_device *dev = ifa->idev->dev; - struct ncsi_dev *nd = ncsi_find_dev(dev); - struct ncsi_dev_priv *ndp = nd ? TO_NCSI_DEV_PRIV(nd) : NULL; - struct ncsi_package *np; - struct ncsi_channel *nc; - struct ncsi_cmd_arg nca; - bool action; - int ret; - - if (!ndp || (ipv6_addr_type(&ifa->addr) & - (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK))) - return NOTIFY_OK; - - switch (event) { - case NETDEV_UP: - action = (++ndp->inet6_addr_num) == 1; - nca.type = NCSI_PKT_CMD_EGMF; - break; - case NETDEV_DOWN: - action = (--ndp->inet6_addr_num == 0); - nca.type = NCSI_PKT_CMD_DGMF; - break; - default: - return NOTIFY_OK; - } - - /* We might not have active channel or packages. The IPv6 - * required multicast will be enabled when active channel - * or packages are chosen. - */ - np = ndp->active_package; - nc = ndp->active_channel; - if (!action || !np || !nc) - return NOTIFY_OK; - - /* We needn't enable or disable it if the function isn't supported */ - if (!(nc->caps[NCSI_CAP_GENERIC].cap & NCSI_CAP_GENERIC_MC)) - return NOTIFY_OK; - - nca.ndp = ndp; - nca.req_flags = 0; - nca.package = np->id; - nca.channel = nc->id; - nca.dwords[0] = nc->caps[NCSI_CAP_MC].cap; - ret = ncsi_xmit_cmd(&nca); - if (ret) { - netdev_warn(dev, "Fail to %s global multicast filter (%d)\n", - (event == NETDEV_UP) ? "enable" : "disable", ret); - return NOTIFY_DONE; - } - - return NOTIFY_OK; -} - -static struct notifier_block ncsi_inet6addr_notifier = { - .notifier_call = ncsi_inet6addr_event, -}; -#endif /* CONFIG_IPV6 */ - static int ncsi_kick_channels(struct ncsi_dev_priv *ndp) { struct ncsi_dev *nd = &ndp->ndev; @@ -1725,11 +1658,6 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev, } spin_lock_irqsave(&ncsi_dev_lock, flags); -#if IS_ENABLED(CONFIG_IPV6) - ndp->inet6_addr_num = 0; - if (list_empty(&ncsi_dev_list)) - register_inet6addr_notifier(&ncsi_inet6addr_notifier); -#endif list_add_tail_rcu(&ndp->node, &ncsi_dev_list); spin_unlock_irqrestore(&ncsi_dev_lock, flags); @@ -1896,10 +1824,6 @@ void ncsi_unregister_dev(struct ncsi_dev *nd) spin_lock_irqsave(&ncsi_dev_lock, flags); list_del_rcu(&ndp->node); -#if IS_ENABLED(CONFIG_IPV6) - if (list_empty(&ncsi_dev_list)) - unregister_inet6addr_notifier(&ncsi_inet6addr_notifier); -#endif spin_unlock_irqrestore(&ncsi_dev_lock, flags); ncsi_unregister_netlink(nd->dev); From 20b3f7d700130326db555d724e7507ce4606219a Mon Sep 17 00:00:00 2001 From: James Byrne Date: Fri, 13 Sep 2019 16:46:35 +0000 Subject: [PATCH 111/334] dt-bindings: net: Correct the documentation of KSZ9021 skew values The documentation of skew values for the KSZ9021 PHY was misleading because the driver implementation followed the erroneous information given in the original KSZ9021 datasheet before it was corrected in revision 1.2 (Feb 2014). It is probably too late to correct the driver now because of the many existing device trees, so instead this just corrects the documentation to explain that what you actually get is not what you might think when looking at the device tree. Signed-off-by: James Byrne Signed-off-by: Jakub Kicinski --- .../bindings/net/micrel-ksz90x1.txt | 32 +++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/net/micrel-ksz90x1.txt b/Documentation/devicetree/bindings/net/micrel-ksz90x1.txt index 5100358177c9..b921731cd970 100644 --- a/Documentation/devicetree/bindings/net/micrel-ksz90x1.txt +++ b/Documentation/devicetree/bindings/net/micrel-ksz90x1.txt @@ -12,8 +12,36 @@ and therefore may overwrite them. KSZ9021: All skew control options are specified in picoseconds. The minimum - value is 0, the maximum value is 3000, and it is incremented by 200ps - steps. + value is 0, the maximum value is 3000, and it can be specified in 200ps + steps, *but* these values are in not fact what you get because this chip's + skew values actually increase in 120ps steps, starting from -840ps. The + incorrect values came from an error in the original KSZ9021 datasheet + before it was corrected in revision 1.2 (Feb 2014), but it is too late to + change the driver now because of the many existing device trees that have + been created using values that go up in increments of 200. + + The following table shows the actual skew delay you will get for each of the + possible devicetree values, and the number that will be programmed into the + corresponding pad skew register: + + Device Tree Value Delay Pad Skew Register Value + ----------------------------------------------------- + 0 -840ps 0000 + 200 -720ps 0001 + 400 -600ps 0010 + 600 -480ps 0011 + 800 -360ps 0100 + 1000 -240ps 0101 + 1200 -120ps 0110 + 1400 0ps 0111 + 1600 120ps 1000 + 1800 240ps 1001 + 2000 360ps 1010 + 2200 480ps 1011 + 2400 600ps 1100 + 2600 720ps 1101 + 2800 840ps 1110 + 3000 960ps 1111 Optional properties: From 864668bfc374dfbf4851ec828b9049e08f9057b1 Mon Sep 17 00:00:00 2001 From: Donald Sharp Date: Mon, 16 Sep 2019 08:26:50 -0400 Subject: [PATCH 112/334] selftests: Add test cases for `ip nexthop flush proto XX` Add some test cases to allow the fib_nexthops.sh test code to test the flushing of nexthops based upon the proto passed in upon creation of the nexthop group. Signed-off-by: Donald Sharp Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/fib_nexthops.sh | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh index f9ebeac1e6f2..796670ebc65b 100755 --- a/tools/testing/selftests/net/fib_nexthops.sh +++ b/tools/testing/selftests/net/fib_nexthops.sh @@ -940,6 +940,20 @@ basic() run_cmd "$IP nexthop add id 104 group 1 dev veth1" log_test $? 2 "Nexthop group and device" + # Tests to ensure that flushing works as expected. + run_cmd "$IP nexthop add id 105 blackhole proto 99" + run_cmd "$IP nexthop add id 106 blackhole proto 100" + run_cmd "$IP nexthop add id 107 blackhole proto 99" + run_cmd "$IP nexthop flush proto 99" + check_nexthop "id 105" "" + check_nexthop "id 106" "id 106 blackhole proto 100" + check_nexthop "id 107" "" + run_cmd "$IP nexthop flush proto 100" + check_nexthop "id 106" "" + + run_cmd "$IP nexthop flush proto 100" + log_test $? 0 "Test proto flush" + run_cmd "$IP nexthop add id 104 group 1 blackhole" log_test $? 2 "Nexthop group and blackhole" From b74d957f6317e60924d0d3c0c01750814c24611b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 29 Aug 2019 15:43:49 +0200 Subject: [PATCH 113/334] ARM: aspeed: ast2500 is ARMv6K Linux supports both the original ARMv6 level (early ARM1136) and ARMv6K (later ARM1136, ARM1176 and ARM11mpcore). ast2500 falls into the second categoy, being based on arm1176jzf-s. This is enabled by default when using ARCH_MULTI_V6, so we should not 'select CPU_V6'. Removing this will lead to more efficient use of atomic instructions. Fixes: 8c2ed9bcfbeb ("arm: Add Aspeed machine") Signed-off-by: Arnd Bergmann Reviewed-by: Joel Stanley --- arch/arm/mach-aspeed/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm/mach-aspeed/Kconfig b/arch/arm/mach-aspeed/Kconfig index a15c3a291386..44ee6bac255e 100644 --- a/arch/arm/mach-aspeed/Kconfig +++ b/arch/arm/mach-aspeed/Kconfig @@ -26,7 +26,6 @@ config MACH_ASPEED_G4 config MACH_ASPEED_G5 bool "Aspeed SoC 5th Generation" depends on ARCH_MULTI_V6 - select CPU_V6 select PINCTRL_ASPEED_G5 help Say yes if you intend to run on an Aspeed ast2500 or similar From ad652f3811d8644d547506154ec9a9c22c8771cd Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 16 Sep 2019 18:33:08 +0200 Subject: [PATCH 114/334] netfilter: nf_tables: add NFT_CHAIN_POLICY_UNSET and use it Default policy is defined as a unsigned 8-bit field, do not use a negative value to leave it unset, use this new NFT_CHAIN_POLICY_UNSET instead. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 ++ net/netfilter/nf_tables_api.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 2655e03dbe1b..a26d64056fc8 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -889,6 +889,8 @@ enum nft_chain_flags { NFT_CHAIN_HW_OFFLOAD = 0x2, }; +#define NFT_CHAIN_POLICY_UNSET U8_MAX + /** * struct nft_chain - nf_tables chain * diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index e4a68dc42694..4a5d6ef2b706 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1715,7 +1715,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, goto err2; } - nft_trans_chain_policy(trans) = -1; + nft_trans_chain_policy(trans) = NFT_CHAIN_POLICY_UNSET; if (nft_is_base_chain(chain)) nft_trans_chain_policy(trans) = policy; From ff175d0b0eab99f512b9afdb571f0ed18b63f533 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 16 Sep 2019 18:33:09 +0200 Subject: [PATCH 115/334] netfilter: nf_tables_offload: fix always true policy is unset check New smatch warnings: net/netfilter/nf_tables_offload.c:316 nft_flow_offload_chain() warn: always true condition '(policy != -1) => (0-255 != (-1))' Reported-by: kbuild test robot Fixes: c9626a2cbdb2 ("netfilter: nf_tables: add hardware offload support") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_offload.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index 21bb772cb4b7..e546f759b7a7 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -313,7 +313,7 @@ static int nft_flow_offload_chain(struct nft_chain *chain, policy = ppolicy ? *ppolicy : basechain->policy; /* Only default policy to accept is supported for now. */ - if (cmd == FLOW_BLOCK_BIND && policy != -1 && policy != NF_ACCEPT) + if (cmd == FLOW_BLOCK_BIND && policy == NF_DROP) return -EOPNOTSUPP; if (dev->netdev_ops->ndo_setup_tc) From acab713177377d9e0889c46bac7ff0cfb9a90c4d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 19 Sep 2019 16:56:44 +0200 Subject: [PATCH 116/334] netfilter: nf_tables: allow lookups in dynamic sets This un-breaks lookups in sets that have the 'dynamic' flag set. Given this active example configuration: table filter { set set1 { type ipv4_addr size 64 flags dynamic,timeout timeout 1m } chain input { type filter hook input priority 0; policy accept; } } ... this works: nft add rule ip filter input add @set1 { ip saddr } -> whenever rule is triggered, the source ip address is inserted into the set (if it did not exist). This won't work: nft add rule ip filter input ip saddr @set1 counter Error: Could not process rule: Operation not supported In other words, we can add entries to the set, but then can't make matching decision based on that set. That is just wrong -- all set backends support lookups (else they would not be very useful). The failure comes from an explicit rejection in nft_lookup.c. Looking at the history, it seems like NFT_SET_EVAL used to mean 'set contains expressions' (aka. "is a meter"), for instance something like nft add rule ip filter input meter example { ip saddr limit rate 10/second } or nft add rule ip filter input meter example { ip saddr counter } The actual meaning of NFT_SET_EVAL however, is 'set can be updated from the packet path'. 'meters' and packet-path insertions into sets, such as 'add @set { ip saddr }' use exactly the same kernel code (nft_dynset.c) and thus require a set backend that provides the ->update() function. The only set that provides this also is the only one that has the NFT_SET_EVAL feature flag. Removing the wrong check makes the above example work. While at it, also fix the flag check during set instantiation to allow supported combinations only. Fixes: 8aeff920dcc9b3f ("netfilter: nf_tables: add stateful object reference to set elements") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 7 +++++-- net/netfilter/nft_lookup.c | 3 --- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 4a5d6ef2b706..6dc46f9b5f7b 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3562,8 +3562,11 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, NFT_SET_OBJECT)) return -EINVAL; /* Only one of these operations is supported */ - if ((flags & (NFT_SET_MAP | NFT_SET_EVAL | NFT_SET_OBJECT)) == - (NFT_SET_MAP | NFT_SET_EVAL | NFT_SET_OBJECT)) + if ((flags & (NFT_SET_MAP | NFT_SET_OBJECT)) == + (NFT_SET_MAP | NFT_SET_OBJECT)) + return -EOPNOTSUPP; + if ((flags & (NFT_SET_EVAL | NFT_SET_OBJECT)) == + (NFT_SET_EVAL | NFT_SET_OBJECT)) return -EOPNOTSUPP; } diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index c0560bf3c31b..660bad688e2b 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -73,9 +73,6 @@ static int nft_lookup_init(const struct nft_ctx *ctx, if (IS_ERR(set)) return PTR_ERR(set); - if (set->flags & NFT_SET_EVAL) - return -EOPNOTSUPP; - priv->sreg = nft_parse_register(tb[NFTA_LOOKUP_SREG]); err = nft_validate_register_load(priv->sreg, set->klen); if (err < 0) From 7f49fd5d7acd82163685559045d04d49433581cc Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 20 Sep 2019 16:33:16 +1000 Subject: [PATCH 117/334] nfsd: handle drc over-allocation gracefully. Currently, if there are more clients than allowed for by the space allocation in set_max_drc(), we fail a SESSION_CREATE request with NFS4ERR_DELAY. This means that the client retries indefinitely, which isn't a user-friendly response. The RFC requires NFS4ERR_NOSPC, but that would at best result in a clean failure on the client, which is not much more friendly. The current space allocation is a best-guess and doesn't provide any guarantees, we could still run out of space when trying to allocate drc space. So fail more gracefully - always give out at least one slot. If all clients used all the space in all slots, we might start getting memory pressure, but that is possible anyway. So ensure 'num' is always at least 1, and remove the test for it being zero. Signed-off-by: NeilBrown Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index da1093dd5016..8622d6dd45c0 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1575,14 +1575,25 @@ static u32 nfsd4_get_drc_mem(struct nfsd4_channel_attrs *ca) unsigned long avail, total_avail; spin_lock(&nfsd_drc_lock); - total_avail = nfsd_drc_max_mem - nfsd_drc_mem_used; + if (nfsd_drc_max_mem > nfsd_drc_mem_used) + total_avail = nfsd_drc_max_mem - nfsd_drc_mem_used; + else + /* We have handed out more space than we chose in + * set_max_drc() to allow. That isn't really a + * problem as long as that doesn't make us think we + * have lots more due to integer overflow. + */ + total_avail = 0; avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION, total_avail); /* * Never use more than a third of the remaining memory, - * unless it's the only way to give this client a slot: + * unless it's the only way to give this client a slot. + * Give the client one slot even if that would require + * over-allocation--it is better than failure. */ avail = clamp_t(unsigned long, avail, slotsize, total_avail/3); num = min_t(int, num, avail / slotsize); + num = max_t(int, num, 1); nfsd_drc_mem_used += num * slotsize; spin_unlock(&nfsd_drc_lock); @@ -3174,10 +3185,10 @@ static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs *ca, struct nfs * performance. When short on memory we therefore prefer to * decrease number of slots instead of their size. Clients that * request larger slots than they need will get poor results: + * Note that we always allow at least one slot, because our + * accounting is soft and provides no guarantees either way. */ ca->maxreqs = nfsd4_get_drc_mem(ca); - if (!ca->maxreqs) - return nfserr_jukebox; return nfs_ok; } From 2030ca560c5f24138c1f0f8c8a89f1ac3b560613 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 20 Sep 2019 16:36:45 +1000 Subject: [PATCH 118/334] nfsd: degraded slot-count more gracefully as allocation nears exhaustion. This original code in nfsd4_get_drc_mem() would hand out 30 slots (approximately NFSD_MAX_MEM_PER_SESSION bytes at slightly over 2K per slot) to each requesting client until it ran out of space, then it would possibly give one last client a reduced allocation, then fail the allocation. Since commit de766e570413 ("nfsd: give out fewer session slots as limit approaches") the last 90 slots to be given to about 12 clients with quickly reducing slot counts (better than just 3 clients). This still seems unnecessarily hasty. A subsequent patch allows over-allocation so every client gets at least one slot, but that might be a bit restrictive. The requested number of nfsd threads is the best guide we have to the expected number of clients, so use that - if it is at least 8. 256 threads on a 256Meg machine - which is a lot for a tiny machine - would result in nfsd_drc_max_mem being 2Meg, so 8K (3 slots) would be available for the first client, and over 200 clients would get more than 1 slot. So I don't think this change will be too debilitating on poorly configured machines, though it does mean that a sensible configuration is a little more important. Signed-off-by: NeilBrown Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 8622d6dd45c0..c65aeaa812d4 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1568,11 +1568,12 @@ static inline u32 slot_bytes(struct nfsd4_channel_attrs *ca) * re-negotiate active sessions and reduce their slot usage to make * room for new connections. For now we just fail the create session. */ -static u32 nfsd4_get_drc_mem(struct nfsd4_channel_attrs *ca) +static u32 nfsd4_get_drc_mem(struct nfsd4_channel_attrs *ca, struct nfsd_net *nn) { u32 slotsize = slot_bytes(ca); u32 num = ca->maxreqs; unsigned long avail, total_avail; + unsigned int scale_factor; spin_lock(&nfsd_drc_lock); if (nfsd_drc_max_mem > nfsd_drc_mem_used) @@ -1586,12 +1587,18 @@ static u32 nfsd4_get_drc_mem(struct nfsd4_channel_attrs *ca) total_avail = 0; avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION, total_avail); /* - * Never use more than a third of the remaining memory, + * Never use more than a fraction of the remaining memory, * unless it's the only way to give this client a slot. + * The chosen fraction is either 1/8 or 1/number of threads, + * whichever is smaller. This ensures there are adequate + * slots to support multiple clients per thread. * Give the client one slot even if that would require * over-allocation--it is better than failure. */ - avail = clamp_t(unsigned long, avail, slotsize, total_avail/3); + scale_factor = max_t(unsigned int, 8, nn->nfsd_serv->sv_nrthreads); + + avail = clamp_t(unsigned long, avail, slotsize, + total_avail/scale_factor); num = min_t(int, num, avail / slotsize); num = max_t(int, num, 1); nfsd_drc_mem_used += num * slotsize; @@ -3188,7 +3195,7 @@ static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs *ca, struct nfs * Note that we always allow at least one slot, because our * accounting is soft and provides no guarantees either way. */ - ca->maxreqs = nfsd4_get_drc_mem(ca); + ca->maxreqs = nfsd4_get_drc_mem(ca, nn); return nfs_ok; } From a003365cab64b0f7988ac3ccb1da895ce0bece5e Mon Sep 17 00:00:00 2001 From: Gustavo Romero Date: Wed, 4 Sep 2019 00:55:29 -0400 Subject: [PATCH 119/334] powerpc/tm: Add tm-poison test Add TM selftest to check if FP or VEC register values from one process can leak into another process when both run on the same CPU. Signed-off-by: Gustavo Romero Signed-off-by: Michael Neuling Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190904045529.23002-3-gromero@linux.vnet.ibm.com --- tools/testing/selftests/powerpc/tm/.gitignore | 1 + tools/testing/selftests/powerpc/tm/Makefile | 2 +- .../testing/selftests/powerpc/tm/tm-poison.c | 179 ++++++++++++++++++ 3 files changed, 181 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/powerpc/tm/tm-poison.c diff --git a/tools/testing/selftests/powerpc/tm/.gitignore b/tools/testing/selftests/powerpc/tm/.gitignore index 951fe855f7cd..98f2708d86cc 100644 --- a/tools/testing/selftests/powerpc/tm/.gitignore +++ b/tools/testing/selftests/powerpc/tm/.gitignore @@ -17,3 +17,4 @@ tm-vmx-unavail tm-unavailable tm-trap tm-sigreturn +tm-poison diff --git a/tools/testing/selftests/powerpc/tm/Makefile b/tools/testing/selftests/powerpc/tm/Makefile index c0734ed0ef56..b15a1a325bd0 100644 --- a/tools/testing/selftests/powerpc/tm/Makefile +++ b/tools/testing/selftests/powerpc/tm/Makefile @@ -5,7 +5,7 @@ SIGNAL_CONTEXT_CHK_TESTS := tm-signal-context-chk-gpr tm-signal-context-chk-fpu TEST_GEN_PROGS := tm-resched-dscr tm-syscall tm-signal-msr-resv tm-signal-stack \ tm-vmxcopy tm-fork tm-tar tm-tmspr tm-vmx-unavail tm-unavailable tm-trap \ $(SIGNAL_CONTEXT_CHK_TESTS) tm-sigreturn tm-signal-sigreturn-nt \ - tm-signal-context-force-tm + tm-signal-context-force-tm tm-poison top_srcdir = ../../../../.. include ../../lib.mk diff --git a/tools/testing/selftests/powerpc/tm/tm-poison.c b/tools/testing/selftests/powerpc/tm/tm-poison.c new file mode 100644 index 000000000000..977558497c16 --- /dev/null +++ b/tools/testing/selftests/powerpc/tm/tm-poison.c @@ -0,0 +1,179 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2019, Gustavo Romero, Michael Neuling, IBM Corp. + * + * This test will spawn two processes. Both will be attached to the same + * CPU (CPU 0). The child will be in a loop writing to FP register f31 and + * VMX/VEC/Altivec register vr31 a known value, called poison, calling + * sched_yield syscall after to allow the parent to switch on the CPU. + * Parent will set f31 and vr31 to 1 and in a loop will check if f31 and + * vr31 remain 1 as expected until a given timeout (2m). If the issue is + * present child's poison will leak into parent's f31 or vr31 registers, + * otherwise, poison will never leak into parent's f31 and vr31 registers. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include + +#include "tm.h" + +int tm_poison_test(void) +{ + int pid; + cpu_set_t cpuset; + uint64_t poison = 0xdeadbeefc0dec0fe; + uint64_t unknown = 0; + bool fail_fp = false; + bool fail_vr = false; + + SKIP_IF(!have_htm()); + + /* Attach both Child and Parent to CPU 0 */ + CPU_ZERO(&cpuset); + CPU_SET(0, &cpuset); + sched_setaffinity(0, sizeof(cpuset), &cpuset); + + pid = fork(); + if (!pid) { + /** + * child + */ + while (1) { + sched_yield(); + asm ( + "mtvsrd 31, %[poison];" // f31 = poison + "mtvsrd 63, %[poison];" // vr31 = poison + + : : [poison] "r" (poison) : ); + } + } + + /** + * parent + */ + asm ( + /* + * Set r3, r4, and f31 to known value 1 before entering + * in transaction. They won't be written after that. + */ + " li 3, 0x1 ;" + " li 4, 0x1 ;" + " mtvsrd 31, 4 ;" + + /* + * The Time Base (TB) is a 64-bit counter register that is + * independent of the CPU clock and which is incremented + * at a frequency of 512000000 Hz, so every 1.953125ns. + * So it's necessary 120s/0.000000001953125s = 61440000000 + * increments to get a 2 minutes timeout. Below we set that + * value in r5 and then use r6 to track initial TB value, + * updating TB values in r7 at every iteration and comparing it + * to r6. When r7 (current) - r6 (initial) > 61440000000 we bail + * out since for sure we spent already 2 minutes in the loop. + * SPR 268 is the TB register. + */ + " lis 5, 14 ;" + " ori 5, 5, 19996 ;" + " sldi 5, 5, 16 ;" // r5 = 61440000000 + + " mfspr 6, 268 ;" // r6 (TB initial) + "1: mfspr 7, 268 ;" // r7 (TB current) + " subf 7, 6, 7 ;" // r7 - r6 > 61440000000 ? + " cmpd 7, 5 ;" + " bgt 3f ;" // yes, exit + + /* + * Main loop to check f31 + */ + " tbegin. ;" // no, try again + " beq 1b ;" // restart if no timeout + " mfvsrd 3, 31 ;" // read f31 + " cmpd 3, 4 ;" // f31 == 1 ? + " bne 2f ;" // broken :-( + " tabort. 3 ;" // try another transaction + "2: tend. ;" // commit transaction + "3: mr %[unknown], 3 ;" // record r3 + + : [unknown] "=r" (unknown) + : + : "cr0", "r3", "r4", "r5", "r6", "r7", "vs31" + + ); + + /* + * On leak 'unknown' will contain 'poison' value from child, + * otherwise (no leak) 'unknown' will contain the same value + * as r3 before entering in transactional mode, i.e. 0x1. + */ + fail_fp = unknown != 0x1; + if (fail_fp) + printf("Unknown value %#"PRIx64" leaked into f31!\n", unknown); + else + printf("Good, no poison or leaked value into FP registers\n"); + + asm ( + /* + * Set r3, r4, and vr31 to known value 1 before entering + * in transaction. They won't be written after that. + */ + " li 3, 0x1 ;" + " li 4, 0x1 ;" + " mtvsrd 63, 4 ;" + + " lis 5, 14 ;" + " ori 5, 5, 19996 ;" + " sldi 5, 5, 16 ;" // r5 = 61440000000 + + " mfspr 6, 268 ;" // r6 (TB initial) + "1: mfspr 7, 268 ;" // r7 (TB current) + " subf 7, 6, 7 ;" // r7 - r6 > 61440000000 ? + " cmpd 7, 5 ;" + " bgt 3f ;" // yes, exit + + /* + * Main loop to check vr31 + */ + " tbegin. ;" // no, try again + " beq 1b ;" // restart if no timeout + " mfvsrd 3, 63 ;" // read vr31 + " cmpd 3, 4 ;" // vr31 == 1 ? + " bne 2f ;" // broken :-( + " tabort. 3 ;" // try another transaction + "2: tend. ;" // commit transaction + "3: mr %[unknown], 3 ;" // record r3 + + : [unknown] "=r" (unknown) + : + : "cr0", "r3", "r4", "r5", "r6", "r7", "vs63" + + ); + + /* + * On leak 'unknown' will contain 'poison' value from child, + * otherwise (no leak) 'unknown' will contain the same value + * as r3 before entering in transactional mode, i.e. 0x1. + */ + fail_vr = unknown != 0x1; + if (fail_vr) + printf("Unknown value %#"PRIx64" leaked into vr31!\n", unknown); + else + printf("Good, no poison or leaked value into VEC registers\n"); + + kill(pid, SIGKILL); + + return (fail_fp | fail_vr); +} + +int main(int argc, char *argv[]) +{ + /* Test completes in about 4m */ + test_harness_set_timeout(250); + return test_harness(tm_poison_test, "tm_poison_test"); +} From 7aec584eaf1cc1a527dcbe7d80f2e44e3bfcfe1d Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 18 Sep 2019 19:31:03 +0530 Subject: [PATCH 120/334] powerpc/book3s64/radix: Remove WARN_ON in destroy_context() On failed task initialization due to memory allocation failures, we can call into destroy_context() with process_tb entry already populated. This patch forces the process_tb entry to zero in destroy_context(). With this patch, we lose the ability to track if we are destroying a context without flushing the process table entry. WARNING: CPU: 4 PID: 6368 at arch/powerpc/mm/mmu_context_book3s64.c:246 destroy_context+0x58/0x340 NIP [c0000000000875f8] destroy_context+0x58/0x340 LR [c00000000013da18] __mmdrop+0x78/0x270 Call Trace: [c000000f7db77c80] [c00000000013da18] __mmdrop+0x78/0x270 [c000000f7db77cf0] [c0000000004d6a34] __do_execve_file.isra.13+0xbd4/0x1000 [c000000f7db77e00] [c0000000004d7428] sys_execve+0x58/0x70 [c000000f7db77e30] [c00000000000b388] system_call+0x5c/0x70 Reported-by: Priya M.A Signed-off-by: Aneesh Kumar K.V [mpe: Reformat/tweak comment wording] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190918140103.24395-1-aneesh.kumar@linux.ibm.com --- arch/powerpc/mm/book3s64/mmu_context.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/mm/book3s64/mmu_context.c b/arch/powerpc/mm/book3s64/mmu_context.c index 2d0cb5ba9a47..0ba30b8b935b 100644 --- a/arch/powerpc/mm/book3s64/mmu_context.c +++ b/arch/powerpc/mm/book3s64/mmu_context.c @@ -256,8 +256,21 @@ void destroy_context(struct mm_struct *mm) #ifdef CONFIG_SPAPR_TCE_IOMMU WARN_ON_ONCE(!list_empty(&mm->context.iommu_group_mem_list)); #endif + /* + * For tasks which were successfully initialized we end up calling + * arch_exit_mmap() which clears the process table entry. And + * arch_exit_mmap() is called before the required fullmm TLB flush + * which does a RIC=2 flush. Hence for an initialized task, we do clear + * any cached process table entries. + * + * The condition below handles the error case during task init. We have + * set the process table entry early and if we fail a task + * initialization, we need to ensure the process table entry is zeroed. + * We need not worry about process table entry caches because the task + * never ran with the PID value. + */ if (radix_enabled()) - WARN_ON(process_tb[mm->context.id].prtb0 != 0); + process_tb[mm->context.id].prtb0 = 0; else subpage_prot_free(mm); destroy_contexts(&mm->context); From c6fadabb2868f817299ddb338ac15885e25d12d2 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Tue, 17 Sep 2019 10:46:04 +1000 Subject: [PATCH 121/334] powerpc: Fix definition of PCR bits to work with old binutils Commit 388cc6e133132 ("KVM: PPC: Book3S HV: Support POWER6 compatibility mode on POWER7") introduced new macros defining the PCR bits. When used from assembly files these definitions lead to build errors using older versions of binutils that don't support the 'ul' suffix. This fixes the build errors by updating the definitions to use the __MASK() macro which selects the appropriate suffix. Signed-off-by: Alistair Popple Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190917004605.22471-1-alistair@popple.id.au --- arch/powerpc/include/asm/reg.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index ec3714cf0989..931939af95d1 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -475,9 +475,9 @@ #define HMER_DEBUG_TRIG (1ul << (63 - 17)) /* Debug trigger */ #define SPRN_HMEER 0x151 /* Hyp maintenance exception enable reg */ #define SPRN_PCR 0x152 /* Processor compatibility register */ -#define PCR_VEC_DIS (1ul << (63-0)) /* Vec. disable (bit NA since POWER8) */ -#define PCR_VSX_DIS (1ul << (63-1)) /* VSX disable (bit NA since POWER8) */ -#define PCR_TM_DIS (1ul << (63-2)) /* Trans. memory disable (POWER8) */ +#define PCR_VEC_DIS (__MASK(63-0)) /* Vec. disable (bit NA since POWER8) */ +#define PCR_VSX_DIS (__MASK(63-1)) /* VSX disable (bit NA since POWER8) */ +#define PCR_TM_DIS (__MASK(63-2)) /* Trans. memory disable (POWER8) */ /* * These bits are used in the function kvmppc_set_arch_compat() to specify and * determine both the compatibility level which we want to emulate and the From 13c7bb3c57dcfe779ea5b4b083f6c47753cc5327 Mon Sep 17 00:00:00 2001 From: Jordan Niethe Date: Tue, 17 Sep 2019 10:46:05 +1000 Subject: [PATCH 122/334] powerpc/64s: Set reserved PCR bits Currently the reserved bits of the Processor Compatibility Register (PCR) are cleared as per the Programming Note in Section 1.3.3 of version 3.0B of the Power ISA. This causes all new architecture features to be made available when running on newer processors with new architecture features added to the PCR as bits must be set to disable a given feature. For example to disable new features added as part of Version 2.07 of the ISA the corresponding bit in the PCR needs to be set. As new processor features generally require explicit kernel support they should be disabled until such support is implemented. Therefore kernels should set all unknown/reserved bits in the PCR such that any new architecture features which the kernel does not currently know about get disabled. An update is planned to the ISA to clarify that the PCR is an exception to the Programming Note on reserved bits in Section 1.3.3. Signed-off-by: Alistair Popple Signed-off-by: Jordan Niethe Tested-by: Joel Stanley Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190917004605.22471-2-alistair@popple.id.au --- arch/powerpc/include/asm/reg.h | 3 +++ arch/powerpc/kernel/cpu_setup_power.S | 6 ++++++ arch/powerpc/kernel/dt_cpu_ftrs.c | 3 ++- arch/powerpc/kvm/book3s_hv.c | 11 +++++++---- arch/powerpc/kvm/book3s_hv_nested.c | 6 +++--- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 10 ++++++---- 6 files changed, 27 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 931939af95d1..b3cbb1136bce 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -478,6 +478,7 @@ #define PCR_VEC_DIS (__MASK(63-0)) /* Vec. disable (bit NA since POWER8) */ #define PCR_VSX_DIS (__MASK(63-1)) /* VSX disable (bit NA since POWER8) */ #define PCR_TM_DIS (__MASK(63-2)) /* Trans. memory disable (POWER8) */ +#define PCR_HIGH_BITS (PCR_VEC_DIS | PCR_VSX_DIS | PCR_TM_DIS) /* * These bits are used in the function kvmppc_set_arch_compat() to specify and * determine both the compatibility level which we want to emulate and the @@ -486,6 +487,8 @@ #define PCR_ARCH_207 0x8 /* Architecture 2.07 */ #define PCR_ARCH_206 0x4 /* Architecture 2.06 */ #define PCR_ARCH_205 0x2 /* Architecture 2.05 */ +#define PCR_LOW_BITS (PCR_ARCH_207 | PCR_ARCH_206 | PCR_ARCH_205) +#define PCR_MASK ~(PCR_HIGH_BITS | PCR_LOW_BITS) /* PCR Reserved Bits */ #define SPRN_HEIR 0x153 /* Hypervisor Emulated Instruction Register */ #define SPRN_TLBINDEXR 0x154 /* P7 TLB control register */ #define SPRN_TLBVPNR 0x155 /* P7 TLB control register */ diff --git a/arch/powerpc/kernel/cpu_setup_power.S b/arch/powerpc/kernel/cpu_setup_power.S index 3239a9fe6c1c..a460298c7ddb 100644 --- a/arch/powerpc/kernel/cpu_setup_power.S +++ b/arch/powerpc/kernel/cpu_setup_power.S @@ -23,6 +23,7 @@ _GLOBAL(__setup_cpu_power7) beqlr li r0,0 mtspr SPRN_LPID,r0 + LOAD_REG_IMMEDIATE(r0, PCR_MASK) mtspr SPRN_PCR,r0 mfspr r3,SPRN_LPCR li r4,(LPCR_LPES1 >> LPCR_LPES_SH) @@ -37,6 +38,7 @@ _GLOBAL(__restore_cpu_power7) beqlr li r0,0 mtspr SPRN_LPID,r0 + LOAD_REG_IMMEDIATE(r0, PCR_MASK) mtspr SPRN_PCR,r0 mfspr r3,SPRN_LPCR li r4,(LPCR_LPES1 >> LPCR_LPES_SH) @@ -54,6 +56,7 @@ _GLOBAL(__setup_cpu_power8) beqlr li r0,0 mtspr SPRN_LPID,r0 + LOAD_REG_IMMEDIATE(r0, PCR_MASK) mtspr SPRN_PCR,r0 mfspr r3,SPRN_LPCR ori r3, r3, LPCR_PECEDH @@ -76,6 +79,7 @@ _GLOBAL(__restore_cpu_power8) beqlr li r0,0 mtspr SPRN_LPID,r0 + LOAD_REG_IMMEDIATE(r0, PCR_MASK) mtspr SPRN_PCR,r0 mfspr r3,SPRN_LPCR ori r3, r3, LPCR_PECEDH @@ -98,6 +102,7 @@ _GLOBAL(__setup_cpu_power9) mtspr SPRN_PSSCR,r0 mtspr SPRN_LPID,r0 mtspr SPRN_PID,r0 + LOAD_REG_IMMEDIATE(r0, PCR_MASK) mtspr SPRN_PCR,r0 mfspr r3,SPRN_LPCR LOAD_REG_IMMEDIATE(r4, LPCR_PECEDH | LPCR_PECE_HVEE | LPCR_HVICE | LPCR_HEIC) @@ -123,6 +128,7 @@ _GLOBAL(__restore_cpu_power9) mtspr SPRN_PSSCR,r0 mtspr SPRN_LPID,r0 mtspr SPRN_PID,r0 + LOAD_REG_IMMEDIATE(r0, PCR_MASK) mtspr SPRN_PCR,r0 mfspr r3,SPRN_LPCR LOAD_REG_IMMEDIATE(r4, LPCR_PECEDH | LPCR_PECE_HVEE | LPCR_HVICE | LPCR_HEIC) diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index bd95318d2202..bceee2fde885 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -101,7 +101,7 @@ static void __restore_cpu_cpufeatures(void) if (hv_mode) { mtspr(SPRN_LPID, 0); mtspr(SPRN_HFSCR, system_registers.hfscr); - mtspr(SPRN_PCR, 0); + mtspr(SPRN_PCR, PCR_MASK); } mtspr(SPRN_FSCR, system_registers.fscr); @@ -144,6 +144,7 @@ static void __init cpufeatures_setup_cpu(void) mtspr(SPRN_HFSCR, 0); } mtspr(SPRN_FSCR, 0); + mtspr(SPRN_PCR, PCR_MASK); /* * LPCR does not get cleared, to match behaviour with secondaries diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index efd8f93bc9dc..709cf1fd4cf4 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -401,8 +401,11 @@ static int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat) spin_lock(&vc->lock); vc->arch_compat = arch_compat; - /* Set all PCR bits for which guest_pcr_bit <= bit < host_pcr_bit */ - vc->pcr = host_pcr_bit - guest_pcr_bit; + /* + * Set all PCR bits for which guest_pcr_bit <= bit < host_pcr_bit + * Also set all reserved PCR bits + */ + vc->pcr = (host_pcr_bit - guest_pcr_bit) | PCR_MASK; spin_unlock(&vc->lock); return 0; @@ -3410,7 +3413,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit, } if (vc->pcr) - mtspr(SPRN_PCR, vc->pcr); + mtspr(SPRN_PCR, vc->pcr | PCR_MASK); mtspr(SPRN_DPDES, vc->dpdes); mtspr(SPRN_VTB, vc->vtb); @@ -3490,7 +3493,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit, vc->vtb = mfspr(SPRN_VTB); mtspr(SPRN_DPDES, 0); if (vc->pcr) - mtspr(SPRN_PCR, 0); + mtspr(SPRN_PCR, PCR_MASK); if (vc->tb_offset_applied) { u64 new_tb = mftb() - vc->tb_offset_applied; diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c index fff90f2c3de2..cdf30c6eaf54 100644 --- a/arch/powerpc/kvm/book3s_hv_nested.c +++ b/arch/powerpc/kvm/book3s_hv_nested.c @@ -29,7 +29,7 @@ void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr) { struct kvmppc_vcore *vc = vcpu->arch.vcore; - hr->pcr = vc->pcr; + hr->pcr = vc->pcr | PCR_MASK; hr->dpdes = vc->dpdes; hr->hfscr = vcpu->arch.hfscr; hr->tb_offset = vc->tb_offset; @@ -65,7 +65,7 @@ static void byteswap_hv_regs(struct hv_guest_state *hr) hr->lpid = swab32(hr->lpid); hr->vcpu_token = swab32(hr->vcpu_token); hr->lpcr = swab64(hr->lpcr); - hr->pcr = swab64(hr->pcr); + hr->pcr = swab64(hr->pcr) | PCR_MASK; hr->amor = swab64(hr->amor); hr->dpdes = swab64(hr->dpdes); hr->hfscr = swab64(hr->hfscr); @@ -148,7 +148,7 @@ static void restore_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr) { struct kvmppc_vcore *vc = vcpu->arch.vcore; - vc->pcr = hr->pcr; + vc->pcr = hr->pcr | PCR_MASK; vc->dpdes = hr->dpdes; vcpu->arch.hfscr = hr->hfscr; vcpu->arch.dawr = hr->dawr0; diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 9a05b0d932ef..74a9cfe84aee 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -644,8 +644,10 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) /* Load guest PCR value to select appropriate compat mode */ 37: ld r7, VCORE_PCR(r5) - cmpdi r7, 0 + LOAD_REG_IMMEDIATE(r6, PCR_MASK) + cmpld r7, r6 beq 38f + or r7, r7, r6 mtspr SPRN_PCR, r7 38: @@ -1913,10 +1915,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) /* Reset PCR */ ld r0, VCORE_PCR(r5) - cmpdi r0, 0 + LOAD_REG_IMMEDIATE(r6, PCR_MASK) + cmpld r0, r6 beq 18f - li r0, 0 - mtspr SPRN_PCR, r0 + mtspr SPRN_PCR, r6 18: /* Signal secondary CPUs to continue */ stb r0,VCORE_IN_GUEST(r5) From 4c0f5d1eb4072871c34530358df45f05ab80edd6 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 21 Aug 2019 10:20:00 +0000 Subject: [PATCH 123/334] powerpc/mm: Add a helper to select PAGE_KERNEL_RO or PAGE_READONLY In a couple of places there is a need to select whether read-only protection of shadow pages is performed with PAGE_KERNEL_RO or with PAGE_READONLY. Add a helper to avoid duplicating the choice. Signed-off-by: Christophe Leroy Cc: stable@vger.kernel.org Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/9f33f44b9cd741c4a02b3dce7b8ef9438fe2cd2a.1566382750.git.christophe.leroy@c-s.fr --- arch/powerpc/mm/kasan/kasan_init_32.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c index 802387b231ad..e8ab3cc5f6e4 100644 --- a/arch/powerpc/mm/kasan/kasan_init_32.c +++ b/arch/powerpc/mm/kasan/kasan_init_32.c @@ -12,6 +12,14 @@ #include #include +static pgprot_t kasan_prot_ro(void) +{ + if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE)) + return PAGE_READONLY; + + return PAGE_KERNEL_RO; +} + static void kasan_populate_pte(pte_t *ptep, pgprot_t prot) { unsigned long va = (unsigned long)kasan_early_shadow_page; @@ -26,6 +34,7 @@ static int __ref kasan_init_shadow_page_tables(unsigned long k_start, unsigned l { pmd_t *pmd; unsigned long k_cur, k_next; + pgprot_t prot = kasan_prot_ro(); pmd = pmd_offset(pud_offset(pgd_offset_k(k_start), k_start), k_start); @@ -43,10 +52,7 @@ static int __ref kasan_init_shadow_page_tables(unsigned long k_start, unsigned l if (!new) return -ENOMEM; - if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE)) - kasan_populate_pte(new, PAGE_READONLY); - else - kasan_populate_pte(new, PAGE_KERNEL_RO); + kasan_populate_pte(new, prot); smp_wmb(); /* See comment in __pte_alloc */ @@ -103,10 +109,9 @@ static int __ref kasan_init_region(void *start, size_t size) static void __init kasan_remap_early_shadow_ro(void) { - if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE)) - kasan_populate_pte(kasan_early_shadow_pte, PAGE_READONLY); - else - kasan_populate_pte(kasan_early_shadow_pte, PAGE_KERNEL_RO); + pgprot_t prot = kasan_prot_ro(); + + kasan_populate_pte(kasan_early_shadow_pte, prot); flush_tlb_kernel_range(KASAN_SHADOW_START, KASAN_SHADOW_END); } From cbd18991e24fea2c31da3bb117c83e4a3538cd11 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 21 Aug 2019 10:20:11 +0000 Subject: [PATCH 124/334] powerpc/mm: Fix an Oops in kasan_mmu_init() Uncompressing Kernel Image ... OK Loading Device Tree to 01ff7000, end 01fff74f ... OK [ 0.000000] printk: bootconsole [udbg0] enabled [ 0.000000] BUG: Unable to handle kernel data access at 0xf818c000 [ 0.000000] Faulting instruction address: 0xc0013c7c [ 0.000000] Thread overran stack, or stack corrupted [ 0.000000] Oops: Kernel access of bad area, sig: 11 [#1] [ 0.000000] BE PAGE_SIZE=16K PREEMPT [ 0.000000] Modules linked in: [ 0.000000] CPU: 0 PID: 0 Comm: swapper Not tainted 5.3.0-rc4-s3k-dev-00743-g5abe4a3e8fd3-dirty #2080 [ 0.000000] NIP: c0013c7c LR: c0013310 CTR: 00000000 [ 0.000000] REGS: c0c5ff38 TRAP: 0300 Not tainted (5.3.0-rc4-s3k-dev-00743-g5abe4a3e8fd3-dirty) [ 0.000000] MSR: 00001032 CR: 99033955 XER: 80002100 [ 0.000000] DAR: f818c000 DSISR: 82000000 [ 0.000000] GPR00: c0013310 c0c5fff0 c0ad6ac0 c0c600c0 f818c031 82000000 00000000 ffffffff [ 0.000000] GPR08: 00000000 f1f1f1f1 c0013c2c c0013304 99033955 00400008 00000000 07ff9598 [ 0.000000] GPR16: 00000000 07ffb94c 00000000 00000000 00000000 00000000 00000000 f818cfb2 [ 0.000000] GPR24: 00000000 00000000 00001000 ffffffff 00000000 c07dbf80 00000000 f818c000 [ 0.000000] NIP [c0013c7c] do_page_fault+0x50/0x904 [ 0.000000] LR [c0013310] handle_page_fault+0xc/0x38 [ 0.000000] Call Trace: [ 0.000000] Instruction dump: [ 0.000000] be010080 91410014 553fe8fe 3d40c001 3d20f1f1 7d800026 394a3c2c 3fffe000 [ 0.000000] 6129f1f1 900100c4 9181007c 91410018 <913f0000> 3d2001f4 6129f4f4 913f0004 Don't map the early shadow page read-only yet when creating the new page tables for the real shadow memory, otherwise the memblock allocations that immediately follows to create the real shadow pages that are about to replace the early shadow page trigger a page fault if they fall into the region being worked on at the moment. Signed-off-by: Christophe Leroy Fixes: 2edb16efc899 ("powerpc/32: Add KASAN support") Cc: stable@vger.kernel.org Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/fe86886fb8db44360417cee0dc515ad47ca6ef72.1566382750.git.christophe.leroy@c-s.fr --- arch/powerpc/mm/kasan/kasan_init_32.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c index e8ab3cc5f6e4..0e6ed4413eea 100644 --- a/arch/powerpc/mm/kasan/kasan_init_32.c +++ b/arch/powerpc/mm/kasan/kasan_init_32.c @@ -34,7 +34,7 @@ static int __ref kasan_init_shadow_page_tables(unsigned long k_start, unsigned l { pmd_t *pmd; unsigned long k_cur, k_next; - pgprot_t prot = kasan_prot_ro(); + pgprot_t prot = slab_is_available() ? kasan_prot_ro() : PAGE_KERNEL; pmd = pmd_offset(pud_offset(pgd_offset_k(k_start), k_start), k_start); @@ -110,9 +110,22 @@ static int __ref kasan_init_region(void *start, size_t size) static void __init kasan_remap_early_shadow_ro(void) { pgprot_t prot = kasan_prot_ro(); + unsigned long k_start = KASAN_SHADOW_START; + unsigned long k_end = KASAN_SHADOW_END; + unsigned long k_cur; + phys_addr_t pa = __pa(kasan_early_shadow_page); kasan_populate_pte(kasan_early_shadow_pte, prot); + for (k_cur = k_start & PAGE_MASK; k_cur < k_end; k_cur += PAGE_SIZE) { + pmd_t *pmd = pmd_offset(pud_offset(pgd_offset_k(k_cur), k_cur), k_cur); + pte_t *ptep = pte_offset_kernel(pmd, k_cur); + + if ((pte_val(*ptep) & PTE_RPN_MASK) != pa) + continue; + + __set_pte_at(&init_mm, k_cur, ptep, pfn_pte(PHYS_PFN(pa), prot), 0); + } flush_tlb_kernel_range(KASAN_SHADOW_START, KASAN_SHADOW_END); } From 92974a1d006ad8b30d53047c70974c9e065eb7df Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Tue, 17 Sep 2019 11:30:55 +0200 Subject: [PATCH 125/334] net/sched: act_sample: don't push mac header on ip6gre ingress current 'sample' action doesn't push the mac header of ingress packets if they are received by a layer 3 tunnel (like gre or sit); but it forgot to check for gre over ipv6, so the following script: # tc q a dev $d clsact # tc f a dev $d ingress protocol ip flower ip_proto icmp action sample \ > group 100 rate 1 # psample -v -g 100 dumps everything, including outer header and mac, when $d is a gre tunnel over ipv6. Fix this adding a missing label for ARPHRD_IP6GRE devices. Fixes: 5c5670fae430 ("net/sched: Introduce sample tc action") Signed-off-by: Davide Caratti Reviewed-by: Yotam Gigi Signed-off-by: Jakub Kicinski --- net/sched/act_sample.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 692c4c9040fd..514456a0b9a8 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -146,6 +146,7 @@ static bool tcf_sample_dev_ok_push(struct net_device *dev) case ARPHRD_TUNNEL6: case ARPHRD_SIT: case ARPHRD_IPGRE: + case ARPHRD_IP6GRE: case ARPHRD_VOID: case ARPHRD_NONE: return false; From 9e5c8d39b88c15f4ec3ab24a73e09c112f209344 Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Tue, 17 Sep 2019 13:30:52 +0300 Subject: [PATCH 126/334] dt-bindings: net: dwmac: fix 'mac-mode' type The 'mac-mode' property is similar to 'phy-mode' and 'phy-connection-type', which are enums of mode strings. The 'dwmac' driver supports almost all modes declared in the 'phy-mode' enum (except for 1 or 2). But in general, there may be a case where 'mac-mode' becomes more generic and is moved as part of phylib or netdev. In any case, the 'mac-mode' field should be made an enum, and it also makes sense to just reference the 'phy-connection-type' from 'ethernet-controller.yaml'. That will also make it more future-proof for new modes. Fixes: 9c15d3597c62 ("dt-bindings: net: dwmac: document 'mac-mode' property") Signed-off-by: Alexandru Ardelean Reviewed-by: Andrew Lunn Reviewed-by: Rob Herring Signed-off-by: Jakub Kicinski --- Documentation/devicetree/bindings/net/snps,dwmac.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/net/snps,dwmac.yaml b/Documentation/devicetree/bindings/net/snps,dwmac.yaml index ebe4537a7cce..4845e29411e4 100644 --- a/Documentation/devicetree/bindings/net/snps,dwmac.yaml +++ b/Documentation/devicetree/bindings/net/snps,dwmac.yaml @@ -113,7 +113,7 @@ properties: const: stmmaceth mac-mode: - maxItems: 1 + $ref: ethernet-controller.yaml#/properties/phy-connection-type description: The property is identical to 'phy-mode', and assumes that there is mode converter in-between the MAC & PHY (e.g. GMII-to-RGMII). This converter From 0360894a05ed52be268e3c4d40b2df9d94975fa6 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Sep 2019 10:30:21 -0700 Subject: [PATCH 127/334] selftests: Update fib_tests to handle missing ping6 Some distributions (e.g., debian buster) do not install ping6. Re-use the hook in pmtu.sh to detect this and fallback to ping. Fixes: a0e11da78f48 ("fib_tests: Add tests for metrics on routes") Signed-off-by: David Ahern Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/fib_tests.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh index 4465fc2dae14..cba83a12da82 100755 --- a/tools/testing/selftests/net/fib_tests.sh +++ b/tools/testing/selftests/net/fib_tests.sh @@ -17,6 +17,8 @@ PAUSE=no IP="ip -netns ns1" NS_EXEC="ip netns exec ns1" +which ping6 > /dev/null 2>&1 && ping6=$(which ping6) || ping6=$(which ping) + log_test() { local rc=$1 @@ -1086,7 +1088,7 @@ ipv6_route_metrics_test() log_test $rc 0 "Multipath route with mtu metric" $IP -6 ro add 2001:db8:104::/64 via 2001:db8:101::2 mtu 1300 - run_cmd "ip netns exec ns1 ping6 -w1 -c1 -s 1500 2001:db8:104::1" + run_cmd "ip netns exec ns1 ${ping6} -w1 -c1 -s 1500 2001:db8:104::1" log_test $? 0 "Using route with mtu metric" run_cmd "$IP -6 ro add 2001:db8:114::/64 via 2001:db8:101::2 congctl lock foo" From e84622ce24482f6e9c1bf29d3bdd556eb587ff41 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Sep 2019 10:30:35 -0700 Subject: [PATCH 128/334] selftests: Update fib_nexthop_multiprefix to handle missing ping6 Some distributions (e.g., debian buster) do not install ping6. Re-use the hook in pmtu.sh to detect this and fallback to ping. Fixes: 735ab2f65dce ("selftests: Add test with multiple prefixes using single nexthop") Signed-off-by: David Ahern Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/fib_nexthop_multiprefix.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/fib_nexthop_multiprefix.sh b/tools/testing/selftests/net/fib_nexthop_multiprefix.sh index e6828732843e..9dc35a16e415 100755 --- a/tools/testing/selftests/net/fib_nexthop_multiprefix.sh +++ b/tools/testing/selftests/net/fib_nexthop_multiprefix.sh @@ -15,6 +15,8 @@ PAUSE_ON_FAIL=no VERBOSE=0 +which ping6 > /dev/null 2>&1 && ping6=$(which ping6) || ping6=$(which ping) + ################################################################################ # helpers @@ -200,7 +202,7 @@ validate_v6_exception() local rc if [ ${ping_sz} != "0" ]; then - run_cmd ip netns exec h0 ping6 -s ${ping_sz} -c5 -w5 ${dst} + run_cmd ip netns exec h0 ${ping6} -s ${ping_sz} -c5 -w5 ${dst} fi if [ "$VERBOSE" = "1" ]; then @@ -243,7 +245,7 @@ do run_cmd taskset -c ${c} ip netns exec h0 ping -c1 -w1 172.16.10${i}.1 [ $? -ne 0 ] && printf "\nERROR: ping to h${i} failed\n" && ret=1 - run_cmd taskset -c ${c} ip netns exec h0 ping6 -c1 -w1 2001:db8:10${i}::1 + run_cmd taskset -c ${c} ip netns exec h0 ${ping6} -c1 -w1 2001:db8:10${i}::1 [ $? -ne 0 ] && printf "\nERROR: ping6 to h${i} failed\n" && ret=1 [ $ret -ne 0 ] && break From 77d5bc7e6a6cf8bbeca31aab7f0c5449a5eee762 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Sep 2019 10:39:49 -0700 Subject: [PATCH 129/334] ipv4: Revert removal of rt_uses_gateway Julian noted that rt_uses_gateway has a more subtle use than 'is gateway set': https://lore.kernel.org/netdev/alpine.LFD.2.21.1909151104060.2546@ja.home.ssi.bg/ Revert that part of the commit referenced in the Fixes tag. Currently, there are no u8 holes in 'struct rtable'. There is a 4-byte hole in the second cacheline which contains the gateway declaration. So move rt_gw_family down to the gateway declarations since they are always used together, and then re-use that u8 for rt_uses_gateway. End result is that rtable size is unchanged. Fixes: 1550c171935d ("ipv4: Prepare rtable for IPv6 gateway") Reported-by: Julian Anastasov Signed-off-by: David Ahern Reviewed-by: Julian Anastasov Signed-off-by: Jakub Kicinski --- drivers/infiniband/core/addr.c | 2 +- include/net/route.h | 3 ++- net/ipv4/inet_connection_sock.c | 4 ++-- net/ipv4/ip_forward.c | 2 +- net/ipv4/ip_output.c | 2 +- net/ipv4/route.c | 34 +++++++++++++++++++-------------- net/ipv4/xfrm4_policy.c | 1 + 7 files changed, 28 insertions(+), 20 deletions(-) diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 9b76a8fcdd24..bf539c34ccd3 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -352,7 +352,7 @@ static bool has_gateway(const struct dst_entry *dst, sa_family_t family) if (family == AF_INET) { rt = container_of(dst, struct rtable, dst); - return rt->rt_gw_family == AF_INET; + return rt->rt_uses_gateway; } rt6 = container_of(dst, struct rt6_info, dst); diff --git a/include/net/route.h b/include/net/route.h index dfce19c9fa96..6c516840380d 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -53,10 +53,11 @@ struct rtable { unsigned int rt_flags; __u16 rt_type; __u8 rt_is_input; - u8 rt_gw_family; + __u8 rt_uses_gateway; int rt_iif; + u8 rt_gw_family; /* Info on neighbour */ union { __be32 rt_gw4; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index f5c163d4771b..a9183543ca30 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -560,7 +560,7 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk, rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; - if (opt && opt->opt.is_strictroute && rt->rt_gw_family) + if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) goto route_err; rcu_read_unlock(); return &rt->dst; @@ -598,7 +598,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; - if (opt && opt->opt.is_strictroute && rt->rt_gw_family) + if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) goto route_err; return &rt->dst; diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 06f6f280b9ff..00ec819f949b 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -123,7 +123,7 @@ int ip_forward(struct sk_buff *skb) rt = skb_rtable(skb); - if (opt->is_strictroute && rt->rt_gw_family) + if (opt->is_strictroute && rt->rt_uses_gateway) goto sr_failed; IPCB(skb)->flags |= IPSKB_FORWARDED; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 5eb73775c3f7..a77c3a4c24de 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -499,7 +499,7 @@ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, skb_dst_set_noref(skb, &rt->dst); packet_routed: - if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_gw_family) + if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway) goto no_route; /* OK, we know where to send it, allocate and build IP header. */ diff --git a/net/ipv4/route.c b/net/ipv4/route.c index b6a6f18c3dd1..7dcce724c78b 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -635,6 +635,7 @@ static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnh if (fnhe->fnhe_gw) { rt->rt_flags |= RTCF_REDIRECTED; + rt->rt_uses_gateway = 1; rt->rt_gw_family = AF_INET; rt->rt_gw4 = fnhe->fnhe_gw; } @@ -1313,7 +1314,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) mtu = READ_ONCE(dst->dev->mtu); if (unlikely(ip_mtu_locked(dst))) { - if (rt->rt_gw_family && mtu > 576) + if (rt->rt_uses_gateway && mtu > 576) mtu = 576; } @@ -1569,6 +1570,7 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, struct fib_nh_common *nhc = FIB_RES_NHC(*res); if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) { + rt->rt_uses_gateway = 1; rt->rt_gw_family = nhc->nhc_gw_family; /* only INET and INET6 are supported */ if (likely(nhc->nhc_gw_family == AF_INET)) @@ -1634,6 +1636,7 @@ struct rtable *rt_dst_alloc(struct net_device *dev, rt->rt_iif = 0; rt->rt_pmtu = 0; rt->rt_mtu_locked = 0; + rt->rt_uses_gateway = 0; rt->rt_gw_family = 0; rt->rt_gw4 = 0; INIT_LIST_HEAD(&rt->rt_uncached); @@ -2694,6 +2697,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or rt->rt_genid = rt_genid_ipv4(net); rt->rt_flags = ort->rt_flags; rt->rt_type = ort->rt_type; + rt->rt_uses_gateway = ort->rt_uses_gateway; rt->rt_gw_family = ort->rt_gw_family; if (rt->rt_gw_family == AF_INET) rt->rt_gw4 = ort->rt_gw4; @@ -2778,21 +2782,23 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr)) goto nla_put_failure; } - if (rt->rt_gw_family == AF_INET && - nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) { - goto nla_put_failure; - } else if (rt->rt_gw_family == AF_INET6) { - int alen = sizeof(struct in6_addr); - struct nlattr *nla; - struct rtvia *via; - - nla = nla_reserve(skb, RTA_VIA, alen + 2); - if (!nla) + if (rt->rt_uses_gateway) { + if (rt->rt_gw_family == AF_INET && + nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) { goto nla_put_failure; + } else if (rt->rt_gw_family == AF_INET6) { + int alen = sizeof(struct in6_addr); + struct nlattr *nla; + struct rtvia *via; - via = nla_data(nla); - via->rtvia_family = AF_INET6; - memcpy(via->rtvia_addr, &rt->rt_gw6, alen); + nla = nla_reserve(skb, RTA_VIA, alen + 2); + if (!nla) + goto nla_put_failure; + + via = nla_data(nla); + via->rtvia_family = AF_INET6; + memcpy(via->rtvia_addr, &rt->rt_gw6, alen); + } } expires = rt->dst.expires; diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index cdef8f9a3b01..35b84b52b702 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -85,6 +85,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL); xdst->u.rt.rt_type = rt->rt_type; + xdst->u.rt.rt_uses_gateway = rt->rt_uses_gateway; xdst->u.rt.rt_gw_family = rt->rt_gw_family; if (rt->rt_gw_family == AF_INET) xdst->u.rt.rt_gw4 = rt->rt_gw4; From 432264e9dfd10537e3cb66a11739f76754fc89e6 Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Wed, 18 Sep 2019 14:14:47 +0300 Subject: [PATCH 130/334] dt-bindings: net: remove un-implemented property The `adi,disable-energy-detect` property was implemented in an initial version of the `adin` driver series, but after a review it was discarded in favor of implementing the ETHTOOL_PHY_EDPD phy-tunable option. With the ETHTOOL_PHY_EDPD control, it's possible to disable/enable Energy-Detect-Power-Down for the `adin` PHY, so this device-tree is not needed. Fixes: 767078132ff9 ("dt-bindings: net: add bindings for ADIN PHY driver") Signed-off-by: Alexandru Ardelean Reviewed-by: Rob Herring Signed-off-by: Jakub Kicinski --- Documentation/devicetree/bindings/net/adi,adin.yaml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/Documentation/devicetree/bindings/net/adi,adin.yaml b/Documentation/devicetree/bindings/net/adi,adin.yaml index 69375cb28e92..d95cc691a65f 100644 --- a/Documentation/devicetree/bindings/net/adi,adin.yaml +++ b/Documentation/devicetree/bindings/net/adi,adin.yaml @@ -36,12 +36,6 @@ properties: enum: [ 4, 8, 12, 16, 20, 24 ] default: 8 - adi,disable-energy-detect: - description: | - Disables Energy Detect Powerdown Mode (default disabled, i.e energy detect - is enabled if this property is unspecified) - type: boolean - examples: - | ethernet { @@ -68,6 +62,5 @@ examples: reg = <1>; adi,fifo-depth-bits = <16>; - adi,disable-energy-detect; }; }; From b41d936b5ecfdb3a4abc525ce6402a6c49cffddc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Sep 2019 08:05:39 -0700 Subject: [PATCH 131/334] sch_netem: fix a divide by zero in tabledist() syzbot managed to crash the kernel in tabledist() loading an empty distribution table. t = dist->table[rnd % dist->size]; Simply return an error when such load is attempted. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: Jakub Kicinski --- net/sched/sch_netem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index b17f2ed970e2..f5cb35e550f8 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -777,7 +777,7 @@ static int get_dist_table(struct Qdisc *sch, struct disttable **tbl, struct disttable *d; int i; - if (n > NETEM_DIST_MAX) + if (!n || n > NETEM_DIST_MAX) return -EINVAL; d = kvmalloc(sizeof(struct disttable) + n * sizeof(s16), GFP_KERNEL); From 7b09c2d052db4b4ad0b27b97918b46a7746966fa Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 19 Sep 2019 10:12:36 -0700 Subject: [PATCH 132/334] ipv6: fix a typo in fib6_rule_lookup() Yi Ren reported an issue discovered by syzkaller, and bisected to the cited commit. Many thanks to Yi, this trivial patch does not reflect the patient work that has been done. Fixes: d64a1f574a29 ("ipv6: honor RT6_LOOKUP_F_DST_NOREF in rule lookup logic") Signed-off-by: Eric Dumazet Acked-by: Wei Wang Bisected-and-reported-by: Yi Ren Signed-off-by: Jakub Kicinski --- net/ipv6/ip6_fib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 87f47bc55c5e..6e2af411cd9c 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -318,7 +318,7 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, if (rt->dst.error == -EAGAIN) { ip6_rt_put_flags(rt, flags); rt = net->ipv6.ip6_null_entry; - if (!(flags | RT6_LOOKUP_F_DST_NOREF)) + if (!(flags & RT6_LOOKUP_F_DST_NOREF)) dst_hold(&rt->dst); } From dc579ca5cfea3b9652db73009b394b9a3f46ae29 Mon Sep 17 00:00:00 2001 From: Yan-Hsuan Chuang Date: Mon, 16 Sep 2019 15:03:34 +0800 Subject: [PATCH 133/334] rtw88: pci: extract skbs free routine for trx rings These skbs free routines could be used when driver wants to stop PCI bus, because some of the skbs remained in the queue may not have been returned via DMA interrupt. Signed-off-by: Yan-Hsuan Chuang Signed-off-by: Kalle Valo --- drivers/net/wireless/realtek/rtw88/pci.c | 36 +++++++++++++++++------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/drivers/net/wireless/realtek/rtw88/pci.c b/drivers/net/wireless/realtek/rtw88/pci.c index 3fdb52a5789a..bc3a36402e56 100644 --- a/drivers/net/wireless/realtek/rtw88/pci.c +++ b/drivers/net/wireless/realtek/rtw88/pci.c @@ -90,16 +90,13 @@ static inline void *rtw_pci_get_tx_desc(struct rtw_pci_tx_ring *tx_ring, u8 idx) return tx_ring->r.head + offset; } -static void rtw_pci_free_tx_ring(struct rtw_dev *rtwdev, - struct rtw_pci_tx_ring *tx_ring) +static void rtw_pci_free_tx_ring_skbs(struct rtw_dev *rtwdev, + struct rtw_pci_tx_ring *tx_ring) { struct pci_dev *pdev = to_pci_dev(rtwdev->dev); struct rtw_pci_tx_data *tx_data; struct sk_buff *skb, *tmp; dma_addr_t dma; - u8 *head = tx_ring->r.head; - u32 len = tx_ring->r.len; - int ring_sz = len * tx_ring->r.desc_size; /* free every skb remained in tx list */ skb_queue_walk_safe(&tx_ring->queue, skb, tmp) { @@ -110,21 +107,30 @@ static void rtw_pci_free_tx_ring(struct rtw_dev *rtwdev, pci_unmap_single(pdev, dma, skb->len, PCI_DMA_TODEVICE); dev_kfree_skb_any(skb); } +} + +static void rtw_pci_free_tx_ring(struct rtw_dev *rtwdev, + struct rtw_pci_tx_ring *tx_ring) +{ + struct pci_dev *pdev = to_pci_dev(rtwdev->dev); + u8 *head = tx_ring->r.head; + u32 len = tx_ring->r.len; + int ring_sz = len * tx_ring->r.desc_size; + + rtw_pci_free_tx_ring_skbs(rtwdev, tx_ring); /* free the ring itself */ pci_free_consistent(pdev, ring_sz, head, tx_ring->r.dma); tx_ring->r.head = NULL; } -static void rtw_pci_free_rx_ring(struct rtw_dev *rtwdev, - struct rtw_pci_rx_ring *rx_ring) +static void rtw_pci_free_rx_ring_skbs(struct rtw_dev *rtwdev, + struct rtw_pci_rx_ring *rx_ring) { struct pci_dev *pdev = to_pci_dev(rtwdev->dev); struct sk_buff *skb; - dma_addr_t dma; - u8 *head = rx_ring->r.head; int buf_sz = RTK_PCI_RX_BUF_SIZE; - int ring_sz = rx_ring->r.desc_size * rx_ring->r.len; + dma_addr_t dma; int i; for (i = 0; i < rx_ring->r.len; i++) { @@ -137,6 +143,16 @@ static void rtw_pci_free_rx_ring(struct rtw_dev *rtwdev, dev_kfree_skb(skb); rx_ring->buf[i] = NULL; } +} + +static void rtw_pci_free_rx_ring(struct rtw_dev *rtwdev, + struct rtw_pci_rx_ring *rx_ring) +{ + struct pci_dev *pdev = to_pci_dev(rtwdev->dev); + u8 *head = rx_ring->r.head; + int ring_sz = rx_ring->r.desc_size * rx_ring->r.len; + + rtw_pci_free_rx_ring_skbs(rtwdev, rx_ring); pci_free_consistent(pdev, ring_sz, head, rx_ring->r.dma); } From 0e41edcdfe86435fef709b7de8397e8a5a0e1b2f Mon Sep 17 00:00:00 2001 From: Yan-Hsuan Chuang Date: Mon, 16 Sep 2019 15:03:35 +0800 Subject: [PATCH 134/334] rtw88: pci: release tx skbs DMAed when stop Interrupt is disabled to stop PCI, which means the skbs queued for each TX ring will not be released via DMA interrupt. To avoid those skbs remained being left in the skb queue until PCI has been removed, driver needs to release skbs by itself. Signed-off-by: Yan-Hsuan Chuang Reviewed-by: Brian Norris Tested-by: Brian Norris Signed-off-by: Kalle Valo --- drivers/net/wireless/realtek/rtw88/pci.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/net/wireless/realtek/rtw88/pci.c b/drivers/net/wireless/realtek/rtw88/pci.c index bc3a36402e56..d90928be663b 100644 --- a/drivers/net/wireless/realtek/rtw88/pci.c +++ b/drivers/net/wireless/realtek/rtw88/pci.c @@ -500,6 +500,17 @@ static void rtw_pci_dma_reset(struct rtw_dev *rtwdev, struct rtw_pci *rtwpci) rtwpci->rx_tag = 0; } +static void rtw_pci_dma_release(struct rtw_dev *rtwdev, struct rtw_pci *rtwpci) +{ + struct rtw_pci_tx_ring *tx_ring; + u8 queue; + + for (queue = 0; queue < RTK_MAX_TX_QUEUE_NUM; queue++) { + tx_ring = &rtwpci->tx_rings[queue]; + rtw_pci_free_tx_ring_skbs(rtwdev, tx_ring); + } +} + static int rtw_pci_start(struct rtw_dev *rtwdev) { struct rtw_pci *rtwpci = (struct rtw_pci *)rtwdev->priv; @@ -521,6 +532,7 @@ static void rtw_pci_stop(struct rtw_dev *rtwdev) spin_lock_irqsave(&rtwpci->irq_lock, flags); rtw_pci_disable_interrupt(rtwdev, rtwpci); + rtw_pci_dma_release(rtwdev, rtwpci); spin_unlock_irqrestore(&rtwpci->irq_lock, flags); } From 6355592e6b55be8c568b62efba5fe5a1c919a2db Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 19 Sep 2019 11:15:32 +0200 Subject: [PATCH 135/334] zd1211rw: zd_usb: Use "%zu" to format size_t MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On 32-bit: drivers/net/wireless/zydas/zd1211rw/zd_usb.c: In function ‘check_read_regs’: drivers/net/wireless/zydas/zd1211rw/zd_def.h:18:25: warning: format ‘%ld’ expects argument of type ‘long int’, but argument 6 has type ‘size_t’ {aka ‘unsigned int’} [-Wformat=] dev_printk(level, dev, "%s() " fmt, __func__, ##args) ^~~~~~~ drivers/net/wireless/zydas/zd1211rw/zd_def.h:22:4: note: in expansion of macro ‘dev_printk_f’ dev_printk_f(KERN_DEBUG, dev, fmt, ## args) ^~~~~~~~~~~~ drivers/net/wireless/zydas/zd1211rw/zd_usb.c:1635:3: note: in expansion of macro ‘dev_dbg_f’ dev_dbg_f(zd_usb_dev(usb), ^~~~~~~~~ drivers/net/wireless/zydas/zd1211rw/zd_usb.c:1636:51: note: format string is defined here "error: actual length %d less than expected %ld\n", ~~^ %d Fixes: 84b0b66352470e64 ("zd1211rw: zd_usb: Use struct_size() helper") Signed-off-by: Geert Uytterhoeven Signed-off-by: Kalle Valo --- drivers/net/wireless/zydas/zd1211rw/zd_usb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/zydas/zd1211rw/zd_usb.c b/drivers/net/wireless/zydas/zd1211rw/zd_usb.c index 4e44ea8c652d..7b5c2fe5bd4d 100644 --- a/drivers/net/wireless/zydas/zd1211rw/zd_usb.c +++ b/drivers/net/wireless/zydas/zd1211rw/zd_usb.c @@ -1633,7 +1633,7 @@ static bool check_read_regs(struct zd_usb *usb, struct usb_req_read_regs *req, */ if (rr->length < struct_size(regs, regs, count)) { dev_dbg_f(zd_usb_dev(usb), - "error: actual length %d less than expected %ld\n", + "error: actual length %d less than expected %zu\n", rr->length, struct_size(regs, regs, count)); return false; } From 7e7db86c7e1088e768438fe6c894d748b0c32abe Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 19 Sep 2019 04:00:55 -0500 Subject: [PATCH 136/334] smb3: allow decryption keys to be dumped by admin for debugging In order to debug certain problems it is important to be able to decrypt network traces (e.g. wireshark) but to do this we need to be able to dump out the encryption/decryption keys. Dumping them to an ioctl is safer than dumping then to dmesg, (and better than showing all keys in a pseudofile). Restrict this to root (CAP_SYS_ADMIN), and only for a mount that this admin has access to. Sample smbinfo output: SMB3.0 encryption Session Id: 0x82d2ec52 Session Key: a5 6d 81 d0 e c1 ca e1 d8 13 aa 20 e8 f2 cc 71 Server Encryption Key: 1a c3 be ba 3d fc dc 3c e bc 93 9e 50 9e 19 c1 Server Decryption Key: e0 d4 d9 43 1b a2 1b e3 d8 76 77 49 56 f7 20 88 Reviewed-by: Aurelien Aptel Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/cifs_ioctl.h | 9 +++++++++ fs/cifs/ioctl.c | 29 +++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h index 6c3bd07868d7..0f0dc1c1fe41 100644 --- a/fs/cifs/cifs_ioctl.h +++ b/fs/cifs/cifs_ioctl.h @@ -57,9 +57,18 @@ struct smb_query_info { /* char buffer[]; */ } __packed; +struct smb3_key_debug_info { + __u64 Suid; + __u16 cipher_type; + __u8 auth_key[16]; /* SMB2_NTLMV2_SESSKEY_SIZE */ + __u8 smb3encryptionkey[SMB3_SIGN_KEY_SIZE]; + __u8 smb3decryptionkey[SMB3_SIGN_KEY_SIZE]; +} __packed; + #define CIFS_IOCTL_MAGIC 0xCF #define CIFS_IOC_COPYCHUNK_FILE _IOW(CIFS_IOCTL_MAGIC, 3, int) #define CIFS_IOC_SET_INTEGRITY _IO(CIFS_IOCTL_MAGIC, 4) #define CIFS_IOC_GET_MNT_INFO _IOR(CIFS_IOCTL_MAGIC, 5, struct smb_mnt_fs_info) #define CIFS_ENUMERATE_SNAPSHOTS _IOR(CIFS_IOCTL_MAGIC, 6, struct smb_snapshot_array) #define CIFS_QUERY_INFO _IOWR(CIFS_IOCTL_MAGIC, 7, struct smb_query_info) +#define CIFS_DUMP_KEY _IOWR(CIFS_IOCTL_MAGIC, 8, struct smb3_key_debug_info) diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 76ddd98b6298..1a01e108d75e 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -164,6 +164,7 @@ static long smb_mnt_get_fsinfo(unsigned int xid, struct cifs_tcon *tcon, long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) { struct inode *inode = file_inode(filep); + struct smb3_key_debug_info pkey_inf; int rc = -ENOTTY; /* strange error - but the precedent */ unsigned int xid; struct cifsFileInfo *pSMBFile = filep->private_data; @@ -270,6 +271,34 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) else rc = -EOPNOTSUPP; break; + case CIFS_DUMP_KEY: + if (pSMBFile == NULL) + break; + if (!capable(CAP_SYS_ADMIN)) { + rc = -EACCES; + break; + } + + tcon = tlink_tcon(pSMBFile->tlink); + if (!smb3_encryption_required(tcon)) { + rc = -EOPNOTSUPP; + break; + } + pkey_inf.cipher_type = + le16_to_cpu(tcon->ses->server->cipher_type); + pkey_inf.Suid = tcon->ses->Suid; + memcpy(pkey_inf.auth_key, tcon->ses->auth_key.response, + 16 /* SMB2_NTLMV2_SESSKEY_SIZE */); + memcpy(pkey_inf.smb3decryptionkey, + tcon->ses->smb3decryptionkey, SMB3_SIGN_KEY_SIZE); + memcpy(pkey_inf.smb3encryptionkey, + tcon->ses->smb3encryptionkey, SMB3_SIGN_KEY_SIZE); + if (copy_to_user((void __user *)arg, &pkey_inf, + sizeof(struct smb3_key_debug_info))) + rc = -EFAULT; + else + rc = 0; + break; default: cifs_dbg(FYI, "unsupported ioctl\n"); break; From 3fe4b3351301660653a2bc73f2226da0ebd2b95e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Mork?= Date: Wed, 18 Sep 2019 14:01:46 +0200 Subject: [PATCH 137/334] cdc_ncm: fix divide-by-zero caused by invalid wMaxPacketSize MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Endpoints with zero wMaxPacketSize are not usable for transferring data. Ignore such endpoints when looking for valid in, out and status pipes, to make the driver more robust against invalid and meaningless descriptors. The wMaxPacketSize of the out pipe is used as divisor. So this change fixes a divide-by-zero bug. Reported-by: syzbot+ce366e2b8296e25d84f5@syzkaller.appspotmail.com Signed-off-by: Bjørn Mork Signed-off-by: Jakub Kicinski --- drivers/net/usb/cdc_ncm.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c index 50c05d0f44cb..00cab3f43a4c 100644 --- a/drivers/net/usb/cdc_ncm.c +++ b/drivers/net/usb/cdc_ncm.c @@ -681,8 +681,12 @@ cdc_ncm_find_endpoints(struct usbnet *dev, struct usb_interface *intf) u8 ep; for (ep = 0; ep < intf->cur_altsetting->desc.bNumEndpoints; ep++) { - e = intf->cur_altsetting->endpoint + ep; + + /* ignore endpoints which cannot transfer data */ + if (!usb_endpoint_maxp(&e->desc)) + continue; + switch (e->desc.bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) { case USB_ENDPOINT_XFER_INT: if (usb_endpoint_dir_in(&e->desc)) { From 8d3d7c2029c1b360f1a6b0a2fca470b57eb575c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Mork?= Date: Wed, 18 Sep 2019 14:17:38 +0200 Subject: [PATCH 138/334] usbnet: ignore endpoints with invalid wMaxPacketSize MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Endpoints with zero wMaxPacketSize are not usable for transferring data. Ignore such endpoints when looking for valid in, out and status pipes, to make the drivers more robust against invalid and meaningless descriptors. The wMaxPacketSize of these endpoints are used for memory allocations and as divisors in many usbnet minidrivers. Avoiding zero is therefore critical. Signed-off-by: Bjørn Mork Signed-off-by: Jakub Kicinski --- drivers/net/usb/usbnet.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c index e44849499b89..dde05e2fdc3e 100644 --- a/drivers/net/usb/usbnet.c +++ b/drivers/net/usb/usbnet.c @@ -100,6 +100,11 @@ int usbnet_get_endpoints(struct usbnet *dev, struct usb_interface *intf) int intr = 0; e = alt->endpoint + ep; + + /* ignore endpoints which cannot transfer data */ + if (!usb_endpoint_maxp(&e->desc)) + continue; + switch (e->desc.bmAttributes) { case USB_ENDPOINT_XFER_INT: if (!usb_endpoint_dir_in(&e->desc)) From e47488b2df7f9cb405789c7f5d4c27909fc597ae Mon Sep 17 00:00:00 2001 From: Peter Mamonov Date: Wed, 18 Sep 2019 19:27:55 +0300 Subject: [PATCH 139/334] net/phy: fix DP83865 10 Mbps HDX loopback disable function According to the DP83865 datasheet "the 10 Mbps HDX loopback can be disabled in the expanded memory register 0x1C0.1". The driver erroneously used bit 0 instead of bit 1. Fixes: 4621bf129856 ("phy: Add file missed in previous commit.") Signed-off-by: Peter Mamonov Reviewed-by: Andrew Lunn Signed-off-by: Jakub Kicinski --- drivers/net/phy/national.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/phy/national.c b/drivers/net/phy/national.c index a221dd552c3c..a5bf0874c7d8 100644 --- a/drivers/net/phy/national.c +++ b/drivers/net/phy/national.c @@ -105,14 +105,17 @@ static void ns_giga_speed_fallback(struct phy_device *phydev, int mode) static void ns_10_base_t_hdx_loopack(struct phy_device *phydev, int disable) { + u16 lb_dis = BIT(1); + if (disable) - ns_exp_write(phydev, 0x1c0, ns_exp_read(phydev, 0x1c0) | 1); + ns_exp_write(phydev, 0x1c0, + ns_exp_read(phydev, 0x1c0) | lb_dis); else ns_exp_write(phydev, 0x1c0, - ns_exp_read(phydev, 0x1c0) & 0xfffe); + ns_exp_read(phydev, 0x1c0) & ~lb_dis); pr_debug("10BASE-T HDX loopback %s\n", - (ns_exp_read(phydev, 0x1c0) & 0x0001) ? "off" : "on"); + (ns_exp_read(phydev, 0x1c0) & lb_dis) ? "off" : "on"); } static int ns_config_init(struct phy_device *phydev) From 73f0c11d11329a0d6d205d4312b6e5d2512af7c5 Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Wed, 18 Sep 2019 10:21:17 -0700 Subject: [PATCH 140/334] net: qrtr: Stop rx_worker before freeing node As the endpoint is unregistered there might still be work pending to handle incoming messages, which will result in a use after free scenario. The plan is to remove the rx_worker, but until then (and for stable@) ensure that the work is stopped before the node is freed. Fixes: bdabad3e363d ("net: Add Qualcomm IPC router") Cc: stable@vger.kernel.org Signed-off-by: Bjorn Andersson Signed-off-by: Jakub Kicinski --- net/qrtr/qrtr.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index 6c8b0f6d28f9..88f98f27ad88 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -150,6 +150,7 @@ static void __qrtr_node_release(struct kref *kref) list_del(&node->item); mutex_unlock(&qrtr_node_lock); + cancel_work_sync(&node->work); skb_queue_purge(&node->rx_queue); kfree(node); } From b0e1ee435aba68c4080e3cb67adf6573aa5bcc6d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 18 Sep 2019 22:21:24 +0200 Subject: [PATCH 141/334] net: remove netx ethernet driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ARM netx platform got removed in 5.3, so this driver is now useless. Reported-by: Uwe Kleine-König Cc: Sascha Hauer Signed-off-by: Arnd Bergmann Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/Kconfig | 11 - drivers/net/ethernet/Makefile | 1 - drivers/net/ethernet/netx-eth.c | 497 ------------------------- include/linux/platform_data/eth-netx.h | 13 - 4 files changed, 522 deletions(-) delete mode 100644 drivers/net/ethernet/netx-eth.c delete mode 100644 include/linux/platform_data/eth-netx.h diff --git a/drivers/net/ethernet/Kconfig b/drivers/net/ethernet/Kconfig index 1e2de9d062bf..e8e9c166185d 100644 --- a/drivers/net/ethernet/Kconfig +++ b/drivers/net/ethernet/Kconfig @@ -140,17 +140,6 @@ source "drivers/net/ethernet/neterion/Kconfig" source "drivers/net/ethernet/netronome/Kconfig" source "drivers/net/ethernet/ni/Kconfig" source "drivers/net/ethernet/8390/Kconfig" - -config NET_NETX - tristate "NetX Ethernet support" - select MII - depends on ARCH_NETX - ---help--- - This is support for the Hilscher netX builtin Ethernet ports - - To compile this driver as a module, choose M here. The module - will be called netx-eth. - source "drivers/net/ethernet/nvidia/Kconfig" source "drivers/net/ethernet/nxp/Kconfig" source "drivers/net/ethernet/oki-semi/Kconfig" diff --git a/drivers/net/ethernet/Makefile b/drivers/net/ethernet/Makefile index 77f9838a76c9..05abebc17804 100644 --- a/drivers/net/ethernet/Makefile +++ b/drivers/net/ethernet/Makefile @@ -64,7 +64,6 @@ obj-$(CONFIG_NET_VENDOR_NATSEMI) += natsemi/ obj-$(CONFIG_NET_VENDOR_NETERION) += neterion/ obj-$(CONFIG_NET_VENDOR_NETRONOME) += netronome/ obj-$(CONFIG_NET_VENDOR_NI) += ni/ -obj-$(CONFIG_NET_NETX) += netx-eth.o obj-$(CONFIG_NET_VENDOR_NVIDIA) += nvidia/ obj-$(CONFIG_LPC_ENET) += nxp/ obj-$(CONFIG_NET_VENDOR_OKI) += oki-semi/ diff --git a/drivers/net/ethernet/netx-eth.c b/drivers/net/ethernet/netx-eth.c deleted file mode 100644 index cf6e7eb1b1e1..000000000000 --- a/drivers/net/ethernet/netx-eth.c +++ /dev/null @@ -1,497 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * drivers/net/ethernet/netx-eth.c - * - * Copyright (c) 2005 Sascha Hauer , Pengutronix - */ - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -/* XC Fifo Offsets */ -#define EMPTY_PTR_FIFO(xcno) (0 + ((xcno) << 3)) /* Index of the empty pointer FIFO */ -#define IND_FIFO_PORT_HI(xcno) (1 + ((xcno) << 3)) /* Index of the FIFO where received */ - /* Data packages are indicated by XC */ -#define IND_FIFO_PORT_LO(xcno) (2 + ((xcno) << 3)) /* Index of the FIFO where received */ - /* Data packages are indicated by XC */ -#define REQ_FIFO_PORT_HI(xcno) (3 + ((xcno) << 3)) /* Index of the FIFO where Data packages */ - /* have to be indicated by ARM which */ - /* shall be sent */ -#define REQ_FIFO_PORT_LO(xcno) (4 + ((xcno) << 3)) /* Index of the FIFO where Data packages */ - /* have to be indicated by ARM which shall */ - /* be sent */ -#define CON_FIFO_PORT_HI(xcno) (5 + ((xcno) << 3)) /* Index of the FIFO where sent Data packages */ - /* are confirmed */ -#define CON_FIFO_PORT_LO(xcno) (6 + ((xcno) << 3)) /* Index of the FIFO where sent Data */ - /* packages are confirmed */ -#define PFIFO_MASK(xcno) (0x7f << (xcno*8)) - -#define FIFO_PTR_FRAMELEN_SHIFT 0 -#define FIFO_PTR_FRAMELEN_MASK (0x7ff << 0) -#define FIFO_PTR_FRAMELEN(len) (((len) << 0) & FIFO_PTR_FRAMELEN_MASK) -#define FIFO_PTR_TIMETRIG (1<<11) -#define FIFO_PTR_MULTI_REQ -#define FIFO_PTR_ORIGIN (1<<14) -#define FIFO_PTR_VLAN (1<<15) -#define FIFO_PTR_FRAMENO_SHIFT 16 -#define FIFO_PTR_FRAMENO_MASK (0x3f << 16) -#define FIFO_PTR_FRAMENO(no) (((no) << 16) & FIFO_PTR_FRAMENO_MASK) -#define FIFO_PTR_SEGMENT_SHIFT 22 -#define FIFO_PTR_SEGMENT_MASK (0xf << 22) -#define FIFO_PTR_SEGMENT(seg) (((seg) & 0xf) << 22) -#define FIFO_PTR_ERROR_SHIFT 28 -#define FIFO_PTR_ERROR_MASK (0xf << 28) - -#define ISR_LINK_STATUS_CHANGE (1<<4) -#define ISR_IND_LO (1<<3) -#define ISR_CON_LO (1<<2) -#define ISR_IND_HI (1<<1) -#define ISR_CON_HI (1<<0) - -#define ETH_MAC_LOCAL_CONFIG 0x1560 -#define ETH_MAC_4321 0x1564 -#define ETH_MAC_65 0x1568 - -#define MAC_TRAFFIC_CLASS_ARRANGEMENT_SHIFT 16 -#define MAC_TRAFFIC_CLASS_ARRANGEMENT_MASK (0xf<data; - unsigned int len = skb->len; - - spin_lock_irq(&priv->lock); - memcpy_toio(priv->sram_base + 1560, (void *)buf, len); - if (len < 60) { - memset_io(priv->sram_base + 1560 + len, 0, 60 - len); - len = 60; - } - - pfifo_push(REQ_FIFO_PORT_LO(priv->id), - FIFO_PTR_SEGMENT(priv->id) | - FIFO_PTR_FRAMENO(1) | - FIFO_PTR_FRAMELEN(len)); - - ndev->stats.tx_packets++; - ndev->stats.tx_bytes += skb->len; - - netif_stop_queue(ndev); - spin_unlock_irq(&priv->lock); - dev_kfree_skb(skb); - - return NETDEV_TX_OK; -} - -static void netx_eth_receive(struct net_device *ndev) -{ - struct netx_eth_priv *priv = netdev_priv(ndev); - unsigned int val, frameno, seg, len; - unsigned char *data; - struct sk_buff *skb; - - val = pfifo_pop(IND_FIFO_PORT_LO(priv->id)); - - frameno = (val & FIFO_PTR_FRAMENO_MASK) >> FIFO_PTR_FRAMENO_SHIFT; - seg = (val & FIFO_PTR_SEGMENT_MASK) >> FIFO_PTR_SEGMENT_SHIFT; - len = (val & FIFO_PTR_FRAMELEN_MASK) >> FIFO_PTR_FRAMELEN_SHIFT; - - skb = netdev_alloc_skb(ndev, len); - if (unlikely(skb == NULL)) { - ndev->stats.rx_dropped++; - return; - } - - data = skb_put(skb, len); - - memcpy_fromio(data, priv->sram_base + frameno * 1560, len); - - pfifo_push(EMPTY_PTR_FIFO(priv->id), - FIFO_PTR_SEGMENT(seg) | FIFO_PTR_FRAMENO(frameno)); - - skb->protocol = eth_type_trans(skb, ndev); - netif_rx(skb); - ndev->stats.rx_packets++; - ndev->stats.rx_bytes += len; -} - -static irqreturn_t -netx_eth_interrupt(int irq, void *dev_id) -{ - struct net_device *ndev = dev_id; - struct netx_eth_priv *priv = netdev_priv(ndev); - int status; - unsigned long flags; - - spin_lock_irqsave(&priv->lock, flags); - - status = readl(NETX_PFIFO_XPEC_ISR(priv->id)); - while (status) { - int fill_level; - writel(status, NETX_PFIFO_XPEC_ISR(priv->id)); - - if ((status & ISR_CON_HI) || (status & ISR_IND_HI)) - printk("%s: unexpected status: 0x%08x\n", - __func__, status); - - fill_level = - readl(NETX_PFIFO_FILL_LEVEL(IND_FIFO_PORT_LO(priv->id))); - while (fill_level--) - netx_eth_receive(ndev); - - if (status & ISR_CON_LO) - netif_wake_queue(ndev); - - if (status & ISR_LINK_STATUS_CHANGE) - mii_check_media(&priv->mii, netif_msg_link(priv), 1); - - status = readl(NETX_PFIFO_XPEC_ISR(priv->id)); - } - spin_unlock_irqrestore(&priv->lock, flags); - return IRQ_HANDLED; -} - -static int netx_eth_open(struct net_device *ndev) -{ - struct netx_eth_priv *priv = netdev_priv(ndev); - - if (request_irq - (ndev->irq, netx_eth_interrupt, IRQF_SHARED, ndev->name, ndev)) - return -EAGAIN; - - writel(ndev->dev_addr[0] | - ndev->dev_addr[1]<<8 | - ndev->dev_addr[2]<<16 | - ndev->dev_addr[3]<<24, - priv->xpec_base + NETX_XPEC_RAM_START_OFS + ETH_MAC_4321); - writel(ndev->dev_addr[4] | - ndev->dev_addr[5]<<8, - priv->xpec_base + NETX_XPEC_RAM_START_OFS + ETH_MAC_65); - - writel(LOCAL_CONFIG_LINK_STATUS_IRQ_EN | - LOCAL_CONFIG_CON_LO_IRQ_EN | - LOCAL_CONFIG_CON_HI_IRQ_EN | - LOCAL_CONFIG_IND_LO_IRQ_EN | - LOCAL_CONFIG_IND_HI_IRQ_EN, - priv->xpec_base + NETX_XPEC_RAM_START_OFS + - ETH_MAC_LOCAL_CONFIG); - - mii_check_media(&priv->mii, netif_msg_link(priv), 1); - netif_start_queue(ndev); - - return 0; -} - -static int netx_eth_close(struct net_device *ndev) -{ - struct netx_eth_priv *priv = netdev_priv(ndev); - - netif_stop_queue(ndev); - - writel(0, - priv->xpec_base + NETX_XPEC_RAM_START_OFS + ETH_MAC_LOCAL_CONFIG); - - free_irq(ndev->irq, ndev); - - return 0; -} - -static void netx_eth_timeout(struct net_device *ndev) -{ - struct netx_eth_priv *priv = netdev_priv(ndev); - int i; - - printk(KERN_ERR "%s: transmit timed out, resetting\n", ndev->name); - - spin_lock_irq(&priv->lock); - - xc_reset(priv->xc); - xc_start(priv->xc); - - for (i=2; i<=18; i++) - pfifo_push(EMPTY_PTR_FIFO(priv->id), - FIFO_PTR_FRAMENO(i) | FIFO_PTR_SEGMENT(priv->id)); - - spin_unlock_irq(&priv->lock); - - netif_wake_queue(ndev); -} - -static int -netx_eth_phy_read(struct net_device *ndev, int phy_id, int reg) -{ - unsigned int val; - - val = MIIMU_SNRDY | MIIMU_PREAMBLE | MIIMU_PHYADDR(phy_id) | - MIIMU_REGADDR(reg) | MIIMU_PHY_NRES; - - writel(val, NETX_MIIMU); - while (readl(NETX_MIIMU) & MIIMU_SNRDY); - - return readl(NETX_MIIMU) >> 16; - -} - -static void -netx_eth_phy_write(struct net_device *ndev, int phy_id, int reg, int value) -{ - unsigned int val; - - val = MIIMU_SNRDY | MIIMU_PREAMBLE | MIIMU_PHYADDR(phy_id) | - MIIMU_REGADDR(reg) | MIIMU_PHY_NRES | MIIMU_OPMODE_WRITE | - MIIMU_DATA(value); - - writel(val, NETX_MIIMU); - while (readl(NETX_MIIMU) & MIIMU_SNRDY); -} - -static const struct net_device_ops netx_eth_netdev_ops = { - .ndo_open = netx_eth_open, - .ndo_stop = netx_eth_close, - .ndo_start_xmit = netx_eth_hard_start_xmit, - .ndo_tx_timeout = netx_eth_timeout, - .ndo_set_rx_mode = netx_eth_set_multicast_list, - .ndo_validate_addr = eth_validate_addr, - .ndo_set_mac_address = eth_mac_addr, -}; - -static int netx_eth_enable(struct net_device *ndev) -{ - struct netx_eth_priv *priv = netdev_priv(ndev); - unsigned int mac4321, mac65; - int running, i, ret; - bool inv_mac_addr = false; - - ndev->netdev_ops = &netx_eth_netdev_ops; - ndev->watchdog_timeo = msecs_to_jiffies(5000); - - priv->msg_enable = NETIF_MSG_LINK; - priv->mii.phy_id_mask = 0x1f; - priv->mii.reg_num_mask = 0x1f; - priv->mii.force_media = 0; - priv->mii.full_duplex = 0; - priv->mii.dev = ndev; - priv->mii.mdio_read = netx_eth_phy_read; - priv->mii.mdio_write = netx_eth_phy_write; - priv->mii.phy_id = INTERNAL_PHY_ADR + priv->id; - - running = xc_running(priv->xc); - xc_stop(priv->xc); - - /* if the xc engine is already running, assume the bootloader has - * loaded the firmware for us - */ - if (running) { - /* get Node Address from hardware */ - mac4321 = readl(priv->xpec_base + - NETX_XPEC_RAM_START_OFS + ETH_MAC_4321); - mac65 = readl(priv->xpec_base + - NETX_XPEC_RAM_START_OFS + ETH_MAC_65); - - ndev->dev_addr[0] = mac4321 & 0xff; - ndev->dev_addr[1] = (mac4321 >> 8) & 0xff; - ndev->dev_addr[2] = (mac4321 >> 16) & 0xff; - ndev->dev_addr[3] = (mac4321 >> 24) & 0xff; - ndev->dev_addr[4] = mac65 & 0xff; - ndev->dev_addr[5] = (mac65 >> 8) & 0xff; - } else { - if (xc_request_firmware(priv->xc)) { - printk(CARDNAME ": requesting firmware failed\n"); - return -ENODEV; - } - } - - xc_reset(priv->xc); - xc_start(priv->xc); - - if (!is_valid_ether_addr(ndev->dev_addr)) - inv_mac_addr = true; - - for (i=2; i<=18; i++) - pfifo_push(EMPTY_PTR_FIFO(priv->id), - FIFO_PTR_FRAMENO(i) | FIFO_PTR_SEGMENT(priv->id)); - - ret = register_netdev(ndev); - if (inv_mac_addr) - printk("%s: Invalid ethernet MAC address. Please set using ip\n", - ndev->name); - - return ret; -} - -static int netx_eth_drv_probe(struct platform_device *pdev) -{ - struct netx_eth_priv *priv; - struct net_device *ndev; - struct netxeth_platform_data *pdata; - int ret; - - ndev = alloc_etherdev(sizeof (struct netx_eth_priv)); - if (!ndev) { - ret = -ENOMEM; - goto exit; - } - SET_NETDEV_DEV(ndev, &pdev->dev); - - platform_set_drvdata(pdev, ndev); - - priv = netdev_priv(ndev); - - pdata = dev_get_platdata(&pdev->dev); - priv->xc = request_xc(pdata->xcno, &pdev->dev); - if (!priv->xc) { - dev_err(&pdev->dev, "unable to request xc engine\n"); - ret = -ENODEV; - goto exit_free_netdev; - } - - ndev->irq = priv->xc->irq; - priv->id = pdev->id; - priv->xpec_base = priv->xc->xpec_base; - priv->xmac_base = priv->xc->xmac_base; - priv->sram_base = priv->xc->sram_base; - - spin_lock_init(&priv->lock); - - ret = pfifo_request(PFIFO_MASK(priv->id)); - if (ret) { - printk("unable to request PFIFO\n"); - goto exit_free_xc; - } - - ret = netx_eth_enable(ndev); - if (ret) - goto exit_free_pfifo; - - return 0; -exit_free_pfifo: - pfifo_free(PFIFO_MASK(priv->id)); -exit_free_xc: - free_xc(priv->xc); -exit_free_netdev: - free_netdev(ndev); -exit: - return ret; -} - -static int netx_eth_drv_remove(struct platform_device *pdev) -{ - struct net_device *ndev = platform_get_drvdata(pdev); - struct netx_eth_priv *priv = netdev_priv(ndev); - - unregister_netdev(ndev); - xc_stop(priv->xc); - free_xc(priv->xc); - free_netdev(ndev); - pfifo_free(PFIFO_MASK(priv->id)); - - return 0; -} - -static int netx_eth_drv_suspend(struct platform_device *pdev, pm_message_t state) -{ - dev_err(&pdev->dev, "suspend not implemented\n"); - return 0; -} - -static int netx_eth_drv_resume(struct platform_device *pdev) -{ - dev_err(&pdev->dev, "resume not implemented\n"); - return 0; -} - -static struct platform_driver netx_eth_driver = { - .probe = netx_eth_drv_probe, - .remove = netx_eth_drv_remove, - .suspend = netx_eth_drv_suspend, - .resume = netx_eth_drv_resume, - .driver = { - .name = CARDNAME, - }, -}; - -static int __init netx_eth_init(void) -{ - unsigned int phy_control, val; - - printk("NetX Ethernet driver\n"); - - phy_control = PHY_CONTROL_PHY_ADDRESS(INTERNAL_PHY_ADR>>1) | - PHY_CONTROL_PHY1_MODE(PHY_MODE_ALL) | - PHY_CONTROL_PHY1_AUTOMDIX | - PHY_CONTROL_PHY1_EN | - PHY_CONTROL_PHY0_MODE(PHY_MODE_ALL) | - PHY_CONTROL_PHY0_AUTOMDIX | - PHY_CONTROL_PHY0_EN | - PHY_CONTROL_CLK_XLATIN; - - val = readl(NETX_SYSTEM_IOC_ACCESS_KEY); - writel(val, NETX_SYSTEM_IOC_ACCESS_KEY); - - writel(phy_control | PHY_CONTROL_RESET, NETX_SYSTEM_PHY_CONTROL); - udelay(100); - - val = readl(NETX_SYSTEM_IOC_ACCESS_KEY); - writel(val, NETX_SYSTEM_IOC_ACCESS_KEY); - - writel(phy_control, NETX_SYSTEM_PHY_CONTROL); - - return platform_driver_register(&netx_eth_driver); -} - -static void __exit netx_eth_cleanup(void) -{ - platform_driver_unregister(&netx_eth_driver); -} - -module_init(netx_eth_init); -module_exit(netx_eth_cleanup); - -MODULE_AUTHOR("Sascha Hauer, Pengutronix"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS("platform:" CARDNAME); -MODULE_FIRMWARE("xc0.bin"); -MODULE_FIRMWARE("xc1.bin"); -MODULE_FIRMWARE("xc2.bin"); diff --git a/include/linux/platform_data/eth-netx.h b/include/linux/platform_data/eth-netx.h deleted file mode 100644 index a3a6322668d8..000000000000 --- a/include/linux/platform_data/eth-netx.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (c) 2005 Sascha Hauer , Pengutronix - */ - -#ifndef __ETH_NETX_H -#define __ETH_NETX_H - -struct netxeth_platform_data { - unsigned int xcno; /* number of xmac/xpec engine this eth uses */ -}; - -#endif From 62794fc4fbf52f2209dc094ea255eaef760e7d01 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 18 Sep 2019 16:24:12 -0700 Subject: [PATCH 142/334] net_sched: add max len check for TCA_KIND The TCA_KIND attribute is of NLA_STRING which does not check the NUL char. KMSAN reported an uninit-value of TCA_KIND which is likely caused by the lack of NUL. Change it to NLA_NUL_STRING and add a max len too. Fixes: 8b4c3cdd9dd8 ("net: sched: Add policy validation for tc attributes") Reported-and-tested-by: syzbot+618aacd49e8c8b8486bd@syzkaller.appspotmail.com Cc: Jamal Hadi Salim Signed-off-by: Cong Wang Reviewed-by: David Ahern Acked-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- net/sched/sch_api.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 1047825d9f48..81d58b280612 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1390,7 +1390,8 @@ check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w) } const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = { - [TCA_KIND] = { .type = NLA_STRING }, + [TCA_KIND] = { .type = NLA_NUL_STRING, + .len = IFNAMSIZ - 1 }, [TCA_RATE] = { .type = NLA_BINARY, .len = sizeof(struct tc_estimator) }, [TCA_STAB] = { .type = NLA_NESTED }, From 199ce850ce112315cfc68d42b694bcaa27b097b7 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 18 Sep 2019 18:44:43 -0700 Subject: [PATCH 143/334] net_sched: add policy validation for action attributes Similar to commit 8b4c3cdd9dd8 ("net: sched: Add policy validation for tc attributes"), we need to add proper policy validation for TC action attributes too. Cc: David Ahern Cc: Jamal Hadi Salim Signed-off-by: Cong Wang Acked-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- net/sched/act_api.c | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 339712296164..2558f00f6b3e 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -831,6 +831,15 @@ static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb) return c; } +static const struct nla_policy tcf_action_policy[TCA_ACT_MAX + 1] = { + [TCA_ACT_KIND] = { .type = NLA_NUL_STRING, + .len = IFNAMSIZ - 1 }, + [TCA_ACT_INDEX] = { .type = NLA_U32 }, + [TCA_ACT_COOKIE] = { .type = NLA_BINARY, + .len = TC_COOKIE_MAX_SIZE }, + [TCA_ACT_OPTIONS] = { .type = NLA_NESTED }, +}; + struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, @@ -846,8 +855,8 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, int err; if (name == NULL) { - err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, NULL, - extack); + err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, + tcf_action_policy, extack); if (err < 0) goto err_out; err = -EINVAL; @@ -856,18 +865,9 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, NL_SET_ERR_MSG(extack, "TC action kind must be specified"); goto err_out; } - if (nla_strlcpy(act_name, kind, IFNAMSIZ) >= IFNAMSIZ) { - NL_SET_ERR_MSG(extack, "TC action name too long"); - goto err_out; - } + nla_strlcpy(act_name, kind, IFNAMSIZ); + if (tb[TCA_ACT_COOKIE]) { - int cklen = nla_len(tb[TCA_ACT_COOKIE]); - - if (cklen > TC_COOKIE_MAX_SIZE) { - NL_SET_ERR_MSG(extack, "TC cookie size above the maximum"); - goto err_out; - } - cookie = nla_memdup_cookie(tb); if (!cookie) { NL_SET_ERR_MSG(extack, "No memory to generate TC cookie"); @@ -1098,7 +1098,8 @@ static struct tc_action *tcf_action_get_1(struct net *net, struct nlattr *nla, int index; int err; - err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, NULL, extack); + err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, + tcf_action_policy, extack); if (err < 0) goto err_out; @@ -1152,7 +1153,8 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, b = skb_tail_pointer(skb); - err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, NULL, extack); + err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, + tcf_action_policy, extack); if (err < 0) goto err_out; @@ -1440,7 +1442,7 @@ static struct nlattr *find_dump_kind(struct nlattr **nla) if (tb[1] == NULL) return NULL; - if (nla_parse_nested_deprecated(tb2, TCA_ACT_MAX, tb[1], NULL, NULL) < 0) + if (nla_parse_nested_deprecated(tb2, TCA_ACT_MAX, tb[1], tcf_action_policy, NULL) < 0) return NULL; kind = tb2[TCA_ACT_KIND]; From a8d570de0cc6acd6dc994515f163ccbc5e54ab60 Mon Sep 17 00:00:00 2001 From: Mao Wenan Date: Thu, 19 Sep 2019 14:38:19 +0800 Subject: [PATCH 144/334] net: dsa: sja1105: Add dependency for NET_DSA_SJA1105_TAS If CONFIG_NET_DSA_SJA1105_TAS=y and CONFIG_NET_SCH_TAPRIO=n, below error can be found: drivers/net/dsa/sja1105/sja1105_tas.o: In function `sja1105_setup_tc_taprio': sja1105_tas.c:(.text+0x318): undefined reference to `taprio_offload_free' sja1105_tas.c:(.text+0x590): undefined reference to `taprio_offload_get' drivers/net/dsa/sja1105/sja1105_tas.o: In function `sja1105_tas_teardown': sja1105_tas.c:(.text+0x610): undefined reference to `taprio_offload_free' make: *** [vmlinux] Error 1 sja1105_tas needs tc-taprio, so this patch add the dependency for it. Fixes: 317ab5b86c8e ("net: dsa: sja1105: Configure the Time-Aware Scheduler via tc-taprio offload") Signed-off-by: Mao Wenan Reviewed-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- drivers/net/dsa/sja1105/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/dsa/sja1105/Kconfig b/drivers/net/dsa/sja1105/Kconfig index 55424f39cb0d..f40b248f0b23 100644 --- a/drivers/net/dsa/sja1105/Kconfig +++ b/drivers/net/dsa/sja1105/Kconfig @@ -27,6 +27,7 @@ config NET_DSA_SJA1105_PTP config NET_DSA_SJA1105_TAS bool "Support for the Time-Aware Scheduler on NXP SJA1105" depends on NET_DSA_SJA1105 + depends on NET_SCH_TAPRIO help This enables support for the TTEthernet-based egress scheduling engine in the SJA1105 DSA driver, which is controlled using a From b6b6cc9acd7b99df216c007e9565aebdc2c77469 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 19 Sep 2019 14:33:43 +0200 Subject: [PATCH 145/334] net: stmmac: selftest: avoid large stack usage Putting a struct stmmac_rss object on the stack is a bad idea, as it exceeds the warning limit for a stack frame on 32-bit architectures: drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c:1221:12: error: stack frame size of 1208 bytes in function '__stmmac_test_l3filt' [-Werror,-Wframe-larger-than=] drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c:1338:12: error: stack frame size of 1208 bytes in function '__stmmac_test_l4filt' [-Werror,-Wframe-larger-than=] As the object is the trivial empty case, change the called function to accept a NULL pointer to mean the same thing and remove the large variable in the two callers. Fixes: 4647e021193d ("net: stmmac: selftests: Add selftest for L3/L4 Filters") Signed-off-by: Arnd Bergmann Acked-by: Jose Abreu Signed-off-by: Jakub Kicinski --- .../net/ethernet/stmicro/stmmac/dwxgmac2_core.c | 5 ++--- .../net/ethernet/stmicro/stmmac/stmmac_selftests.c | 14 ++++---------- 2 files changed, 6 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c index d5173dd02a71..2b277b2c586b 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c @@ -523,19 +523,18 @@ static int dwxgmac2_rss_configure(struct mac_device_info *hw, struct stmmac_rss *cfg, u32 num_rxq) { void __iomem *ioaddr = hw->pcsr; - u32 *key = (u32 *)cfg->key; int i, ret; u32 value; value = readl(ioaddr + XGMAC_RSS_CTRL); - if (!cfg->enable) { + if (!cfg || !cfg->enable) { value &= ~XGMAC_RSSE; writel(value, ioaddr + XGMAC_RSS_CTRL); return 0; } for (i = 0; i < (sizeof(cfg->key) / sizeof(u32)); i++) { - ret = dwxgmac2_rss_write_reg(ioaddr, true, i, *key++); + ret = dwxgmac2_rss_write_reg(ioaddr, true, i, cfg->key[i]); if (ret) return ret; } diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c index c56e89e1ae56..9c8d210b2d6a 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c @@ -1233,12 +1233,9 @@ static int __stmmac_test_l3filt(struct stmmac_priv *priv, u32 dst, u32 src, return -EOPNOTSUPP; if (!priv->dma_cap.l3l4fnum) return -EOPNOTSUPP; - if (priv->rss.enable) { - struct stmmac_rss rss = { .enable = false, }; - - stmmac_rss_configure(priv, priv->hw, &rss, + if (priv->rss.enable) + stmmac_rss_configure(priv, priv->hw, NULL, priv->plat->rx_queues_to_use); - } dissector = kzalloc(sizeof(*dissector), GFP_KERNEL); if (!dissector) { @@ -1357,12 +1354,9 @@ static int __stmmac_test_l4filt(struct stmmac_priv *priv, u32 dst, u32 src, return -EOPNOTSUPP; if (!priv->dma_cap.l3l4fnum) return -EOPNOTSUPP; - if (priv->rss.enable) { - struct stmmac_rss rss = { .enable = false, }; - - stmmac_rss_configure(priv, priv->hw, &rss, + if (priv->rss.enable) + stmmac_rss_configure(priv, priv->hw, NULL, priv->plat->rx_queues_to_use); - } dissector = kzalloc(sizeof(*dissector), GFP_KERNEL); if (!dissector) { From 24ccb0ab95bf14e414cf2ba65af5458bc5a2e865 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Fri, 20 Sep 2019 06:56:56 +0200 Subject: [PATCH 146/334] qede: qede_fp: simplify a bit 'qede_rx_build_skb()' Use 'skb_put_data()' instead of rewritting it. This improves readability. Signed-off-by: Christophe JAILLET Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/qlogic/qede/qede_fp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qede/qede_fp.c b/drivers/net/ethernet/qlogic/qede/qede_fp.c index 0ae28f0d2523..004c0bfec41d 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_fp.c +++ b/drivers/net/ethernet/qlogic/qede/qede_fp.c @@ -779,8 +779,7 @@ qede_rx_build_skb(struct qede_dev *edev, return NULL; skb_reserve(skb, pad); - memcpy(skb_put(skb, len), - page_address(bd->data) + offset, len); + skb_put_data(skb, page_address(bd->data) + offset, len); qede_reuse_page(rxq, bd); goto out; } From dd89d82e751473d13da0daea1bfb15d5634071c4 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 15 May 2019 12:34:21 +0300 Subject: [PATCH 147/334] thermal: thermal_mmio: remove some dead code The platform_get_resource() function doesn't return error pointers, it returns NULL. The way this is normally done, is that we pass the NULL resource to devm_ioremap_resource() and then check for errors from that. See the comment in front of devm_ioremap_resource() for more details. Signed-off-by: Dan Carpenter Acked-by: Talel Shenhar Signed-off-by: Eduardo Valentin --- drivers/thermal/thermal_mmio.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/thermal/thermal_mmio.c b/drivers/thermal/thermal_mmio.c index de3cceea23bc..40524fa13533 100644 --- a/drivers/thermal/thermal_mmio.c +++ b/drivers/thermal/thermal_mmio.c @@ -53,13 +53,6 @@ static int thermal_mmio_probe(struct platform_device *pdev) return -ENOMEM; resource = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (IS_ERR(resource)) { - dev_err(&pdev->dev, - "fail to get platform memory resource (%ld)\n", - PTR_ERR(resource)); - return PTR_ERR(resource); - } - sensor->mmio_base = devm_ioremap_resource(&pdev->dev, resource); if (IS_ERR(sensor->mmio_base)) { dev_err(&pdev->dev, "failed to ioremap memory (%ld)\n", From ff04cfbaa23644562f369eeca0b44ef66e185c9e Mon Sep 17 00:00:00 2001 From: Mao Wenan Date: Sun, 22 Sep 2019 13:38:08 +0800 Subject: [PATCH 148/334] net: ena: Select DIMLIB for ENA_ETHERNET If CONFIG_ENA_ETHERNET=y and CONFIG_DIMLIB=n, below erros can be found: drivers/net/ethernet/amazon/ena/ena_netdev.o: In function `ena_dim_work': ena_netdev.c:(.text+0x21cc): undefined reference to `net_dim_get_rx_moderation' ena_netdev.c:(.text+0x21cc): relocation truncated to fit: R_AARCH64_CALL26 against undefined symbol `net_dim_get_rx_moderation' drivers/net/ethernet/amazon/ena/ena_netdev.o: In function `ena_io_poll': ena_netdev.c:(.text+0x7bd4): undefined reference to `net_dim' ena_netdev.c:(.text+0x7bd4): relocation truncated to fit: R_AARCH64_CALL26 against undefined symbol `net_dim' After commit 282faf61a053 ("net: ena: switch to dim algorithm for rx adaptive interrupt moderation"), it introduces dim algorithm, which configured by CONFIG_DIMLIB. So, this patch is to select DIMLIB for ENA_ETHERNET. Fixes: 282faf61a053 ("net: ena: switch to dim algorithm for rx adaptive interrupt moderation") Signed-off-by: Mao Wenan Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/amazon/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/amazon/Kconfig b/drivers/net/ethernet/amazon/Kconfig index 69ca99d8ac26..cca72a75f551 100644 --- a/drivers/net/ethernet/amazon/Kconfig +++ b/drivers/net/ethernet/amazon/Kconfig @@ -19,6 +19,7 @@ if NET_VENDOR_AMAZON config ENA_ETHERNET tristate "Elastic Network Adapter (ENA) support" depends on PCI_MSI && !CPU_BIG_ENDIAN + select DIMLIB ---help--- This driver supports Elastic Network Adapter (ENA)" From 73a63ee9955494e18a6f7d9ba396f5e78e3ce307 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 21 Sep 2019 08:59:26 +0300 Subject: [PATCH 149/334] ionic: Fix an error code in ionic_lif_alloc() We need to set the error code on this path. Otherwise it probably results in a NULL dereference down the line. Fixes: aa3198819bea ("ionic: Add RSS support") Signed-off-by: Dan Carpenter Acked-by: Shannon Nelson Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/pensando/ionic/ionic_lif.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_lif.c b/drivers/net/ethernet/pensando/ionic/ionic_lif.c index db7c82742828..72107a0627a9 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_lif.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_lif.c @@ -1704,6 +1704,7 @@ static struct ionic_lif *ionic_lif_alloc(struct ionic *ionic, unsigned int index GFP_KERNEL); if (!lif->rss_ind_tbl) { + err = -ENOMEM; dev_err(dev, "Failed to allocate rss indirection table, aborting\n"); goto err_out_free_qcqs; } From 938e4d49c26eee7b6cb39f2d516126ac39391511 Mon Sep 17 00:00:00 2001 From: Nishad Kamdar Date: Sat, 21 Sep 2019 19:00:16 +0530 Subject: [PATCH 150/334] net: dsa: b53: Use the correct style for SPDX License Identifier This patch corrects the SPDX License Identifier style in header file for Broadcom BCM53xx managed switch driver. For C header files Documentation/process/license-rules.rst mandates C-like comments (opposed to C source files where C++ style should be used) Changes made by using a script provided by Joe Perches here: https://lkml.org/lkml/2019/2/7/46. Suggested-by: Joe Perches Signed-off-by: Nishad Kamdar Reviewed-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- drivers/net/dsa/b53/b53_serdes.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/dsa/b53/b53_serdes.h b/drivers/net/dsa/b53/b53_serdes.h index 3bb4f91aec9e..55d280fe38e4 100644 --- a/drivers/net/dsa/b53/b53_serdes.h +++ b/drivers/net/dsa/b53/b53_serdes.h @@ -1,5 +1,5 @@ -/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause - * +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ +/* * Northstar Plus switch SerDes/SGMII PHY definitions * * Copyright (C) 2018 Florian Fainelli From 34b4688425d9841a19a15fa6ae2bfc12a372650f Mon Sep 17 00:00:00 2001 From: Nishad Kamdar Date: Sat, 21 Sep 2019 19:15:25 +0530 Subject: [PATCH 151/334] net: dsa: Use the correct style for SPDX License Identifier This patch corrects the SPDX License Identifier style in header file for Distributed Switch Architecture drivers. For C header files Documentation/process/license-rules.rst mandates C-like comments (opposed to C source files where C++ style should be used) Changes made by using a script provided by Joe Perches here: https://lkml.org/lkml/2019/2/7/46. Suggested-by: Joe Perches Signed-off-by: Nishad Kamdar Reviewed-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- drivers/net/dsa/lantiq_pce.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/lantiq_pce.h b/drivers/net/dsa/lantiq_pce.h index 180663138e75..e2be31f3672a 100644 --- a/drivers/net/dsa/lantiq_pce.h +++ b/drivers/net/dsa/lantiq_pce.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * PCE microcode extracted from UGW 7.1.1 switch api * From 65643f4c8217edb1f40f7fb05f996c3a4798442a Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Mon, 23 Sep 2019 13:58:59 +0800 Subject: [PATCH 152/334] nfsd: Make nfsd_reset_boot_verifier_locked static Fix sparse warning: fs/nfsd/nfssvc.c:364:6: warning: symbol 'nfsd_reset_boot_verifier_locked' was not declared. Should it be static? Reported-by: Hulk Robot Signed-off-by: YueHaibing Signed-off-by: J. Bruce Fields --- fs/nfsd/nfssvc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 3caaf5675259..fdf7ed4bd5dd 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -361,7 +361,7 @@ void nfsd_copy_boot_verifier(__be32 verf[2], struct nfsd_net *nn) done_seqretry(&nn->boot_lock, seq); } -void nfsd_reset_boot_verifier_locked(struct nfsd_net *nn) +static void nfsd_reset_boot_verifier_locked(struct nfsd_net *nn) { ktime_get_real_ts64(&nn->nfssvc_boot); } From ca14c996afe7228ff9b480cf225211cc17212688 Mon Sep 17 00:00:00 2001 From: Arvind Sankar Date: Mon, 23 Sep 2019 13:17:54 -0400 Subject: [PATCH 153/334] x86/purgatory: Disable the stackleak GCC plugin for the purgatory Since commit: b059f801a937 ("x86/purgatory: Use CFLAGS_REMOVE rather than reset KBUILD_CFLAGS") kexec breaks if GCC_PLUGIN_STACKLEAK=y is enabled, as the purgatory contains undefined references to stackleak_track_stack. Attempting to load a kexec kernel results in this failure: kexec: Undefined symbol: stackleak_track_stack kexec-bzImage64: Loading purgatory failed Fix this by disabling the stackleak plugin for the purgatory. Signed-off-by: Arvind Sankar Reviewed-by: Nick Desaulniers Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: b059f801a937 ("x86/purgatory: Use CFLAGS_REMOVE rather than reset KBUILD_CFLAGS") Link: https://lkml.kernel.org/r/20190923171753.GA2252517@rani.riverdale.lan Signed-off-by: Ingo Molnar --- arch/x86/purgatory/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index 10fb42da0007..b81b5172cf99 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -23,6 +23,7 @@ KCOV_INSTRUMENT := n PURGATORY_CFLAGS_REMOVE := -mcmodel=kernel PURGATORY_CFLAGS := -mcmodel=large -ffreestanding -fno-zero-initialized-in-bss +PURGATORY_CFLAGS += $(DISABLE_STACKLEAK_PLUGIN) # Default KBUILD_CFLAGS can have -pg option set when FTRACE is enabled. That # in turn leaves some undefined symbols like __fentry__ in purgatory and not From 83a63072c815e8a042c60fa964dcbde2a6df0e87 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 26 Aug 2019 13:03:11 -0400 Subject: [PATCH 154/334] nfsd: fix nfs read eof detection Currently, the knfsd server assumes that a short read indicates an end of file. That assumption is incorrect. The short read means that either we've hit the end of file, or we've hit a read error. In the case of a read error, the client may want to retry (as per the implementation recommendations in RFC1813 and RFC7530), but currently it is being told that it hit an eof. Move the code to detect eof from version specific code into the generic nfsd read. Report eof only in the two following cases: 1) read() returns a zero length short read with no error. 2) the offset+length of the read is >= the file size. Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs3proc.c | 9 ++------- fs/nfsd/nfs4xdr.c | 11 +++-------- fs/nfsd/nfsproc.c | 4 +++- fs/nfsd/vfs.c | 37 ++++++++++++++++++++++++++----------- fs/nfsd/vfs.h | 28 ++++++---------------------- fs/nfsd/xdr3.h | 2 +- 6 files changed, 41 insertions(+), 50 deletions(-) diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 9bc32af4e2da..cea68d8411ac 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -172,13 +172,8 @@ nfsd3_proc_read(struct svc_rqst *rqstp) nfserr = nfsd_read(rqstp, &resp->fh, argp->offset, rqstp->rq_vec, argp->vlen, - &resp->count); - if (nfserr == 0) { - struct inode *inode = d_inode(resp->fh.fh_dentry); - resp->eof = nfsd_eof_on_read(cnt, resp->count, argp->offset, - inode->i_size); - } - + &resp->count, + &resp->eof); RETURN_STATUS(nfserr); } diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index c1fc2641e3e7..533d0fc3c96b 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -3472,7 +3472,7 @@ static __be32 nfsd4_encode_splice_read( len = maxcount; nfserr = nfsd_splice_read(read->rd_rqstp, read->rd_fhp, - file, read->rd_offset, &maxcount); + file, read->rd_offset, &maxcount, &eof); read->rd_length = maxcount; if (nfserr) { /* @@ -3484,9 +3484,6 @@ static __be32 nfsd4_encode_splice_read( return nfserr; } - eof = nfsd_eof_on_read(len, maxcount, read->rd_offset, - d_inode(read->rd_fhp->fh_dentry)->i_size); - *(p++) = htonl(eof); *(p++) = htonl(maxcount); @@ -3557,15 +3554,13 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp, len = maxcount; nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset, - resp->rqstp->rq_vec, read->rd_vlen, &maxcount); + resp->rqstp->rq_vec, read->rd_vlen, &maxcount, + &eof); read->rd_length = maxcount; if (nfserr) return nfserr; xdr_truncate_encode(xdr, starting_len + 8 + ((maxcount+3)&~3)); - eof = nfsd_eof_on_read(len, maxcount, read->rd_offset, - d_inode(read->rd_fhp->fh_dentry)->i_size); - tmp = htonl(eof); write_bytes_to_xdr_buf(xdr->buf, starting_len , &tmp, 4); tmp = htonl(maxcount); diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 0d20fd161225..c83ddac22f38 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -172,6 +172,7 @@ nfsd_proc_read(struct svc_rqst *rqstp) struct nfsd_readargs *argp = rqstp->rq_argp; struct nfsd_readres *resp = rqstp->rq_resp; __be32 nfserr; + u32 eof; dprintk("nfsd: READ %s %d bytes at %d\n", SVCFH_fmt(&argp->fh), @@ -195,7 +196,8 @@ nfsd_proc_read(struct svc_rqst *rqstp) nfserr = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh), argp->offset, rqstp->rq_vec, argp->vlen, - &resp->count); + &resp->count, + &eof); if (nfserr) return nfserr; return fh_getattr(&resp->fh, &resp->stat); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 0867d5319fdb..bd0a385df3fc 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -834,12 +834,23 @@ static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe, return __splice_from_pipe(pipe, sd, nfsd_splice_actor); } +static u32 nfsd_eof_on_read(struct file *file, loff_t offset, ssize_t len, + size_t expected) +{ + if (expected != 0 && len == 0) + return 1; + if (offset+len >= i_size_read(file_inode(file))) + return 1; + return 0; +} + static __be32 nfsd_finish_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, - unsigned long *count, int host_err) + unsigned long *count, u32 *eof, ssize_t host_err) { if (host_err >= 0) { nfsdstats.io_read += host_err; + *eof = nfsd_eof_on_read(file, offset, host_err, *count); *count = host_err; fsnotify_access(file); trace_nfsd_read_io_done(rqstp, fhp, offset, *count); @@ -851,7 +862,8 @@ static __be32 nfsd_finish_read(struct svc_rqst *rqstp, struct svc_fh *fhp, } __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, - struct file *file, loff_t offset, unsigned long *count) + struct file *file, loff_t offset, unsigned long *count, + u32 *eof) { struct splice_desc sd = { .len = 0, @@ -859,25 +871,27 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, .pos = offset, .u.data = rqstp, }; - int host_err; + ssize_t host_err; trace_nfsd_read_splice(rqstp, fhp, offset, *count); rqstp->rq_next_page = rqstp->rq_respages + 1; host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor); - return nfsd_finish_read(rqstp, fhp, file, offset, count, host_err); + return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err); } __be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, - struct kvec *vec, int vlen, unsigned long *count) + struct kvec *vec, int vlen, unsigned long *count, + u32 *eof) { struct iov_iter iter; - int host_err; + loff_t ppos = offset; + ssize_t host_err; trace_nfsd_read_vector(rqstp, fhp, offset, *count); iov_iter_kvec(&iter, READ, vec, vlen, *count); - host_err = vfs_iter_read(file, &iter, &offset, 0); - return nfsd_finish_read(rqstp, fhp, file, offset, count, host_err); + host_err = vfs_iter_read(file, &iter, &ppos, 0); + return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err); } /* @@ -984,7 +998,8 @@ out_nfserr: * N.B. After this call fhp needs an fh_put */ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, - loff_t offset, struct kvec *vec, int vlen, unsigned long *count) + loff_t offset, struct kvec *vec, int vlen, unsigned long *count, + u32 *eof) { struct nfsd_file *nf; struct file *file; @@ -997,9 +1012,9 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, file = nf->nf_file; if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &rqstp->rq_flags)) - err = nfsd_splice_read(rqstp, fhp, file, offset, count); + err = nfsd_splice_read(rqstp, fhp, file, offset, count, eof); else - err = nfsd_readv(rqstp, fhp, file, offset, vec, vlen, count); + err = nfsd_readv(rqstp, fhp, file, offset, vec, vlen, count, eof); nfsd_file_put(nf); diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index e0f7792165a6..a13fd9d7e1f5 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -80,13 +80,16 @@ __be32 nfsd_open_verified(struct svc_rqst *, struct svc_fh *, umode_t, int, struct file **); __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, - unsigned long *count); + unsigned long *count, + u32 *eof); __be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, struct kvec *vec, int vlen, - unsigned long *count); + unsigned long *count, + u32 *eof); __be32 nfsd_read(struct svc_rqst *, struct svc_fh *, - loff_t, struct kvec *, int, unsigned long *); + loff_t, struct kvec *, int, unsigned long *, + u32 *eof); __be32 nfsd_write(struct svc_rqst *, struct svc_fh *, loff_t, struct kvec *, int, unsigned long *, int); __be32 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, @@ -149,23 +152,4 @@ static inline int nfsd_create_is_exclusive(int createmode) || createmode == NFS4_CREATE_EXCLUSIVE4_1; } -static inline bool nfsd_eof_on_read(long requested, long read, - loff_t offset, loff_t size) -{ - /* We assume a short read means eof: */ - if (requested > read) - return true; - /* - * A non-short read might also reach end of file. The spec - * still requires us to set eof in that case. - * - * Further operations may have modified the file size since - * the read, so the following check is not atomic with the read. - * We've only seen that cause a problem for a client in the case - * where the read returned a count of 0 without setting eof. - * That case was fixed by the addition of the above check. - */ - return (offset + read >= size); -} - #endif /* LINUX_NFSD_VFS_H */ diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h index 2cb29e961a76..99ff9f403ff1 100644 --- a/fs/nfsd/xdr3.h +++ b/fs/nfsd/xdr3.h @@ -151,7 +151,7 @@ struct nfsd3_readres { __be32 status; struct svc_fh fh; unsigned long count; - int eof; + __u32 eof; }; struct nfsd3_writeres { From 3a83f677a6eeff65751b29e3648d7c69c3be83f3 Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Wed, 11 Sep 2019 17:31:55 -0500 Subject: [PATCH 155/334] KVM: PPC: Book3S HV: use smp_mb() when setting/clearing host_ipi flag On a 2-socket Power9 system with 32 cores/128 threads (SMT4) and 1TB of memory running the following guest configs: guest A: - 224GB of memory - 56 VCPUs (sockets=1,cores=28,threads=2), where: VCPUs 0-1 are pinned to CPUs 0-3, VCPUs 2-3 are pinned to CPUs 4-7, ... VCPUs 54-55 are pinned to CPUs 108-111 guest B: - 4GB of memory - 4 VCPUs (sockets=1,cores=4,threads=1) with the following workloads (with KSM and THP enabled in all): guest A: stress --cpu 40 --io 20 --vm 20 --vm-bytes 512M guest B: stress --cpu 4 --io 4 --vm 4 --vm-bytes 512M host: stress --cpu 4 --io 4 --vm 2 --vm-bytes 256M the below soft-lockup traces were observed after an hour or so and persisted until the host was reset (this was found to be reliably reproducible for this configuration, for kernels 4.15, 4.18, 5.0, and 5.3-rc5): [ 1253.183290] rcu: INFO: rcu_sched self-detected stall on CPU [ 1253.183319] rcu: 124-....: (5250 ticks this GP) idle=10a/1/0x4000000000000002 softirq=5408/5408 fqs=1941 [ 1256.287426] watchdog: BUG: soft lockup - CPU#105 stuck for 23s! [CPU 52/KVM:19709] [ 1264.075773] watchdog: BUG: soft lockup - CPU#24 stuck for 23s! [worker:19913] [ 1264.079769] watchdog: BUG: soft lockup - CPU#31 stuck for 23s! [worker:20331] [ 1264.095770] watchdog: BUG: soft lockup - CPU#45 stuck for 23s! [worker:20338] [ 1264.131773] watchdog: BUG: soft lockup - CPU#64 stuck for 23s! [avocado:19525] [ 1280.408480] watchdog: BUG: soft lockup - CPU#124 stuck for 22s! [ksmd:791] [ 1316.198012] rcu: INFO: rcu_sched self-detected stall on CPU [ 1316.198032] rcu: 124-....: (21003 ticks this GP) idle=10a/1/0x4000000000000002 softirq=5408/5408 fqs=8243 [ 1340.411024] watchdog: BUG: soft lockup - CPU#124 stuck for 22s! [ksmd:791] [ 1379.212609] rcu: INFO: rcu_sched self-detected stall on CPU [ 1379.212629] rcu: 124-....: (36756 ticks this GP) idle=10a/1/0x4000000000000002 softirq=5408/5408 fqs=14714 [ 1404.413615] watchdog: BUG: soft lockup - CPU#124 stuck for 22s! [ksmd:791] [ 1442.227095] rcu: INFO: rcu_sched self-detected stall on CPU [ 1442.227115] rcu: 124-....: (52509 ticks this GP) idle=10a/1/0x4000000000000002 softirq=5408/5408 fqs=21403 [ 1455.111787] INFO: task worker:19907 blocked for more than 120 seconds. [ 1455.111822] Tainted: G L 5.3.0-rc5-mdr-vanilla+ #1 [ 1455.111833] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 1455.111884] INFO: task worker:19908 blocked for more than 120 seconds. [ 1455.111905] Tainted: G L 5.3.0-rc5-mdr-vanilla+ #1 [ 1455.111925] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 1455.111966] INFO: task worker:20328 blocked for more than 120 seconds. [ 1455.111986] Tainted: G L 5.3.0-rc5-mdr-vanilla+ #1 [ 1455.111998] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 1455.112048] INFO: task worker:20330 blocked for more than 120 seconds. [ 1455.112068] Tainted: G L 5.3.0-rc5-mdr-vanilla+ #1 [ 1455.112097] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 1455.112138] INFO: task worker:20332 blocked for more than 120 seconds. [ 1455.112159] Tainted: G L 5.3.0-rc5-mdr-vanilla+ #1 [ 1455.112179] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 1455.112210] INFO: task worker:20333 blocked for more than 120 seconds. [ 1455.112231] Tainted: G L 5.3.0-rc5-mdr-vanilla+ #1 [ 1455.112242] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 1455.112282] INFO: task worker:20335 blocked for more than 120 seconds. [ 1455.112303] Tainted: G L 5.3.0-rc5-mdr-vanilla+ #1 [ 1455.112332] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 1455.112372] INFO: task worker:20336 blocked for more than 120 seconds. [ 1455.112392] Tainted: G L 5.3.0-rc5-mdr-vanilla+ #1 CPUs 45, 24, and 124 are stuck on spin locks, likely held by CPUs 105 and 31. CPUs 105 and 31 are stuck in smp_call_function_many(), waiting on target CPU 42. For instance: # CPU 105 registers (via xmon) R00 = c00000000020b20c R16 = 00007d1bcd800000 R01 = c00000363eaa7970 R17 = 0000000000000001 R02 = c0000000019b3a00 R18 = 000000000000006b R03 = 000000000000002a R19 = 00007d537d7aecf0 R04 = 000000000000002a R20 = 60000000000000e0 R05 = 000000000000002a R21 = 0801000000000080 R06 = c0002073fb0caa08 R22 = 0000000000000d60 R07 = c0000000019ddd78 R23 = 0000000000000001 R08 = 000000000000002a R24 = c00000000147a700 R09 = 0000000000000001 R25 = c0002073fb0ca908 R10 = c000008ffeb4e660 R26 = 0000000000000000 R11 = c0002073fb0ca900 R27 = c0000000019e2464 R12 = c000000000050790 R28 = c0000000000812b0 R13 = c000207fff623e00 R29 = c0002073fb0ca808 R14 = 00007d1bbee00000 R30 = c0002073fb0ca800 R15 = 00007d1bcd600000 R31 = 0000000000000800 pc = c00000000020b260 smp_call_function_many+0x3d0/0x460 cfar= c00000000020b270 smp_call_function_many+0x3e0/0x460 lr = c00000000020b20c smp_call_function_many+0x37c/0x460 msr = 900000010288b033 cr = 44024824 ctr = c000000000050790 xer = 0000000000000000 trap = 100 CPU 42 is running normally, doing VCPU work: # CPU 42 stack trace (via xmon) [link register ] c00800001be17188 kvmppc_book3s_radix_page_fault+0x90/0x2b0 [kvm_hv] [c000008ed3343820] c000008ed3343850 (unreliable) [c000008ed33438d0] c00800001be11b6c kvmppc_book3s_hv_page_fault+0x264/0xe30 [kvm_hv] [c000008ed33439d0] c00800001be0d7b4 kvmppc_vcpu_run_hv+0x8dc/0xb50 [kvm_hv] [c000008ed3343ae0] c00800001c10891c kvmppc_vcpu_run+0x34/0x48 [kvm] [c000008ed3343b00] c00800001c10475c kvm_arch_vcpu_ioctl_run+0x244/0x420 [kvm] [c000008ed3343b90] c00800001c0f5a78 kvm_vcpu_ioctl+0x470/0x7c8 [kvm] [c000008ed3343d00] c000000000475450 do_vfs_ioctl+0xe0/0xc70 [c000008ed3343db0] c0000000004760e4 ksys_ioctl+0x104/0x120 [c000008ed3343e00] c000000000476128 sys_ioctl+0x28/0x80 [c000008ed3343e20] c00000000000b388 system_call+0x5c/0x70 --- Exception: c00 (System Call) at 00007d545cfd7694 SP (7d53ff7edf50) is in userspace It was subsequently found that ipi_message[PPC_MSG_CALL_FUNCTION] was set for CPU 42 by at least 1 of the CPUs waiting in smp_call_function_many(), but somehow the corresponding call_single_queue entries were never processed by CPU 42, causing the callers to spin in csd_lock_wait() indefinitely. Nick Piggin suggested something similar to the following sequence as a possible explanation (interleaving of CALL_FUNCTION/RESCHEDULE IPI messages seems to be most common, but any mix of CALL_FUNCTION and !CALL_FUNCTION messages could trigger it): CPU X: smp_muxed_ipi_set_message(): X: smp_mb() X: message[RESCHEDULE] = 1 X: doorbell_global_ipi(42): X: kvmppc_set_host_ipi(42, 1) X: ppc_msgsnd_sync()/smp_mb() X: ppc_msgsnd() -> 42 42: doorbell_exception(): // from CPU X 42: ppc_msgsync() 105: smp_muxed_ipi_set_message(): 105: smb_mb() // STORE DEFERRED DUE TO RE-ORDERING --105: message[CALL_FUNCTION] = 1 | 105: doorbell_global_ipi(42): | 105: kvmppc_set_host_ipi(42, 1) | 42: kvmppc_set_host_ipi(42, 0) | 42: smp_ipi_demux_relaxed() | 42: // returns to executing guest | // RE-ORDERED STORE COMPLETES ->105: message[CALL_FUNCTION] = 1 105: ppc_msgsnd_sync()/smp_mb() 105: ppc_msgsnd() -> 42 42: local_paca->kvm_hstate.host_ipi == 0 // IPI ignored 105: // hangs waiting on 42 to process messages/call_single_queue This can be prevented with an smp_mb() at the beginning of kvmppc_set_host_ipi(), such that stores to message[] (or other state indicated by the host_ipi flag) are ordered vs. the store to to host_ipi. However, doing so might still allow for the following scenario (not yet observed): CPU X: smp_muxed_ipi_set_message(): X: smp_mb() X: message[RESCHEDULE] = 1 X: doorbell_global_ipi(42): X: kvmppc_set_host_ipi(42, 1) X: ppc_msgsnd_sync()/smp_mb() X: ppc_msgsnd() -> 42 42: doorbell_exception(): // from CPU X 42: ppc_msgsync() // STORE DEFERRED DUE TO RE-ORDERING -- 42: kvmppc_set_host_ipi(42, 0) | 42: smp_ipi_demux_relaxed() | 105: smp_muxed_ipi_set_message(): | 105: smb_mb() | 105: message[CALL_FUNCTION] = 1 | 105: doorbell_global_ipi(42): | 105: kvmppc_set_host_ipi(42, 1) | // RE-ORDERED STORE COMPLETES -> 42: kvmppc_set_host_ipi(42, 0) 42: // returns to executing guest 105: ppc_msgsnd_sync()/smp_mb() 105: ppc_msgsnd() -> 42 42: local_paca->kvm_hstate.host_ipi == 0 // IPI ignored 105: // hangs waiting on 42 to process messages/call_single_queue Fixing this scenario would require an smp_mb() *after* clearing host_ipi flag in kvmppc_set_host_ipi() to order the store vs. subsequent processing of IPI messages. To handle both cases, this patch splits kvmppc_set_host_ipi() into separate set/clear functions, where we execute smp_mb() prior to setting host_ipi flag, and after clearing host_ipi flag. These functions pair with each other to synchronize the sender and receiver sides. With that change in place the above workload ran for 20 hours without triggering any lock-ups. Fixes: 755563bc79c7 ("powerpc/powernv: Fixes for hypervisor doorbell handling") # v4.0 Signed-off-by: Michael Roth Acked-by: Paul Mackerras Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190911223155.16045-1-mdroth@linux.vnet.ibm.com --- arch/powerpc/include/asm/kvm_ppc.h | 100 +++++++++++++++++++++++++- arch/powerpc/kernel/dbell.c | 6 +- arch/powerpc/kvm/book3s_hv_rm_xics.c | 2 +- arch/powerpc/platforms/powernv/smp.c | 2 +- arch/powerpc/sysdev/xics/icp-native.c | 6 +- arch/powerpc/sysdev/xics/icp-opal.c | 6 +- 6 files changed, 108 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 8e8514efb124..ee62776e5433 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -452,9 +452,100 @@ static inline u32 kvmppc_get_xics_latch(void) return xirr; } -static inline void kvmppc_set_host_ipi(int cpu, u8 host_ipi) +/* + * To avoid the need to unnecessarily exit fully to the host kernel, an IPI to + * a CPU thread that's running/napping inside of a guest is by default regarded + * as a request to wake the CPU (if needed) and continue execution within the + * guest, potentially to process new state like externally-generated + * interrupts or IPIs sent from within the guest itself (e.g. H_PROD/H_IPI). + * + * To force an exit to the host kernel, kvmppc_set_host_ipi() must be called + * prior to issuing the IPI to set the corresponding 'host_ipi' flag in the + * target CPU's PACA. To avoid unnecessary exits to the host, this flag should + * be immediately cleared via kvmppc_clear_host_ipi() by the IPI handler on + * the receiving side prior to processing the IPI work. + * + * NOTE: + * + * We currently issue an smp_mb() at the beginning of kvmppc_set_host_ipi(). + * This is to guard against sequences such as the following: + * + * CPU + * X: smp_muxed_ipi_set_message(): + * X: smp_mb() + * X: message[RESCHEDULE] = 1 + * X: doorbell_global_ipi(42): + * X: kvmppc_set_host_ipi(42) + * X: ppc_msgsnd_sync()/smp_mb() + * X: ppc_msgsnd() -> 42 + * 42: doorbell_exception(): // from CPU X + * 42: ppc_msgsync() + * 105: smp_muxed_ipi_set_message(): + * 105: smb_mb() + * // STORE DEFERRED DUE TO RE-ORDERING + * --105: message[CALL_FUNCTION] = 1 + * | 105: doorbell_global_ipi(42): + * | 105: kvmppc_set_host_ipi(42) + * | 42: kvmppc_clear_host_ipi(42) + * | 42: smp_ipi_demux_relaxed() + * | 42: // returns to executing guest + * | // RE-ORDERED STORE COMPLETES + * ->105: message[CALL_FUNCTION] = 1 + * 105: ppc_msgsnd_sync()/smp_mb() + * 105: ppc_msgsnd() -> 42 + * 42: local_paca->kvm_hstate.host_ipi == 0 // IPI ignored + * 105: // hangs waiting on 42 to process messages/call_single_queue + * + * We also issue an smp_mb() at the end of kvmppc_clear_host_ipi(). This is + * to guard against sequences such as the following (as well as to create + * a read-side pairing with the barrier in kvmppc_set_host_ipi()): + * + * CPU + * X: smp_muxed_ipi_set_message(): + * X: smp_mb() + * X: message[RESCHEDULE] = 1 + * X: doorbell_global_ipi(42): + * X: kvmppc_set_host_ipi(42) + * X: ppc_msgsnd_sync()/smp_mb() + * X: ppc_msgsnd() -> 42 + * 42: doorbell_exception(): // from CPU X + * 42: ppc_msgsync() + * // STORE DEFERRED DUE TO RE-ORDERING + * -- 42: kvmppc_clear_host_ipi(42) + * | 42: smp_ipi_demux_relaxed() + * | 105: smp_muxed_ipi_set_message(): + * | 105: smb_mb() + * | 105: message[CALL_FUNCTION] = 1 + * | 105: doorbell_global_ipi(42): + * | 105: kvmppc_set_host_ipi(42) + * | // RE-ORDERED STORE COMPLETES + * -> 42: kvmppc_clear_host_ipi(42) + * 42: // returns to executing guest + * 105: ppc_msgsnd_sync()/smp_mb() + * 105: ppc_msgsnd() -> 42 + * 42: local_paca->kvm_hstate.host_ipi == 0 // IPI ignored + * 105: // hangs waiting on 42 to process messages/call_single_queue + */ +static inline void kvmppc_set_host_ipi(int cpu) { - paca_ptrs[cpu]->kvm_hstate.host_ipi = host_ipi; + /* + * order stores of IPI messages vs. setting of host_ipi flag + * + * pairs with the barrier in kvmppc_clear_host_ipi() + */ + smp_mb(); + paca_ptrs[cpu]->kvm_hstate.host_ipi = 1; +} + +static inline void kvmppc_clear_host_ipi(int cpu) +{ + paca_ptrs[cpu]->kvm_hstate.host_ipi = 0; + /* + * order clearing of host_ipi flag vs. processing of IPI messages + * + * pairs with the barrier in kvmppc_set_host_ipi() + */ + smp_mb(); } static inline void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu) @@ -486,7 +577,10 @@ static inline u32 kvmppc_get_xics_latch(void) return 0; } -static inline void kvmppc_set_host_ipi(int cpu, u8 host_ipi) +static inline void kvmppc_set_host_ipi(int cpu) +{} + +static inline void kvmppc_clear_host_ipi(int cpu) {} static inline void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu) diff --git a/arch/powerpc/kernel/dbell.c b/arch/powerpc/kernel/dbell.c index 804b1a6196fa..f17ff1200eaa 100644 --- a/arch/powerpc/kernel/dbell.c +++ b/arch/powerpc/kernel/dbell.c @@ -33,7 +33,7 @@ void doorbell_global_ipi(int cpu) { u32 tag = get_hard_smp_processor_id(cpu); - kvmppc_set_host_ipi(cpu, 1); + kvmppc_set_host_ipi(cpu); /* Order previous accesses vs. msgsnd, which is treated as a store */ ppc_msgsnd_sync(); ppc_msgsnd(PPC_DBELL_MSGTYPE, 0, tag); @@ -48,7 +48,7 @@ void doorbell_core_ipi(int cpu) { u32 tag = cpu_thread_in_core(cpu); - kvmppc_set_host_ipi(cpu, 1); + kvmppc_set_host_ipi(cpu); /* Order previous accesses vs. msgsnd, which is treated as a store */ ppc_msgsnd_sync(); ppc_msgsnd(PPC_DBELL_MSGTYPE, 0, tag); @@ -84,7 +84,7 @@ void doorbell_exception(struct pt_regs *regs) may_hard_irq_enable(); - kvmppc_set_host_ipi(smp_processor_id(), 0); + kvmppc_clear_host_ipi(smp_processor_id()); __this_cpu_inc(irq_stat.doorbell_irqs); smp_ipi_demux_relaxed(); /* already performed the barrier */ diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index 4d2ec77d806c..287d5911df0f 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -58,7 +58,7 @@ static inline void icp_send_hcore_msg(int hcore, struct kvm_vcpu *vcpu) hcpu = hcore << threads_shift; kvmppc_host_rm_ops_hv->rm_core[hcore].rm_data = vcpu; smp_muxed_ipi_set_message(hcpu, PPC_MSG_RM_HOST_ACTION); - kvmppc_set_host_ipi(hcpu, 1); + kvmppc_set_host_ipi(hcpu); smp_mb(); kvmhv_rm_send_ipi(hcpu); } diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index 94cd96b9b7bb..fbd6e6b7bbf2 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -193,7 +193,7 @@ static void pnv_smp_cpu_kill_self(void) * for coming online, which are handled via * generic_check_cpu_restart() calls. */ - kvmppc_set_host_ipi(cpu, 0); + kvmppc_clear_host_ipi(cpu); srr1 = pnv_cpu_offline(cpu); diff --git a/arch/powerpc/sysdev/xics/icp-native.c b/arch/powerpc/sysdev/xics/icp-native.c index 485569ff7ef1..7d13d2ef5a90 100644 --- a/arch/powerpc/sysdev/xics/icp-native.c +++ b/arch/powerpc/sysdev/xics/icp-native.c @@ -140,7 +140,7 @@ static unsigned int icp_native_get_irq(void) static void icp_native_cause_ipi(int cpu) { - kvmppc_set_host_ipi(cpu, 1); + kvmppc_set_host_ipi(cpu); icp_native_set_qirr(cpu, IPI_PRIORITY); } @@ -179,7 +179,7 @@ void icp_native_flush_interrupt(void) if (vec == XICS_IPI) { /* Clear pending IPI */ int cpu = smp_processor_id(); - kvmppc_set_host_ipi(cpu, 0); + kvmppc_clear_host_ipi(cpu); icp_native_set_qirr(cpu, 0xff); } else { pr_err("XICS: hw interrupt 0x%x to offline cpu, disabling\n", @@ -200,7 +200,7 @@ static irqreturn_t icp_native_ipi_action(int irq, void *dev_id) { int cpu = smp_processor_id(); - kvmppc_set_host_ipi(cpu, 0); + kvmppc_clear_host_ipi(cpu); icp_native_set_qirr(cpu, 0xff); return smp_ipi_demux(); diff --git a/arch/powerpc/sysdev/xics/icp-opal.c b/arch/powerpc/sysdev/xics/icp-opal.c index 8bb8dd7dd6ad..68fd2540b093 100644 --- a/arch/powerpc/sysdev/xics/icp-opal.c +++ b/arch/powerpc/sysdev/xics/icp-opal.c @@ -126,7 +126,7 @@ static void icp_opal_cause_ipi(int cpu) { int hw_cpu = get_hard_smp_processor_id(cpu); - kvmppc_set_host_ipi(cpu, 1); + kvmppc_set_host_ipi(cpu); opal_int_set_mfrr(hw_cpu, IPI_PRIORITY); } @@ -134,7 +134,7 @@ static irqreturn_t icp_opal_ipi_action(int irq, void *dev_id) { int cpu = smp_processor_id(); - kvmppc_set_host_ipi(cpu, 0); + kvmppc_clear_host_ipi(cpu); opal_int_set_mfrr(get_hard_smp_processor_id(cpu), 0xff); return smp_ipi_demux(); @@ -157,7 +157,7 @@ void icp_opal_flush_interrupt(void) if (vec == XICS_IPI) { /* Clear pending IPI */ int cpu = smp_processor_id(); - kvmppc_set_host_ipi(cpu, 0); + kvmppc_clear_host_ipi(cpu); opal_int_set_mfrr(get_hard_smp_processor_id(cpu), 0xff); } else { pr_err("XICS: hw interrupt 0x%x to offline cpu, " From d2f15428d6a0ebfc0edc364094d7c4a2de7037ed Mon Sep 17 00:00:00 2001 From: Steve French Date: Sun, 22 Sep 2019 00:55:46 -0500 Subject: [PATCH 156/334] smb3: fix leak in "open on server" perf counter We were not bumping up the "open on server" (num_remote_opens) counter (in some cases) on opens of the share root so could end up showing as a negative value. CC: Stable Signed-off-by: Steve French Reviewed-by: Pavel Shilovsky --- fs/cifs/smb2ops.c | 5 +++++ fs/cifs/smb2pdu.c | 1 + 2 files changed, 6 insertions(+) diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index eaed18061314..3010066a8709 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -751,6 +751,8 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid) goto oshr_exit; } + atomic_inc(&tcon->num_remote_opens); + o_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base; oparms.fid->persistent_fid = o_rsp->PersistentFileId; oparms.fid->volatile_fid = o_rsp->VolatileFileId; @@ -1176,6 +1178,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, rc = compound_send_recv(xid, ses, flags, 3, rqst, resp_buftype, rsp_iov); + /* no need to bump num_remote_opens because handle immediately closed */ sea_exit: kfree(ea); @@ -1518,6 +1521,8 @@ smb2_ioctl_query_info(const unsigned int xid, resp_buftype, rsp_iov); if (rc) goto iqinf_exit; + + /* No need to bump num_remote_opens since handle immediately closed */ if (qi.flags & PASSTHRU_FSCTL) { pqi = (struct smb_query_info __user *)arg; io_rsp = (struct smb2_ioctl_rsp *)rsp_iov[1].iov_base; diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 87066f1af12c..5f2491efd950 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -2352,6 +2352,7 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode, rqst.rq_iov = iov; rqst.rq_nvec = n_iov; + /* no need to inc num_remote_opens because we close it just below */ trace_smb3_posix_mkdir_enter(xid, tcon->tid, ses->Suid, CREATE_NOT_FILE, FILE_WRITE_ATTRIBUTES); /* resource #4: response buffer */ From 388962e8e9ce8b253e91cbaed94e78a07dc92d84 Mon Sep 17 00:00:00 2001 From: zhengbin Date: Mon, 23 Sep 2019 15:06:18 +0800 Subject: [PATCH 157/334] fs/cifs/smb2pdu.c: Make SMB2_notify_init static Fix sparse warnings: fs/cifs/smb2pdu.c:3200:1: warning: symbol 'SMB2_notify_init' was not declared. Should it be static? Reported-by: Hulk Robot Signed-off-by: zhengbin Signed-off-by: Steve French --- fs/cifs/smb2pdu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 5f2491efd950..ea08e6159481 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -3181,7 +3181,7 @@ SMB2_get_srv_num(const unsigned int xid, struct cifs_tcon *tcon, * See MS-SMB2 2.2.35 and 2.2.36 */ -int +static int SMB2_notify_init(const unsigned int xid, struct smb_rqst *rqst, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, u32 completion_filter, bool watch_tree) From 8559ad8e89185ca55d8c9ef1e0b20f58578a4055 Mon Sep 17 00:00:00 2001 From: zhengbin Date: Tue, 24 Sep 2019 10:13:47 +0800 Subject: [PATCH 158/334] fs/cifs/sess.c: Remove set but not used variable 'capabilities' Fixes gcc '-Wunused-but-set-variable' warning: fs/cifs/sess.c: In function sess_auth_lanman: fs/cifs/sess.c:910:8: warning: variable capabilities set but not used [-Wunused-but-set-variable] Reported-by: Hulk Robot Signed-off-by: zhengbin Signed-off-by: Steve French --- fs/cifs/sess.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 4c764ff7edd2..85bd644f9773 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -698,7 +698,6 @@ sess_auth_lanman(struct sess_data *sess_data) char *bcc_ptr; struct cifs_ses *ses = sess_data->ses; char lnm_session_key[CIFS_AUTH_RESP_SIZE]; - __u32 capabilities; __u16 bytes_remaining; /* lanman 2 style sessionsetup */ @@ -709,7 +708,7 @@ sess_auth_lanman(struct sess_data *sess_data) pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; bcc_ptr = sess_data->iov[2].iov_base; - capabilities = cifs_ssetup_hdr(ses, pSMB); + (void)cifs_ssetup_hdr(ses, pSMB); pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE; From 63d37fb4ce5ae7bf1e58f906d1bf25f036fe79b2 Mon Sep 17 00:00:00 2001 From: Murphy Zhou Date: Sat, 21 Sep 2019 19:26:00 +0800 Subject: [PATCH 159/334] CIFS: fix max ea value size It should not be larger then the slab max buf size. If user specifies a larger size, it passes this check and goes straightly to SMB2_set_info_init performing an insecure memcpy. Signed-off-by: Murphy Zhou Reviewed-by: Aurelien Aptel CC: Stable Signed-off-by: Steve French --- fs/cifs/xattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 9076150758d8..db4ba8f6077e 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -31,7 +31,7 @@ #include "cifs_fs_sb.h" #include "cifs_unicode.h" -#define MAX_EA_VALUE_SIZE 65535 +#define MAX_EA_VALUE_SIZE CIFSMaxBufSize #define CIFS_XATTR_CIFS_ACL "system.cifs_acl" #define CIFS_XATTR_ATTRIB "cifs.dosattrib" /* full name: user.cifs.dosattrib */ #define CIFS_XATTR_CREATETIME "cifs.creationtime" /* user.cifs.creationtime */ From 0b8dc6abbdb9f6696b1a79c42976e506645e5c2c Mon Sep 17 00:00:00 2001 From: Yan-Hsuan Chuang Date: Mon, 23 Sep 2019 10:47:03 +0800 Subject: [PATCH 160/334] rtw88: configure firmware after HCI started After firmware has been downloaded, driver should send some information to it through H2C commands. Those H2C commands are transmitted through TX path. But before HCI has been started, the TX path is not working completely. Such as PCI interfaces, the interrupts are not enabled, hence TX interrupts will not be issued after H2C skb has been DMAed to the device. And the H2C skbs will not be released until the device is powered off. Signed-off-by: Yan-Hsuan Chuang Signed-off-by: Kalle Valo --- drivers/net/wireless/realtek/rtw88/mac.c | 3 --- drivers/net/wireless/realtek/rtw88/main.c | 4 ++++ 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/realtek/rtw88/mac.c b/drivers/net/wireless/realtek/rtw88/mac.c index fc14b37d927d..b61b073031e5 100644 --- a/drivers/net/wireless/realtek/rtw88/mac.c +++ b/drivers/net/wireless/realtek/rtw88/mac.c @@ -707,9 +707,6 @@ int rtw_download_firmware(struct rtw_dev *rtwdev, struct rtw_fw_state *fw) rtwdev->h2c.last_box_num = 0; rtwdev->h2c.seq = 0; - rtw_fw_send_general_info(rtwdev); - rtw_fw_send_phydm_info(rtwdev); - rtw_flag_set(rtwdev, RTW_FLAG_FW_RUNNING); return 0; diff --git a/drivers/net/wireless/realtek/rtw88/main.c b/drivers/net/wireless/realtek/rtw88/main.c index fc8f6213fc8f..6dd457741b15 100644 --- a/drivers/net/wireless/realtek/rtw88/main.c +++ b/drivers/net/wireless/realtek/rtw88/main.c @@ -704,6 +704,10 @@ static int rtw_power_on(struct rtw_dev *rtwdev) goto err_off; } + /* send H2C after HCI has started */ + rtw_fw_send_general_info(rtwdev); + rtw_fw_send_phydm_info(rtwdev); + wifi_only = !rtwdev->efuse.btcoex; rtw_coex_power_on_setting(rtwdev); rtw_coex_init_hw_config(rtwdev, wifi_only); From 34c0989c05314b0288b058a4d1f6e58e2d320929 Mon Sep 17 00:00:00 2001 From: Andrei Dulea Date: Fri, 13 Sep 2019 16:42:28 +0200 Subject: [PATCH 161/334] iommu/amd: Fix pages leak in free_pagetable() Take into account the gathered freelist in free_sub_pt(), otherwise we end up leaking all that pages. Fixes: 409afa44f9ba ("iommu/amd: Introduce free_sub_pt() function") Signed-off-by: Andrei Dulea --- drivers/iommu/amd_iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 1ed3b98324ba..138547446345 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -1425,7 +1425,7 @@ static void free_pagetable(struct protection_domain *domain) BUG_ON(domain->mode < PAGE_MODE_NONE || domain->mode > PAGE_MODE_6_LEVEL); - free_sub_pt(root, domain->mode, freelist); + freelist = free_sub_pt(root, domain->mode, freelist); free_page_list(freelist); } From 6ccb72f8374e17d60b58a7bfd5570496332c54e2 Mon Sep 17 00:00:00 2001 From: Andrei Dulea Date: Fri, 13 Sep 2019 16:42:29 +0200 Subject: [PATCH 162/334] iommu/amd: Fix downgrading default page-sizes in alloc_pte() Downgrading an existing large mapping to a mapping using smaller page-sizes works only for the mappings created with page-mode 7 (i.e. non-default page size). Treat large mappings created with page-mode 0 (i.e. default page size) like a non-present mapping and allow to overwrite it in alloc_pte(). While around, make sure that we flush the TLB only if we change an existing mapping, otherwise we might end up acting on garbage PTEs. Fixes: 6d568ef9a622 ("iommu/amd: Allow downgrading page-sizes in alloc_pte()") Signed-off-by: Andrei Dulea --- drivers/iommu/amd_iommu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 138547446345..c7e28a8d25d1 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -1490,6 +1490,7 @@ static u64 *alloc_pte(struct protection_domain *domain, pte_level = PM_PTE_LEVEL(__pte); if (!IOMMU_PTE_PRESENT(__pte) || + pte_level == PAGE_MODE_NONE || pte_level == PAGE_MODE_7_LEVEL) { page = (u64 *)get_zeroed_page(gfp); if (!page) @@ -1500,7 +1501,7 @@ static u64 *alloc_pte(struct protection_domain *domain, /* pte could have been changed somewhere. */ if (cmpxchg64(pte, __pte, __npte) != __pte) free_page((unsigned long)page); - else if (pte_level == PAGE_MODE_7_LEVEL) + else if (IOMMU_PTE_PRESENT(__pte)) domain->updated = true; continue; From 7f1f1683c1e27c280556766cfd2c219957cebe4f Mon Sep 17 00:00:00 2001 From: Andrei Dulea Date: Fri, 13 Sep 2019 16:42:30 +0200 Subject: [PATCH 163/334] iommu/amd: Introduce first_pte_l7() helper Given an arbitrary pte that is part of a large mapping, this function returns the first pte of the series (and optionally the mapped size and number of PTEs) It will be re-used in a subsequent patch to replace an existing L7 mapping. Fixes: 6d568ef9a622 ("iommu/amd: Allow downgrading page-sizes in alloc_pte()") Signed-off-by: Andrei Dulea --- drivers/iommu/amd_iommu.c | 40 ++++++++++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index c7e28a8d25d1..a227e7a9b8b7 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -501,6 +501,29 @@ static void iommu_uninit_device(struct device *dev) */ } +/* + * Helper function to get the first pte of a large mapping + */ +static u64 *first_pte_l7(u64 *pte, unsigned long *page_size, + unsigned long *count) +{ + unsigned long pte_mask, pg_size, cnt; + u64 *fpte; + + pg_size = PTE_PAGE_SIZE(*pte); + cnt = PAGE_SIZE_PTE_COUNT(pg_size); + pte_mask = ~((cnt << 3) - 1); + fpte = (u64 *)(((unsigned long)pte) & pte_mask); + + if (page_size) + *page_size = pg_size; + + if (count) + *count = cnt; + + return fpte; +} + /**************************************************************************** * * Interrupt handling functions @@ -1567,17 +1590,12 @@ static u64 *fetch_pte(struct protection_domain *domain, *page_size = PTE_LEVEL_PAGE_SIZE(level); } - if (PM_PTE_LEVEL(*pte) == 0x07) { - unsigned long pte_mask; - - /* - * If we have a series of large PTEs, make - * sure to return a pointer to the first one. - */ - *page_size = pte_mask = PTE_PAGE_SIZE(*pte); - pte_mask = ~((PAGE_SIZE_PTE_COUNT(pte_mask) << 3) - 1); - pte = (u64 *)(((unsigned long)pte) & pte_mask); - } + /* + * If we have a series of large PTEs, make + * sure to return a pointer to the first one. + */ + if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL) + pte = first_pte_l7(pte, page_size, NULL); return pte; } From cc449541f2a8a646ad5ac45cb1c7b59c3ed6b310 Mon Sep 17 00:00:00 2001 From: Andrei Dulea Date: Fri, 13 Sep 2019 16:42:31 +0200 Subject: [PATCH 164/334] iommu/amd: Unmap all L7 PTEs when downgrading page-sizes When replacing a large mapping created with page-mode 7 (i.e. non-default page size), tear down the entire series of replicated PTEs. Besides providing access to the old mapping, another thing that might go wrong with this issue is on the fetch_pte() code path that can return a PDE entry of the newly re-mapped range. While at it, make sure that we flush the TLB in case alloc_pte() fails and returns NULL at a lower level. Fixes: 6d568ef9a622 ("iommu/amd: Allow downgrading page-sizes in alloc_pte()") Signed-off-by: Andrei Dulea --- drivers/iommu/amd_iommu.c | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index a227e7a9b8b7..fda9923542c9 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -1512,10 +1512,32 @@ static u64 *alloc_pte(struct protection_domain *domain, __pte = *pte; pte_level = PM_PTE_LEVEL(__pte); - if (!IOMMU_PTE_PRESENT(__pte) || - pte_level == PAGE_MODE_NONE || + /* + * If we replace a series of large PTEs, we need + * to tear down all of them. + */ + if (IOMMU_PTE_PRESENT(__pte) && pte_level == PAGE_MODE_7_LEVEL) { + unsigned long count, i; + u64 *lpte; + + lpte = first_pte_l7(pte, NULL, &count); + + /* + * Unmap the replicated PTEs that still match the + * original large mapping + */ + for (i = 0; i < count; ++i) + cmpxchg64(&lpte[i], __pte, 0ULL); + + domain->updated = true; + continue; + } + + if (!IOMMU_PTE_PRESENT(__pte) || + pte_level == PAGE_MODE_NONE) { page = (u64 *)get_zeroed_page(gfp); + if (!page) return NULL; @@ -1646,8 +1668,10 @@ static int iommu_map_page(struct protection_domain *dom, count = PAGE_SIZE_PTE_COUNT(page_size); pte = alloc_pte(dom, bus_addr, page_size, NULL, gfp); - if (!pte) + if (!pte) { + update_domain(dom); return -ENOMEM; + } for (i = 0; i < count; ++i) freelist = free_clear_pte(&pte[i], pte[i], freelist); From d32d7c52e08a25d8d4b9f1a7b56400f35b8f72fa Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Sun, 1 Sep 2019 16:28:28 +0300 Subject: [PATCH 165/334] net/mlx5: DR, Fix SW steering HW bits and definitions Fix wrong reserved bits offsets. Fixes: 97b5484ed608 ("net/mlx5: Add HW bits and definitions required for SW steering") Signed-off-by: Yevgeny Kliteynik Reviewed-by: Alex Vesker Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index a487b681b516..138c50d5a353 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -282,7 +282,6 @@ enum { MLX5_CMD_OP_ALLOC_MODIFY_HEADER_CONTEXT = 0x940, MLX5_CMD_OP_DEALLOC_MODIFY_HEADER_CONTEXT = 0x941, MLX5_CMD_OP_QUERY_MODIFY_HEADER_CONTEXT = 0x942, - MLX5_CMD_OP_SYNC_STEERING = 0xb00, MLX5_CMD_OP_FPGA_CREATE_QP = 0x960, MLX5_CMD_OP_FPGA_MODIFY_QP = 0x961, MLX5_CMD_OP_FPGA_QUERY_QP = 0x962, @@ -296,6 +295,7 @@ enum { MLX5_CMD_OP_DESTROY_UCTX = 0xa06, MLX5_CMD_OP_CREATE_UMEM = 0xa08, MLX5_CMD_OP_DESTROY_UMEM = 0xa0a, + MLX5_CMD_OP_SYNC_STEERING = 0xb00, MLX5_CMD_OP_MAX }; @@ -487,7 +487,7 @@ union mlx5_ifc_gre_key_bits { struct mlx5_ifc_fte_match_set_misc_bits { u8 gre_c_present[0x1]; - u8 reserved_auto1[0x1]; + u8 reserved_at_1[0x1]; u8 gre_k_present[0x1]; u8 gre_s_present[0x1]; u8 source_vhca_port[0x4]; @@ -5054,50 +5054,50 @@ struct mlx5_ifc_query_hca_cap_in_bits { struct mlx5_ifc_other_hca_cap_bits { u8 roce[0x1]; - u8 reserved_0[0x27f]; + u8 reserved_at_1[0x27f]; }; struct mlx5_ifc_query_other_hca_cap_out_bits { u8 status[0x8]; - u8 reserved_0[0x18]; + u8 reserved_at_8[0x18]; u8 syndrome[0x20]; - u8 reserved_1[0x40]; + u8 reserved_at_40[0x40]; struct mlx5_ifc_other_hca_cap_bits other_capability; }; struct mlx5_ifc_query_other_hca_cap_in_bits { u8 opcode[0x10]; - u8 reserved_0[0x10]; + u8 reserved_at_10[0x10]; - u8 reserved_1[0x10]; + u8 reserved_at_20[0x10]; u8 op_mod[0x10]; - u8 reserved_2[0x10]; + u8 reserved_at_40[0x10]; u8 function_id[0x10]; - u8 reserved_3[0x20]; + u8 reserved_at_60[0x20]; }; struct mlx5_ifc_modify_other_hca_cap_out_bits { u8 status[0x8]; - u8 reserved_0[0x18]; + u8 reserved_at_8[0x18]; u8 syndrome[0x20]; - u8 reserved_1[0x40]; + u8 reserved_at_40[0x40]; }; struct mlx5_ifc_modify_other_hca_cap_in_bits { u8 opcode[0x10]; - u8 reserved_0[0x10]; + u8 reserved_at_10[0x10]; - u8 reserved_1[0x10]; + u8 reserved_at_20[0x10]; u8 op_mod[0x10]; - u8 reserved_2[0x10]; + u8 reserved_at_40[0x10]; u8 function_id[0x10]; u8 field_select[0x20]; From cc5fd15fc5578e5a3da13f6b14b42ebef5ad42c2 Mon Sep 17 00:00:00 2001 From: Alex Vesker Date: Thu, 12 Sep 2019 11:38:20 +0300 Subject: [PATCH 166/334] net/mlx5: DR, Remove redundant vport number from action The vport number is part of the vport_cap, there is no reason to store in a separate variable on the vport. Fixes: 9db810ed2d37 ("net/mlx5: DR, Expose steering action functionality") Signed-off-by: Alex Vesker Reviewed-by: Maor Gottlieb Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c index 7d81a7735de5..b74b7d0f6590 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c @@ -615,7 +615,7 @@ static int dr_action_handle_cs_recalc(struct mlx5dr_domain *dmn, * that recalculates the CS and forwards to the vport. */ ret = mlx5dr_domain_cache_get_recalc_cs_ft_addr(dest_action->vport.dmn, - dest_action->vport.num, + dest_action->vport.caps->num, final_icm_addr); if (ret) { mlx5dr_err(dmn, "Failed to get FW cs recalc flow table\n"); @@ -744,7 +744,7 @@ int mlx5dr_actions_build_ste_arr(struct mlx5dr_matcher *matcher, dest_action = action; if (rx_rule) { /* Loopback on WIRE vport is not supported */ - if (action->vport.num == WIRE_PORT) + if (action->vport.caps->num == WIRE_PORT) goto out_invalid_arg; attr.final_icm_addr = action->vport.caps->icm_address_rx; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h index a37ee6359be2..78f899fb3305 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h @@ -745,7 +745,6 @@ struct mlx5dr_action { struct { struct mlx5dr_domain *dmn; struct mlx5dr_cmd_vport_cap *caps; - u32 num; } vport; struct { u32 vlan_hdr; /* tpid_pcp_dei_vid */ From 48cbde4bd2c7c028a7205cb83386bb345c315adc Mon Sep 17 00:00:00 2001 From: Alex Vesker Date: Thu, 19 Sep 2019 11:24:19 +0300 Subject: [PATCH 167/334] net/mlx5: DR, Fix getting incorrect prev node in ste_free When we free an STE and the STE is in the middle of collision list, the prev_ste was obtained incorrectly from the list. To avoid such issues list_entry calls replaced with standard list API. Fixes: 26d688e33f88 ("net/mlx5: DR, Add Steering entry (STE) utilities") Signed-off-by: Alex Vesker Signed-off-by: Saeed Mahameed --- .../mellanox/mlx5/core/steering/dr_matcher.c | 10 ++++------ .../ethernet/mellanox/mlx5/core/steering/dr_rule.c | 2 +- .../ethernet/mellanox/mlx5/core/steering/dr_ste.c | 14 +++++--------- 3 files changed, 10 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c index 01008cd66f75..9c2c25356dd0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c @@ -458,13 +458,11 @@ static int dr_matcher_add_to_tbl(struct mlx5dr_matcher *matcher) prev_matcher = NULL; if (next_matcher && !first) - prev_matcher = list_entry(next_matcher->matcher_list.prev, - struct mlx5dr_matcher, - matcher_list); + prev_matcher = list_prev_entry(next_matcher, matcher_list); else if (!first) - prev_matcher = list_entry(tbl->matcher_list.prev, - struct mlx5dr_matcher, - matcher_list); + prev_matcher = list_last_entry(&tbl->matcher_list, + struct mlx5dr_matcher, + matcher_list); if (dmn->type == MLX5DR_DOMAIN_TYPE_FDB || dmn->type == MLX5DR_DOMAIN_TYPE_NIC_RX) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c index 3bc3f66b8fa8..4187f2b112b8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c @@ -18,7 +18,7 @@ static int dr_rule_append_to_miss_list(struct mlx5dr_ste *new_last_ste, struct mlx5dr_ste *last_ste; /* The new entry will be inserted after the last */ - last_ste = list_entry(miss_list->prev, struct mlx5dr_ste, miss_list_node); + last_ste = list_last_entry(miss_list, struct mlx5dr_ste, miss_list_node); WARN_ON(!last_ste); ste_info_last = kzalloc(sizeof(*ste_info_last), GFP_KERNEL); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c index 6b0af64536d8..95b7221f5730 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c @@ -429,12 +429,9 @@ static void dr_ste_remove_middle_ste(struct mlx5dr_ste *ste, struct mlx5dr_ste *prev_ste; u64 miss_addr; - prev_ste = list_entry(mlx5dr_ste_get_miss_list(ste)->prev, struct mlx5dr_ste, - miss_list_node); - if (!prev_ste) { - WARN_ON(true); + prev_ste = list_prev_entry(ste, miss_list_node); + if (WARN_ON(!prev_ste)) return; - } miss_addr = mlx5dr_ste_get_miss_addr(ste->hw_ste); mlx5dr_ste_set_miss_addr(prev_ste->hw_ste, miss_addr); @@ -461,8 +458,8 @@ void mlx5dr_ste_free(struct mlx5dr_ste *ste, struct mlx5dr_ste_htbl *stats_tbl; LIST_HEAD(send_ste_list); - first_ste = list_entry(mlx5dr_ste_get_miss_list(ste)->next, - struct mlx5dr_ste, miss_list_node); + first_ste = list_first_entry(mlx5dr_ste_get_miss_list(ste), + struct mlx5dr_ste, miss_list_node); stats_tbl = first_ste->htbl; /* Two options: @@ -479,8 +476,7 @@ void mlx5dr_ste_free(struct mlx5dr_ste *ste, if (last_ste == first_ste) next_ste = NULL; else - next_ste = list_entry(ste->miss_list_node.next, - struct mlx5dr_ste, miss_list_node); + next_ste = list_next_entry(ste, miss_list_node); if (!next_ste) { /* One and only entry in the list */ From 640bdb1fdb4ee42fa469c7842e0fac7b0ada7b9d Mon Sep 17 00:00:00 2001 From: Alaa Hleihel Date: Wed, 18 Sep 2019 12:23:10 +0300 Subject: [PATCH 168/334] net/mlx5: DR, Allow matching on vport based on vhca_id In case source_eswitch_owner_vhca_id is given as a match, the source_vport (vhca_id) will be set in case vhca_id_valid. This will allow matching on peer vports, vports that belong to the other pf. Fixes: 26d688e33f88 ("net/mlx5: DR, Add Steering entry (STE) utilities") Signed-off-by: Alaa Hleihel Signed-off-by: Alex Vesker Signed-off-by: Saeed Mahameed --- .../mellanox/mlx5/core/steering/dr_matcher.c | 3 +- .../mellanox/mlx5/core/steering/dr_ste.c | 36 ++++++++++++++++--- .../mellanox/mlx5/core/steering/dr_types.h | 6 ++-- 3 files changed, 37 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c index 9c2c25356dd0..67dea7698fc9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c @@ -230,8 +230,7 @@ static int dr_matcher_set_ste_builders(struct mlx5dr_matcher *matcher, (dmn->type == MLX5DR_DOMAIN_TYPE_FDB || dmn->type == MLX5DR_DOMAIN_TYPE_NIC_RX)) { ret = mlx5dr_ste_build_src_gvmi_qpn(&sb[idx++], &mask, - &dmn->info.caps, - inner, rx); + dmn, inner, rx); if (ret) return ret; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c index 95b7221f5730..4efe1b0be4a8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c @@ -837,6 +837,8 @@ static void dr_ste_copy_mask_misc(char *mask, struct mlx5dr_match_misc *spec) spec->source_sqn = MLX5_GET(fte_match_set_misc, mask, source_sqn); spec->source_port = MLX5_GET(fte_match_set_misc, mask, source_port); + spec->source_eswitch_owner_vhca_id = MLX5_GET(fte_match_set_misc, mask, + source_eswitch_owner_vhca_id); spec->outer_second_prio = MLX5_GET(fte_match_set_misc, mask, outer_second_prio); spec->outer_second_cfi = MLX5_GET(fte_match_set_misc, mask, outer_second_cfi); @@ -2250,11 +2252,18 @@ static int dr_ste_build_src_gvmi_qpn_bit_mask(struct mlx5dr_match_param *value, { struct mlx5dr_match_misc *misc_mask = &value->misc; - if (misc_mask->source_port != 0xffff) + /* Partial misc source_port is not supported */ + if (misc_mask->source_port && misc_mask->source_port != 0xffff) + return -EINVAL; + + /* Partial misc source_eswitch_owner_vhca_id is not supported */ + if (misc_mask->source_eswitch_owner_vhca_id && + misc_mask->source_eswitch_owner_vhca_id != 0xffff) return -EINVAL; DR_STE_SET_MASK(src_gvmi_qp, bit_mask, source_gvmi, misc_mask, source_port); DR_STE_SET_MASK(src_gvmi_qp, bit_mask, source_qp, misc_mask, source_sqn); + misc_mask->source_eswitch_owner_vhca_id = 0; return 0; } @@ -2266,17 +2275,33 @@ static int dr_ste_build_src_gvmi_qpn_tag(struct mlx5dr_match_param *value, struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; struct mlx5dr_match_misc *misc = &value->misc; struct mlx5dr_cmd_vport_cap *vport_cap; + struct mlx5dr_domain *dmn = sb->dmn; + struct mlx5dr_cmd_caps *caps; u8 *tag = hw_ste->tag; DR_STE_SET_TAG(src_gvmi_qp, tag, source_qp, misc, source_sqn); - vport_cap = mlx5dr_get_vport_cap(sb->caps, misc->source_port); + if (sb->vhca_id_valid) { + /* Find port GVMI based on the eswitch_owner_vhca_id */ + if (misc->source_eswitch_owner_vhca_id == dmn->info.caps.gvmi) + caps = &dmn->info.caps; + else if (dmn->peer_dmn && (misc->source_eswitch_owner_vhca_id == + dmn->peer_dmn->info.caps.gvmi)) + caps = &dmn->peer_dmn->info.caps; + else + return -EINVAL; + } else { + caps = &dmn->info.caps; + } + + vport_cap = mlx5dr_get_vport_cap(caps, misc->source_port); if (!vport_cap) return -EINVAL; if (vport_cap->vport_gvmi) MLX5_SET(ste_src_gvmi_qp, tag, source_gvmi, vport_cap->vport_gvmi); + misc->source_eswitch_owner_vhca_id = 0; misc->source_port = 0; return 0; @@ -2284,17 +2309,20 @@ static int dr_ste_build_src_gvmi_qpn_tag(struct mlx5dr_match_param *value, int mlx5dr_ste_build_src_gvmi_qpn(struct mlx5dr_ste_build *sb, struct mlx5dr_match_param *mask, - struct mlx5dr_cmd_caps *caps, + struct mlx5dr_domain *dmn, bool inner, bool rx) { int ret; + /* Set vhca_id_valid before we reset source_eswitch_owner_vhca_id */ + sb->vhca_id_valid = mask->misc.source_eswitch_owner_vhca_id; + ret = dr_ste_build_src_gvmi_qpn_bit_mask(mask, sb->bit_mask); if (ret) return ret; sb->rx = rx; - sb->caps = caps; + sb->dmn = dmn; sb->inner = inner; sb->lu_type = MLX5DR_STE_LU_TYPE_SRC_GVMI_AND_QP; sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h index 78f899fb3305..1cb3769d4e3c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h @@ -180,6 +180,8 @@ void mlx5dr_send_fill_and_append_ste_send_info(struct mlx5dr_ste *ste, u16 size, struct mlx5dr_ste_build { u8 inner:1; u8 rx:1; + u8 vhca_id_valid:1; + struct mlx5dr_domain *dmn; struct mlx5dr_cmd_caps *caps; u8 lu_type; u16 byte_mask; @@ -331,7 +333,7 @@ void mlx5dr_ste_build_register_1(struct mlx5dr_ste_build *sb, bool inner, bool rx); int mlx5dr_ste_build_src_gvmi_qpn(struct mlx5dr_ste_build *sb, struct mlx5dr_match_param *mask, - struct mlx5dr_cmd_caps *caps, + struct mlx5dr_domain *dmn, bool inner, bool rx); void mlx5dr_ste_build_empty_always_hit(struct mlx5dr_ste_build *sb, bool rx); @@ -453,7 +455,7 @@ struct mlx5dr_match_misc { u32 gre_c_present:1; /* Source port.;0xffff determines wire port */ u32 source_port:16; - u32 reserved_auto2:16; + u32 source_eswitch_owner_vhca_id:16; /* VLAN ID of first VLAN tag the inner header of the incoming packet. * Valid only when inner_second_cvlan_tag ==1 or inner_second_svlan_tag ==1 */ From d19a79ee38c8fda6d297e4227e80db8bf51c71a6 Mon Sep 17 00:00:00 2001 From: Bodong Wang Date: Mon, 26 Aug 2019 16:34:12 -0500 Subject: [PATCH 169/334] net/mlx5: Add device ID of upcoming BlueField-2 Add the device ID of upcoming BlueField-2 integrated ConnectX-6 Dx network controller. Its VFs will be using the generic VF device ID: 0x101e "ConnectX Family mlx5Gen Virtual Function". Fixes: 2e9d3e83ab82 ("net/mlx5: Update the list of the PCI supported devices") Signed-off-by: Bodong Wang Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 9648c2297803..e47dd7c1b909 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1568,6 +1568,7 @@ static const struct pci_device_id mlx5_core_pci_table[] = { { PCI_VDEVICE(MELLANOX, 0x101e), MLX5_PCI_DEV_IS_VF}, /* ConnectX Family mlx5Gen Virtual Function */ { PCI_VDEVICE(MELLANOX, 0xa2d2) }, /* BlueField integrated ConnectX-5 network controller */ { PCI_VDEVICE(MELLANOX, 0xa2d3), MLX5_PCI_DEV_IS_VF}, /* BlueField integrated ConnectX-5 network controller VF */ + { PCI_VDEVICE(MELLANOX, 0xa2d6) }, /* BlueField-2 integrated ConnectX-6 Dx network controller */ { 0, } }; From d22fcc806b84b9818de08b32e494f3c05dd236c7 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Wed, 11 Sep 2019 07:50:13 -0700 Subject: [PATCH 170/334] net/mlx5e: Fix traffic duplication in ethtool steering Before this patch, when adding multiple ethtool steering rules with identical classification, the driver used to append the new destination to the already existing hw rule, which caused the hw to forward the traffic to all destinations (rx queues). Here we avoid this by setting the "no append" mlx5 fs core flag when adding a new ethtool rule. Fixes: 6dc6071cfcde ("net/mlx5e: Add ethtool flow steering support") Signed-off-by: Saeed Mahameed Reviewed-by: Maor Gottlieb Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c index eed7101e8bb7..acd946f2ddbe 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c @@ -399,10 +399,10 @@ add_ethtool_flow_rule(struct mlx5e_priv *priv, struct mlx5_flow_table *ft, struct ethtool_rx_flow_spec *fs) { + struct mlx5_flow_act flow_act = { .flags = FLOW_ACT_NO_APPEND }; struct mlx5_flow_destination *dst = NULL; - struct mlx5_flow_act flow_act = {0}; - struct mlx5_flow_spec *spec; struct mlx5_flow_handle *rule; + struct mlx5_flow_spec *spec; int err = 0; spec = kvzalloc(sizeof(*spec), GFP_KERNEL); From fe1587a7de94912ed75ba5ddbfabf0741f9f8239 Mon Sep 17 00:00:00 2001 From: Dmytro Linkin Date: Fri, 13 Sep 2019 10:42:21 +0000 Subject: [PATCH 171/334] net/mlx5e: Fix matching on tunnel addresses type In mlx5 parse_tunnel_attr() function dispatch on encap IP address type is performed by directly checking flow_rule_match_key() on FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, and then on FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS. However, since those are stored in union, first check is always true if any type of encap address is set, which leads to IPv6 tunnel encap address being parsed as IPv4 by mlx5. Determine correct IP address type by checking control key first and if it set, take address type from match.key->addr_type. Fixes: d1bda7eecd88 ("net/mlx5e: Allow matching only enc_key_id/enc_dst_port for decapsulation action") Signed-off-by: Dmytro Linkin Reviewed-by: Vlad Buslov Reviewed-by: Eli Britstein Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/en_tc.c | 85 +++++++++++-------- 1 file changed, 51 insertions(+), 34 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index da7555fdb4d5..3e78a727f3e6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -1664,46 +1664,63 @@ static int parse_tunnel_attr(struct mlx5e_priv *priv, return err; } - if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS)) { - struct flow_match_ipv4_addrs match; + if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_CONTROL)) { + struct flow_match_control match; + u16 addr_type; - flow_rule_match_enc_ipv4_addrs(rule, &match); - MLX5_SET(fte_match_set_lyr_2_4, headers_c, - src_ipv4_src_ipv6.ipv4_layout.ipv4, - ntohl(match.mask->src)); - MLX5_SET(fte_match_set_lyr_2_4, headers_v, - src_ipv4_src_ipv6.ipv4_layout.ipv4, - ntohl(match.key->src)); + flow_rule_match_enc_control(rule, &match); + addr_type = match.key->addr_type; - MLX5_SET(fte_match_set_lyr_2_4, headers_c, - dst_ipv4_dst_ipv6.ipv4_layout.ipv4, - ntohl(match.mask->dst)); - MLX5_SET(fte_match_set_lyr_2_4, headers_v, - dst_ipv4_dst_ipv6.ipv4_layout.ipv4, - ntohl(match.key->dst)); + /* For tunnel addr_type used same key id`s as for non-tunnel */ + if (addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { + struct flow_match_ipv4_addrs match; - MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, ethertype); - MLX5_SET(fte_match_set_lyr_2_4, headers_v, ethertype, ETH_P_IP); - } else if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS)) { - struct flow_match_ipv6_addrs match; + flow_rule_match_enc_ipv4_addrs(rule, &match); + MLX5_SET(fte_match_set_lyr_2_4, headers_c, + src_ipv4_src_ipv6.ipv4_layout.ipv4, + ntohl(match.mask->src)); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, + src_ipv4_src_ipv6.ipv4_layout.ipv4, + ntohl(match.key->src)); - flow_rule_match_enc_ipv6_addrs(rule, &match); - memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c, - src_ipv4_src_ipv6.ipv6_layout.ipv6), - &match.mask->src, MLX5_FLD_SZ_BYTES(ipv6_layout, ipv6)); - memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, - src_ipv4_src_ipv6.ipv6_layout.ipv6), - &match.key->src, MLX5_FLD_SZ_BYTES(ipv6_layout, ipv6)); + MLX5_SET(fte_match_set_lyr_2_4, headers_c, + dst_ipv4_dst_ipv6.ipv4_layout.ipv4, + ntohl(match.mask->dst)); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, + dst_ipv4_dst_ipv6.ipv4_layout.ipv4, + ntohl(match.key->dst)); - memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c, - dst_ipv4_dst_ipv6.ipv6_layout.ipv6), - &match.mask->dst, MLX5_FLD_SZ_BYTES(ipv6_layout, ipv6)); - memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, - dst_ipv4_dst_ipv6.ipv6_layout.ipv6), - &match.key->dst, MLX5_FLD_SZ_BYTES(ipv6_layout, ipv6)); + MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, + ethertype); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ethertype, + ETH_P_IP); + } else if (addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { + struct flow_match_ipv6_addrs match; - MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, ethertype); - MLX5_SET(fte_match_set_lyr_2_4, headers_v, ethertype, ETH_P_IPV6); + flow_rule_match_enc_ipv6_addrs(rule, &match); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c, + src_ipv4_src_ipv6.ipv6_layout.ipv6), + &match.mask->src, MLX5_FLD_SZ_BYTES(ipv6_layout, + ipv6)); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, + src_ipv4_src_ipv6.ipv6_layout.ipv6), + &match.key->src, MLX5_FLD_SZ_BYTES(ipv6_layout, + ipv6)); + + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c, + dst_ipv4_dst_ipv6.ipv6_layout.ipv6), + &match.mask->dst, MLX5_FLD_SZ_BYTES(ipv6_layout, + ipv6)); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, + dst_ipv4_dst_ipv6.ipv6_layout.ipv6), + &match.key->dst, MLX5_FLD_SZ_BYTES(ipv6_layout, + ipv6)); + + MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, + ethertype); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ethertype, + ETH_P_IPV6); + } } if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_IP)) { From 0b15e02f0cc4fb34a9160de7ba6db3a4013dc1b7 Mon Sep 17 00:00:00 2001 From: Filippo Sironi Date: Tue, 10 Sep 2019 19:49:21 +0200 Subject: [PATCH 172/334] iommu/amd: Wait for completion of IOTLB flush in attach_device To make sure the domain tlb flush completes before the function returns, explicitly wait for its completion. Signed-off-by: Filippo Sironi Fixes: 42a49f965a8d ("amd-iommu: flush domain tlb when attaching a new device") [joro: Added commit message and fixes tag] Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index fda9923542c9..7bdce3b10f3d 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -2212,6 +2212,8 @@ skip_ats_check: */ domain_flush_tlb_pde(domain); + domain_flush_complete(domain); + return ret; } From 1211ee61b4a8e60d6dc77211cdcf01906915bfba Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Fri, 20 Sep 2019 15:05:22 +0200 Subject: [PATCH 173/334] powerpc/pseries: Read TLB Block Invalidate Characteristics The PAPR document specifies the TLB Block Invalidate Characteristics which tells for each pair of segment base page size, actual page size, the size of the block the hcall H_BLOCK_REMOVE supports. These characteristics are loaded at boot time in a new table hblkr_size. The table is separate from the mmu_psize_def because this is specific to the pseries platform. A new init function, pseries_lpar_read_hblkrm_characteristics() is added to read the characteristics. It is called from pSeries_setup_arch(). Fixes: ba2dd8a26baa ("powerpc/pseries/mm: call H_BLOCK_REMOVE") Signed-off-by: Laurent Dufour Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190920130523.20441-2-ldufour@linux.ibm.com --- arch/powerpc/platforms/pseries/lpar.c | 140 +++++++++++++++++++++++ arch/powerpc/platforms/pseries/pseries.h | 1 + arch/powerpc/platforms/pseries/setup.c | 1 + 3 files changed, 142 insertions(+) diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 36b846f6e74e..8e3ac7ae4e11 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -56,6 +56,15 @@ EXPORT_SYMBOL(plpar_hcall); EXPORT_SYMBOL(plpar_hcall9); EXPORT_SYMBOL(plpar_hcall_norets); +/* + * H_BLOCK_REMOVE supported block size for this page size in segment who's base + * page size is that page size. + * + * The first index is the segment base page size, the second one is the actual + * page size. + */ +static int hblkrm_size[MMU_PAGE_COUNT][MMU_PAGE_COUNT] __ro_after_init; + #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE static u8 dtl_mask = DTL_LOG_PREEMPT; #else @@ -1311,6 +1320,137 @@ static void do_block_remove(unsigned long number, struct ppc64_tlb_batch *batch, (void)call_block_remove(pix, param, true); } +/* + * TLB Block Invalidate Characteristics + * + * These characteristics define the size of the block the hcall H_BLOCK_REMOVE + * is able to process for each couple segment base page size, actual page size. + * + * The ibm,get-system-parameter properties is returning a buffer with the + * following layout: + * + * [ 2 bytes size of the RTAS buffer (excluding these 2 bytes) ] + * ----------------- + * TLB Block Invalidate Specifiers: + * [ 1 byte LOG base 2 of the TLB invalidate block size being specified ] + * [ 1 byte Number of page sizes (N) that are supported for the specified + * TLB invalidate block size ] + * [ 1 byte Encoded segment base page size and actual page size + * MSB=0 means 4k segment base page size and actual page size + * MSB=1 the penc value in mmu_psize_def ] + * ... + * ----------------- + * Next TLB Block Invalidate Specifiers... + * ----------------- + * [ 0 ] + */ +static inline void set_hblkrm_bloc_size(int bpsize, int psize, + unsigned int block_size) +{ + if (block_size > hblkrm_size[bpsize][psize]) + hblkrm_size[bpsize][psize] = block_size; +} + +/* + * Decode the Encoded segment base page size and actual page size. + * PAPR specifies: + * - bit 7 is the L bit + * - bits 0-5 are the penc value + * If the L bit is 0, this means 4K segment base page size and actual page size + * otherwise the penc value should be read. + */ +#define HBLKRM_L_MASK 0x80 +#define HBLKRM_PENC_MASK 0x3f +static inline void __init check_lp_set_hblkrm(unsigned int lp, + unsigned int block_size) +{ + unsigned int bpsize, psize; + + /* First, check the L bit, if not set, this means 4K */ + if ((lp & HBLKRM_L_MASK) == 0) { + set_hblkrm_bloc_size(MMU_PAGE_4K, MMU_PAGE_4K, block_size); + return; + } + + lp &= HBLKRM_PENC_MASK; + for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++) { + struct mmu_psize_def *def = &mmu_psize_defs[bpsize]; + + for (psize = 0; psize < MMU_PAGE_COUNT; psize++) { + if (def->penc[psize] == lp) { + set_hblkrm_bloc_size(bpsize, psize, block_size); + return; + } + } + } +} + +#define SPLPAR_TLB_BIC_TOKEN 50 + +/* + * The size of the TLB Block Invalidate Characteristics is variable. But at the + * maximum it will be the number of possible page sizes *2 + 10 bytes. + * Currently MMU_PAGE_COUNT is 16, which means 42 bytes. Use a cache line size + * (128 bytes) for the buffer to get plenty of space. + */ +#define SPLPAR_TLB_BIC_MAXLENGTH 128 + +void __init pseries_lpar_read_hblkrm_characteristics(void) +{ + unsigned char local_buffer[SPLPAR_TLB_BIC_MAXLENGTH]; + int call_status, len, idx, bpsize; + + spin_lock(&rtas_data_buf_lock); + memset(rtas_data_buf, 0, RTAS_DATA_BUF_SIZE); + call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1, + NULL, + SPLPAR_TLB_BIC_TOKEN, + __pa(rtas_data_buf), + RTAS_DATA_BUF_SIZE); + memcpy(local_buffer, rtas_data_buf, SPLPAR_TLB_BIC_MAXLENGTH); + local_buffer[SPLPAR_TLB_BIC_MAXLENGTH - 1] = '\0'; + spin_unlock(&rtas_data_buf_lock); + + if (call_status != 0) { + pr_warn("%s %s Error calling get-system-parameter (0x%x)\n", + __FILE__, __func__, call_status); + return; + } + + /* + * The first two (2) bytes of the data in the buffer are the length of + * the returned data, not counting these first two (2) bytes. + */ + len = be16_to_cpu(*((u16 *)local_buffer)) + 2; + if (len > SPLPAR_TLB_BIC_MAXLENGTH) { + pr_warn("%s too large returned buffer %d", __func__, len); + return; + } + + idx = 2; + while (idx < len) { + u8 block_shift = local_buffer[idx++]; + u32 block_size; + unsigned int npsize; + + if (!block_shift) + break; + + block_size = 1 << block_shift; + + for (npsize = local_buffer[idx++]; + npsize > 0 && idx < len; npsize--) + check_lp_set_hblkrm((unsigned int) local_buffer[idx++], + block_size); + } + + for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++) + for (idx = 0; idx < MMU_PAGE_COUNT; idx++) + if (hblkrm_size[bpsize][idx]) + pr_info("H_BLOCK_REMOVE supports base psize:%d psize:%d block size:%d", + bpsize, idx, hblkrm_size[bpsize][idx]); +} + /* * Take a spinlock around flushes to avoid bouncing the hypervisor tlbie * lock. diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h index a6624d4bd9d0..13fa370a87e4 100644 --- a/arch/powerpc/platforms/pseries/pseries.h +++ b/arch/powerpc/platforms/pseries/pseries.h @@ -112,5 +112,6 @@ static inline unsigned long cmo_get_page_size(void) int dlpar_workqueue_init(void); void pseries_setup_rfi_flush(void); +void pseries_lpar_read_hblkrm_characteristics(void); #endif /* _PSERIES_PSERIES_H */ diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index f8adcd0e4589..0a40201f315f 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -744,6 +744,7 @@ static void __init pSeries_setup_arch(void) pseries_setup_rfi_flush(); setup_stf_barrier(); + pseries_lpar_read_hblkrm_characteristics(); /* By default, only probe PCI (can be overridden by rtas_pci) */ pci_add_flags(PCI_PROBE_ONLY); From 59545ebe331917afcb1ebbd6f3e5dc3ed51beb05 Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Fri, 20 Sep 2019 15:05:23 +0200 Subject: [PATCH 174/334] powerpc/pseries: Call H_BLOCK_REMOVE when supported Depending on the hardware and the hypervisor, the hcall H_BLOCK_REMOVE may not be able to process all the page sizes for a segment base page size, as reported by the TLB Invalidate Characteristics. For each pair of base segment page size and actual page size, this characteristic tells us the size of the block the hcall supports. In the case, the hcall is not supporting a pair of base segment page size, actual page size, it is returning H_PARAM which leads to a panic like this: kernel BUG at /home/srikar/work/linux.git/arch/powerpc/platforms/pseries/lpar.c:466! Oops: Exception in kernel mode, sig: 5 [#1] BE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries Modules linked in: CPU: 28 PID: 583 Comm: modprobe Not tainted 5.2.0-master #5 NIP: c0000000000be8dc LR: c0000000000be880 CTR: 0000000000000000 REGS: c0000007e77fb130 TRAP: 0700 Not tainted (5.2.0-master) MSR: 8000000000029032 CR: 42224824 XER: 20000000 CFAR: c0000000000be8fc IRQMASK: 0 GPR00: 0000000022224828 c0000007e77fb3c0 c000000001434d00 0000000000000005 GPR04: 9000000004fa8c00 0000000000000000 0000000000000003 0000000000000001 GPR08: c0000007e77fb450 0000000000000000 0000000000000001 ffffffffffffffff GPR12: c0000007e77fb450 c00000000edfcb80 0000cd7d3ea30000 c0000000016022b0 GPR16: 00000000000000b0 0000cd7d3ea30000 0000000000000001 c080001f04f00105 GPR20: 0000000000000003 0000000000000004 c000000fbeb05f58 c000000001602200 GPR24: 0000000000000000 0000000000000004 8800000000000000 c000000000c5d148 GPR28: c000000000000000 8000000000000000 a000000000000000 c0000007e77fb580 NIP [c0000000000be8dc] .call_block_remove+0x12c/0x220 LR [c0000000000be880] .call_block_remove+0xd0/0x220 Call Trace: 0xc000000fb8c00240 (unreliable) .pSeries_lpar_flush_hash_range+0x578/0x670 .flush_hash_range+0x44/0x100 .__flush_tlb_pending+0x3c/0xc0 .zap_pte_range+0x7ec/0x830 .unmap_page_range+0x3f4/0x540 .unmap_vmas+0x94/0x120 .exit_mmap+0xac/0x1f0 .mmput+0x9c/0x1f0 .do_exit+0x388/0xd60 .do_group_exit+0x54/0x100 .__se_sys_exit_group+0x14/0x20 system_call+0x5c/0x70 Instruction dump: 39400001 38a00000 4800003c 60000000 60420000 7fa9e800 38e00000 419e0014 7d29d278 7d290074 7929d182 69270001 <0b070000> 7d495378 394a0001 7fa93040 The call to H_BLOCK_REMOVE should only be made for the supported pair of base segment page size, actual page size and using the correct maximum block size. Due to the required complexity in do_block_remove() and call_block_remove(), and the fact that currently a block size of 8 is returned by the hypervisor, we are only supporting 8 size block to the H_BLOCK_REMOVE hcall. In order to identify this limitation easily in the code, a local define HBLKR_SUPPORTED_SIZE defining the currently supported block size, and a dedicated checking helper is_supported_hlbkr() are introduced. For regular pages and hugetlb, the assumption is made that the page size is equal to the base page size. For THP the page size is assumed to be 16M. Fixes: ba2dd8a26baa ("powerpc/pseries/mm: call H_BLOCK_REMOVE") Signed-off-by: Laurent Dufour Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190920130523.20441-3-ldufour@linux.ibm.com --- arch/powerpc/platforms/pseries/lpar.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 8e3ac7ae4e11..b53359258d99 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -65,6 +65,13 @@ EXPORT_SYMBOL(plpar_hcall_norets); */ static int hblkrm_size[MMU_PAGE_COUNT][MMU_PAGE_COUNT] __ro_after_init; +/* + * Due to the involved complexity, and that the current hypervisor is only + * returning this value or 0, we are limiting the support of the H_BLOCK_REMOVE + * buffer size to 8 size block. + */ +#define HBLKRM_SUPPORTED_BLOCK_SIZE 8 + #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE static u8 dtl_mask = DTL_LOG_PREEMPT; #else @@ -993,6 +1000,17 @@ static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long vpn, #define HBLKR_CTRL_ERRNOTFOUND 0x8800000000000000UL #define HBLKR_CTRL_ERRBUSY 0xa000000000000000UL +/* + * Returned true if we are supporting this block size for the specified segment + * base page size and actual page size. + * + * Currently, we only support 8 size block. + */ +static inline bool is_supported_hlbkrm(int bpsize, int psize) +{ + return (hblkrm_size[bpsize][psize] == HBLKRM_SUPPORTED_BLOCK_SIZE); +} + /** * H_BLOCK_REMOVE caller. * @idx should point to the latest @param entry set with a PTEX. @@ -1152,7 +1170,8 @@ static inline void __pSeries_lpar_hugepage_invalidate(unsigned long *slot, if (lock_tlbie) spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags); - if (firmware_has_feature(FW_FEATURE_BLOCK_REMOVE)) + /* Assuming THP size is 16M */ + if (is_supported_hlbkrm(psize, MMU_PAGE_16M)) hugepage_block_invalidate(slot, vpn, count, psize, ssize); else hugepage_bulk_invalidate(slot, vpn, count, psize, ssize); @@ -1470,7 +1489,7 @@ static void pSeries_lpar_flush_hash_range(unsigned long number, int local) if (lock_tlbie) spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags); - if (firmware_has_feature(FW_FEATURE_BLOCK_REMOVE)) { + if (is_supported_hlbkrm(batch->psize, batch->psize)) { do_block_remove(number, batch, param); goto out; } From 677733e296b5c7a37c47da391fc70a43dc40bd67 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 24 Sep 2019 09:22:51 +0530 Subject: [PATCH 175/334] powerpc/book3s64/mm: Don't do tlbie fixup for some hardware revisions The store ordering vs tlbie issue mentioned in commit a5d4b5891c2f ("powerpc/mm: Fixup tlbie vs store ordering issue on POWER9") is fixed for Nimbus 2.3 and Cumulus 1.3 revisions. We don't need to apply the fixup if we are running on them We can only do this on PowerNV. On pseries guest with KVM we still don't support redoing the feature fixup after migration. So we should be enabling all the workarounds needed, because whe can possibly migrate between DD 2.3 and DD 2.2 Fixes: a5d4b5891c2f ("powerpc/mm: Fixup tlbie vs store ordering issue on POWER9") Cc: stable@vger.kernel.org # v4.16+ Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190924035254.24612-1-aneesh.kumar@linux.ibm.com --- arch/powerpc/kernel/dt_cpu_ftrs.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index bceee2fde885..5fc1b527de46 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -692,9 +692,35 @@ static bool __init cpufeatures_process_feature(struct dt_cpu_feature *f) return true; } +/* + * Handle POWER9 broadcast tlbie invalidation issue using + * cpu feature flag. + */ +static __init void update_tlbie_feature_flag(unsigned long pvr) +{ + if (PVR_VER(pvr) == PVR_POWER9) { + /* + * Set the tlbie feature flag for anything below + * Nimbus DD 2.3 and Cumulus DD 1.3 + */ + if ((pvr & 0xe000) == 0) { + /* Nimbus */ + if ((pvr & 0xfff) < 0x203) + cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_BUG; + } else if ((pvr & 0xc000) == 0) { + /* Cumulus */ + if ((pvr & 0xfff) < 0x103) + cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_BUG; + } else { + WARN_ONCE(1, "Unknown PVR"); + cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_BUG; + } + } +} + static __init void cpufeatures_cpu_quirks(void) { - int version = mfspr(SPRN_PVR); + unsigned long version = mfspr(SPRN_PVR); /* * Not all quirks can be derived from the cpufeatures device tree. @@ -713,10 +739,10 @@ static __init void cpufeatures_cpu_quirks(void) if ((version & 0xffff0000) == 0x004e0000) { cur_cpu_spec->cpu_features &= ~(CPU_FTR_DAWR); - cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_BUG; cur_cpu_spec->cpu_features |= CPU_FTR_P9_TIDR; } + update_tlbie_feature_flag(version); /* * PKEY was not in the initial base or feature node * specification, but it should become optional in the next From 09ce98cacd51fcd0fa0af2f79d1e1d3192f4cbb0 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 24 Sep 2019 09:22:52 +0530 Subject: [PATCH 176/334] powerpc/book3s64/radix: Rename CPU_FTR_P9_TLBIE_BUG feature flag Rename the #define to indicate this is related to store vs tlbie ordering issue. In the next patch, we will be adding another feature flag that is used to handles ERAT flush vs tlbie ordering issue. Fixes: a5d4b5891c2f ("powerpc/mm: Fixup tlbie vs store ordering issue on POWER9") Cc: stable@vger.kernel.org # v4.16+ Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190924035254.24612-2-aneesh.kumar@linux.ibm.com --- arch/powerpc/include/asm/cputable.h | 4 ++-- arch/powerpc/kernel/dt_cpu_ftrs.c | 6 +++--- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 2 +- arch/powerpc/mm/book3s64/hash_native.c | 2 +- arch/powerpc/mm/book3s64/radix_tlb.c | 4 ++-- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h index a1ebcbc3931f..f080fba48619 100644 --- a/arch/powerpc/include/asm/cputable.h +++ b/arch/powerpc/include/asm/cputable.h @@ -209,7 +209,7 @@ static inline void cpu_feature_keys_init(void) { } #define CPU_FTR_POWER9_DD2_1 LONG_ASM_CONST(0x0000080000000000) #define CPU_FTR_P9_TM_HV_ASSIST LONG_ASM_CONST(0x0000100000000000) #define CPU_FTR_P9_TM_XER_SO_BUG LONG_ASM_CONST(0x0000200000000000) -#define CPU_FTR_P9_TLBIE_BUG LONG_ASM_CONST(0x0000400000000000) +#define CPU_FTR_P9_TLBIE_STQ_BUG LONG_ASM_CONST(0x0000400000000000) #define CPU_FTR_P9_TIDR LONG_ASM_CONST(0x0000800000000000) #ifndef __ASSEMBLY__ @@ -457,7 +457,7 @@ static inline void cpu_feature_keys_init(void) { } CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY | \ CPU_FTR_DBELL | CPU_FTR_HAS_PPR | CPU_FTR_ARCH_207S | \ CPU_FTR_TM_COMP | CPU_FTR_ARCH_300 | CPU_FTR_PKEY | \ - CPU_FTR_P9_TLBIE_BUG | CPU_FTR_P9_TIDR) + CPU_FTR_P9_TLBIE_STQ_BUG | CPU_FTR_P9_TIDR) #define CPU_FTRS_POWER9_DD2_0 CPU_FTRS_POWER9 #define CPU_FTRS_POWER9_DD2_1 (CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD2_1) #define CPU_FTRS_POWER9_DD2_2 (CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD2_1 | \ diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index 5fc1b527de46..a86486390c70 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -706,14 +706,14 @@ static __init void update_tlbie_feature_flag(unsigned long pvr) if ((pvr & 0xe000) == 0) { /* Nimbus */ if ((pvr & 0xfff) < 0x203) - cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_BUG; + cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_STQ_BUG; } else if ((pvr & 0xc000) == 0) { /* Cumulus */ if ((pvr & 0xfff) < 0x103) - cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_BUG; + cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_STQ_BUG; } else { WARN_ONCE(1, "Unknown PVR"); - cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_BUG; + cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_STQ_BUG; } } } diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 7186c65c61c9..cfdf232aac34 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -451,7 +451,7 @@ static void do_tlbies(struct kvm *kvm, unsigned long *rbvalues, "r" (rbvalues[i]), "r" (kvm->arch.lpid)); } - if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) { + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { /* * Need the extra ptesync to make sure we don't * re-order the tlbie diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c index 90ab4f31e2b3..02568dae4695 100644 --- a/arch/powerpc/mm/book3s64/hash_native.c +++ b/arch/powerpc/mm/book3s64/hash_native.c @@ -199,7 +199,7 @@ static inline unsigned long ___tlbie(unsigned long vpn, int psize, static inline void fixup_tlbie(unsigned long vpn, int psize, int apsize, int ssize) { - if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) { + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { /* Need the extra ptesync to ensure we don't reorder tlbie*/ asm volatile("ptesync": : :"memory"); ___tlbie(vpn, psize, apsize, ssize); diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index 631be42abd33..69fdc004d83f 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -201,7 +201,7 @@ static inline void fixup_tlbie(void) unsigned long pid = 0; unsigned long va = ((1UL << 52) - 1); - if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) { + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { asm volatile("ptesync": : :"memory"); __tlbie_va(va, pid, mmu_get_ap(MMU_PAGE_64K), RIC_FLUSH_TLB); } @@ -211,7 +211,7 @@ static inline void fixup_tlbie_lpid(unsigned long lpid) { unsigned long va = ((1UL << 52) - 1); - if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) { + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { asm volatile("ptesync": : :"memory"); __tlbie_lpid_va(va, lpid, mmu_get_ap(MMU_PAGE_64K), RIC_FLUSH_TLB); } From 047e6575aec71d75b765c22111820c4776cd1c43 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 24 Sep 2019 09:22:53 +0530 Subject: [PATCH 177/334] powerpc/mm: Fixup tlbie vs mtpidr/mtlpidr ordering issue on POWER9 On POWER9, under some circumstances, a broadcast TLB invalidation will fail to invalidate the ERAT cache on some threads when there are parallel mtpidr/mtlpidr happening on other threads of the same core. This can cause stores to continue to go to a page after it's unmapped. The workaround is to force an ERAT flush using PID=0 or LPID=0 tlbie flush. This additional TLB flush will cause the ERAT cache invalidation. Since we are using PID=0 or LPID=0, we don't get filtered out by the TLB snoop filtering logic. We need to still follow this up with another tlbie to take care of store vs tlbie ordering issue explained in commit: a5d4b5891c2f ("powerpc/mm: Fixup tlbie vs store ordering issue on POWER9"). The presence of ERAT cache implies we can still get new stores and they may miss store queue marking flush. Cc: stable@vger.kernel.org Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190924035254.24612-3-aneesh.kumar@linux.ibm.com --- arch/powerpc/include/asm/cputable.h | 3 +- arch/powerpc/kernel/dt_cpu_ftrs.c | 2 + arch/powerpc/kvm/book3s_hv_rm_mmu.c | 42 ++++++++++---- arch/powerpc/mm/book3s64/hash_native.c | 29 +++++++++- arch/powerpc/mm/book3s64/radix_tlb.c | 80 +++++++++++++++++++++++--- 5 files changed, 134 insertions(+), 22 deletions(-) diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h index f080fba48619..cf00ff0d121d 100644 --- a/arch/powerpc/include/asm/cputable.h +++ b/arch/powerpc/include/asm/cputable.h @@ -211,6 +211,7 @@ static inline void cpu_feature_keys_init(void) { } #define CPU_FTR_P9_TM_XER_SO_BUG LONG_ASM_CONST(0x0000200000000000) #define CPU_FTR_P9_TLBIE_STQ_BUG LONG_ASM_CONST(0x0000400000000000) #define CPU_FTR_P9_TIDR LONG_ASM_CONST(0x0000800000000000) +#define CPU_FTR_P9_TLBIE_ERAT_BUG LONG_ASM_CONST(0x0001000000000000) #ifndef __ASSEMBLY__ @@ -457,7 +458,7 @@ static inline void cpu_feature_keys_init(void) { } CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY | \ CPU_FTR_DBELL | CPU_FTR_HAS_PPR | CPU_FTR_ARCH_207S | \ CPU_FTR_TM_COMP | CPU_FTR_ARCH_300 | CPU_FTR_PKEY | \ - CPU_FTR_P9_TLBIE_STQ_BUG | CPU_FTR_P9_TIDR) + CPU_FTR_P9_TLBIE_STQ_BUG | CPU_FTR_P9_TLBIE_ERAT_BUG | CPU_FTR_P9_TIDR) #define CPU_FTRS_POWER9_DD2_0 CPU_FTRS_POWER9 #define CPU_FTRS_POWER9_DD2_1 (CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD2_1) #define CPU_FTRS_POWER9_DD2_2 (CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD2_1 | \ diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index a86486390c70..180b3a5d1001 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -715,6 +715,8 @@ static __init void update_tlbie_feature_flag(unsigned long pvr) WARN_ONCE(1, "Unknown PVR"); cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_STQ_BUG; } + + cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_ERAT_BUG; } } diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index cfdf232aac34..220305454c23 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -433,6 +433,37 @@ static inline int is_mmio_hpte(unsigned long v, unsigned long r) (HPTE_R_KEY_HI | HPTE_R_KEY_LO)); } +static inline void fixup_tlbie_lpid(unsigned long rb_value, unsigned long lpid) +{ + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) { + /* Radix flush for a hash guest */ + + unsigned long rb,rs,prs,r,ric; + + rb = PPC_BIT(52); /* IS = 2 */ + rs = 0; /* lpid = 0 */ + prs = 0; /* partition scoped */ + r = 1; /* radix format */ + ric = 0; /* RIC_FLSUH_TLB */ + + /* + * Need the extra ptesync to make sure we don't + * re-order the tlbie + */ + asm volatile("ptesync": : :"memory"); + asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(prs), + "i"(ric), "r"(rs) : "memory"); + } + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { + asm volatile("ptesync": : :"memory"); + asm volatile(PPC_TLBIE_5(%0,%1,0,0,0) : : + "r" (rb_value), "r" (lpid)); + } +} + static void do_tlbies(struct kvm *kvm, unsigned long *rbvalues, long npages, int global, bool need_sync) { @@ -451,16 +482,7 @@ static void do_tlbies(struct kvm *kvm, unsigned long *rbvalues, "r" (rbvalues[i]), "r" (kvm->arch.lpid)); } - if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { - /* - * Need the extra ptesync to make sure we don't - * re-order the tlbie - */ - asm volatile("ptesync": : :"memory"); - asm volatile(PPC_TLBIE_5(%0,%1,0,0,0) : : - "r" (rbvalues[0]), "r" (kvm->arch.lpid)); - } - + fixup_tlbie_lpid(rbvalues[i - 1], kvm->arch.lpid); asm volatile("eieio; tlbsync; ptesync" : : : "memory"); } else { if (need_sync) diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c index 02568dae4695..523e42eb11da 100644 --- a/arch/powerpc/mm/book3s64/hash_native.c +++ b/arch/powerpc/mm/book3s64/hash_native.c @@ -197,8 +197,31 @@ static inline unsigned long ___tlbie(unsigned long vpn, int psize, return va; } -static inline void fixup_tlbie(unsigned long vpn, int psize, int apsize, int ssize) +static inline void fixup_tlbie_vpn(unsigned long vpn, int psize, + int apsize, int ssize) { + if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) { + /* Radix flush for a hash guest */ + + unsigned long rb,rs,prs,r,ric; + + rb = PPC_BIT(52); /* IS = 2 */ + rs = 0; /* lpid = 0 */ + prs = 0; /* partition scoped */ + r = 1; /* radix format */ + ric = 0; /* RIC_FLSUH_TLB */ + + /* + * Need the extra ptesync to make sure we don't + * re-order the tlbie + */ + asm volatile("ptesync": : :"memory"); + asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(prs), + "i"(ric), "r"(rs) : "memory"); + } + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { /* Need the extra ptesync to ensure we don't reorder tlbie*/ asm volatile("ptesync": : :"memory"); @@ -283,7 +306,7 @@ static inline void tlbie(unsigned long vpn, int psize, int apsize, asm volatile("ptesync": : :"memory"); } else { __tlbie(vpn, psize, apsize, ssize); - fixup_tlbie(vpn, psize, apsize, ssize); + fixup_tlbie_vpn(vpn, psize, apsize, ssize); asm volatile("eieio; tlbsync; ptesync": : :"memory"); } if (lock_tlbie && !use_local) @@ -856,7 +879,7 @@ static void native_flush_hash_range(unsigned long number, int local) /* * Just do one more with the last used values. */ - fixup_tlbie(vpn, psize, psize, ssize); + fixup_tlbie_vpn(vpn, psize, psize, ssize); asm volatile("eieio; tlbsync; ptesync":::"memory"); if (lock_tlbie) diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index 69fdc004d83f..67af871190c6 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -196,21 +196,82 @@ static __always_inline void __tlbie_lpid_va(unsigned long va, unsigned long lpid trace_tlbie(lpid, 0, rb, rs, ric, prs, r); } -static inline void fixup_tlbie(void) + +static inline void fixup_tlbie_va(unsigned long va, unsigned long pid, + unsigned long ap) { - unsigned long pid = 0; + if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) { + asm volatile("ptesync": : :"memory"); + __tlbie_va(va, 0, ap, RIC_FLUSH_TLB); + } + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { + asm volatile("ptesync": : :"memory"); + __tlbie_va(va, pid, ap, RIC_FLUSH_TLB); + } +} + +static inline void fixup_tlbie_va_range(unsigned long va, unsigned long pid, + unsigned long ap) +{ + if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) { + asm volatile("ptesync": : :"memory"); + __tlbie_pid(0, RIC_FLUSH_TLB); + } + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { + asm volatile("ptesync": : :"memory"); + __tlbie_va(va, pid, ap, RIC_FLUSH_TLB); + } +} + +static inline void fixup_tlbie_pid(unsigned long pid) +{ + /* + * We can use any address for the invalidation, pick one which is + * probably unused as an optimisation. + */ unsigned long va = ((1UL << 52) - 1); + if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) { + asm volatile("ptesync": : :"memory"); + __tlbie_pid(0, RIC_FLUSH_TLB); + } + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { asm volatile("ptesync": : :"memory"); __tlbie_va(va, pid, mmu_get_ap(MMU_PAGE_64K), RIC_FLUSH_TLB); } } + +static inline void fixup_tlbie_lpid_va(unsigned long va, unsigned long lpid, + unsigned long ap) +{ + if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) { + asm volatile("ptesync": : :"memory"); + __tlbie_lpid_va(va, 0, ap, RIC_FLUSH_TLB); + } + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { + asm volatile("ptesync": : :"memory"); + __tlbie_lpid_va(va, lpid, ap, RIC_FLUSH_TLB); + } +} + static inline void fixup_tlbie_lpid(unsigned long lpid) { + /* + * We can use any address for the invalidation, pick one which is + * probably unused as an optimisation. + */ unsigned long va = ((1UL << 52) - 1); + if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) { + asm volatile("ptesync": : :"memory"); + __tlbie_lpid(0, RIC_FLUSH_TLB); + } + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { asm volatile("ptesync": : :"memory"); __tlbie_lpid_va(va, lpid, mmu_get_ap(MMU_PAGE_64K), RIC_FLUSH_TLB); @@ -258,6 +319,7 @@ static inline void _tlbie_pid(unsigned long pid, unsigned long ric) switch (ric) { case RIC_FLUSH_TLB: __tlbie_pid(pid, RIC_FLUSH_TLB); + fixup_tlbie_pid(pid); break; case RIC_FLUSH_PWC: __tlbie_pid(pid, RIC_FLUSH_PWC); @@ -265,8 +327,8 @@ static inline void _tlbie_pid(unsigned long pid, unsigned long ric) case RIC_FLUSH_ALL: default: __tlbie_pid(pid, RIC_FLUSH_ALL); + fixup_tlbie_pid(pid); } - fixup_tlbie(); asm volatile("eieio; tlbsync; ptesync": : :"memory"); } @@ -315,6 +377,7 @@ static inline void _tlbie_lpid(unsigned long lpid, unsigned long ric) switch (ric) { case RIC_FLUSH_TLB: __tlbie_lpid(lpid, RIC_FLUSH_TLB); + fixup_tlbie_lpid(lpid); break; case RIC_FLUSH_PWC: __tlbie_lpid(lpid, RIC_FLUSH_PWC); @@ -322,8 +385,8 @@ static inline void _tlbie_lpid(unsigned long lpid, unsigned long ric) case RIC_FLUSH_ALL: default: __tlbie_lpid(lpid, RIC_FLUSH_ALL); + fixup_tlbie_lpid(lpid); } - fixup_tlbie_lpid(lpid); asm volatile("eieio; tlbsync; ptesync": : :"memory"); } @@ -390,6 +453,8 @@ static inline void __tlbie_va_range(unsigned long start, unsigned long end, for (addr = start; addr < end; addr += page_size) __tlbie_va(addr, pid, ap, RIC_FLUSH_TLB); + + fixup_tlbie_va_range(addr - page_size, pid, ap); } static __always_inline void _tlbie_va(unsigned long va, unsigned long pid, @@ -399,7 +464,7 @@ static __always_inline void _tlbie_va(unsigned long va, unsigned long pid, asm volatile("ptesync": : :"memory"); __tlbie_va(va, pid, ap, ric); - fixup_tlbie(); + fixup_tlbie_va(va, pid, ap); asm volatile("eieio; tlbsync; ptesync": : :"memory"); } @@ -457,7 +522,7 @@ static __always_inline void _tlbie_lpid_va(unsigned long va, unsigned long lpid, asm volatile("ptesync": : :"memory"); __tlbie_lpid_va(va, lpid, ap, ric); - fixup_tlbie_lpid(lpid); + fixup_tlbie_lpid_va(va, lpid, ap); asm volatile("eieio; tlbsync; ptesync": : :"memory"); } @@ -469,7 +534,6 @@ static inline void _tlbie_va_range(unsigned long start, unsigned long end, if (also_pwc) __tlbie_pid(pid, RIC_FLUSH_PWC); __tlbie_va_range(start, end, pid, page_size, psize); - fixup_tlbie(); asm volatile("eieio; tlbsync; ptesync": : :"memory"); } @@ -856,7 +920,7 @@ is_local: if (gflush) __tlbie_va_range(gstart, gend, pid, PUD_SIZE, MMU_PAGE_1G); - fixup_tlbie(); + asm volatile("eieio; tlbsync; ptesync": : :"memory"); } else { _tlbiel_va_range_multicast(mm, From 9f7fec0ba89108b9385f1b9fb167861224912a4a Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 18 Sep 2019 13:08:52 +0100 Subject: [PATCH 178/334] Btrfs: fix selftests failure due to uninitialized i_mode in test inodes Some of the self tests create a test inode, setup some extents and then do calls to btrfs_get_extent() to test that the corresponding extent maps exist and are correct. However btrfs_get_extent(), since the 5.2 merge window, now errors out when it finds a regular or prealloc extent for an inode that does not correspond to a regular file (its ->i_mode is not S_IFREG). This causes the self tests to fail sometimes, specially when KASAN, slub_debug and page poisoning are enabled: $ modprobe btrfs modprobe: ERROR: could not insert 'btrfs': Invalid argument $ dmesg [ 9414.691648] Btrfs loaded, crc32c=crc32c-intel, debug=on, assert=on, integrity-checker=on, ref-verify=on [ 9414.692655] BTRFS: selftest: sectorsize: 4096 nodesize: 4096 [ 9414.692658] BTRFS: selftest: running btrfs free space cache tests [ 9414.692918] BTRFS: selftest: running extent only tests [ 9414.693061] BTRFS: selftest: running bitmap only tests [ 9414.693366] BTRFS: selftest: running bitmap and extent tests [ 9414.696455] BTRFS: selftest: running space stealing from bitmap to extent tests [ 9414.697131] BTRFS: selftest: running extent buffer operation tests [ 9414.697133] BTRFS: selftest: running btrfs_split_item tests [ 9414.697564] BTRFS: selftest: running extent I/O tests [ 9414.697583] BTRFS: selftest: running find delalloc tests [ 9415.081125] BTRFS: selftest: running find_first_clear_extent_bit test [ 9415.081278] BTRFS: selftest: running extent buffer bitmap tests [ 9415.124192] BTRFS: selftest: running inode tests [ 9415.124195] BTRFS: selftest: running btrfs_get_extent tests [ 9415.127909] BTRFS: selftest: running hole first btrfs_get_extent test [ 9415.128343] BTRFS critical (device (efault)): regular/prealloc extent found for non-regular inode 256 [ 9415.131428] BTRFS: selftest: fs/btrfs/tests/inode-tests.c:904 expected a real extent, got 0 This happens because the test inodes are created without ever initializing the i_mode field of the inode, and neither VFS's new_inode() nor the btrfs callback btrfs_alloc_inode() initialize the i_mode. Initialization of the i_mode is done through the various callbacks used by the VFS to create new inodes (regular files, directories, symlinks, tmpfiles, etc), which all call btrfs_new_inode() which in turn calls inode_init_owner(), which sets the inode's i_mode. Since the tests only uses new_inode() to create the test inodes, the i_mode was never initialized. This always happens on a VM I used with kasan, slub_debug and many other debug facilities enabled. It also happened to someone who reported this on bugzilla (on a 5.3-rc). Fix this by setting i_mode to S_IFREG at btrfs_new_test_inode(). Fixes: 6bf9e4bd6a2778 ("btrfs: inode: Verify inode mode to avoid NULL pointer dereference") Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=204397 Signed-off-by: Filipe Manana Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/tests/btrfs-tests.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index b5e80563efaa..99fe9bf3fdac 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -52,7 +52,13 @@ static struct file_system_type test_type = { struct inode *btrfs_new_test_inode(void) { - return new_inode(test_mnt->mnt_sb); + struct inode *inode; + + inode = new_inode(test_mnt->mnt_sb); + if (inode) + inode_init_owner(inode, NULL, S_IFREG); + + return inode; } static int btrfs_init_test_fs(void) From eb5b64f142504a597d67e2109d603055ff765e52 Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Fri, 13 Sep 2019 14:54:07 +0100 Subject: [PATCH 179/334] btrfs: adjust dirty_metadata_bytes after writeback failure of extent buffer Before, if a eb failed to write out, we would end up triggering a BUG_ON(). As of f4340622e0226 ("btrfs: extent_io: Move the BUG_ON() in flush_write_bio() one level up"), we no longer BUG_ON(), so we should make life consistent and add back the unwritten bytes to dirty_metadata_bytes. Fixes: f4340622e022 ("btrfs: extent_io: Move the BUG_ON() in flush_write_bio() one level up") CC: stable@vger.kernel.org # 5.2+ Reviewed-by: Filipe Manana Signed-off-by: Dennis Zhou Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4dc5e6939856..67066ece4de3 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3728,11 +3728,20 @@ static void end_extent_buffer_writeback(struct extent_buffer *eb) static void set_btree_ioerr(struct page *page) { struct extent_buffer *eb = (struct extent_buffer *)page->private; + struct btrfs_fs_info *fs_info; SetPageError(page); if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) return; + /* + * If we error out, we should add back the dirty_metadata_bytes + * to make it consistent. + */ + fs_info = eb->fs_info; + percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, + eb->len, fs_info->dirty_metadata_batch); + /* * If writeback for a btree extent that doesn't belong to a log tree * failed, increment the counter transaction->eb_write_errors. From 0607eb1d452d45c5ac4c745a9e9e0d95152ea9d0 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 11 Sep 2019 17:42:28 +0100 Subject: [PATCH 180/334] Btrfs: fix missing error return if writeback for extent buffer never started If lock_extent_buffer_for_io() fails, it returns a negative value, but its caller btree_write_cache_pages() ignores such error. This means that a call to flush_write_bio(), from lock_extent_buffer_for_io(), might have failed. We should make btree_write_cache_pages() notice such error values and stop immediatelly, making sure filemap_fdatawrite_range() returns an error to the transaction commit path. A failure from flush_write_bio() should also result in the endio callback end_bio_extent_buffer_writepage() being invoked, which sets the BTRFS_FS_*_ERR bits appropriately, so that there's no risk a transaction or log commit doesn't catch a writeback failure. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 67066ece4de3..380ced84d4b1 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3978,6 +3978,10 @@ retry: if (!ret) { free_extent_buffer(eb); continue; + } else if (ret < 0) { + done = 1; + free_extent_buffer(eb); + break; } ret = write_one_eb(eb, wbc, &epd); From 3d66b89c30f9220a72e92847768fc8ba4d027d88 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Sep 2019 12:57:04 -0700 Subject: [PATCH 181/334] net: sched: fix possible crash in tcf_action_destroy() If the allocation done in tcf_exts_init() failed, we end up with a NULL pointer in exts->actions. kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] PREEMPT SMP KASAN CPU: 1 PID: 8198 Comm: syz-executor.3 Not tainted 5.3.0-rc8+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:tcf_action_destroy+0x71/0x160 net/sched/act_api.c:705 Code: c3 08 44 89 ee e8 4f cb bb fb 41 83 fd 20 0f 84 c9 00 00 00 e8 c0 c9 bb fb 48 89 d8 48 b9 00 00 00 00 00 fc ff df 48 c1 e8 03 <80> 3c 08 00 0f 85 c0 00 00 00 4c 8b 33 4d 85 f6 0f 84 9d 00 00 00 RSP: 0018:ffff888096e16ff0 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000000000000 RCX: dffffc0000000000 RDX: 0000000000040000 RSI: ffffffff85b6ab30 RDI: 0000000000000000 RBP: ffff888096e17020 R08: ffff8880993f6140 R09: fffffbfff11cae67 R10: fffffbfff11cae66 R11: ffffffff88e57333 R12: 0000000000000000 R13: 0000000000000000 R14: ffff888096e177a0 R15: 0000000000000001 FS: 00007f62bc84a700(0000) GS:ffff8880ae900000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000758040 CR3: 0000000088b64000 CR4: 00000000001426e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: tcf_exts_destroy+0x38/0xb0 net/sched/cls_api.c:3030 tcindex_set_parms+0xf7f/0x1e50 net/sched/cls_tcindex.c:488 tcindex_change+0x230/0x318 net/sched/cls_tcindex.c:519 tc_new_tfilter+0xa4b/0x1c70 net/sched/cls_api.c:2152 rtnetlink_rcv_msg+0x838/0xb00 net/core/rtnetlink.c:5214 netlink_rcv_skb+0x177/0x450 net/netlink/af_netlink.c:2477 rtnetlink_rcv+0x1d/0x30 net/core/rtnetlink.c:5241 netlink_unicast_kernel net/netlink/af_netlink.c:1302 [inline] netlink_unicast+0x531/0x710 net/netlink/af_netlink.c:1328 netlink_sendmsg+0x8a5/0xd60 net/netlink/af_netlink.c:1917 sock_sendmsg_nosec net/socket.c:637 [inline] sock_sendmsg+0xd7/0x130 net/socket.c:657 ___sys_sendmsg+0x3e2/0x920 net/socket.c:2311 __sys_sendmmsg+0x1bf/0x4d0 net/socket.c:2413 __do_sys_sendmmsg net/socket.c:2442 [inline] Fixes: 90b73b77d08e ("net: sched: change action API to use array of pointers to actions") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Vlad Buslov Cc: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_api.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 32577c248968..64584a1df425 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -2894,8 +2894,10 @@ out: void tcf_exts_destroy(struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT - tcf_action_destroy(exts->actions, TCA_ACT_UNBIND); - kfree(exts->actions); + if (exts->actions) { + tcf_action_destroy(exts->actions, TCA_ACT_UNBIND); + kfree(exts->actions); + } exts->nr_actions = 0; #endif } From b91ee4aa2a2199ba4d4650706c272985a5a32d80 Mon Sep 17 00:00:00 2001 From: Ori Nimron Date: Fri, 20 Sep 2019 09:35:45 +0200 Subject: [PATCH 182/334] mISDN: enforce CAP_NET_RAW for raw sockets When creating a raw AF_ISDN socket, CAP_NET_RAW needs to be checked first. Signed-off-by: Ori Nimron Signed-off-by: Greg Kroah-Hartman Signed-off-by: David S. Miller --- drivers/isdn/mISDN/socket.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c index c6ba37df4b9d..dff4132b3702 100644 --- a/drivers/isdn/mISDN/socket.c +++ b/drivers/isdn/mISDN/socket.c @@ -754,6 +754,8 @@ base_sock_create(struct net *net, struct socket *sock, int protocol, int kern) if (sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; + if (!capable(CAP_NET_RAW)) + return -EPERM; sk = sk_alloc(net, PF_ISDN, GFP_KERNEL, &mISDN_proto, kern); if (!sk) From 6cc03e8aa36c51f3b26a0d21a3c4ce2809c842ac Mon Sep 17 00:00:00 2001 From: Ori Nimron Date: Fri, 20 Sep 2019 09:35:46 +0200 Subject: [PATCH 183/334] appletalk: enforce CAP_NET_RAW for raw sockets When creating a raw AF_APPLETALK socket, CAP_NET_RAW needs to be checked first. Signed-off-by: Ori Nimron Signed-off-by: Greg Kroah-Hartman Signed-off-by: David S. Miller --- net/appletalk/ddp.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 4072e9d394d6..b41375d4d295 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1023,6 +1023,11 @@ static int atalk_create(struct net *net, struct socket *sock, int protocol, */ if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM) goto out; + + rc = -EPERM; + if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW)) + goto out; + rc = -ENOMEM; sk = sk_alloc(net, PF_APPLETALK, GFP_KERNEL, &ddp_proto, kern); if (!sk) From 0614e2b73768b502fc32a75349823356d98aae2c Mon Sep 17 00:00:00 2001 From: Ori Nimron Date: Fri, 20 Sep 2019 09:35:47 +0200 Subject: [PATCH 184/334] ax25: enforce CAP_NET_RAW for raw sockets When creating a raw AF_AX25 socket, CAP_NET_RAW needs to be checked first. Signed-off-by: Ori Nimron Signed-off-by: Greg Kroah-Hartman Signed-off-by: David S. Miller --- net/ax25/af_ax25.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index ca5207767dc2..bb222b882b67 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -855,6 +855,8 @@ static int ax25_create(struct net *net, struct socket *sock, int protocol, break; case SOCK_RAW: + if (!capable(CAP_NET_RAW)) + return -EPERM; break; default: return -ESOCKTNOSUPPORT; From e69dbd4619e7674c1679cba49afd9dd9ac347eef Mon Sep 17 00:00:00 2001 From: Ori Nimron Date: Fri, 20 Sep 2019 09:35:48 +0200 Subject: [PATCH 185/334] ieee802154: enforce CAP_NET_RAW for raw sockets When creating a raw AF_IEEE802154 socket, CAP_NET_RAW needs to be checked first. Signed-off-by: Ori Nimron Signed-off-by: Greg Kroah-Hartman Acked-by: Stefan Schmidt Signed-off-by: David S. Miller --- net/ieee802154/socket.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index badc5cfe4dc6..d93d4531aa9b 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -1008,6 +1008,9 @@ static int ieee802154_create(struct net *net, struct socket *sock, switch (sock->type) { case SOCK_RAW: + rc = -EPERM; + if (!capable(CAP_NET_RAW)) + goto out; proto = &ieee802154_raw_prot; ops = &ieee802154_raw_ops; break; From 3a359798b176183ef09efb7a3dc59abad1cc7104 Mon Sep 17 00:00:00 2001 From: Ori Nimron Date: Fri, 20 Sep 2019 09:35:49 +0200 Subject: [PATCH 186/334] nfc: enforce CAP_NET_RAW for raw sockets When creating a raw AF_NFC socket, CAP_NET_RAW needs to be checked first. Signed-off-by: Ori Nimron Signed-off-by: Greg Kroah-Hartman Signed-off-by: David S. Miller --- net/nfc/llcp_sock.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index 9b8742947aff..8dfea26536c9 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -1004,10 +1004,13 @@ static int llcp_sock_create(struct net *net, struct socket *sock, sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; - if (sock->type == SOCK_RAW) + if (sock->type == SOCK_RAW) { + if (!capable(CAP_NET_RAW)) + return -EPERM; sock->ops = &llcp_rawsock_ops; - else + } else { sock->ops = &llcp_sock_ops; + } sk = nfc_llcp_sock_alloc(sock, sock->type, GFP_ATOMIC, kern); if (sk == NULL) From 13fc1d271a2e3ab8a02071e711add01fab9271f6 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 24 Sep 2019 10:49:54 +0100 Subject: [PATCH 187/334] Btrfs: fix race setting up and completing qgroup rescan workers There is a race between setting up a qgroup rescan worker and completing a qgroup rescan worker that can lead to callers of the qgroup rescan wait ioctl to either not wait for the rescan worker to complete or to hang forever due to missing wake ups. The following diagram shows a sequence of steps that illustrates the race. CPU 1 CPU 2 CPU 3 btrfs_ioctl_quota_rescan() btrfs_qgroup_rescan() qgroup_rescan_init() mutex_lock(&fs_info->qgroup_rescan_lock) spin_lock(&fs_info->qgroup_lock) fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN init_completion( &fs_info->qgroup_rescan_completion) fs_info->qgroup_rescan_running = true mutex_unlock(&fs_info->qgroup_rescan_lock) spin_unlock(&fs_info->qgroup_lock) btrfs_init_work() --> starts the worker btrfs_qgroup_rescan_worker() mutex_lock(&fs_info->qgroup_rescan_lock) fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN mutex_unlock(&fs_info->qgroup_rescan_lock) starts transaction, updates qgroup status item, etc btrfs_ioctl_quota_rescan() btrfs_qgroup_rescan() qgroup_rescan_init() mutex_lock(&fs_info->qgroup_rescan_lock) spin_lock(&fs_info->qgroup_lock) fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN init_completion( &fs_info->qgroup_rescan_completion) fs_info->qgroup_rescan_running = true mutex_unlock(&fs_info->qgroup_rescan_lock) spin_unlock(&fs_info->qgroup_lock) btrfs_init_work() --> starts another worker mutex_lock(&fs_info->qgroup_rescan_lock) fs_info->qgroup_rescan_running = false mutex_unlock(&fs_info->qgroup_rescan_lock) complete_all(&fs_info->qgroup_rescan_completion) Before the rescan worker started by the task at CPU 3 completes, if another task calls btrfs_ioctl_quota_rescan(), it will get -EINPROGRESS because the flag BTRFS_QGROUP_STATUS_FLAG_RESCAN is set at fs_info->qgroup_flags, which is expected and correct behaviour. However if other task calls btrfs_ioctl_quota_rescan_wait() before the rescan worker started by the task at CPU 3 completes, it will return immediately without waiting for the new rescan worker to complete, because fs_info->qgroup_rescan_running is set to false by CPU 2. This race is making test case btrfs/171 (from fstests) to fail often: btrfs/171 9s ... - output mismatch (see /home/fdmanana/git/hub/xfstests/results//btrfs/171.out.bad) --- tests/btrfs/171.out 2018-09-16 21:30:48.505104287 +0100 +++ /home/fdmanana/git/hub/xfstests/results//btrfs/171.out.bad 2019-09-19 02:01:36.938486039 +0100 @@ -1,2 +1,3 @@ QA output created by 171 +ERROR: quota rescan failed: Operation now in progress Silence is golden ... (Run 'diff -u /home/fdmanana/git/hub/xfstests/tests/btrfs/171.out /home/fdmanana/git/hub/xfstests/results//btrfs/171.out.bad' to see the entire diff) That is because the test calls the btrfs-progs commands "qgroup quota rescan -w", "qgroup assign" and "qgroup remove" in a sequence that makes calls to the rescan start ioctl fail with -EINPROGRESS (note the "btrfs" commands 'qgroup assign' and 'qgroup remove' often call the rescan start ioctl after calling the qgroup assign ioctl, btrfs_ioctl_qgroup_assign()), since previous waits didn't actually wait for a rescan worker to complete. Another problem the race can cause is missing wake ups for waiters, since the call to complete_all() happens outside a critical section and after clearing the flag BTRFS_QGROUP_STATUS_FLAG_RESCAN. In the sequence diagram above, if we have a waiter for the first rescan task (executed by CPU 2), then fs_info->qgroup_rescan_completion.wait is not empty, and if after the rescan worker clears BTRFS_QGROUP_STATUS_FLAG_RESCAN and before it calls complete_all() against fs_info->qgroup_rescan_completion, the task at CPU 3 calls init_completion() against fs_info->qgroup_rescan_completion which re-initilizes its wait queue to an empty queue, therefore causing the rescan worker at CPU 2 to call complete_all() against an empty queue, never waking up the task waiting for that rescan worker. Fix this by clearing BTRFS_QGROUP_STATUS_FLAG_RESCAN and setting fs_info->qgroup_rescan_running to false in the same critical section, delimited by the mutex fs_info->qgroup_rescan_lock, as well as doing the call to complete_all() in that same critical section. This gives the protection needed to avoid rescan wait ioctl callers not waiting for a running rescan worker and the lost wake ups problem, since setting that rescan flag and boolean as well as initializing the wait queue is done already in a critical section delimited by that mutex (at qgroup_rescan_init()). Fixes: 57254b6ebce4ce ("Btrfs: add ioctl to wait for qgroup rescan completion") Fixes: d2c609b834d62f ("btrfs: properly track when rescan worker is running") CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 8d3bd799ac7d..52701c1be109 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3166,9 +3166,6 @@ out: btrfs_free_path(path); mutex_lock(&fs_info->qgroup_rescan_lock); - if (!btrfs_fs_closing(fs_info)) - fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; - if (err > 0 && fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) { fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; @@ -3184,16 +3181,30 @@ out: trans = btrfs_start_transaction(fs_info->quota_root, 1); if (IS_ERR(trans)) { err = PTR_ERR(trans); + trans = NULL; btrfs_err(fs_info, "fail to start transaction for status update: %d", err); - goto done; } - ret = update_qgroup_status_item(trans); - if (ret < 0) { - err = ret; - btrfs_err(fs_info, "fail to update qgroup status: %d", err); + + mutex_lock(&fs_info->qgroup_rescan_lock); + if (!btrfs_fs_closing(fs_info)) + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; + if (trans) { + ret = update_qgroup_status_item(trans); + if (ret < 0) { + err = ret; + btrfs_err(fs_info, "fail to update qgroup status: %d", + err); + } } + fs_info->qgroup_rescan_running = false; + complete_all(&fs_info->qgroup_rescan_completion); + mutex_unlock(&fs_info->qgroup_rescan_lock); + + if (!trans) + return; + btrfs_end_transaction(trans); if (btrfs_fs_closing(fs_info)) { @@ -3204,12 +3215,6 @@ out: } else { btrfs_err(fs_info, "qgroup scan failed with %d", err); } - -done: - mutex_lock(&fs_info->qgroup_rescan_lock); - fs_info->qgroup_rescan_running = false; - mutex_unlock(&fs_info->qgroup_rescan_lock); - complete_all(&fs_info->qgroup_rescan_completion); } /* From 9d4d0d06bbf9f7e576b0ebbb2f77672d0fc7f503 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Sun, 22 Sep 2019 15:36:03 +0200 Subject: [PATCH 188/334] mt76: mt7615: fix mt7615 firmware path definitions mt7615 patch/n9/cr4 firmwares are available in mediatek folder in linux-firmware repository. Because of this mt7615 won't work on regular distributions like Ubuntu. Fix path definitions. Moreover remove useless firmware name pointers and use definitions directly Fixes: 04b8e65922f6 ("mt76: add mac80211 driver for MT7615 PCIe-based chipsets") Cc: stable@vger.kernel.org Signed-off-by: Lorenzo Bianconi Signed-off-by: Kalle Valo --- drivers/net/wireless/mediatek/mt76/mt7615/mcu.c | 11 ++++------- drivers/net/wireless/mediatek/mt76/mt7615/mt7615.h | 6 +++--- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7615/mcu.c index 275d5eaed3b7..842cd81704db 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7615/mcu.c +++ b/drivers/net/wireless/mediatek/mt76/mt7615/mcu.c @@ -333,7 +333,6 @@ static int mt7615_driver_own(struct mt7615_dev *dev) static int mt7615_load_patch(struct mt7615_dev *dev) { - const char *firmware = MT7615_ROM_PATCH; const struct mt7615_patch_hdr *hdr; const struct firmware *fw = NULL; int len, ret, sem; @@ -349,7 +348,7 @@ static int mt7615_load_patch(struct mt7615_dev *dev) return -EAGAIN; } - ret = request_firmware(&fw, firmware, dev->mt76.dev); + ret = request_firmware(&fw, MT7615_ROM_PATCH, dev->mt76.dev); if (ret) goto out; @@ -447,13 +446,11 @@ mt7615_mcu_send_ram_firmware(struct mt7615_dev *dev, static int mt7615_load_ram(struct mt7615_dev *dev) { - const struct firmware *fw; const struct mt7615_fw_trailer *hdr; - const char *n9_firmware = MT7615_FIRMWARE_N9; - const char *cr4_firmware = MT7615_FIRMWARE_CR4; + const struct firmware *fw; int ret; - ret = request_firmware(&fw, n9_firmware, dev->mt76.dev); + ret = request_firmware(&fw, MT7615_FIRMWARE_N9, dev->mt76.dev); if (ret) return ret; @@ -482,7 +479,7 @@ static int mt7615_load_ram(struct mt7615_dev *dev) release_firmware(fw); - ret = request_firmware(&fw, cr4_firmware, dev->mt76.dev); + ret = request_firmware(&fw, MT7615_FIRMWARE_CR4, dev->mt76.dev); if (ret) return ret; diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/mt7615.h b/drivers/net/wireless/mediatek/mt76/mt7615/mt7615.h index cef3fd43cb00..7963e302d705 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7615/mt7615.h +++ b/drivers/net/wireless/mediatek/mt76/mt7615/mt7615.h @@ -26,9 +26,9 @@ #define MT7615_RX_RING_SIZE 1024 #define MT7615_RX_MCU_RING_SIZE 512 -#define MT7615_FIRMWARE_CR4 "mt7615_cr4.bin" -#define MT7615_FIRMWARE_N9 "mt7615_n9.bin" -#define MT7615_ROM_PATCH "mt7615_rom_patch.bin" +#define MT7615_FIRMWARE_CR4 "mediatek/mt7615_cr4.bin" +#define MT7615_FIRMWARE_N9 "mediatek/mt7615_n9.bin" +#define MT7615_ROM_PATCH "mediatek/mt7615_rom_patch.bin" #define MT7615_EEPROM_SIZE 1024 #define MT7615_TOKEN_SIZE 4096 From fddbfeece9c7882cc47754c7da460fe427e3e85b Mon Sep 17 00:00:00 2001 From: Luca Coelho Date: Tue, 24 Sep 2019 13:30:57 +0300 Subject: [PATCH 189/334] iwlwifi: fw: don't send GEO_TX_POWER_LIMIT command to FW version 36 The intention was to have the GEO_TX_POWER_LIMIT command in FW version 36 as well, but not all 8000 family got this feature enabled. The 8000 family is the only one using version 36, so skip this version entirely. If we try to send this command to the firmwares that do not support it, we get a BAD_COMMAND response from the firmware. This fixes https://bugzilla.kernel.org/show_bug.cgi?id=204151. Cc: stable@vger.kernel.org # 4.19+ Signed-off-by: Luca Coelho Signed-off-by: Kalle Valo --- drivers/net/wireless/intel/iwlwifi/mvm/fw.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/fw.c b/drivers/net/wireless/intel/iwlwifi/mvm/fw.c index 014eca6596e2..32a5e4e5461f 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/fw.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/fw.c @@ -889,11 +889,13 @@ static bool iwl_mvm_sar_geo_support(struct iwl_mvm *mvm) * firmware versions. Unfortunately, we don't have a TLV API * flag to rely on, so rely on the major version which is in * the first byte of ucode_ver. This was implemented - * initially on version 38 and then backported to 36, 29 and - * 17. + * initially on version 38 and then backported to29 and 17. + * The intention was to have it in 36 as well, but not all + * 8000 family got this feature enabled. The 8000 family is + * the only one using version 36, so skip this version + * entirely. */ return IWL_UCODE_SERIAL(mvm->fw->ucode_ver) >= 38 || - IWL_UCODE_SERIAL(mvm->fw->ucode_ver) == 36 || IWL_UCODE_SERIAL(mvm->fw->ucode_ver) == 29 || IWL_UCODE_SERIAL(mvm->fw->ucode_ver) == 17; } From 02a07046834e64970f3bcd87a422ac2b0adb80de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 20 Sep 2019 16:08:21 +0200 Subject: [PATCH 190/334] arcnet: provide a buffer big enough to actually receive packets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit struct archdr is only big enough to hold the header of various types of arcnet packets. So to provide enough space to hold the data read from hardware provide a buffer large enough to hold a packet with maximal size. The problem was noticed by the stack protector which makes the kernel oops. Signed-off-by: Uwe Kleine-König Acked-by: Michael Grzeschik Signed-off-by: David S. Miller --- drivers/net/arcnet/arcnet.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/drivers/net/arcnet/arcnet.c b/drivers/net/arcnet/arcnet.c index 8459115d9d4e..553776cc1d29 100644 --- a/drivers/net/arcnet/arcnet.c +++ b/drivers/net/arcnet/arcnet.c @@ -1063,31 +1063,34 @@ EXPORT_SYMBOL(arcnet_interrupt); static void arcnet_rx(struct net_device *dev, int bufnum) { struct arcnet_local *lp = netdev_priv(dev); - struct archdr pkt; + union { + struct archdr pkt; + char buf[512]; + } rxdata; struct arc_rfc1201 *soft; int length, ofs; - soft = &pkt.soft.rfc1201; + soft = &rxdata.pkt.soft.rfc1201; - lp->hw.copy_from_card(dev, bufnum, 0, &pkt, ARC_HDR_SIZE); - if (pkt.hard.offset[0]) { - ofs = pkt.hard.offset[0]; + lp->hw.copy_from_card(dev, bufnum, 0, &rxdata.pkt, ARC_HDR_SIZE); + if (rxdata.pkt.hard.offset[0]) { + ofs = rxdata.pkt.hard.offset[0]; length = 256 - ofs; } else { - ofs = pkt.hard.offset[1]; + ofs = rxdata.pkt.hard.offset[1]; length = 512 - ofs; } /* get the full header, if possible */ - if (sizeof(pkt.soft) <= length) { - lp->hw.copy_from_card(dev, bufnum, ofs, soft, sizeof(pkt.soft)); + if (sizeof(rxdata.pkt.soft) <= length) { + lp->hw.copy_from_card(dev, bufnum, ofs, soft, sizeof(rxdata.pkt.soft)); } else { - memset(&pkt.soft, 0, sizeof(pkt.soft)); + memset(&rxdata.pkt.soft, 0, sizeof(rxdata.pkt.soft)); lp->hw.copy_from_card(dev, bufnum, ofs, soft, length); } arc_printk(D_DURING, dev, "Buffer #%d: received packet from %02Xh to %02Xh (%d+4 bytes)\n", - bufnum, pkt.hard.source, pkt.hard.dest, length); + bufnum, rxdata.pkt.hard.source, rxdata.pkt.hard.dest, length); dev->stats.rx_packets++; dev->stats.rx_bytes += length + ARC_HDR_SIZE; @@ -1096,13 +1099,13 @@ static void arcnet_rx(struct net_device *dev, int bufnum) if (arc_proto_map[soft->proto]->is_ip) { if (BUGLVL(D_PROTO)) { struct ArcProto - *oldp = arc_proto_map[lp->default_proto[pkt.hard.source]], + *oldp = arc_proto_map[lp->default_proto[rxdata.pkt.hard.source]], *newp = arc_proto_map[soft->proto]; if (oldp != newp) { arc_printk(D_PROTO, dev, "got protocol %02Xh; encap for host %02Xh is now '%c' (was '%c')\n", - soft->proto, pkt.hard.source, + soft->proto, rxdata.pkt.hard.source, newp->suffix, oldp->suffix); } } @@ -1111,10 +1114,10 @@ static void arcnet_rx(struct net_device *dev, int bufnum) lp->default_proto[0] = soft->proto; /* in striking contrast, the following isn't a hack. */ - lp->default_proto[pkt.hard.source] = soft->proto; + lp->default_proto[rxdata.pkt.hard.source] = soft->proto; } /* call the protocol-specific receiver. */ - arc_proto_map[soft->proto]->rx(dev, bufnum, &pkt, length); + arc_proto_map[soft->proto]->rx(dev, bufnum, &rxdata.pkt, length); } static void null_rx(struct net_device *dev, int bufnum, From 5aafeb74b5bb65b34cc87c7623f9fa163a34fa3b Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Fri, 20 Sep 2019 18:18:26 +0200 Subject: [PATCH 191/334] skge: fix checksum byte order Running old skge driver on PowerPC causes checksum errors because hardware reported 1's complement checksum is in little-endian byte order. Reported-by: Benoit Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/skge.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/skge.c b/drivers/net/ethernet/marvell/skge.c index 0a2ec387a482..095f6c71b4fa 100644 --- a/drivers/net/ethernet/marvell/skge.c +++ b/drivers/net/ethernet/marvell/skge.c @@ -3108,7 +3108,7 @@ static struct sk_buff *skge_rx_get(struct net_device *dev, skb_put(skb, len); if (dev->features & NETIF_F_RXCSUM) { - skb->csum = csum; + skb->csum = le16_to_cpu(csum); skb->ip_summed = CHECKSUM_COMPLETE; } From a6f197f889ce0a5fd583674d9fa39259f5c8f72f Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 24 Sep 2019 09:54:40 +0530 Subject: [PATCH 192/334] powerpc/book3s64: Export has_transparent_hugepage() related functions. In later patch, we want to use hash_transparent_hugepage() in a kernel module. Export two related functions. Acked-by: Michael Ellerman Signed-off-by: Aneesh Kumar K.V Link: https://lore.kernel.org/r/20190924042440.27946-1-aneesh.kumar@linux.ibm.com Signed-off-by: Dan Williams --- arch/powerpc/include/asm/book3s/64/radix.h | 8 +++++++- arch/powerpc/mm/book3s64/hash_pgtable.c | 2 ++ arch/powerpc/mm/book3s64/radix_pgtable.c | 7 ------- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index e04a839cb5b9..65a6966f1de4 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -254,7 +254,13 @@ extern void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, extern pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); extern pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp); -extern int radix__has_transparent_hugepage(void); +static inline int radix__has_transparent_hugepage(void) +{ + /* For radix 2M at PMD level means thp */ + if (mmu_psize_defs[MMU_PAGE_2M].shift == PMD_SHIFT) + return 1; + return 0; +} #endif extern int __meminit radix__vmemmap_create_mapping(unsigned long start, diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c index d1f390ac9cdb..64733b9cb20a 100644 --- a/arch/powerpc/mm/book3s64/hash_pgtable.c +++ b/arch/powerpc/mm/book3s64/hash_pgtable.c @@ -406,6 +406,8 @@ int hash__has_transparent_hugepage(void) return 1; } +EXPORT_SYMBOL_GPL(hash__has_transparent_hugepage); + #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifdef CONFIG_STRICT_KERNEL_RWX diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index b4ca9e95e678..dc7a38f0a45b 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -1057,13 +1057,6 @@ pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, return old_pmd; } -int radix__has_transparent_hugepage(void) -{ - /* For radix 2M at PMD level means thp */ - if (mmu_psize_defs[MMU_PAGE_2M].shift == PMD_SHIFT) - return 1; - return 0; -} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, From f537669978a7abae29c1d3b489f300e4d8f47005 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 5 Sep 2019 21:16:03 +0530 Subject: [PATCH 193/334] libnvdimm/dax: Pick the right alignment default when creating dax devices Allow arch to provide the supported alignments and use hugepage alignment only if we support hugepage. Right now we depend on compile time configs whereas this patch switch this to runtime discovery. Architectures like ppc64 can have THP enabled in code, but then can have hugepage size disabled by the hypervisor. This allows us to create dax devices with PAGE_SIZE alignment in this case. Existing dax namespace with alignment larger than PAGE_SIZE will fail to initialize in this specific case. We still allow fsdax namespace initialization. With respect to identifying whether to enable hugepage fault for a dax device, if THP is enabled during compile, we default to taking hugepage fault and in dax fault handler if we find the fault size > alignment we retry with PAGE_SIZE fault size. This also addresses the below failure scenario on ppc64 ndctl create-namespace --mode=devdax | grep align "align":16777216, "align":16777216 cat /sys/devices/ndbus0/region0/dax0.0/supported_alignments 65536 16777216 daxio.static-debug -z -o /dev/dax0.0 Bus error (core dumped) $ dmesg | tail lpar: Failed hash pte insert with error -4 hash-mmu: mm: Hashing failure ! EA=0x7fff17000000 access=0x8000000000000006 current=daxio hash-mmu: trap=0x300 vsid=0x22cb7a3 ssize=1 base psize=2 psize 10 pte=0xc000000501002b86 daxio[3860]: bus error (7) at 7fff17000000 nip 7fff973c007c lr 7fff973bff34 code 2 in libpmem.so.1.0.0[7fff973b0000+20000] daxio[3860]: code: 792945e4 7d494b78 e95f0098 7d494b78 f93f00a0 4800012c e93f0088 f93f0120 daxio[3860]: code: e93f00a0 f93f0128 e93f0120 e95f0128 e93f0088 39290008 f93f0110 The failure was due to guest kernel using wrong page size. The namespaces created with 16M alignment will appear as below on a config with 16M page size disabled. $ ndctl list -Ni [ { "dev":"namespace0.1", "mode":"fsdax", "map":"dev", "size":5351931904, "uuid":"fc6e9667-461a-4718-82b4-69b24570bddb", "align":16777216, "blockdev":"pmem0.1", "supported_alignments":[ 65536 ] }, { "dev":"namespace0.0", "mode":"fsdax", <==== devdax 16M alignment marked disabled. "map":"mem", "size":5368709120, "uuid":"a4bdf81a-f2ee-4bc6-91db-7b87eddd0484", "state":"disabled" } ] Cc: linux-mm@kvack.org Cc: "Kirill A. Shutemov" Signed-off-by: Aneesh Kumar K.V Link: https://lore.kernel.org/r/20190905154603.10349-8-aneesh.kumar@linux.ibm.com Signed-off-by: Dan Williams --- drivers/nvdimm/nd.h | 6 +--- drivers/nvdimm/pfn_devs.c | 75 ++++++++++++++++++++++++++++----------- include/linux/huge_mm.h | 7 +++- 3 files changed, 61 insertions(+), 27 deletions(-) diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index e89af4b2d8e9..ee5c04070ef9 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -289,11 +289,7 @@ static inline struct device *nd_btt_create(struct nd_region *nd_region) struct nd_pfn *to_nd_pfn(struct device *dev); #if IS_ENABLED(CONFIG_NVDIMM_PFN) -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define PFN_DEFAULT_ALIGNMENT HPAGE_PMD_SIZE -#else -#define PFN_DEFAULT_ALIGNMENT PAGE_SIZE -#endif +#define MAX_NVDIMM_ALIGN 4 int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns); bool is_nd_pfn(struct device *dev); diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index ce9ef18282dd..934cdcaaae97 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c @@ -103,39 +103,42 @@ static ssize_t align_show(struct device *dev, return sprintf(buf, "%ld\n", nd_pfn->align); } -static const unsigned long *nd_pfn_supported_alignments(void) +static unsigned long *nd_pfn_supported_alignments(unsigned long *alignments) { - /* - * This needs to be a non-static variable because the *_SIZE - * macros aren't always constants. - */ - const unsigned long supported_alignments[] = { - PAGE_SIZE, -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - HPAGE_PMD_SIZE, -#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD - HPAGE_PUD_SIZE, -#endif -#endif - 0, - }; - static unsigned long data[ARRAY_SIZE(supported_alignments)]; - memcpy(data, supported_alignments, sizeof(data)); + alignments[0] = PAGE_SIZE; - return data; + if (has_transparent_hugepage()) { + alignments[1] = HPAGE_PMD_SIZE; + if (IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)) + alignments[2] = HPAGE_PUD_SIZE; + } + + return alignments; +} + +/* + * Use pmd mapping if supported as default alignment + */ +static unsigned long nd_pfn_default_alignment(void) +{ + + if (has_transparent_hugepage()) + return HPAGE_PMD_SIZE; + return PAGE_SIZE; } static ssize_t align_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev); + unsigned long aligns[MAX_NVDIMM_ALIGN] = { [0] = 0, }; ssize_t rc; nd_device_lock(dev); nvdimm_bus_lock(dev); rc = nd_size_select_store(dev, buf, &nd_pfn->align, - nd_pfn_supported_alignments()); + nd_pfn_supported_alignments(aligns)); dev_dbg(dev, "result: %zd wrote: %s%s", rc, buf, buf[len - 1] == '\n' ? "" : "\n"); nvdimm_bus_unlock(dev); @@ -259,7 +262,10 @@ static DEVICE_ATTR_RO(size); static ssize_t supported_alignments_show(struct device *dev, struct device_attribute *attr, char *buf) { - return nd_size_select_show(0, nd_pfn_supported_alignments(), buf); + unsigned long aligns[MAX_NVDIMM_ALIGN] = { [0] = 0, }; + + return nd_size_select_show(0, + nd_pfn_supported_alignments(aligns), buf); } static DEVICE_ATTR_RO(supported_alignments); @@ -302,7 +308,7 @@ struct device *nd_pfn_devinit(struct nd_pfn *nd_pfn, return NULL; nd_pfn->mode = PFN_MODE_NONE; - nd_pfn->align = PFN_DEFAULT_ALIGNMENT; + nd_pfn->align = nd_pfn_default_alignment(); dev = &nd_pfn->dev; device_initialize(&nd_pfn->dev); if (ndns && !__nd_attach_ndns(&nd_pfn->dev, ndns, &nd_pfn->ndns)) { @@ -412,6 +418,21 @@ static int nd_pfn_clear_memmap_errors(struct nd_pfn *nd_pfn) return 0; } +static bool nd_supported_alignment(unsigned long align) +{ + int i; + unsigned long supported[MAX_NVDIMM_ALIGN] = { [0] = 0, }; + + if (align == 0) + return false; + + nd_pfn_supported_alignments(supported); + for (i = 0; supported[i]; i++) + if (align == supported[i]) + return true; + return false; +} + /** * nd_pfn_validate - read and validate info-block * @nd_pfn: fsdax namespace runtime state / properties @@ -496,6 +517,18 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig) return -EOPNOTSUPP; } + /* + * Check whether the we support the alignment. For Dax if the + * superblock alignment is not matching, we won't initialize + * the device. + */ + if (!nd_supported_alignment(align) && + !memcmp(pfn_sb->signature, DAX_SIG, PFN_SIG_LEN)) { + dev_err(&nd_pfn->dev, "init failed, alignment mismatch: " + "%ld:%ld\n", nd_pfn->align, align); + return -EOPNOTSUPP; + } + if (!nd_pfn->uuid) { /* * When probing a namepace via nd_pfn_probe() the uuid diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 45ede62aa85b..376a81ff2c96 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -108,7 +108,12 @@ static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma) if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_FLAG)) return true; - + /* + * For dax vmas, try to always use hugepage mappings. If the kernel does + * not support hugepages, fsdax mappings will fallback to PAGE_SIZE + * mappings, and device-dax namespaces, that try to guarantee a given + * mapping size, will fail to enable + */ if (vma_is_dax(vma)) return true; From 86aa66687442ef45909ff9814b82b4d2bb892294 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 9 Aug 2019 13:17:26 +0530 Subject: [PATCH 194/334] =?UTF-8?q?libnvdimm:=20Fix=20endian=20conversion?= =?UTF-8?q?=20issues=C2=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit nd_label->dpa issue was observed when trying to enable the namespace created with little-endian kernel on a big-endian kernel. That made me run `sparse` on the rest of the code and other changes are the result of that. Fixes: d9b83c756953 ("libnvdimm, btt: rework error clearing") Fixes: 9dedc73a4658 ("libnvdimm/btt: Fix LBA masking during 'free list' population") Reviewed-by: Vishal Verma Signed-off-by: Aneesh Kumar K.V Link: https://lore.kernel.org/r/20190809074726.27815-1-aneesh.kumar@linux.ibm.com Signed-off-by: Dan Williams --- drivers/nvdimm/btt.c | 8 ++++---- drivers/nvdimm/namespace_devs.c | 7 ++++--- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index a8d56887ec88..3e9f45aec8d1 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -392,9 +392,9 @@ static int btt_flog_write(struct arena_info *arena, u32 lane, u32 sub, arena->freelist[lane].sub = 1 - arena->freelist[lane].sub; if (++(arena->freelist[lane].seq) == 4) arena->freelist[lane].seq = 1; - if (ent_e_flag(ent->old_map)) + if (ent_e_flag(le32_to_cpu(ent->old_map))) arena->freelist[lane].has_err = 1; - arena->freelist[lane].block = le32_to_cpu(ent_lba(ent->old_map)); + arena->freelist[lane].block = ent_lba(le32_to_cpu(ent->old_map)); return ret; } @@ -560,8 +560,8 @@ static int btt_freelist_init(struct arena_info *arena) * FIXME: if error clearing fails during init, we want to make * the BTT read-only */ - if (ent_e_flag(log_new.old_map) && - !ent_normal(log_new.old_map)) { + if (ent_e_flag(le32_to_cpu(log_new.old_map)) && + !ent_normal(le32_to_cpu(log_new.old_map))) { arena->freelist[i].has_err = 1; ret = arena_clear_freelist_error(arena, i); if (ret) diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index 43401325c874..cca0a3ba1d2c 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -1987,7 +1987,7 @@ static struct device *create_namespace_pmem(struct nd_region *nd_region, nd_mapping = &nd_region->mapping[i]; label_ent = list_first_entry_or_null(&nd_mapping->labels, typeof(*label_ent), list); - label0 = label_ent ? label_ent->label : 0; + label0 = label_ent ? label_ent->label : NULL; if (!label0) { WARN_ON(1); @@ -2322,8 +2322,9 @@ static struct device **scan_labels(struct nd_region *nd_region) continue; /* skip labels that describe extents outside of the region */ - if (nd_label->dpa < nd_mapping->start || nd_label->dpa > map_end) - continue; + if (__le64_to_cpu(nd_label->dpa) < nd_mapping->start || + __le64_to_cpu(nd_label->dpa) > map_end) + continue; i = add_namespace_resource(nd_region, nd_label, devs, count); if (i < 0) From cf387d9644d8c78721cf9b77af9f67bb5b04da16 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 10 Sep 2019 11:58:25 +0530 Subject: [PATCH 195/334] libnvdimm/altmap: Track namespace boundaries in altmap With PFN_MODE_PMEM namespace, the memmap area is allocated from the device area. Some architectures map the memmap area with large page size. On architectures like ppc64, 16MB page for memap mapping can map 262144 pfns. This maps a namespace size of 16G. When populating memmap region with 16MB page from the device area, make sure the allocated space is not used to map resources outside this namespace. Such usage of device area will prevent a namespace destroy. Add resource end pnf in altmap and use that to check if the memmap area allocation can map pfn outside the namespace. On ppc64 in such case we fallback to allocation from memory. This fix kernel crash reported below: [ 132.034989] WARNING: CPU: 13 PID: 13719 at mm/memremap.c:133 devm_memremap_pages_release+0x2d8/0x2e0 [ 133.464754] BUG: Unable to handle kernel data access at 0xc00c00010b204000 [ 133.464760] Faulting instruction address: 0xc00000000007580c [ 133.464766] Oops: Kernel access of bad area, sig: 11 [#1] [ 133.464771] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries ..... [ 133.464901] NIP [c00000000007580c] vmemmap_free+0x2ac/0x3d0 [ 133.464906] LR [c0000000000757f8] vmemmap_free+0x298/0x3d0 [ 133.464910] Call Trace: [ 133.464914] [c000007cbfd0f7b0] [c0000000000757f8] vmemmap_free+0x298/0x3d0 (unreliable) [ 133.464921] [c000007cbfd0f8d0] [c000000000370a44] section_deactivate+0x1a4/0x240 [ 133.464928] [c000007cbfd0f980] [c000000000386270] __remove_pages+0x3a0/0x590 [ 133.464935] [c000007cbfd0fa50] [c000000000074158] arch_remove_memory+0x88/0x160 [ 133.464942] [c000007cbfd0fae0] [c0000000003be8c0] devm_memremap_pages_release+0x150/0x2e0 [ 133.464949] [c000007cbfd0fb70] [c000000000738ea0] devm_action_release+0x30/0x50 [ 133.464955] [c000007cbfd0fb90] [c00000000073a5a4] release_nodes+0x344/0x400 [ 133.464961] [c000007cbfd0fc40] [c00000000073378c] device_release_driver_internal+0x15c/0x250 [ 133.464968] [c000007cbfd0fc80] [c00000000072fd14] unbind_store+0x104/0x110 [ 133.464973] [c000007cbfd0fcd0] [c00000000072ee24] drv_attr_store+0x44/0x70 [ 133.464981] [c000007cbfd0fcf0] [c0000000004a32bc] sysfs_kf_write+0x6c/0xa0 [ 133.464987] [c000007cbfd0fd10] [c0000000004a1dfc] kernfs_fop_write+0x17c/0x250 [ 133.464993] [c000007cbfd0fd60] [c0000000003c348c] __vfs_write+0x3c/0x70 [ 133.464999] [c000007cbfd0fd80] [c0000000003c75d0] vfs_write+0xd0/0x250 djbw: Aneesh notes that this crash can likely be triggered in any kernel that supports 'papr_scm', so flagging that commit for -stable consideration. Fixes: b5beae5e224f ("powerpc/pseries: Add driver for PAPR SCM regions") Cc: Reported-by: Sachin Sant Signed-off-by: Aneesh Kumar K.V Reviewed-by: Pankaj Gupta Tested-by: Santosh Sivaraj Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20190910062826.10041-1-aneesh.kumar@linux.ibm.com Signed-off-by: Dan Williams --- arch/powerpc/mm/init_64.c | 17 ++++++++++++++++- drivers/nvdimm/pfn_devs.c | 2 ++ include/linux/memremap.h | 1 + 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index a44f6281ca3a..4e08246acd79 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -172,6 +172,21 @@ static __meminit void vmemmap_list_populate(unsigned long phys, vmemmap_list = vmem_back; } +static bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start, + unsigned long page_size) +{ + unsigned long nr_pfn = page_size / sizeof(struct page); + unsigned long start_pfn = page_to_pfn((struct page *)start); + + if ((start_pfn + nr_pfn) > altmap->end_pfn) + return true; + + if (start_pfn < altmap->base_pfn) + return true; + + return false; +} + int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap) { @@ -194,7 +209,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, * fail due to alignment issues when using 16MB hugepages, so * fall back to system memory if the altmap allocation fail. */ - if (altmap) { + if (altmap && !altmap_cross_boundary(altmap, start, page_size)) { p = altmap_alloc_block_buf(page_size, altmap); if (!p) pr_debug("altmap block allocation failed, falling back to system memory"); diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index 934cdcaaae97..80c7992bc538 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c @@ -672,9 +672,11 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap) struct nd_namespace_common *ndns = nd_pfn->ndns; struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev); resource_size_t base = nsio->res.start + start_pad; + resource_size_t end = nsio->res.end - end_trunc; struct vmem_altmap __altmap = { .base_pfn = init_altmap_base(base), .reserve = init_altmap_reserve(base), + .end_pfn = PHYS_PFN(end), }; memcpy(res, &nsio->res, sizeof(*res)); diff --git a/include/linux/memremap.h b/include/linux/memremap.h index f8a5b2a19945..c70996fe48c8 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -17,6 +17,7 @@ struct device; */ struct vmem_altmap { const unsigned long base_pfn; + const unsigned long end_pfn; const unsigned long reserve; unsigned long free; unsigned long align; From 59f08896f058a92f03a0041b397a1a227c5e8529 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 17 Sep 2019 21:21:49 -0700 Subject: [PATCH 196/334] libnvdimm/nfit_test: Fix acpi_handle redefinition After commit 62974fc389b3 ("libnvdimm: Enable unit test infrastructure compile checks"), clang warns: In file included from ../drivers/nvdimm/../../tools/testing/nvdimm/test/iomap.c:15: ../drivers/nvdimm/../../tools/testing/nvdimm/test/nfit_test.h:206:15: warning: redefinition of typedef 'acpi_handle' is a C11 feature [-Wtypedef-redefinition] typedef void *acpi_handle; ^ ../include/acpi/actypes.h:424:15: note: previous definition is here typedef void *acpi_handle; /* Actually a ptr to a NS Node */ ^ 1 warning generated. The include chain: iomap.c -> linux/acpi.h -> acpi/acpi.h -> acpi/actypes.h nfit_test.h Avoid this by including linux/acpi.h in nfit_test.h, which allows us to remove both the typedef and the forward declaration of acpi_object. Link: https://github.com/ClangBuiltLinux/linux/issues/660 Signed-off-by: Nathan Chancellor Reviewed-by: Ira Weiny Link: https://lore.kernel.org/r/20190918042148.77553-1-natechancellor@gmail.com Signed-off-by: Dan Williams --- tools/testing/nvdimm/test/nfit_test.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tools/testing/nvdimm/test/nfit_test.h b/tools/testing/nvdimm/test/nfit_test.h index 448d686da8b1..0bf5640f1f07 100644 --- a/tools/testing/nvdimm/test/nfit_test.h +++ b/tools/testing/nvdimm/test/nfit_test.h @@ -4,6 +4,7 @@ */ #ifndef __NFIT_TEST_H__ #define __NFIT_TEST_H__ +#include #include #include #include @@ -202,9 +203,6 @@ struct nd_intel_lss { __u32 status; } __packed; -union acpi_object; -typedef void *acpi_handle; - typedef struct nfit_test_resource *(*nfit_test_lookup_fn)(resource_size_t); typedef union acpi_object *(*nfit_test_evaluate_dsm_fn)(acpi_handle handle, const guid_t *guid, u64 rev, u64 func, From c42adf87e4e7ed77f6ffe288dc90f980d07d68df Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 19 Sep 2019 14:03:55 +0530 Subject: [PATCH 197/334] libnvdimm/region: Initialize bad block for volatile namespaces We do check for a bad block during namespace init and that use region bad block list. We need to initialize the bad block for volatile regions for this to work. We also observe a lockdep warning as below because the lock is not initialized correctly since we skip bad block init for volatile regions. INFO: trying to register non-static key. the code is fine but needs lockdep annotation. turning off the locking correctness validator. CPU: 2 PID: 1 Comm: swapper/0 Not tainted 5.3.0-rc1-15699-g3dee241c937e #149 Call Trace: [c0000000f95cb250] [c00000000147dd84] dump_stack+0xe8/0x164 (unreliable) [c0000000f95cb2a0] [c00000000022ccd8] register_lock_class+0x308/0xa60 [c0000000f95cb3a0] [c000000000229cc0] __lock_acquire+0x170/0x1ff0 [c0000000f95cb4c0] [c00000000022c740] lock_acquire+0x220/0x270 [c0000000f95cb580] [c000000000a93230] badblocks_check+0xc0/0x290 [c0000000f95cb5f0] [c000000000d97540] nd_pfn_validate+0x5c0/0x7f0 [c0000000f95cb6d0] [c000000000d98300] nd_dax_probe+0xd0/0x1f0 [c0000000f95cb760] [c000000000d9b66c] nd_pmem_probe+0x10c/0x160 [c0000000f95cb790] [c000000000d7f5ec] nvdimm_bus_probe+0x10c/0x240 [c0000000f95cb820] [c000000000d0f844] really_probe+0x254/0x4e0 [c0000000f95cb8b0] [c000000000d0fdfc] driver_probe_device+0x16c/0x1e0 [c0000000f95cb930] [c000000000d10238] device_driver_attach+0x68/0xa0 [c0000000f95cb970] [c000000000d1040c] __driver_attach+0x19c/0x1c0 [c0000000f95cb9f0] [c000000000d0c4c4] bus_for_each_dev+0x94/0x130 [c0000000f95cba50] [c000000000d0f014] driver_attach+0x34/0x50 [c0000000f95cba70] [c000000000d0e208] bus_add_driver+0x178/0x2f0 [c0000000f95cbb00] [c000000000d117c8] driver_register+0x108/0x170 [c0000000f95cbb70] [c000000000d7edb0] __nd_driver_register+0xe0/0x100 [c0000000f95cbbd0] [c000000001a6baa4] nd_pmem_driver_init+0x34/0x48 [c0000000f95cbbf0] [c0000000000106f4] do_one_initcall+0x1d4/0x4b0 [c0000000f95cbcd0] [c0000000019f499c] kernel_init_freeable+0x544/0x65c [c0000000f95cbdb0] [c000000000010d6c] kernel_init+0x2c/0x180 [c0000000f95cbe20] [c00000000000b954] ret_from_kernel_thread+0x5c/0x68 Signed-off-by: Aneesh Kumar K.V Link: https://lore.kernel.org/r/20190919083355.26340-1-aneesh.kumar@linux.ibm.com Signed-off-by: Dan Williams --- drivers/nvdimm/bus.c | 2 +- drivers/nvdimm/region.c | 4 ++-- drivers/nvdimm/region_devs.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c index 75a58a6e9615..d47412dcdf38 100644 --- a/drivers/nvdimm/bus.c +++ b/drivers/nvdimm/bus.c @@ -180,7 +180,7 @@ static int nvdimm_clear_badblocks_region(struct device *dev, void *data) sector_t sector; /* make sure device is a region */ - if (!is_nd_pmem(dev)) + if (!is_memory(dev)) return 0; nd_region = to_nd_region(dev); diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c index 37bf8719a2a4..0f6978e72e7c 100644 --- a/drivers/nvdimm/region.c +++ b/drivers/nvdimm/region.c @@ -34,7 +34,7 @@ static int nd_region_probe(struct device *dev) if (rc) return rc; - if (is_nd_pmem(&nd_region->dev)) { + if (is_memory(&nd_region->dev)) { struct resource ndr_res; if (devm_init_badblocks(dev, &nd_region->bb)) @@ -123,7 +123,7 @@ static void nd_region_notify(struct device *dev, enum nvdimm_event event) struct nd_region *nd_region = to_nd_region(dev); struct resource res; - if (is_nd_pmem(&nd_region->dev)) { + if (is_memory(&nd_region->dev)) { res.start = nd_region->ndr_start; res.end = nd_region->ndr_start + nd_region->ndr_size - 1; diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index 3fd6b59abd33..ab91890f2486 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -632,11 +632,11 @@ static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n) if (!is_memory(dev) && a == &dev_attr_dax_seed.attr) return 0; - if (!is_nd_pmem(dev) && a == &dev_attr_badblocks.attr) + if (!is_memory(dev) && a == &dev_attr_badblocks.attr) return 0; if (a == &dev_attr_resource.attr) { - if (is_nd_pmem(dev)) + if (is_memory(dev)) return 0400; else return 0; From 674f31a352da5e9f621f757b9a89262f486533a0 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Tue, 24 Sep 2019 10:34:49 -0700 Subject: [PATCH 198/334] libnvdimm: prevent nvdimm from requesting key when security is disabled Current implementation attempts to request keys from the keyring even when security is not enabled. Change behavior so when security is disabled it will skip key request. Error messages seen when no keys are installed and libnvdimm is loaded: request-key[4598]: Cannot find command to construct key 661489677 request-key[4606]: Cannot find command to construct key 34713726 Cc: stable@vger.kernel.org Fixes: 4c6926a23b76 ("acpi/nfit, libnvdimm: Add unlock of nvdimm support for Intel DIMMs") Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/156934642272.30222.5230162488753445916.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Dan Williams --- drivers/nvdimm/security.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/nvdimm/security.c b/drivers/nvdimm/security.c index 9e45b207ff01..89b85970912d 100644 --- a/drivers/nvdimm/security.c +++ b/drivers/nvdimm/security.c @@ -177,6 +177,10 @@ static int __nvdimm_security_unlock(struct nvdimm *nvdimm) || !nvdimm->sec.flags) return -EIO; + /* No need to go further if security is disabled */ + if (test_bit(NVDIMM_SECURITY_DISABLED, &nvdimm->sec.flags)) + return 0; + if (test_bit(NDD_SECURITY_OVERWRITE, &nvdimm->flags)) { dev_dbg(dev, "Security operation in progress.\n"); return -EBUSY; From 4c806b897d6075bfa5067e524fb058c57ab64e7b Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 24 Sep 2019 17:13:27 +0530 Subject: [PATCH 199/334] libnvdimm/region: Enable MAP_SYNC for volatile regions Some environments want to use a host tmpfs/ramdisk to back guest pmem. While the data is not persisted relative to the host it *is* persisted relative to guest crashes / reboots. The guest is free to use dax and MAP_SYNC to keep filesystem metadata consistent with dax accesses without requiring guest fsync(). The guest can also observe that the region is volatile and skip cache flushing as global visibility is enough to "persist" data relative to the host staying alive over guest reset events. Signed-off-by: Aneesh Kumar K.V Reviewed-by: Pankaj Gupta Link: https://lore.kernel.org/r/20190924114327.14700-1-aneesh.kumar@linux.ibm.com [djbw: reword the changelog] Signed-off-by: Dan Williams --- drivers/nvdimm/region_devs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index ab91890f2486..ef423ba1a711 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -1168,6 +1168,9 @@ EXPORT_SYMBOL_GPL(nvdimm_has_cache); bool is_nvdimm_sync(struct nd_region *nd_region) { + if (is_nd_volatile(&nd_region->dev)) + return true; + return is_nd_pmem(&nd_region->dev) && !test_bit(ND_REGION_ASYNC, &nd_region->flags); } From 93cad5f789951eaa27c3392b15294b4e51253944 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 24 Sep 2019 09:22:54 +0530 Subject: [PATCH 200/334] selftests/powerpc: Add test case for tlbie vs mtpidr ordering issue Signed-off-by: Aneesh Kumar K.V [mpe: Some minor fixes to make it build] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190924035254.24612-4-aneesh.kumar@linux.ibm.com --- tools/testing/selftests/powerpc/mm/Makefile | 2 + .../testing/selftests/powerpc/mm/tlbie_test.c | 734 ++++++++++++++++++ 2 files changed, 736 insertions(+) create mode 100644 tools/testing/selftests/powerpc/mm/tlbie_test.c diff --git a/tools/testing/selftests/powerpc/mm/Makefile b/tools/testing/selftests/powerpc/mm/Makefile index f1fbc15800c4..ed1565809d2b 100644 --- a/tools/testing/selftests/powerpc/mm/Makefile +++ b/tools/testing/selftests/powerpc/mm/Makefile @@ -4,6 +4,7 @@ noarg: TEST_GEN_PROGS := hugetlb_vs_thp_test subpage_prot prot_sao segv_errors wild_bctr \ large_vm_fork_separation +TEST_GEN_PROGS_EXTENDED := tlbie_test TEST_GEN_FILES := tempfile top_srcdir = ../../../../.. @@ -19,3 +20,4 @@ $(OUTPUT)/large_vm_fork_separation: CFLAGS += -m64 $(OUTPUT)/tempfile: dd if=/dev/zero of=$@ bs=64k count=1 +$(OUTPUT)/tlbie_test: LDLIBS += -lpthread diff --git a/tools/testing/selftests/powerpc/mm/tlbie_test.c b/tools/testing/selftests/powerpc/mm/tlbie_test.c new file mode 100644 index 000000000000..9868a5ddd847 --- /dev/null +++ b/tools/testing/selftests/powerpc/mm/tlbie_test.c @@ -0,0 +1,734 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2019, Nick Piggin, Gautham R. Shenoy, Aneesh Kumar K.V, IBM Corp. + */ + +/* + * + * Test tlbie/mtpidr race. We have 4 threads doing flush/load/compare/store + * sequence in a loop. The same threads also rung a context switch task + * that does sched_yield() in loop. + * + * The snapshot thread mark the mmap area PROT_READ in between, make a copy + * and copy it back to the original area. This helps us to detect if any + * store continued to happen after we marked the memory PROT_READ. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static inline void dcbf(volatile unsigned int *addr) +{ + __asm__ __volatile__ ("dcbf %y0; sync" : : "Z"(*(unsigned char *)addr) : "memory"); +} + +static void err_msg(char *msg) +{ + + time_t now; + time(&now); + printf("=================================\n"); + printf(" Error: %s\n", msg); + printf(" %s", ctime(&now)); + printf("=================================\n"); + exit(1); +} + +static char *map1; +static char *map2; +static pid_t rim_process_pid; + +/* + * A "rim-sequence" is defined to be the sequence of the following + * operations performed on a memory word: + * 1) FLUSH the contents of that word. + * 2) LOAD the contents of that word. + * 3) COMPARE the contents of that word with the content that was + * previously stored at that word + * 4) STORE new content into that word. + * + * The threads in this test that perform the rim-sequence are termed + * as rim_threads. + */ + +/* + * A "corruption" is defined to be the failed COMPARE operation in a + * rim-sequence. + * + * A rim_thread that detects a corruption informs about it to all the + * other rim_threads, and the mem_snapshot thread. + */ +static volatile unsigned int corruption_found; + +/* + * This defines the maximum number of rim_threads in this test. + * + * The THREAD_ID_BITS denote the number of bits required + * to represent the thread_ids [0..MAX_THREADS - 1]. + * We are being a bit paranoid here and set it to 8 bits, + * though 6 bits suffice. + * + */ +#define MAX_THREADS 64 +#define THREAD_ID_BITS 8 +#define THREAD_ID_MASK ((1 << THREAD_ID_BITS) - 1) +static unsigned int rim_thread_ids[MAX_THREADS]; +static pthread_t rim_threads[MAX_THREADS]; + + +/* + * Each rim_thread works on an exclusive "chunk" of size + * RIM_CHUNK_SIZE. + * + * The ith rim_thread works on the ith chunk. + * + * The ith chunk begins at + * map1 + (i * RIM_CHUNK_SIZE) + */ +#define RIM_CHUNK_SIZE 1024 +#define BITS_PER_BYTE 8 +#define WORD_SIZE (sizeof(unsigned int)) +#define WORD_BITS (WORD_SIZE * BITS_PER_BYTE) +#define WORDS_PER_CHUNK (RIM_CHUNK_SIZE/WORD_SIZE) + +static inline char *compute_chunk_start_addr(unsigned int thread_id) +{ + char *chunk_start; + + chunk_start = (char *)((unsigned long)map1 + + (thread_id * RIM_CHUNK_SIZE)); + + return chunk_start; +} + +/* + * The "word-offset" of a word-aligned address inside a chunk, is + * defined to be the number of words that precede the address in that + * chunk. + * + * WORD_OFFSET_BITS denote the number of bits required to represent + * the word-offsets of all the word-aligned addresses of a chunk. + */ +#define WORD_OFFSET_BITS (__builtin_ctz(WORDS_PER_CHUNK)) +#define WORD_OFFSET_MASK ((1 << WORD_OFFSET_BITS) - 1) + +static inline unsigned int compute_word_offset(char *start, unsigned int *addr) +{ + unsigned int delta_bytes, ret; + delta_bytes = (unsigned long)addr - (unsigned long)start; + + ret = delta_bytes/WORD_SIZE; + + return ret; +} + +/* + * A "sweep" is defined to be the sequential execution of the + * rim-sequence by a rim_thread on its chunk one word at a time, + * starting from the first word of its chunk and ending with the last + * word of its chunk. + * + * Each sweep of a rim_thread is uniquely identified by a sweep_id. + * SWEEP_ID_BITS denote the number of bits required to represent + * the sweep_ids of rim_threads. + * + * As to why SWEEP_ID_BITS are computed as a function of THREAD_ID_BITS, + * WORD_OFFSET_BITS, and WORD_BITS, see the "store-pattern" below. + */ +#define SWEEP_ID_BITS (WORD_BITS - (THREAD_ID_BITS + WORD_OFFSET_BITS)) +#define SWEEP_ID_MASK ((1 << SWEEP_ID_BITS) - 1) + +/* + * A "store-pattern" is the word-pattern that is stored into a word + * location in the 4)STORE step of the rim-sequence. + * + * In the store-pattern, we shall encode: + * + * - The thread-id of the rim_thread performing the store + * (The most significant THREAD_ID_BITS) + * + * - The word-offset of the address into which the store is being + * performed (The next WORD_OFFSET_BITS) + * + * - The sweep_id of the current sweep in which the store is + * being performed. (The lower SWEEP_ID_BITS) + * + * Store Pattern: 32 bits + * |------------------|--------------------|---------------------------------| + * | Thread id | Word offset | sweep_id | + * |------------------|--------------------|---------------------------------| + * THREAD_ID_BITS WORD_OFFSET_BITS SWEEP_ID_BITS + * + * In the store pattern, the (Thread-id + Word-offset) uniquely identify the + * address to which the store is being performed i.e, + * address == map1 + + * (Thread-id * RIM_CHUNK_SIZE) + (Word-offset * WORD_SIZE) + * + * And the sweep_id in the store pattern identifies the time when the + * store was performed by the rim_thread. + * + * We shall use this property in the 3)COMPARE step of the + * rim-sequence. + */ +#define SWEEP_ID_SHIFT 0 +#define WORD_OFFSET_SHIFT (SWEEP_ID_BITS) +#define THREAD_ID_SHIFT (WORD_OFFSET_BITS + SWEEP_ID_BITS) + +/* + * Compute the store pattern for a given thread with id @tid, at + * location @addr in the sweep identified by @sweep_id + */ +static inline unsigned int compute_store_pattern(unsigned int tid, + unsigned int *addr, + unsigned int sweep_id) +{ + unsigned int ret = 0; + char *start = compute_chunk_start_addr(tid); + unsigned int word_offset = compute_word_offset(start, addr); + + ret += (tid & THREAD_ID_MASK) << THREAD_ID_SHIFT; + ret += (word_offset & WORD_OFFSET_MASK) << WORD_OFFSET_SHIFT; + ret += (sweep_id & SWEEP_ID_MASK) << SWEEP_ID_SHIFT; + return ret; +} + +/* Extract the thread-id from the given store-pattern */ +static inline unsigned int extract_tid(unsigned int pattern) +{ + unsigned int ret; + + ret = (pattern >> THREAD_ID_SHIFT) & THREAD_ID_MASK; + return ret; +} + +/* Extract the word-offset from the given store-pattern */ +static inline unsigned int extract_word_offset(unsigned int pattern) +{ + unsigned int ret; + + ret = (pattern >> WORD_OFFSET_SHIFT) & WORD_OFFSET_MASK; + + return ret; +} + +/* Extract the sweep-id from the given store-pattern */ +static inline unsigned int extract_sweep_id(unsigned int pattern) + +{ + unsigned int ret; + + ret = (pattern >> SWEEP_ID_SHIFT) & SWEEP_ID_MASK; + + return ret; +} + +/************************************************************ + * * + * Logging the output of the verification * + * * + ************************************************************/ +#define LOGDIR_NAME_SIZE 100 +static char logdir[LOGDIR_NAME_SIZE]; + +static FILE *fp[MAX_THREADS]; +static const char logfilename[] ="Thread-%02d-Chunk"; + +static inline void start_verification_log(unsigned int tid, + unsigned int *addr, + unsigned int cur_sweep_id, + unsigned int prev_sweep_id) +{ + FILE *f; + char logfile[30]; + char path[LOGDIR_NAME_SIZE + 30]; + char separator[2] = "/"; + char *chunk_start = compute_chunk_start_addr(tid); + unsigned int size = RIM_CHUNK_SIZE; + + sprintf(logfile, logfilename, tid); + strcpy(path, logdir); + strcat(path, separator); + strcat(path, logfile); + f = fopen(path, "w"); + + if (!f) { + err_msg("Unable to create logfile\n"); + } + + fp[tid] = f; + + fprintf(f, "----------------------------------------------------------\n"); + fprintf(f, "PID = %d\n", rim_process_pid); + fprintf(f, "Thread id = %02d\n", tid); + fprintf(f, "Chunk Start Addr = 0x%016lx\n", (unsigned long)chunk_start); + fprintf(f, "Chunk Size = %d\n", size); + fprintf(f, "Next Store Addr = 0x%016lx\n", (unsigned long)addr); + fprintf(f, "Current sweep-id = 0x%08x\n", cur_sweep_id); + fprintf(f, "Previous sweep-id = 0x%08x\n", prev_sweep_id); + fprintf(f, "----------------------------------------------------------\n"); +} + +static inline void log_anamoly(unsigned int tid, unsigned int *addr, + unsigned int expected, unsigned int observed) +{ + FILE *f = fp[tid]; + + fprintf(f, "Thread %02d: Addr 0x%lx: Expected 0x%x, Observed 0x%x\n", + tid, (unsigned long)addr, expected, observed); + fprintf(f, "Thread %02d: Expected Thread id = %02d\n", tid, extract_tid(expected)); + fprintf(f, "Thread %02d: Observed Thread id = %02d\n", tid, extract_tid(observed)); + fprintf(f, "Thread %02d: Expected Word offset = %03d\n", tid, extract_word_offset(expected)); + fprintf(f, "Thread %02d: Observed Word offset = %03d\n", tid, extract_word_offset(observed)); + fprintf(f, "Thread %02d: Expected sweep-id = 0x%x\n", tid, extract_sweep_id(expected)); + fprintf(f, "Thread %02d: Observed sweep-id = 0x%x\n", tid, extract_sweep_id(observed)); + fprintf(f, "----------------------------------------------------------\n"); +} + +static inline void end_verification_log(unsigned int tid, unsigned nr_anamolies) +{ + FILE *f = fp[tid]; + char logfile[30]; + char path[LOGDIR_NAME_SIZE + 30]; + char separator[] = "/"; + + fclose(f); + + if (nr_anamolies == 0) { + remove(path); + return; + } + + sprintf(logfile, logfilename, tid); + strcpy(path, logdir); + strcat(path, separator); + strcat(path, logfile); + + printf("Thread %02d chunk has %d corrupted words. For details check %s\n", + tid, nr_anamolies, path); +} + +/* + * When a COMPARE step of a rim-sequence fails, the rim_thread informs + * everyone else via the shared_memory pointed to by + * corruption_found variable. On seeing this, every thread verifies the + * content of its chunk as follows. + * + * Suppose a thread identified with @tid was about to store (but not + * yet stored) to @next_store_addr in its current sweep identified + * @cur_sweep_id. Let @prev_sweep_id indicate the previous sweep_id. + * + * This implies that for all the addresses @addr < @next_store_addr, + * Thread @tid has already performed a store as part of its current + * sweep. Hence we expect the content of such @addr to be: + * |-------------------------------------------------| + * | tid | word_offset(addr) | cur_sweep_id | + * |-------------------------------------------------| + * + * Since Thread @tid is yet to perform stores on address + * @next_store_addr and above, we expect the content of such an + * address @addr to be: + * |-------------------------------------------------| + * | tid | word_offset(addr) | prev_sweep_id | + * |-------------------------------------------------| + * + * The verifier function @verify_chunk does this verification and logs + * any anamolies that it finds. + */ +static void verify_chunk(unsigned int tid, unsigned int *next_store_addr, + unsigned int cur_sweep_id, + unsigned int prev_sweep_id) +{ + unsigned int *iter_ptr; + unsigned int size = RIM_CHUNK_SIZE; + unsigned int expected; + unsigned int observed; + char *chunk_start = compute_chunk_start_addr(tid); + + int nr_anamolies = 0; + + start_verification_log(tid, next_store_addr, + cur_sweep_id, prev_sweep_id); + + for (iter_ptr = (unsigned int *)chunk_start; + (unsigned long)iter_ptr < (unsigned long)chunk_start + size; + iter_ptr++) { + unsigned int expected_sweep_id; + + if (iter_ptr < next_store_addr) { + expected_sweep_id = cur_sweep_id; + } else { + expected_sweep_id = prev_sweep_id; + } + + expected = compute_store_pattern(tid, iter_ptr, expected_sweep_id); + + dcbf((volatile unsigned int*)iter_ptr); //Flush before reading + observed = *iter_ptr; + + if (observed != expected) { + nr_anamolies++; + log_anamoly(tid, iter_ptr, expected, observed); + } + } + + end_verification_log(tid, nr_anamolies); +} + +static void set_pthread_cpu(pthread_t th, int cpu) +{ + cpu_set_t run_cpu_mask; + struct sched_param param; + + CPU_ZERO(&run_cpu_mask); + CPU_SET(cpu, &run_cpu_mask); + pthread_setaffinity_np(th, sizeof(cpu_set_t), &run_cpu_mask); + + param.sched_priority = 1; + if (0 && sched_setscheduler(0, SCHED_FIFO, ¶m) == -1) { + /* haven't reproduced with this setting, it kills random preemption which may be a factor */ + fprintf(stderr, "could not set SCHED_FIFO, run as root?\n"); + } +} + +static void set_mycpu(int cpu) +{ + cpu_set_t run_cpu_mask; + struct sched_param param; + + CPU_ZERO(&run_cpu_mask); + CPU_SET(cpu, &run_cpu_mask); + sched_setaffinity(0, sizeof(cpu_set_t), &run_cpu_mask); + + param.sched_priority = 1; + if (0 && sched_setscheduler(0, SCHED_FIFO, ¶m) == -1) { + fprintf(stderr, "could not set SCHED_FIFO, run as root?\n"); + } +} + +static volatile int segv_wait; + +static void segv_handler(int signo, siginfo_t *info, void *extra) +{ + while (segv_wait) { + sched_yield(); + } + +} + +static void set_segv_handler(void) +{ + struct sigaction sa; + + sa.sa_flags = SA_SIGINFO; + sa.sa_sigaction = segv_handler; + + if (sigaction(SIGSEGV, &sa, NULL) == -1) { + perror("sigaction"); + exit(EXIT_FAILURE); + } +} + +int timeout = 0; +/* + * This function is executed by every rim_thread. + * + * This function performs sweeps over the exclusive chunks of the + * rim_threads executing the rim-sequence one word at a time. + */ +static void *rim_fn(void *arg) +{ + unsigned int tid = *((unsigned int *)arg); + + int size = RIM_CHUNK_SIZE; + char *chunk_start = compute_chunk_start_addr(tid); + + unsigned int prev_sweep_id; + unsigned int cur_sweep_id = 0; + + /* word access */ + unsigned int pattern = cur_sweep_id; + unsigned int *pattern_ptr = &pattern; + unsigned int *w_ptr, read_data; + + set_segv_handler(); + + /* + * Let us initialize the chunk: + * + * Each word-aligned address addr in the chunk, + * is initialized to : + * |-------------------------------------------------| + * | tid | word_offset(addr) | 0 | + * |-------------------------------------------------| + */ + for (w_ptr = (unsigned int *)chunk_start; + (unsigned long)w_ptr < (unsigned long)(chunk_start) + size; + w_ptr++) { + + *pattern_ptr = compute_store_pattern(tid, w_ptr, cur_sweep_id); + *w_ptr = *pattern_ptr; + } + + while (!corruption_found && !timeout) { + prev_sweep_id = cur_sweep_id; + cur_sweep_id = cur_sweep_id + 1; + + for (w_ptr = (unsigned int *)chunk_start; + (unsigned long)w_ptr < (unsigned long)(chunk_start) + size; + w_ptr++) { + unsigned int old_pattern; + + /* + * Compute the pattern that we would have + * stored at this location in the previous + * sweep. + */ + old_pattern = compute_store_pattern(tid, w_ptr, prev_sweep_id); + + /* + * FLUSH:Ensure that we flush the contents of + * the cache before loading + */ + dcbf((volatile unsigned int*)w_ptr); //Flush + + /* LOAD: Read the value */ + read_data = *w_ptr; //Load + + /* + * COMPARE: Is it the same as what we had stored + * in the previous sweep ? It better be! + */ + if (read_data != old_pattern) { + /* No it isn't! Tell everyone */ + corruption_found = 1; + } + + /* + * Before performing a store, let us check if + * any rim_thread has found a corruption. + */ + if (corruption_found || timeout) { + /* + * Yes. Someone (including us!) has found + * a corruption :( + * + * Let us verify that our chunk is + * correct. + */ + /* But first, let us allow the dust to settle down! */ + verify_chunk(tid, w_ptr, cur_sweep_id, prev_sweep_id); + + return 0; + } + + /* + * Compute the new pattern that we are going + * to write to this location + */ + *pattern_ptr = compute_store_pattern(tid, w_ptr, cur_sweep_id); + + /* + * STORE: Now let us write this pattern into + * the location + */ + *w_ptr = *pattern_ptr; + } + } + + return NULL; +} + + +static unsigned long start_cpu = 0; +static unsigned long nrthreads = 4; + +static pthread_t mem_snapshot_thread; + +static void *mem_snapshot_fn(void *arg) +{ + int page_size = getpagesize(); + size_t size = page_size; + void *tmp = malloc(size); + + while (!corruption_found && !timeout) { + /* Stop memory migration once corruption is found */ + segv_wait = 1; + + mprotect(map1, size, PROT_READ); + + /* + * Load from the working alias (map1). Loading from map2 + * also fails. + */ + memcpy(tmp, map1, size); + + /* + * Stores must go via map2 which has write permissions, but + * the corrupted data tends to be seen in the snapshot buffer, + * so corruption does not appear to be introduced at the + * copy-back via map2 alias here. + */ + memcpy(map2, tmp, size); + /* + * Before releasing other threads, must ensure the copy + * back to + */ + asm volatile("sync" ::: "memory"); + mprotect(map1, size, PROT_READ|PROT_WRITE); + asm volatile("sync" ::: "memory"); + segv_wait = 0; + + usleep(1); /* This value makes a big difference */ + } + + return 0; +} + +void alrm_sighandler(int sig) +{ + timeout = 1; +} + +int main(int argc, char *argv[]) +{ + int c; + int page_size = getpagesize(); + time_t now; + int i, dir_error; + pthread_attr_t attr; + key_t shm_key = (key_t) getpid(); + int shmid, run_time = 20 * 60; + struct sigaction sa_alrm; + + snprintf(logdir, LOGDIR_NAME_SIZE, + "/tmp/logdir-%u", (unsigned int)getpid()); + while ((c = getopt(argc, argv, "r:hn:l:t:")) != -1) { + switch(c) { + case 'r': + start_cpu = strtoul(optarg, NULL, 10); + break; + case 'h': + printf("%s [-r ] [-n ] [-l ] [-t ]\n", argv[0]); + exit(0); + break; + case 'n': + nrthreads = strtoul(optarg, NULL, 10); + break; + case 'l': + strncpy(logdir, optarg, LOGDIR_NAME_SIZE); + break; + case 't': + run_time = strtoul(optarg, NULL, 10); + break; + default: + printf("invalid option\n"); + exit(0); + break; + } + } + + if (nrthreads > MAX_THREADS) + nrthreads = MAX_THREADS; + + shmid = shmget(shm_key, page_size, IPC_CREAT|0666); + if (shmid < 0) { + err_msg("Failed shmget\n"); + } + + map1 = shmat(shmid, NULL, 0); + if (map1 == (void *) -1) { + err_msg("Failed shmat"); + } + + map2 = shmat(shmid, NULL, 0); + if (map2 == (void *) -1) { + err_msg("Failed shmat"); + } + + dir_error = mkdir(logdir, 0755); + + if (dir_error) { + err_msg("Failed mkdir"); + } + + printf("start_cpu list:%lu\n", start_cpu); + printf("number of worker threads:%lu + 1 snapshot thread\n", nrthreads); + printf("Allocated address:0x%016lx + secondary map:0x%016lx\n", (unsigned long)map1, (unsigned long)map2); + printf("logdir at : %s\n", logdir); + printf("Timeout: %d seconds\n", run_time); + + time(&now); + printf("=================================\n"); + printf(" Starting Test\n"); + printf(" %s", ctime(&now)); + printf("=================================\n"); + + for (i = 0; i < nrthreads; i++) { + if (1 && !fork()) { + prctl(PR_SET_PDEATHSIG, SIGKILL); + set_mycpu(start_cpu + i); + for (;;) + sched_yield(); + exit(0); + } + } + + + sa_alrm.sa_handler = &alrm_sighandler; + sigemptyset(&sa_alrm.sa_mask); + sa_alrm.sa_flags = 0; + + if (sigaction(SIGALRM, &sa_alrm, 0) == -1) { + err_msg("Failed signal handler registration\n"); + } + + alarm(run_time); + + pthread_attr_init(&attr); + for (i = 0; i < nrthreads; i++) { + rim_thread_ids[i] = i; + pthread_create(&rim_threads[i], &attr, rim_fn, &rim_thread_ids[i]); + set_pthread_cpu(rim_threads[i], start_cpu + i); + } + + pthread_create(&mem_snapshot_thread, &attr, mem_snapshot_fn, map1); + set_pthread_cpu(mem_snapshot_thread, start_cpu + i); + + + pthread_join(mem_snapshot_thread, NULL); + for (i = 0; i < nrthreads; i++) { + pthread_join(rim_threads[i], NULL); + } + + if (!timeout) { + time(&now); + printf("=================================\n"); + printf(" Data Corruption Detected\n"); + printf(" %s", ctime(&now)); + printf(" See logfiles in %s\n", logdir); + printf("=================================\n"); + return 1; + } + return 0; +} From 4111cdef0e871c0f7c8874b19ee68a3671f7d63e Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 3 Sep 2019 18:04:51 +0530 Subject: [PATCH 201/334] powerpc/nvdimm: Use HCALL error as the return value This simplifies the error handling and also enable us to switch to H_SCM_QUERY hcall in a later patch on H_OVERLAP error. We also do some kernel print formatting fixup in this patch. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190903123452.28620-1-aneesh.kumar@linux.ibm.com --- arch/powerpc/platforms/pseries/papr_scm.c | 26 ++++++++++------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c index a5ac371a3f06..3bef4d298ac6 100644 --- a/arch/powerpc/platforms/pseries/papr_scm.c +++ b/arch/powerpc/platforms/pseries/papr_scm.c @@ -66,28 +66,22 @@ static int drc_pmem_bind(struct papr_scm_priv *p) } while (rc == H_BUSY); if (rc) { - /* H_OVERLAP needs a separate error path */ - if (rc == H_OVERLAP) - return -EBUSY; - dev_err(&p->pdev->dev, "bind err: %lld\n", rc); - return -ENXIO; + return rc; } p->bound_addr = saved; - - dev_dbg(&p->pdev->dev, "bound drc %x to %pR\n", p->drc_index, &p->res); - - return 0; + dev_dbg(&p->pdev->dev, "bound drc 0x%x to %pR\n", p->drc_index, &p->res); + return rc; } -static int drc_pmem_unbind(struct papr_scm_priv *p) +static void drc_pmem_unbind(struct papr_scm_priv *p) { unsigned long ret[PLPAR_HCALL_BUFSIZE]; uint64_t token = 0; int64_t rc; - dev_dbg(&p->pdev->dev, "unbind drc %x\n", p->drc_index); + dev_dbg(&p->pdev->dev, "unbind drc 0x%x\n", p->drc_index); /* NB: unbind has the same retry requirements as drc_pmem_bind() */ do { @@ -110,10 +104,10 @@ static int drc_pmem_unbind(struct papr_scm_priv *p) if (rc) dev_err(&p->pdev->dev, "unbind error: %lld\n", rc); else - dev_dbg(&p->pdev->dev, "unbind drc %x complete\n", + dev_dbg(&p->pdev->dev, "unbind drc 0x%x complete\n", p->drc_index); - return rc == H_SUCCESS ? 0 : -ENXIO; + return; } static int papr_scm_meta_get(struct papr_scm_priv *p, @@ -436,14 +430,16 @@ static int papr_scm_probe(struct platform_device *pdev) rc = drc_pmem_bind(p); /* If phyp says drc memory still bound then force unbound and retry */ - if (rc == -EBUSY) { + if (rc == H_OVERLAP) { dev_warn(&pdev->dev, "Retrying bind after unbinding\n"); drc_pmem_unbind(p); rc = drc_pmem_bind(p); } - if (rc) + if (rc != H_SUCCESS) { + rc = -ENXIO; goto err; + } /* setup the resource for the newly bound range */ p->res.start = p->bound_addr; From faa6d21153fd11e139dd880044521389b34a24f2 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 3 Sep 2019 18:04:52 +0530 Subject: [PATCH 202/334] powerpc/nvdimm: use H_SCM_QUERY hcall on H_OVERLAP error Right now we force an unbind of SCM memory at drcindex on H_OVERLAP error. This really slows down operations like kexec where we get the H_OVERLAP error because we don't go through a full hypervisor re init. H_OVERLAP error for a H_SCM_BIND_MEM hcall indicates that SCM memory at drc index is already bound. Since we don't specify a logical memory address for bind hcall, we can use the H_SCM_QUERY hcall to query the already bound logical address. Boot time difference with and without patch is: [ 5.583617] IOMMU table initialized, virtual merging enabled [ 5.603041] papr_scm ibm,persistent-memory:ibm,pmemory@44104001: Retrying bind after unbinding [ 301.514221] papr_scm ibm,persistent-memory:ibm,pmemory@44108001: Retrying bind after unbinding [ 340.057238] hv-24x7: read 1530 catalog entries, created 537 event attrs (0 failures), 275 descs after fix [ 5.101572] IOMMU table initialized, virtual merging enabled [ 5.116984] papr_scm ibm,persistent-memory:ibm,pmemory@44104001: Querying SCM details [ 5.117223] papr_scm ibm,persistent-memory:ibm,pmemory@44108001: Querying SCM details [ 5.120530] hv-24x7: read 1530 catalog entries, created 537 event attrs (0 failures), 275 descs Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190903123452.28620-2-aneesh.kumar@linux.ibm.com --- arch/powerpc/platforms/pseries/papr_scm.c | 48 +++++++++++++++++++---- 1 file changed, 40 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c index 3bef4d298ac6..61883291defc 100644 --- a/arch/powerpc/platforms/pseries/papr_scm.c +++ b/arch/powerpc/platforms/pseries/papr_scm.c @@ -65,10 +65,8 @@ static int drc_pmem_bind(struct papr_scm_priv *p) cond_resched(); } while (rc == H_BUSY); - if (rc) { - dev_err(&p->pdev->dev, "bind err: %lld\n", rc); + if (rc) return rc; - } p->bound_addr = saved; dev_dbg(&p->pdev->dev, "bound drc 0x%x to %pR\n", p->drc_index, &p->res); @@ -110,6 +108,42 @@ static void drc_pmem_unbind(struct papr_scm_priv *p) return; } +static int drc_pmem_query_n_bind(struct papr_scm_priv *p) +{ + unsigned long start_addr; + unsigned long end_addr; + unsigned long ret[PLPAR_HCALL_BUFSIZE]; + int64_t rc; + + + rc = plpar_hcall(H_SCM_QUERY_BLOCK_MEM_BINDING, ret, + p->drc_index, 0); + if (rc) + goto err_out; + start_addr = ret[0]; + + /* Make sure the full region is bound. */ + rc = plpar_hcall(H_SCM_QUERY_BLOCK_MEM_BINDING, ret, + p->drc_index, p->blocks - 1); + if (rc) + goto err_out; + end_addr = ret[0]; + + if ((end_addr - start_addr) != ((p->blocks - 1) * p->block_size)) + goto err_out; + + p->bound_addr = start_addr; + dev_dbg(&p->pdev->dev, "bound drc 0x%x to %pR\n", p->drc_index, &p->res); + return rc; + +err_out: + dev_info(&p->pdev->dev, + "Failed to query, trying an unbind followed by bind"); + drc_pmem_unbind(p); + return drc_pmem_bind(p); +} + + static int papr_scm_meta_get(struct papr_scm_priv *p, struct nd_cmd_get_config_data_hdr *hdr) { @@ -430,13 +464,11 @@ static int papr_scm_probe(struct platform_device *pdev) rc = drc_pmem_bind(p); /* If phyp says drc memory still bound then force unbound and retry */ - if (rc == H_OVERLAP) { - dev_warn(&pdev->dev, "Retrying bind after unbinding\n"); - drc_pmem_unbind(p); - rc = drc_pmem_bind(p); - } + if (rc == H_OVERLAP) + rc = drc_pmem_query_n_bind(p); if (rc != H_SUCCESS) { + dev_err(&p->pdev->dev, "bind err: %d\n", rc); rc = -ENXIO; goto err; } From 131ea1ed3322c6ec06eb8f276f226c8a1f3bbf1b Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 24 Sep 2019 23:27:34 -0500 Subject: [PATCH 203/334] smb3: Add missing reparse tags Additional reparse tags were described for WSL and file sync. Add missing defines for these tags. Some will be useful for POSIX extensions (as discussed at Storage Developer Conference). Signed-off-by: Steve French Reviewed-by: Aurelien Aptel --- fs/cifs/smbfsctl.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h index 08628e6a42ac..1ff28529cf4b 100644 --- a/fs/cifs/smbfsctl.h +++ b/fs/cifs/smbfsctl.h @@ -144,6 +144,17 @@ #define IO_REPARSE_APPXSTREAM 0xC0000014 /* NFS symlinks, Win 8/SMB3 and later */ #define IO_REPARSE_TAG_NFS 0x80000014 +/* + * AzureFileSync - see + * https://docs.microsoft.com/en-us/azure/storage/files/storage-sync-cloud-tiering + */ +#define IO_REPARSE_TAG_AZ_FILE_SYNC 0x8000001e +/* WSL reparse tags */ +#define IO_REPARSE_TAG_LX_SYMLINK 0xA000001D +#define IO_REPARSE_TAG_AF_UNIX 0x80000023 +#define IO_REPARSE_TAG_LX_FIFO 0x80000024 +#define IO_REPARSE_TAG_LX_CHR 0x80000025 +#define IO_REPARSE_TAG_LX_BLK 0x80000026 /* fsctl flags */ /* If Flags is set to this value, the request is an FSCTL not ioctl request */ From cb063a83ca321fbf0cb2b4044186f241d89f3dc1 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Wed, 28 Aug 2019 15:03:18 +0200 Subject: [PATCH 204/334] thermal: db8500: Finalize device tree conversion At some point there was an attempt to convert the DB8500 thermal sensor to device tree: a probe path was added and the device tree was augmented for the Snowball board. The switchover was never completed: instead the thermal devices came from from the PRCMU MFD device and the probe on the Snowball was confused as another set of configuration appeared from the device tree. Move over to a device-tree only approach, as we fixed up the device trees. Cc: Vincent Guittot Acked-by: Lee Jones Reviewed-by: Daniel Lezcano Signed-off-by: Linus Walleij Signed-off-by: Eduardo Valentin --- drivers/mfd/db8500-prcmu.c | 53 +------------------- drivers/thermal/Kconfig | 2 +- drivers/thermal/db8500_thermal.c | 30 +++++------ include/linux/platform_data/db8500_thermal.h | 29 ----------- 4 files changed, 17 insertions(+), 97 deletions(-) delete mode 100644 include/linux/platform_data/db8500_thermal.h diff --git a/drivers/mfd/db8500-prcmu.c b/drivers/mfd/db8500-prcmu.c index 90e0f21bc49c..518439c79826 100644 --- a/drivers/mfd/db8500-prcmu.c +++ b/drivers/mfd/db8500-prcmu.c @@ -36,7 +36,6 @@ #include #include #include -#include #include "dbx500-prcmu-regs.h" /* Index of different voltages to be used when accessing AVSData */ @@ -2984,53 +2983,6 @@ static struct ux500_wdt_data db8500_wdt_pdata = { .timeout = 600, /* 10 minutes */ .has_28_bits_resolution = true, }; -/* - * Thermal Sensor - */ - -static struct resource db8500_thsens_resources[] = { - { - .name = "IRQ_HOTMON_LOW", - .start = IRQ_PRCMU_HOTMON_LOW, - .end = IRQ_PRCMU_HOTMON_LOW, - .flags = IORESOURCE_IRQ, - }, - { - .name = "IRQ_HOTMON_HIGH", - .start = IRQ_PRCMU_HOTMON_HIGH, - .end = IRQ_PRCMU_HOTMON_HIGH, - .flags = IORESOURCE_IRQ, - }, -}; - -static struct db8500_thsens_platform_data db8500_thsens_data = { - .trip_points[0] = { - .temp = 70000, - .type = THERMAL_TRIP_ACTIVE, - .cdev_name = { - [0] = "thermal-cpufreq-0", - }, - }, - .trip_points[1] = { - .temp = 75000, - .type = THERMAL_TRIP_ACTIVE, - .cdev_name = { - [0] = "thermal-cpufreq-0", - }, - }, - .trip_points[2] = { - .temp = 80000, - .type = THERMAL_TRIP_ACTIVE, - .cdev_name = { - [0] = "thermal-cpufreq-0", - }, - }, - .trip_points[3] = { - .temp = 85000, - .type = THERMAL_TRIP_CRITICAL, - }, - .num_trips = 4, -}; static const struct mfd_cell common_prcmu_devs[] = { { @@ -3054,10 +3006,7 @@ static const struct mfd_cell db8500_prcmu_devs[] = { }, { .name = "db8500-thermal", - .num_resources = ARRAY_SIZE(db8500_thsens_resources), - .resources = db8500_thsens_resources, - .platform_data = &db8500_thsens_data, - .pdata_size = sizeof(db8500_thsens_data), + .of_compatible = "stericsson,db8500-thermal", }, }; diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig index 9966364a6deb..001a21abcc28 100644 --- a/drivers/thermal/Kconfig +++ b/drivers/thermal/Kconfig @@ -310,7 +310,7 @@ config DOVE_THERMAL config DB8500_THERMAL tristate "DB8500 thermal management" - depends on MFD_DB8500_PRCMU + depends on MFD_DB8500_PRCMU && OF default y help Adds DB8500 thermal management implementation according to the thermal diff --git a/drivers/thermal/db8500_thermal.c b/drivers/thermal/db8500_thermal.c index b71a999d17d6..d650ae5fdf2a 100644 --- a/drivers/thermal/db8500_thermal.c +++ b/drivers/thermal/db8500_thermal.c @@ -13,13 +13,24 @@ #include #include #include -#include #include #include #include #define PRCMU_DEFAULT_MEASURE_TIME 0xFFF #define PRCMU_DEFAULT_LOW_TEMP 0 +#define COOLING_DEV_MAX 8 + +struct db8500_trip_point { + unsigned long temp; + enum thermal_trip_type type; + char cdev_name[COOLING_DEV_MAX][THERMAL_NAME_LENGTH]; +}; + +struct db8500_thsens_platform_data { + struct db8500_trip_point trip_points[THERMAL_MAX_TRIPS]; + int num_trips; +}; struct db8500_thermal_zone { struct thermal_zone_device *therm_dev; @@ -301,7 +312,6 @@ static void db8500_thermal_work(struct work_struct *work) dev_dbg(&pzone->therm_dev->device, "thermal work finished.\n"); } -#ifdef CONFIG_OF static struct db8500_thsens_platform_data* db8500_thermal_parse_dt(struct platform_device *pdev) { @@ -370,13 +380,6 @@ err_parse_dt: dev_err(&pdev->dev, "Parsing device tree data error.\n"); return NULL; } -#else -static inline struct db8500_thsens_platform_data* - db8500_thermal_parse_dt(struct platform_device *pdev) -{ - return NULL; -} -#endif static int db8500_thermal_probe(struct platform_device *pdev) { @@ -386,11 +389,10 @@ static int db8500_thermal_probe(struct platform_device *pdev) int low_irq, high_irq, ret = 0; unsigned long dft_low, dft_high; - if (np) - ptrips = db8500_thermal_parse_dt(pdev); - else - ptrips = dev_get_platdata(&pdev->dev); + if (!np) + return -EINVAL; + ptrips = db8500_thermal_parse_dt(pdev); if (!ptrips) return -EINVAL; @@ -498,13 +500,11 @@ static int db8500_thermal_resume(struct platform_device *pdev) return 0; } -#ifdef CONFIG_OF static const struct of_device_id db8500_thermal_match[] = { { .compatible = "stericsson,db8500-thermal" }, {}, }; MODULE_DEVICE_TABLE(of, db8500_thermal_match); -#endif static struct platform_driver db8500_thermal_driver = { .driver = { diff --git a/include/linux/platform_data/db8500_thermal.h b/include/linux/platform_data/db8500_thermal.h deleted file mode 100644 index 55e55750a165..000000000000 --- a/include/linux/platform_data/db8500_thermal.h +++ /dev/null @@ -1,29 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * db8500_thermal.h - DB8500 Thermal Management Implementation - * - * Copyright (C) 2012 ST-Ericsson - * Copyright (C) 2012 Linaro Ltd. - * - * Author: Hongbo Zhang - */ - -#ifndef _DB8500_THERMAL_H_ -#define _DB8500_THERMAL_H_ - -#include - -#define COOLING_DEV_MAX 8 - -struct db8500_trip_point { - unsigned long temp; - enum thermal_trip_type type; - char cdev_name[COOLING_DEV_MAX][THERMAL_NAME_LENGTH]; -}; - -struct db8500_thsens_platform_data { - struct db8500_trip_point trip_points[THERMAL_MAX_TRIPS]; - int num_trips; -}; - -#endif /* _DB8500_THERMAL_H_ */ From 3de9e4dff889531d517c8808898972e96fa507b2 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Wed, 28 Aug 2019 15:03:19 +0200 Subject: [PATCH 205/334] thermal: db8500: Use dev helper variable The code gets easier to read like this. Cc: Vincent Guittot Reviewed-by: Daniel Lezcano Signed-off-by: Linus Walleij Signed-off-by: Eduardo Valentin --- drivers/thermal/db8500_thermal.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/drivers/thermal/db8500_thermal.c b/drivers/thermal/db8500_thermal.c index d650ae5fdf2a..d250bcf3c10c 100644 --- a/drivers/thermal/db8500_thermal.c +++ b/drivers/thermal/db8500_thermal.c @@ -313,16 +313,16 @@ static void db8500_thermal_work(struct work_struct *work) } static struct db8500_thsens_platform_data* - db8500_thermal_parse_dt(struct platform_device *pdev) + db8500_thermal_parse_dt(struct device *dev) { struct db8500_thsens_platform_data *ptrips; - struct device_node *np = pdev->dev.of_node; + struct device_node *np = dev->of_node; char prop_name[32]; const char *tmp_str; u32 tmp_data; int i, j; - ptrips = devm_kzalloc(&pdev->dev, sizeof(*ptrips), GFP_KERNEL); + ptrips = devm_kzalloc(dev, sizeof(*ptrips), GFP_KERNEL); if (!ptrips) return NULL; @@ -377,7 +377,7 @@ static struct db8500_thsens_platform_data* return ptrips; err_parse_dt: - dev_err(&pdev->dev, "Parsing device tree data error.\n"); + dev_err(dev, "Parsing device tree data error.\n"); return NULL; } @@ -385,18 +385,19 @@ static int db8500_thermal_probe(struct platform_device *pdev) { struct db8500_thermal_zone *pzone = NULL; struct db8500_thsens_platform_data *ptrips = NULL; - struct device_node *np = pdev->dev.of_node; + struct device *dev = &pdev->dev; + struct device_node *np = dev->of_node; int low_irq, high_irq, ret = 0; unsigned long dft_low, dft_high; if (!np) return -EINVAL; - ptrips = db8500_thermal_parse_dt(pdev); + ptrips = db8500_thermal_parse_dt(dev); if (!ptrips) return -EINVAL; - pzone = devm_kzalloc(&pdev->dev, sizeof(*pzone), GFP_KERNEL); + pzone = devm_kzalloc(dev, sizeof(*pzone), GFP_KERNEL); if (!pzone) return -ENOMEM; @@ -410,31 +411,31 @@ static int db8500_thermal_probe(struct platform_device *pdev) low_irq = platform_get_irq_byname(pdev, "IRQ_HOTMON_LOW"); if (low_irq < 0) { - dev_err(&pdev->dev, "Get IRQ_HOTMON_LOW failed.\n"); + dev_err(dev, "Get IRQ_HOTMON_LOW failed.\n"); ret = low_irq; goto out_unlock; } - ret = devm_request_threaded_irq(&pdev->dev, low_irq, NULL, + ret = devm_request_threaded_irq(dev, low_irq, NULL, prcmu_low_irq_handler, IRQF_NO_SUSPEND | IRQF_ONESHOT, "dbx500_temp_low", pzone); if (ret < 0) { - dev_err(&pdev->dev, "Failed to allocate temp low irq.\n"); + dev_err(dev, "Failed to allocate temp low irq.\n"); goto out_unlock; } high_irq = platform_get_irq_byname(pdev, "IRQ_HOTMON_HIGH"); if (high_irq < 0) { - dev_err(&pdev->dev, "Get IRQ_HOTMON_HIGH failed.\n"); + dev_err(dev, "Get IRQ_HOTMON_HIGH failed.\n"); ret = high_irq; goto out_unlock; } - ret = devm_request_threaded_irq(&pdev->dev, high_irq, NULL, + ret = devm_request_threaded_irq(dev, high_irq, NULL, prcmu_high_irq_handler, IRQF_NO_SUSPEND | IRQF_ONESHOT, "dbx500_temp_high", pzone); if (ret < 0) { - dev_err(&pdev->dev, "Failed to allocate temp high irq.\n"); + dev_err(dev, "Failed to allocate temp high irq.\n"); goto out_unlock; } @@ -442,11 +443,11 @@ static int db8500_thermal_probe(struct platform_device *pdev) ptrips->num_trips, 0, pzone, &thdev_ops, NULL, 0, 0); if (IS_ERR(pzone->therm_dev)) { - dev_err(&pdev->dev, "Register thermal zone device failed.\n"); + dev_err(dev, "Register thermal zone device failed.\n"); ret = PTR_ERR(pzone->therm_dev); goto out_unlock; } - dev_info(&pdev->dev, "Thermal zone device registered.\n"); + dev_info(dev, "Thermal zone device registered.\n"); dft_low = PRCMU_DEFAULT_LOW_TEMP; dft_high = ptrips->trip_points[0].temp; From 6c375eccded41df8033ed55a1b785531b304fc67 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Wed, 28 Aug 2019 15:03:20 +0200 Subject: [PATCH 206/334] thermal: db8500: Rewrite to be a pure OF sensor This patch rewrites the DB8500 thermal sensor to be a pure OF sensor, so that it can be used with thermal zones defined in the device tree. This driver was initially merged before we had generic thermal zone device tree bindings, and now it gets modernized to the way we do things these days. The old driver depended on a set of trigger points provided in the device tree or platform data to interpolate the current temperature between trigger points depending on whether the trend was rising or falling. This was bad because the trigger points should be used for defining temperature zone policies and bind to cooling devices. As the PRCMU (power reset control management unit) can only issue IRQs when we pass temperature trigger points upward or downward We instead define a number of temperature points inside the driver ranging from 15 to 100 degrees celsius. The effect is that when we register the device we quickly trigger 15, 20 ... up to the room temperature in succession and then we get continous event IRQs also under normal operating conditions, and the temperature of the system is now reported more accurately (+/- 2.5 degrees celsius) while in the past the first trigger point was at 70 degrees and the average temperature was simply reported as 35 degrees celsius (between 70 degrees and 0) until we passed 70 degrees which didn't accurately represent the temperature of the system. As a result of dropping all the trigger points from the driver and reusing the core DT thermal zone management code we reduce the code footprint quite a bit. Cc: Vincent Guittot Suggested-by: Daniel Lezcano Signed-off-by: Linus Walleij Reviewed-by: Daniel Lezcano Signed-off-by: Eduardo Valentin --- drivers/thermal/db8500_thermal.c | 477 +++++++------------------------ 1 file changed, 107 insertions(+), 370 deletions(-) diff --git a/drivers/thermal/db8500_thermal.c b/drivers/thermal/db8500_thermal.c index d250bcf3c10c..372dbbaaafb8 100644 --- a/drivers/thermal/db8500_thermal.c +++ b/drivers/thermal/db8500_thermal.c @@ -3,9 +3,9 @@ * db8500_thermal.c - DB8500 Thermal Management Implementation * * Copyright (C) 2012 ST-Ericsson - * Copyright (C) 2012 Linaro Ltd. + * Copyright (C) 2012-2019 Linaro Ltd. * - * Author: Hongbo Zhang + * Authors: Hongbo Zhang, Linus Walleij */ #include @@ -19,458 +19,202 @@ #define PRCMU_DEFAULT_MEASURE_TIME 0xFFF #define PRCMU_DEFAULT_LOW_TEMP 0 -#define COOLING_DEV_MAX 8 -struct db8500_trip_point { - unsigned long temp; - enum thermal_trip_type type; - char cdev_name[COOLING_DEV_MAX][THERMAL_NAME_LENGTH]; -}; - -struct db8500_thsens_platform_data { - struct db8500_trip_point trip_points[THERMAL_MAX_TRIPS]; - int num_trips; +/** + * db8500_thermal_points - the interpolation points that trigger + * interrupts + */ +static const unsigned long db8500_thermal_points[] = { + 15000, + 20000, + 25000, + 30000, + 35000, + 40000, + 45000, + 50000, + 55000, + 60000, + 65000, + 70000, + 75000, + 80000, + /* + * This is where things start to get really bad for the + * SoC and the thermal zones should be set up to trigger + * critical temperature at 85000 mC so we don't get above + * this point. + */ + 85000, + 90000, + 95000, + 100000, }; struct db8500_thermal_zone { - struct thermal_zone_device *therm_dev; - struct mutex th_lock; - struct work_struct therm_work; - struct db8500_thsens_platform_data *trip_tab; - enum thermal_device_mode mode; + struct thermal_zone_device *tz; enum thermal_trend trend; - unsigned long cur_temp_pseudo; + unsigned long interpolated_temp; unsigned int cur_index; }; -/* Local function to check if thermal zone matches cooling devices */ -static int db8500_thermal_match_cdev(struct thermal_cooling_device *cdev, - struct db8500_trip_point *trip_point) -{ - int i; - - if (!strlen(cdev->type)) - return -EINVAL; - - for (i = 0; i < COOLING_DEV_MAX; i++) { - if (!strcmp(trip_point->cdev_name[i], cdev->type)) - return 0; - } - - return -ENODEV; -} - -/* Callback to bind cooling device to thermal zone */ -static int db8500_cdev_bind(struct thermal_zone_device *thermal, - struct thermal_cooling_device *cdev) -{ - struct db8500_thermal_zone *pzone = thermal->devdata; - struct db8500_thsens_platform_data *ptrips = pzone->trip_tab; - unsigned long max_state, upper, lower; - int i, ret = -EINVAL; - - cdev->ops->get_max_state(cdev, &max_state); - - for (i = 0; i < ptrips->num_trips; i++) { - if (db8500_thermal_match_cdev(cdev, &ptrips->trip_points[i])) - continue; - - upper = lower = i > max_state ? max_state : i; - - ret = thermal_zone_bind_cooling_device(thermal, i, cdev, - upper, lower, THERMAL_WEIGHT_DEFAULT); - - dev_info(&cdev->device, "%s bind to %d: %d-%s\n", cdev->type, - i, ret, ret ? "fail" : "succeed"); - } - - return ret; -} - -/* Callback to unbind cooling device from thermal zone */ -static int db8500_cdev_unbind(struct thermal_zone_device *thermal, - struct thermal_cooling_device *cdev) -{ - struct db8500_thermal_zone *pzone = thermal->devdata; - struct db8500_thsens_platform_data *ptrips = pzone->trip_tab; - int i, ret = -EINVAL; - - for (i = 0; i < ptrips->num_trips; i++) { - if (db8500_thermal_match_cdev(cdev, &ptrips->trip_points[i])) - continue; - - ret = thermal_zone_unbind_cooling_device(thermal, i, cdev); - - dev_info(&cdev->device, "%s unbind from %d: %s\n", cdev->type, - i, ret ? "fail" : "succeed"); - } - - return ret; -} - /* Callback to get current temperature */ -static int db8500_sys_get_temp(struct thermal_zone_device *thermal, int *temp) +static int db8500_thermal_get_temp(void *data, int *temp) { - struct db8500_thermal_zone *pzone = thermal->devdata; + struct db8500_thermal_zone *th = data; /* * TODO: There is no PRCMU interface to get temperature data currently, * so a pseudo temperature is returned , it works for thermal framework * and this will be fixed when the PRCMU interface is available. */ - *temp = pzone->cur_temp_pseudo; + *temp = th->interpolated_temp; return 0; } /* Callback to get temperature changing trend */ -static int db8500_sys_get_trend(struct thermal_zone_device *thermal, - int trip, enum thermal_trend *trend) +static int db8500_thermal_get_trend(void *data, int trip, enum thermal_trend *trend) { - struct db8500_thermal_zone *pzone = thermal->devdata; + struct db8500_thermal_zone *th = data; - *trend = pzone->trend; + *trend = th->trend; return 0; } -/* Callback to get thermal zone mode */ -static int db8500_sys_get_mode(struct thermal_zone_device *thermal, - enum thermal_device_mode *mode) -{ - struct db8500_thermal_zone *pzone = thermal->devdata; - - mutex_lock(&pzone->th_lock); - *mode = pzone->mode; - mutex_unlock(&pzone->th_lock); - - return 0; -} - -/* Callback to set thermal zone mode */ -static int db8500_sys_set_mode(struct thermal_zone_device *thermal, - enum thermal_device_mode mode) -{ - struct db8500_thermal_zone *pzone = thermal->devdata; - - mutex_lock(&pzone->th_lock); - - pzone->mode = mode; - if (mode == THERMAL_DEVICE_ENABLED) - schedule_work(&pzone->therm_work); - - mutex_unlock(&pzone->th_lock); - - return 0; -} - -/* Callback to get trip point type */ -static int db8500_sys_get_trip_type(struct thermal_zone_device *thermal, - int trip, enum thermal_trip_type *type) -{ - struct db8500_thermal_zone *pzone = thermal->devdata; - struct db8500_thsens_platform_data *ptrips = pzone->trip_tab; - - if (trip >= ptrips->num_trips) - return -EINVAL; - - *type = ptrips->trip_points[trip].type; - - return 0; -} - -/* Callback to get trip point temperature */ -static int db8500_sys_get_trip_temp(struct thermal_zone_device *thermal, - int trip, int *temp) -{ - struct db8500_thermal_zone *pzone = thermal->devdata; - struct db8500_thsens_platform_data *ptrips = pzone->trip_tab; - - if (trip >= ptrips->num_trips) - return -EINVAL; - - *temp = ptrips->trip_points[trip].temp; - - return 0; -} - -/* Callback to get critical trip point temperature */ -static int db8500_sys_get_crit_temp(struct thermal_zone_device *thermal, - int *temp) -{ - struct db8500_thermal_zone *pzone = thermal->devdata; - struct db8500_thsens_platform_data *ptrips = pzone->trip_tab; - int i; - - for (i = ptrips->num_trips - 1; i > 0; i--) { - if (ptrips->trip_points[i].type == THERMAL_TRIP_CRITICAL) { - *temp = ptrips->trip_points[i].temp; - return 0; - } - } - - return -EINVAL; -} - -static struct thermal_zone_device_ops thdev_ops = { - .bind = db8500_cdev_bind, - .unbind = db8500_cdev_unbind, - .get_temp = db8500_sys_get_temp, - .get_trend = db8500_sys_get_trend, - .get_mode = db8500_sys_get_mode, - .set_mode = db8500_sys_set_mode, - .get_trip_type = db8500_sys_get_trip_type, - .get_trip_temp = db8500_sys_get_trip_temp, - .get_crit_temp = db8500_sys_get_crit_temp, +static struct thermal_zone_of_device_ops thdev_ops = { + .get_temp = db8500_thermal_get_temp, + .get_trend = db8500_thermal_get_trend, }; -static void db8500_thermal_update_config(struct db8500_thermal_zone *pzone, - unsigned int idx, enum thermal_trend trend, - unsigned long next_low, unsigned long next_high) +static void db8500_thermal_update_config(struct db8500_thermal_zone *th, + unsigned int idx, + enum thermal_trend trend, + unsigned long next_low, + unsigned long next_high) { prcmu_stop_temp_sense(); - pzone->cur_index = idx; - pzone->cur_temp_pseudo = (next_low + next_high)/2; - pzone->trend = trend; + th->cur_index = idx; + th->interpolated_temp = (next_low + next_high)/2; + th->trend = trend; + /* + * The PRCMU accept absolute temperatures in celsius so divide + * down the millicelsius with 1000 + */ prcmu_config_hotmon((u8)(next_low/1000), (u8)(next_high/1000)); prcmu_start_temp_sense(PRCMU_DEFAULT_MEASURE_TIME); } static irqreturn_t prcmu_low_irq_handler(int irq, void *irq_data) { - struct db8500_thermal_zone *pzone = irq_data; - struct db8500_thsens_platform_data *ptrips = pzone->trip_tab; - unsigned int idx = pzone->cur_index; + struct db8500_thermal_zone *th = irq_data; + unsigned int idx = th->cur_index; unsigned long next_low, next_high; - if (unlikely(idx == 0)) + if (idx == 0) /* Meaningless for thermal management, ignoring it */ return IRQ_HANDLED; if (idx == 1) { - next_high = ptrips->trip_points[0].temp; + next_high = db8500_thermal_points[0]; next_low = PRCMU_DEFAULT_LOW_TEMP; } else { - next_high = ptrips->trip_points[idx-1].temp; - next_low = ptrips->trip_points[idx-2].temp; + next_high = db8500_thermal_points[idx - 1]; + next_low = db8500_thermal_points[idx - 2]; } idx -= 1; - db8500_thermal_update_config(pzone, idx, THERMAL_TREND_DROPPING, - next_low, next_high); - - dev_dbg(&pzone->therm_dev->device, + db8500_thermal_update_config(th, idx, THERMAL_TREND_DROPPING, + next_low, next_high); + dev_dbg(&th->tz->device, "PRCMU set max %ld, min %ld\n", next_high, next_low); - schedule_work(&pzone->therm_work); + thermal_zone_device_update(th->tz, THERMAL_EVENT_UNSPECIFIED); return IRQ_HANDLED; } static irqreturn_t prcmu_high_irq_handler(int irq, void *irq_data) { - struct db8500_thermal_zone *pzone = irq_data; - struct db8500_thsens_platform_data *ptrips = pzone->trip_tab; - unsigned int idx = pzone->cur_index; + struct db8500_thermal_zone *th = irq_data; + unsigned int idx = th->cur_index; unsigned long next_low, next_high; + int num_points = ARRAY_SIZE(db8500_thermal_points); - if (idx < ptrips->num_trips - 1) { - next_high = ptrips->trip_points[idx+1].temp; - next_low = ptrips->trip_points[idx].temp; + if (idx < num_points - 1) { + next_high = db8500_thermal_points[idx+1]; + next_low = db8500_thermal_points[idx]; idx += 1; - db8500_thermal_update_config(pzone, idx, THERMAL_TREND_RAISING, - next_low, next_high); + db8500_thermal_update_config(th, idx, THERMAL_TREND_RAISING, + next_low, next_high); - dev_dbg(&pzone->therm_dev->device, - "PRCMU set max %ld, min %ld\n", next_high, next_low); - } else if (idx == ptrips->num_trips - 1) - pzone->cur_temp_pseudo = ptrips->trip_points[idx].temp + 1; + dev_info(&th->tz->device, + "PRCMU set max %ld, min %ld\n", next_high, next_low); + } else if (idx == num_points - 1) + /* So we roof out 1 degree over the max point */ + th->interpolated_temp = db8500_thermal_points[idx] + 1; - schedule_work(&pzone->therm_work); + thermal_zone_device_update(th->tz, THERMAL_EVENT_UNSPECIFIED); return IRQ_HANDLED; } -static void db8500_thermal_work(struct work_struct *work) -{ - enum thermal_device_mode cur_mode; - struct db8500_thermal_zone *pzone; - - pzone = container_of(work, struct db8500_thermal_zone, therm_work); - - mutex_lock(&pzone->th_lock); - cur_mode = pzone->mode; - mutex_unlock(&pzone->th_lock); - - if (cur_mode == THERMAL_DEVICE_DISABLED) - return; - - thermal_zone_device_update(pzone->therm_dev, THERMAL_EVENT_UNSPECIFIED); - dev_dbg(&pzone->therm_dev->device, "thermal work finished.\n"); -} - -static struct db8500_thsens_platform_data* - db8500_thermal_parse_dt(struct device *dev) -{ - struct db8500_thsens_platform_data *ptrips; - struct device_node *np = dev->of_node; - char prop_name[32]; - const char *tmp_str; - u32 tmp_data; - int i, j; - - ptrips = devm_kzalloc(dev, sizeof(*ptrips), GFP_KERNEL); - if (!ptrips) - return NULL; - - if (of_property_read_u32(np, "num-trips", &tmp_data)) - goto err_parse_dt; - - if (tmp_data > THERMAL_MAX_TRIPS) - goto err_parse_dt; - - ptrips->num_trips = tmp_data; - - for (i = 0; i < ptrips->num_trips; i++) { - sprintf(prop_name, "trip%d-temp", i); - if (of_property_read_u32(np, prop_name, &tmp_data)) - goto err_parse_dt; - - ptrips->trip_points[i].temp = tmp_data; - - sprintf(prop_name, "trip%d-type", i); - if (of_property_read_string(np, prop_name, &tmp_str)) - goto err_parse_dt; - - if (!strcmp(tmp_str, "active")) - ptrips->trip_points[i].type = THERMAL_TRIP_ACTIVE; - else if (!strcmp(tmp_str, "passive")) - ptrips->trip_points[i].type = THERMAL_TRIP_PASSIVE; - else if (!strcmp(tmp_str, "hot")) - ptrips->trip_points[i].type = THERMAL_TRIP_HOT; - else if (!strcmp(tmp_str, "critical")) - ptrips->trip_points[i].type = THERMAL_TRIP_CRITICAL; - else - goto err_parse_dt; - - sprintf(prop_name, "trip%d-cdev-num", i); - if (of_property_read_u32(np, prop_name, &tmp_data)) - goto err_parse_dt; - - if (tmp_data > COOLING_DEV_MAX) - goto err_parse_dt; - - for (j = 0; j < tmp_data; j++) { - sprintf(prop_name, "trip%d-cdev-name%d", i, j); - if (of_property_read_string(np, prop_name, &tmp_str)) - goto err_parse_dt; - - if (strlen(tmp_str) >= THERMAL_NAME_LENGTH) - goto err_parse_dt; - - strcpy(ptrips->trip_points[i].cdev_name[j], tmp_str); - } - } - return ptrips; - -err_parse_dt: - dev_err(dev, "Parsing device tree data error.\n"); - return NULL; -} - static int db8500_thermal_probe(struct platform_device *pdev) { - struct db8500_thermal_zone *pzone = NULL; - struct db8500_thsens_platform_data *ptrips = NULL; + struct db8500_thermal_zone *th = NULL; struct device *dev = &pdev->dev; - struct device_node *np = dev->of_node; int low_irq, high_irq, ret = 0; - unsigned long dft_low, dft_high; - if (!np) - return -EINVAL; - - ptrips = db8500_thermal_parse_dt(dev); - if (!ptrips) - return -EINVAL; - - pzone = devm_kzalloc(dev, sizeof(*pzone), GFP_KERNEL); - if (!pzone) + th = devm_kzalloc(dev, sizeof(*th), GFP_KERNEL); + if (!th) return -ENOMEM; - mutex_init(&pzone->th_lock); - mutex_lock(&pzone->th_lock); - - pzone->mode = THERMAL_DEVICE_DISABLED; - pzone->trip_tab = ptrips; - - INIT_WORK(&pzone->therm_work, db8500_thermal_work); - low_irq = platform_get_irq_byname(pdev, "IRQ_HOTMON_LOW"); if (low_irq < 0) { - dev_err(dev, "Get IRQ_HOTMON_LOW failed.\n"); - ret = low_irq; - goto out_unlock; + dev_err(dev, "Get IRQ_HOTMON_LOW failed\n"); + return low_irq; } ret = devm_request_threaded_irq(dev, low_irq, NULL, prcmu_low_irq_handler, IRQF_NO_SUSPEND | IRQF_ONESHOT, - "dbx500_temp_low", pzone); + "dbx500_temp_low", th); if (ret < 0) { - dev_err(dev, "Failed to allocate temp low irq.\n"); - goto out_unlock; + dev_err(dev, "failed to allocate temp low irq\n"); + return ret; } high_irq = platform_get_irq_byname(pdev, "IRQ_HOTMON_HIGH"); if (high_irq < 0) { - dev_err(dev, "Get IRQ_HOTMON_HIGH failed.\n"); - ret = high_irq; - goto out_unlock; + dev_err(dev, "Get IRQ_HOTMON_HIGH failed\n"); + return high_irq; } ret = devm_request_threaded_irq(dev, high_irq, NULL, prcmu_high_irq_handler, IRQF_NO_SUSPEND | IRQF_ONESHOT, - "dbx500_temp_high", pzone); + "dbx500_temp_high", th); if (ret < 0) { - dev_err(dev, "Failed to allocate temp high irq.\n"); - goto out_unlock; + dev_err(dev, "failed to allocate temp high irq\n"); + return ret; } - pzone->therm_dev = thermal_zone_device_register("db8500_thermal_zone", - ptrips->num_trips, 0, pzone, &thdev_ops, NULL, 0, 0); - - if (IS_ERR(pzone->therm_dev)) { - dev_err(dev, "Register thermal zone device failed.\n"); - ret = PTR_ERR(pzone->therm_dev); - goto out_unlock; + /* register of thermal sensor and get info from DT */ + th->tz = devm_thermal_zone_of_sensor_register(dev, 0, th, &thdev_ops); + if (IS_ERR(th->tz)) { + dev_err(dev, "register thermal zone sensor failed\n"); + return PTR_ERR(th->tz); } - dev_info(dev, "Thermal zone device registered.\n"); + dev_info(dev, "thermal zone sensor registered\n"); - dft_low = PRCMU_DEFAULT_LOW_TEMP; - dft_high = ptrips->trip_points[0].temp; + /* Start measuring at the lowest point */ + db8500_thermal_update_config(th, 0, THERMAL_TREND_STABLE, + PRCMU_DEFAULT_LOW_TEMP, + db8500_thermal_points[0]); - db8500_thermal_update_config(pzone, 0, THERMAL_TREND_STABLE, - dft_low, dft_high); - - platform_set_drvdata(pdev, pzone); - pzone->mode = THERMAL_DEVICE_ENABLED; - -out_unlock: - mutex_unlock(&pzone->th_lock); - - return ret; -} - -static int db8500_thermal_remove(struct platform_device *pdev) -{ - struct db8500_thermal_zone *pzone = platform_get_drvdata(pdev); - - thermal_zone_device_unregister(pzone->therm_dev); - cancel_work_sync(&pzone->therm_work); - mutex_destroy(&pzone->th_lock); + platform_set_drvdata(pdev, th); return 0; } @@ -478,9 +222,6 @@ static int db8500_thermal_remove(struct platform_device *pdev) static int db8500_thermal_suspend(struct platform_device *pdev, pm_message_t state) { - struct db8500_thermal_zone *pzone = platform_get_drvdata(pdev); - - flush_work(&pzone->therm_work); prcmu_stop_temp_sense(); return 0; @@ -488,15 +229,12 @@ static int db8500_thermal_suspend(struct platform_device *pdev, static int db8500_thermal_resume(struct platform_device *pdev) { - struct db8500_thermal_zone *pzone = platform_get_drvdata(pdev); - struct db8500_thsens_platform_data *ptrips = pzone->trip_tab; - unsigned long dft_low, dft_high; + struct db8500_thermal_zone *th = platform_get_drvdata(pdev); - dft_low = PRCMU_DEFAULT_LOW_TEMP; - dft_high = ptrips->trip_points[0].temp; - - db8500_thermal_update_config(pzone, 0, THERMAL_TREND_STABLE, - dft_low, dft_high); + /* Resume and start measuring at the lowest point */ + db8500_thermal_update_config(th, 0, THERMAL_TREND_STABLE, + PRCMU_DEFAULT_LOW_TEMP, + db8500_thermal_points[0]); return 0; } @@ -515,7 +253,6 @@ static struct platform_driver db8500_thermal_driver = { .probe = db8500_thermal_probe, .suspend = db8500_thermal_suspend, .resume = db8500_thermal_resume, - .remove = db8500_thermal_remove, }; module_platform_driver(db8500_thermal_driver); From 2b481835cf4e7384b80d7762074b32a45b792d99 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 21 Sep 2019 09:01:45 +0300 Subject: [PATCH 207/334] wil6210: use after free in wil_netif_rx_any() The debug code dereferences "skb" to print "skb->len" so we have to print the message before we free "skb". Fixes: f99fe49ff372 ("wil6210: add wil_netif_rx() helper function") Signed-off-by: Dan Carpenter Signed-off-by: Kalle Valo --- drivers/net/wireless/ath/wil6210/txrx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/wil6210/txrx.c b/drivers/net/wireless/ath/wil6210/txrx.c index cb13652491ad..598c1fba9dac 100644 --- a/drivers/net/wireless/ath/wil6210/txrx.c +++ b/drivers/net/wireless/ath/wil6210/txrx.c @@ -1012,11 +1012,11 @@ void wil_netif_rx_any(struct sk_buff *skb, struct net_device *ndev) skb_orphan(skb); if (security && (wil->txrx_ops.rx_crypto_check(wil, skb) != 0)) { + wil_dbg_txrx(wil, "Rx drop %d bytes\n", skb->len); dev_kfree_skb(skb); ndev->stats.rx_dropped++; stats->rx_replay++; stats->rx_dropped++; - wil_dbg_txrx(wil, "Rx drop %d bytes\n", skb->len); return; } From 20ff1cb506727f81acba59acab8a0f37e1a13e43 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 24 Sep 2019 07:40:06 +0900 Subject: [PATCH 208/334] netfilter: ebtables: use __u8 instead of uint8_t in uapi header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When CONFIG_UAPI_HEADER_TEST=y, exported headers are compile-tested to make sure they can be included from user-space. Currently, linux/netfilter_bridge/ebtables.h is excluded from the test coverage. To make it join the compile-test, we need to fix the build errors attached below. For a case like this, we decided to use __u{8,16,32,64} variable types in this discussion: https://lkml.org/lkml/2019/6/5/18 Build log: CC usr/include/linux/netfilter_bridge/ebtables.h.s In file included from :32:0: ./usr/include/linux/netfilter_bridge/ebtables.h:126:4: error: unknown type name ‘uint8_t’ uint8_t revision; ^~~~~~~ ./usr/include/linux/netfilter_bridge/ebtables.h:139:4: error: unknown type name ‘uint8_t’ uint8_t revision; ^~~~~~~ ./usr/include/linux/netfilter_bridge/ebtables.h:152:4: error: unknown type name ‘uint8_t’ uint8_t revision; ^~~~~~~ Signed-off-by: Masahiro Yamada Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter_bridge/ebtables.h | 6 +++--- usr/include/Makefile | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/include/uapi/linux/netfilter_bridge/ebtables.h b/include/uapi/linux/netfilter_bridge/ebtables.h index 3b86c14ea49d..8076c940ffeb 100644 --- a/include/uapi/linux/netfilter_bridge/ebtables.h +++ b/include/uapi/linux/netfilter_bridge/ebtables.h @@ -123,7 +123,7 @@ struct ebt_entry_match { union { struct { char name[EBT_EXTENSION_MAXNAMELEN]; - uint8_t revision; + __u8 revision; }; struct xt_match *match; } u; @@ -136,7 +136,7 @@ struct ebt_entry_watcher { union { struct { char name[EBT_EXTENSION_MAXNAMELEN]; - uint8_t revision; + __u8 revision; }; struct xt_target *watcher; } u; @@ -149,7 +149,7 @@ struct ebt_entry_target { union { struct { char name[EBT_EXTENSION_MAXNAMELEN]; - uint8_t revision; + __u8 revision; }; struct xt_target *target; } u; diff --git a/usr/include/Makefile b/usr/include/Makefile index 1fb6abe29b2f..379cc5abc162 100644 --- a/usr/include/Makefile +++ b/usr/include/Makefile @@ -38,7 +38,6 @@ header-test- += linux/ivtv.h header-test- += linux/jffs2.h header-test- += linux/kexec.h header-test- += linux/matroxfb.h -header-test- += linux/netfilter_bridge/ebtables.h header-test- += linux/netfilter_ipv4/ipt_LOG.h header-test- += linux/netfilter_ipv6/ip6t_LOG.h header-test- += linux/nfc.h From 9b05b6e11d5e93a3a517cadc12b9836e0470c255 Mon Sep 17 00:00:00 2001 From: Laura Garcia Liebana Date: Tue, 24 Sep 2019 14:42:44 +0200 Subject: [PATCH 209/334] netfilter: nf_tables: bogus EBUSY when deleting flowtable after flush The deletion of a flowtable after a flush in the same transaction results in EBUSY. This patch adds an activation and deactivation of flowtables in order to update the _use_ counter. Signed-off-by: Laura Garcia Liebana Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 4 ++++ net/netfilter/nf_tables_api.c | 16 ++++++++++++++++ net/netfilter/nft_flow_offload.c | 19 +++++++++++++++++++ 3 files changed, 39 insertions(+) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index a26d64056fc8..001d294edf57 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1183,6 +1183,10 @@ struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table, const struct nlattr *nla, u8 genmask); +void nf_tables_deactivate_flowtable(const struct nft_ctx *ctx, + struct nft_flowtable *flowtable, + enum nft_trans_phase phase); + void nft_register_flowtable_type(struct nf_flowtable_type *type); void nft_unregister_flowtable_type(struct nf_flowtable_type *type); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 6dc46f9b5f7b..d481f9baca2f 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5598,6 +5598,22 @@ struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table, } EXPORT_SYMBOL_GPL(nft_flowtable_lookup); +void nf_tables_deactivate_flowtable(const struct nft_ctx *ctx, + struct nft_flowtable *flowtable, + enum nft_trans_phase phase) +{ + switch (phase) { + case NFT_TRANS_PREPARE: + case NFT_TRANS_ABORT: + case NFT_TRANS_RELEASE: + flowtable->use--; + /* fall through */ + default: + return; + } +} +EXPORT_SYMBOL_GPL(nf_tables_deactivate_flowtable); + static struct nft_flowtable * nft_flowtable_lookup_byhandle(const struct nft_table *table, const struct nlattr *nla, u8 genmask) diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 22cf236eb5d5..f29bbc74c4bf 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -177,6 +177,23 @@ static int nft_flow_offload_init(const struct nft_ctx *ctx, return nf_ct_netns_get(ctx->net, ctx->family); } +static void nft_flow_offload_deactivate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + enum nft_trans_phase phase) +{ + struct nft_flow_offload *priv = nft_expr_priv(expr); + + nf_tables_deactivate_flowtable(ctx, priv->flowtable, phase); +} + +static void nft_flow_offload_activate(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct nft_flow_offload *priv = nft_expr_priv(expr); + + priv->flowtable->use++; +} + static void nft_flow_offload_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { @@ -205,6 +222,8 @@ static const struct nft_expr_ops nft_flow_offload_ops = { .size = NFT_EXPR_SIZE(sizeof(struct nft_flow_offload)), .eval = nft_flow_offload_eval, .init = nft_flow_offload_init, + .activate = nft_flow_offload_activate, + .deactivate = nft_flow_offload_deactivate, .destroy = nft_flow_offload_destroy, .validate = nft_flow_offload_validate, .dump = nft_flow_offload_dump, From b27507bb59ed504d7fa4d6a35f25a8cc39903b54 Mon Sep 17 00:00:00 2001 From: Juliet Kim Date: Fri, 20 Sep 2019 16:11:22 -0400 Subject: [PATCH 210/334] net/ibmvnic: unlock rtnl_lock in reset so linkwatch_event can run Commit a5681e20b541 ("net/ibmnvic: Fix deadlock problem in reset") made the change to hold the RTNL lock during a reset to avoid deadlock but linkwatch_event is fired during the reset and needs the RTNL lock. That keeps linkwatch_event process from proceeding until the reset is complete. The reset process cannot tolerate the linkwatch_event processing after reset completes, so release the RTNL lock during the process to allow a chance for linkwatch_event to run during reset. This does not guarantee that the linkwatch_event will be processed as soon as link state changes, but is an improvement over the current code where linkwatch_event processing is always delayed, which prevents transmissions on the device from being deactivated leading transmit watchdog timer to time-out. Release the RTNL lock before link state change and re-acquire after the link state change to allow linkwatch_event to grab the RTNL lock and run during the reset. Fixes: a5681e20b541 ("net/ibmnvic: Fix deadlock problem in reset") Signed-off-by: Juliet Kim Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 226 ++++++++++++++++++++--------- drivers/net/ethernet/ibm/ibmvnic.h | 1 + 2 files changed, 158 insertions(+), 69 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 3816fff75bb5..d7db5cc51f6a 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -1723,6 +1723,86 @@ static int ibmvnic_set_mac(struct net_device *netdev, void *p) return rc; } +/** + * do_change_param_reset returns zero if we are able to keep processing reset + * events, or non-zero if we hit a fatal error and must halt. + */ +static int do_change_param_reset(struct ibmvnic_adapter *adapter, + struct ibmvnic_rwi *rwi, + u32 reset_state) +{ + struct net_device *netdev = adapter->netdev; + int i, rc; + + netdev_dbg(adapter->netdev, "Change param resetting driver (%d)\n", + rwi->reset_reason); + + netif_carrier_off(netdev); + adapter->reset_reason = rwi->reset_reason; + + ibmvnic_cleanup(netdev); + + if (reset_state == VNIC_OPEN) { + rc = __ibmvnic_close(netdev); + if (rc) + return rc; + } + + release_resources(adapter); + release_sub_crqs(adapter, 1); + release_crq_queue(adapter); + + adapter->state = VNIC_PROBED; + + rc = init_crq_queue(adapter); + + if (rc) { + netdev_err(adapter->netdev, + "Couldn't initialize crq. rc=%d\n", rc); + return rc; + } + + rc = ibmvnic_reset_init(adapter); + if (rc) + return IBMVNIC_INIT_FAILED; + + /* If the adapter was in PROBE state prior to the reset, + * exit here. + */ + if (reset_state == VNIC_PROBED) + return 0; + + rc = ibmvnic_login(netdev); + if (rc) { + adapter->state = reset_state; + return rc; + } + + rc = init_resources(adapter); + if (rc) + return rc; + + ibmvnic_disable_irqs(adapter); + + adapter->state = VNIC_CLOSED; + + if (reset_state == VNIC_CLOSED) + return 0; + + rc = __ibmvnic_open(netdev); + if (rc) + return IBMVNIC_OPEN_FAILED; + + /* refresh device's multicast list */ + ibmvnic_set_multi(netdev); + + /* kick napi */ + for (i = 0; i < adapter->req_rx_queues; i++) + napi_schedule(&adapter->napi[i]); + + return 0; +} + /** * do_reset returns zero if we are able to keep processing reset events, or * non-zero if we hit a fatal error and must halt. @@ -1738,6 +1818,8 @@ static int do_reset(struct ibmvnic_adapter *adapter, netdev_dbg(adapter->netdev, "Re-setting driver (%d)\n", rwi->reset_reason); + rtnl_lock(); + netif_carrier_off(netdev); adapter->reset_reason = rwi->reset_reason; @@ -1751,16 +1833,25 @@ static int do_reset(struct ibmvnic_adapter *adapter, if (reset_state == VNIC_OPEN && adapter->reset_reason != VNIC_RESET_MOBILITY && adapter->reset_reason != VNIC_RESET_FAILOVER) { - rc = __ibmvnic_close(netdev); - if (rc) - return rc; - } + adapter->state = VNIC_CLOSING; - if (adapter->reset_reason == VNIC_RESET_CHANGE_PARAM || - adapter->wait_for_reset) { - release_resources(adapter); - release_sub_crqs(adapter, 1); - release_crq_queue(adapter); + /* Release the RTNL lock before link state change and + * re-acquire after the link state change to allow + * linkwatch_event to grab the RTNL lock and run during + * a reset. + */ + rtnl_unlock(); + rc = set_link_state(adapter, IBMVNIC_LOGICAL_LNK_DN); + rtnl_lock(); + if (rc) + goto out; + + if (adapter->state != VNIC_CLOSING) { + rc = -1; + goto out; + } + + adapter->state = VNIC_CLOSED; } if (adapter->reset_reason != VNIC_RESET_NON_FATAL) { @@ -1769,9 +1860,7 @@ static int do_reset(struct ibmvnic_adapter *adapter, */ adapter->state = VNIC_PROBED; - if (adapter->wait_for_reset) { - rc = init_crq_queue(adapter); - } else if (adapter->reset_reason == VNIC_RESET_MOBILITY) { + if (adapter->reset_reason == VNIC_RESET_MOBILITY) { rc = ibmvnic_reenable_crq_queue(adapter); release_sub_crqs(adapter, 1); } else { @@ -1783,36 +1872,35 @@ static int do_reset(struct ibmvnic_adapter *adapter, if (rc) { netdev_err(adapter->netdev, "Couldn't initialize crq. rc=%d\n", rc); - return rc; + goto out; } rc = ibmvnic_reset_init(adapter); - if (rc) - return IBMVNIC_INIT_FAILED; + if (rc) { + rc = IBMVNIC_INIT_FAILED; + goto out; + } /* If the adapter was in PROBE state prior to the reset, * exit here. */ - if (reset_state == VNIC_PROBED) - return 0; + if (reset_state == VNIC_PROBED) { + rc = 0; + goto out; + } rc = ibmvnic_login(netdev); if (rc) { adapter->state = reset_state; - return rc; + goto out; } - if (adapter->reset_reason == VNIC_RESET_CHANGE_PARAM || - adapter->wait_for_reset) { - rc = init_resources(adapter); - if (rc) - return rc; - } else if (adapter->req_rx_queues != old_num_rx_queues || - adapter->req_tx_queues != old_num_tx_queues || - adapter->req_rx_add_entries_per_subcrq != - old_num_rx_slots || - adapter->req_tx_entries_per_subcrq != - old_num_tx_slots) { + if (adapter->req_rx_queues != old_num_rx_queues || + adapter->req_tx_queues != old_num_tx_queues || + adapter->req_rx_add_entries_per_subcrq != + old_num_rx_slots || + adapter->req_tx_entries_per_subcrq != + old_num_tx_slots) { release_rx_pools(adapter); release_tx_pools(adapter); release_napi(adapter); @@ -1820,32 +1908,30 @@ static int do_reset(struct ibmvnic_adapter *adapter, rc = init_resources(adapter); if (rc) - return rc; + goto out; } else { rc = reset_tx_pools(adapter); if (rc) - return rc; + goto out; rc = reset_rx_pools(adapter); if (rc) - return rc; + goto out; } ibmvnic_disable_irqs(adapter); } adapter->state = VNIC_CLOSED; - if (reset_state == VNIC_CLOSED) - return 0; + if (reset_state == VNIC_CLOSED) { + rc = 0; + goto out; + } rc = __ibmvnic_open(netdev); if (rc) { - if (list_empty(&adapter->rwi_list)) - adapter->state = VNIC_CLOSED; - else - adapter->state = reset_state; - - return 0; + rc = IBMVNIC_OPEN_FAILED; + goto out; } /* refresh device's multicast list */ @@ -1855,11 +1941,15 @@ static int do_reset(struct ibmvnic_adapter *adapter, for (i = 0; i < adapter->req_rx_queues; i++) napi_schedule(&adapter->napi[i]); - if (adapter->reset_reason != VNIC_RESET_FAILOVER && - adapter->reset_reason != VNIC_RESET_CHANGE_PARAM) + if (adapter->reset_reason != VNIC_RESET_FAILOVER) call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, netdev); - return 0; + rc = 0; + +out: + rtnl_unlock(); + + return rc; } static int do_hard_reset(struct ibmvnic_adapter *adapter, @@ -1919,14 +2009,8 @@ static int do_hard_reset(struct ibmvnic_adapter *adapter, return 0; rc = __ibmvnic_open(netdev); - if (rc) { - if (list_empty(&adapter->rwi_list)) - adapter->state = VNIC_CLOSED; - else - adapter->state = reset_state; - - return 0; - } + if (rc) + return IBMVNIC_OPEN_FAILED; return 0; } @@ -1965,20 +2049,11 @@ static void __ibmvnic_reset(struct work_struct *work) { struct ibmvnic_rwi *rwi; struct ibmvnic_adapter *adapter; - bool we_lock_rtnl = false; u32 reset_state; int rc = 0; adapter = container_of(work, struct ibmvnic_adapter, ibmvnic_reset); - /* netif_set_real_num_xx_queues needs to take rtnl lock here - * unless wait_for_reset is set, in which case the rtnl lock - * has already been taken before initializing the reset - */ - if (!adapter->wait_for_reset) { - rtnl_lock(); - we_lock_rtnl = true; - } reset_state = adapter->state; rwi = get_next_rwi(adapter); @@ -1990,14 +2065,32 @@ static void __ibmvnic_reset(struct work_struct *work) break; } - if (adapter->force_reset_recovery) { - adapter->force_reset_recovery = false; - rc = do_hard_reset(adapter, rwi, reset_state); + if (rwi->reset_reason == VNIC_RESET_CHANGE_PARAM) { + /* CHANGE_PARAM requestor holds rtnl_lock */ + rc = do_change_param_reset(adapter, rwi, reset_state); + } else if (adapter->force_reset_recovery) { + /* Transport event occurred during previous reset */ + if (adapter->wait_for_reset) { + /* Previous was CHANGE_PARAM; caller locked */ + adapter->force_reset_recovery = false; + rc = do_hard_reset(adapter, rwi, reset_state); + } else { + rtnl_lock(); + adapter->force_reset_recovery = false; + rc = do_hard_reset(adapter, rwi, reset_state); + rtnl_unlock(); + } } else { rc = do_reset(adapter, rwi, reset_state); } kfree(rwi); - if (rc && rc != IBMVNIC_INIT_FAILED && + if (rc == IBMVNIC_OPEN_FAILED) { + if (list_empty(&adapter->rwi_list)) + adapter->state = VNIC_CLOSED; + else + adapter->state = reset_state; + rc = 0; + } else if (rc && rc != IBMVNIC_INIT_FAILED && !adapter->force_reset_recovery) break; @@ -2005,7 +2098,6 @@ static void __ibmvnic_reset(struct work_struct *work) } if (adapter->wait_for_reset) { - adapter->wait_for_reset = false; adapter->reset_done_rc = rc; complete(&adapter->reset_done); } @@ -2016,8 +2108,6 @@ static void __ibmvnic_reset(struct work_struct *work) } adapter->resetting = false; - if (we_lock_rtnl) - rtnl_unlock(); } static int ibmvnic_reset(struct ibmvnic_adapter *adapter, @@ -2078,8 +2168,6 @@ static int ibmvnic_reset(struct ibmvnic_adapter *adapter, return 0; err: - if (adapter->wait_for_reset) - adapter->wait_for_reset = false; return -ret; } diff --git a/drivers/net/ethernet/ibm/ibmvnic.h b/drivers/net/ethernet/ibm/ibmvnic.h index 70bd286f8932..9d3d35cc91d6 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.h +++ b/drivers/net/ethernet/ibm/ibmvnic.h @@ -20,6 +20,7 @@ #define IBMVNIC_INVALID_MAP -1 #define IBMVNIC_STATS_TIMEOUT 1 #define IBMVNIC_INIT_FAILED 2 +#define IBMVNIC_OPEN_FAILED 3 /* basic structures plus 100 2k buffers */ #define IBMVNIC_IO_ENTITLEMENT_DEFAULT 610305 From 7ed5b31f4a6695a21f617df07646e9b15c6c1d29 Mon Sep 17 00:00:00 2001 From: Juliet Kim Date: Fri, 20 Sep 2019 16:11:23 -0400 Subject: [PATCH 211/334] net/ibmvnic: prevent more than one thread from running in reset The current code allows more than one thread to run in reset. This can corrupt struct adapter data. Check adapter->resetting before performing a reset, if there is another reset running delay (100 msec) before trying again. Signed-off-by: Juliet Kim Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 40 ++++++++++++++++++++++-------- drivers/net/ethernet/ibm/ibmvnic.h | 5 +++- 2 files changed, 34 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index d7db5cc51f6a..2b073a3c0b84 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -1207,7 +1207,7 @@ static void ibmvnic_cleanup(struct net_device *netdev) struct ibmvnic_adapter *adapter = netdev_priv(netdev); /* ensure that transmissions are stopped if called by do_reset */ - if (adapter->resetting) + if (test_bit(0, &adapter->resetting)) netif_tx_disable(netdev); else netif_tx_stop_all_queues(netdev); @@ -1428,7 +1428,7 @@ static netdev_tx_t ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev) u8 proto = 0; netdev_tx_t ret = NETDEV_TX_OK; - if (adapter->resetting) { + if (test_bit(0, &adapter->resetting)) { if (!netif_subqueue_stopped(netdev, skb)) netif_stop_subqueue(netdev, queue_num); dev_kfree_skb_any(skb); @@ -2054,6 +2054,12 @@ static void __ibmvnic_reset(struct work_struct *work) adapter = container_of(work, struct ibmvnic_adapter, ibmvnic_reset); + if (test_and_set_bit_lock(0, &adapter->resetting)) { + schedule_delayed_work(&adapter->ibmvnic_delayed_reset, + IBMVNIC_RESET_DELAY); + return; + } + reset_state = adapter->state; rwi = get_next_rwi(adapter); @@ -2095,6 +2101,10 @@ static void __ibmvnic_reset(struct work_struct *work) break; rwi = get_next_rwi(adapter); + + if (rwi && (rwi->reset_reason == VNIC_RESET_FAILOVER || + rwi->reset_reason == VNIC_RESET_MOBILITY)) + adapter->force_reset_recovery = true; } if (adapter->wait_for_reset) { @@ -2107,7 +2117,16 @@ static void __ibmvnic_reset(struct work_struct *work) free_all_rwi(adapter); } - adapter->resetting = false; + clear_bit_unlock(0, &adapter->resetting); +} + +static void __ibmvnic_delayed_reset(struct work_struct *work) +{ + struct ibmvnic_adapter *adapter; + + adapter = container_of(work, struct ibmvnic_adapter, + ibmvnic_delayed_reset.work); + __ibmvnic_reset(&adapter->ibmvnic_reset); } static int ibmvnic_reset(struct ibmvnic_adapter *adapter, @@ -2162,7 +2181,6 @@ static int ibmvnic_reset(struct ibmvnic_adapter *adapter, rwi->reset_reason = reason; list_add_tail(&rwi->list, &adapter->rwi_list); spin_unlock_irqrestore(&adapter->rwi_lock, flags); - adapter->resetting = true; netdev_dbg(adapter->netdev, "Scheduling reset (reason %d)\n", reason); schedule_work(&adapter->ibmvnic_reset); @@ -2207,7 +2225,7 @@ restart_poll: u16 offset; u8 flags = 0; - if (unlikely(adapter->resetting && + if (unlikely(test_bit(0, &adapter->resetting) && adapter->reset_reason != VNIC_RESET_NON_FATAL)) { enable_scrq_irq(adapter, adapter->rx_scrq[scrq_num]); napi_complete_done(napi, frames_processed); @@ -2858,7 +2876,7 @@ static int enable_scrq_irq(struct ibmvnic_adapter *adapter, return 1; } - if (adapter->resetting && + if (test_bit(0, &adapter->resetting) && adapter->reset_reason == VNIC_RESET_MOBILITY) { u64 val = (0xff000000) | scrq->hw_irq; @@ -3408,7 +3426,7 @@ static int ibmvnic_send_crq(struct ibmvnic_adapter *adapter, if (rc) { if (rc == H_CLOSED) { dev_warn(dev, "CRQ Queue closed\n"); - if (adapter->resetting) + if (test_bit(0, &adapter->resetting)) ibmvnic_reset(adapter, VNIC_RESET_FATAL); } @@ -4484,7 +4502,7 @@ static void ibmvnic_handle_crq(union ibmvnic_crq *crq, case IBMVNIC_CRQ_XPORT_EVENT: netif_carrier_off(netdev); adapter->crq.active = false; - if (adapter->resetting) + if (test_bit(0, &adapter->resetting)) adapter->force_reset_recovery = true; if (gen_crq->cmd == IBMVNIC_PARTITION_MIGRATED) { dev_info(dev, "Migrated, re-enabling adapter\n"); @@ -4822,7 +4840,7 @@ static int ibmvnic_reset_init(struct ibmvnic_adapter *adapter) return -1; } - if (adapter->resetting && !adapter->wait_for_reset && + if (test_bit(0, &adapter->resetting) && !adapter->wait_for_reset && adapter->reset_reason != VNIC_RESET_MOBILITY) { if (adapter->req_rx_queues != old_num_rx_queues || adapter->req_tx_queues != old_num_tx_queues) { @@ -4934,10 +4952,12 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id) spin_lock_init(&adapter->stats_lock); INIT_WORK(&adapter->ibmvnic_reset, __ibmvnic_reset); + INIT_DELAYED_WORK(&adapter->ibmvnic_delayed_reset, + __ibmvnic_delayed_reset); INIT_LIST_HEAD(&adapter->rwi_list); spin_lock_init(&adapter->rwi_lock); init_completion(&adapter->init_done); - adapter->resetting = false; + clear_bit(0, &adapter->resetting); do { rc = init_crq_queue(adapter); diff --git a/drivers/net/ethernet/ibm/ibmvnic.h b/drivers/net/ethernet/ibm/ibmvnic.h index 9d3d35cc91d6..ebc39248b334 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.h +++ b/drivers/net/ethernet/ibm/ibmvnic.h @@ -39,6 +39,8 @@ #define IBMVNIC_MAX_LTB_SIZE ((1 << (MAX_ORDER - 1)) * PAGE_SIZE) #define IBMVNIC_BUFFER_HLEN 500 +#define IBMVNIC_RESET_DELAY 100 + static const char ibmvnic_priv_flags[][ETH_GSTRING_LEN] = { #define IBMVNIC_USE_SERVER_MAXES 0x1 "use-server-maxes" @@ -1077,7 +1079,8 @@ struct ibmvnic_adapter { spinlock_t rwi_lock; struct list_head rwi_list; struct work_struct ibmvnic_reset; - bool resetting; + struct delayed_work ibmvnic_delayed_reset; + unsigned long resetting; bool napi_enabled, from_passive_init; bool failover_pending; From 4c247de564f1ff614d11b3bb5313fb70d7b9598b Mon Sep 17 00:00:00 2001 From: Takeshi Misawa Date: Sun, 22 Sep 2019 16:45:31 +0900 Subject: [PATCH 212/334] ppp: Fix memory leak in ppp_write When ppp is closing, __ppp_xmit_process() failed to enqueue skb and skb allocated in ppp_write() is leaked. syzbot reported : BUG: memory leak unreferenced object 0xffff88812a17bc00 (size 224): comm "syz-executor673", pid 6952, jiffies 4294942888 (age 13.040s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<00000000d110fff9>] kmemleak_alloc_recursive include/linux/kmemleak.h:43 [inline] [<00000000d110fff9>] slab_post_alloc_hook mm/slab.h:522 [inline] [<00000000d110fff9>] slab_alloc_node mm/slab.c:3262 [inline] [<00000000d110fff9>] kmem_cache_alloc_node+0x163/0x2f0 mm/slab.c:3574 [<000000002d616113>] __alloc_skb+0x6e/0x210 net/core/skbuff.c:197 [<000000000167fc45>] alloc_skb include/linux/skbuff.h:1055 [inline] [<000000000167fc45>] ppp_write+0x48/0x120 drivers/net/ppp/ppp_generic.c:502 [<000000009ab42c0b>] __vfs_write+0x43/0xa0 fs/read_write.c:494 [<00000000086b2e22>] vfs_write fs/read_write.c:558 [inline] [<00000000086b2e22>] vfs_write+0xee/0x210 fs/read_write.c:542 [<00000000a2b70ef9>] ksys_write+0x7c/0x130 fs/read_write.c:611 [<00000000ce5e0fdd>] __do_sys_write fs/read_write.c:623 [inline] [<00000000ce5e0fdd>] __se_sys_write fs/read_write.c:620 [inline] [<00000000ce5e0fdd>] __x64_sys_write+0x1e/0x30 fs/read_write.c:620 [<00000000d9d7b370>] do_syscall_64+0x76/0x1a0 arch/x86/entry/common.c:296 [<0000000006e6d506>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fix this by freeing skb, if ppp is closing. Fixes: 6d066734e9f0 ("ppp: avoid loop in xmit recursion detection code") Reported-and-tested-by: syzbot+d9c8bf24e56416d7ce2c@syzkaller.appspotmail.com Signed-off-by: Takeshi Misawa Reviewed-by: Guillaume Nault Tested-by: Guillaume Nault Signed-off-by: David S. Miller --- drivers/net/ppp/ppp_generic.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index a30e41a56085..9a1b006904a7 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -1415,6 +1415,8 @@ static void __ppp_xmit_process(struct ppp *ppp, struct sk_buff *skb) netif_wake_queue(ppp->dev); else netif_stop_queue(ppp->dev); + } else { + kfree_skb(skb); } ppp_xmit_unlock(ppp); } From 5c94ad1793f1c8743388f329d55a61e5001dc58e Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sun, 22 Sep 2019 13:42:16 +0200 Subject: [PATCH 213/334] atm: he: clean up an indentation issue There is a statement that is indented one level too many, remove the extraneous tab. Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- drivers/atm/he.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/atm/he.c b/drivers/atm/he.c index 70b00ae4ec38..8af793f5e811 100644 --- a/drivers/atm/he.c +++ b/drivers/atm/he.c @@ -1690,7 +1690,7 @@ he_service_rbrq(struct he_dev *he_dev, int group) if (RBRQ_HBUF_ERR(he_dev->rbrq_head)) { hprintk("HBUF_ERR! (cid 0x%x)\n", cid); - atomic_inc(&vcc->stats->rx_drop); + atomic_inc(&vcc->stats->rx_drop); goto return_host_buffers; } From 9f5c44cf61a7cf29d85d2c0d2065ab2d62764a41 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Mon, 23 Sep 2019 14:16:03 +0800 Subject: [PATCH 214/334] gianfar: Make reset_gfar static Fix sparse warning: drivers/net/ethernet/freescale/gianfar.c:2070:6: warning: symbol 'reset_gfar' was not declared. Should it be static? Reported-by: Hulk Robot Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/gianfar.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c index 24bf7f68375f..51ad86417cb1 100644 --- a/drivers/net/ethernet/freescale/gianfar.c +++ b/drivers/net/ethernet/freescale/gianfar.c @@ -2067,7 +2067,7 @@ static int gfar_change_mtu(struct net_device *dev, int new_mtu) return 0; } -void reset_gfar(struct net_device *ndev) +static void reset_gfar(struct net_device *ndev) { struct gfar_private *priv = netdev_priv(ndev); From b0ce902febef24f917c33d5a5982030dac53141d Mon Sep 17 00:00:00 2001 From: Jose Abreu Date: Mon, 23 Sep 2019 09:49:08 +0200 Subject: [PATCH 215/334] net: stmmac: selftests: Flow Control test can also run with ASYM Pause The Flow Control selftest is also available with ASYM Pause. Lets add this check to the test and fix eventual false positive failures. Fixes: 091810dbded9 ("net: stmmac: Introduce selftests support") Signed-off-by: Jose Abreu Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c index 9c8d210b2d6a..5f66f6161629 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c @@ -670,7 +670,7 @@ static int stmmac_test_flowctrl(struct stmmac_priv *priv) unsigned int pkt_count; int i, ret = 0; - if (!phydev || !phydev->pause) + if (!phydev || (!phydev->pause && !phydev->asym_pause)) return -EOPNOTSUPP; tpriv = kzalloc(sizeof(*tpriv), GFP_KERNEL); From 99dcb8432af062fffd5eb81692966a0de9ccc072 Mon Sep 17 00:00:00 2001 From: Shubhrajyoti Datta Date: Mon, 23 Sep 2019 14:03:51 +0530 Subject: [PATCH 216/334] net: macb: Remove dead code macb_64b_desc is always called when HW_DMA_CAP_64B is defined. So the return NULL can never be reached. Remove the dead code. Signed-off-by: Shubhrajyoti Datta Reviewed-by: Claudiu Beznea Signed-off-by: David S. Miller --- drivers/net/ethernet/cadence/macb_main.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index 35b59b5edf0f..8e8d557901a9 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -165,9 +165,8 @@ static unsigned int macb_adj_dma_desc_idx(struct macb *bp, unsigned int desc_idx #ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT static struct macb_dma_desc_64 *macb_64b_desc(struct macb *bp, struct macb_dma_desc *desc) { - if (bp->hw_dma_cap & HW_DMA_CAP_64B) - return (struct macb_dma_desc_64 *)((void *)desc + sizeof(struct macb_dma_desc)); - return NULL; + return (struct macb_dma_desc_64 *)((void *)desc + + sizeof(struct macb_dma_desc)); } #endif From 1fac4a54374f7ef385938f3c6cf7649c0fe4f6cd Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 23 Sep 2019 14:56:14 +0800 Subject: [PATCH 217/334] btrfs: relocation: fix use-after-free on dead relocation roots [BUG] One user reported a reproducible KASAN report about use-after-free: BTRFS info (device sdi1): balance: start -dvrange=1256811659264..1256811659265 BTRFS info (device sdi1): relocating block group 1256811659264 flags data|raid0 ================================================================== BUG: KASAN: use-after-free in btrfs_init_reloc_root+0x2cd/0x340 [btrfs] Write of size 8 at addr ffff88856f671710 by task kworker/u24:10/261579 CPU: 2 PID: 261579 Comm: kworker/u24:10 Tainted: P OE 5.2.11-arch1-1-kasan #4 Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./X99 Extreme4, BIOS P3.80 04/06/2018 Workqueue: btrfs-endio-write btrfs_endio_write_helper [btrfs] Call Trace: dump_stack+0x7b/0xba print_address_description+0x6c/0x22e ? btrfs_init_reloc_root+0x2cd/0x340 [btrfs] __kasan_report.cold+0x1b/0x3b ? btrfs_init_reloc_root+0x2cd/0x340 [btrfs] kasan_report+0x12/0x17 __asan_report_store8_noabort+0x17/0x20 btrfs_init_reloc_root+0x2cd/0x340 [btrfs] record_root_in_trans+0x2a0/0x370 [btrfs] btrfs_record_root_in_trans+0xf4/0x140 [btrfs] start_transaction+0x1ab/0xe90 [btrfs] btrfs_join_transaction+0x1d/0x20 [btrfs] btrfs_finish_ordered_io+0x7bf/0x18a0 [btrfs] ? lock_repin_lock+0x400/0x400 ? __kmem_cache_shutdown.cold+0x140/0x1ad ? btrfs_unlink_subvol+0x9b0/0x9b0 [btrfs] finish_ordered_fn+0x15/0x20 [btrfs] normal_work_helper+0x1bd/0xca0 [btrfs] ? process_one_work+0x819/0x1720 ? kasan_check_read+0x11/0x20 btrfs_endio_write_helper+0x12/0x20 [btrfs] process_one_work+0x8c9/0x1720 ? pwq_dec_nr_in_flight+0x2f0/0x2f0 ? worker_thread+0x1d9/0x1030 worker_thread+0x98/0x1030 kthread+0x2bb/0x3b0 ? process_one_work+0x1720/0x1720 ? kthread_park+0x120/0x120 ret_from_fork+0x35/0x40 Allocated by task 369692: __kasan_kmalloc.part.0+0x44/0xc0 __kasan_kmalloc.constprop.0+0xba/0xc0 kasan_kmalloc+0x9/0x10 kmem_cache_alloc_trace+0x138/0x260 btrfs_read_tree_root+0x92/0x360 [btrfs] btrfs_read_fs_root+0x10/0xb0 [btrfs] create_reloc_root+0x47d/0xa10 [btrfs] btrfs_init_reloc_root+0x1e2/0x340 [btrfs] record_root_in_trans+0x2a0/0x370 [btrfs] btrfs_record_root_in_trans+0xf4/0x140 [btrfs] start_transaction+0x1ab/0xe90 [btrfs] btrfs_start_transaction+0x1e/0x20 [btrfs] __btrfs_prealloc_file_range+0x1c2/0xa00 [btrfs] btrfs_prealloc_file_range+0x13/0x20 [btrfs] prealloc_file_extent_cluster+0x29f/0x570 [btrfs] relocate_file_extent_cluster+0x193/0xc30 [btrfs] relocate_data_extent+0x1f8/0x490 [btrfs] relocate_block_group+0x600/0x1060 [btrfs] btrfs_relocate_block_group+0x3a0/0xa00 [btrfs] btrfs_relocate_chunk+0x9e/0x180 [btrfs] btrfs_balance+0x14e4/0x2fc0 [btrfs] btrfs_ioctl_balance+0x47f/0x640 [btrfs] btrfs_ioctl+0x119d/0x8380 [btrfs] do_vfs_ioctl+0x9f5/0x1060 ksys_ioctl+0x67/0x90 __x64_sys_ioctl+0x73/0xb0 do_syscall_64+0xa5/0x370 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Freed by task 369692: __kasan_slab_free+0x14f/0x210 kasan_slab_free+0xe/0x10 kfree+0xd8/0x270 btrfs_drop_snapshot+0x154c/0x1eb0 [btrfs] clean_dirty_subvols+0x227/0x340 [btrfs] relocate_block_group+0x972/0x1060 [btrfs] btrfs_relocate_block_group+0x3a0/0xa00 [btrfs] btrfs_relocate_chunk+0x9e/0x180 [btrfs] btrfs_balance+0x14e4/0x2fc0 [btrfs] btrfs_ioctl_balance+0x47f/0x640 [btrfs] btrfs_ioctl+0x119d/0x8380 [btrfs] do_vfs_ioctl+0x9f5/0x1060 ksys_ioctl+0x67/0x90 __x64_sys_ioctl+0x73/0xb0 do_syscall_64+0xa5/0x370 entry_SYSCALL_64_after_hwframe+0x44/0xa9 The buggy address belongs to the object at ffff88856f671100 which belongs to the cache kmalloc-4k of size 4096 The buggy address is located 1552 bytes inside of 4096-byte region [ffff88856f671100, ffff88856f672100) The buggy address belongs to the page: page:ffffea0015bd9c00 refcount:1 mapcount:0 mapping:ffff88864400e600 index:0x0 compound_mapcount: 0 flags: 0x2ffff0000010200(slab|head) raw: 02ffff0000010200 dead000000000100 dead000000000200 ffff88864400e600 raw: 0000000000000000 0000000000070007 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff88856f671600: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88856f671680: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb >ffff88856f671700: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff88856f671780: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88856f671800: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ================================================================== BTRFS info (device sdi1): 1 enospc errors during balance BTRFS info (device sdi1): balance: ended with status: -28 [CAUSE] The problem happens when finish_ordered_io() get called with balance still running, while the reloc root of that subvolume is already dead. (Tree is swap already done, but tree not yet deleted for possible qgroup usage.) That means root->reloc_root still exists, but that reloc_root can be under btrfs_drop_snapshot(), thus we shouldn't access it. The following race could cause the use-after-free problem: CPU1 | CPU2 -------------------------------------------------------------------------- | relocate_block_group() | |- unset_reloc_control(rc) | |- btrfs_commit_transaction() btrfs_finish_ordered_io() | |- clean_dirty_subvols() |- btrfs_join_transaction() | | |- record_root_in_trans() | | |- btrfs_init_reloc_root() | | |- if (root->reloc_root) | | | | |- root->reloc_root = NULL | | |- btrfs_drop_snapshot(reloc_root); |- reloc_root->last_trans| = trans->transid | ^^^^^^^^^^^^^^^^^^^^^^ Use after free [FIX] Fix it by the following modifications: - Test if the root has dead reloc tree before accessing root->reloc_root If the root has BTRFS_ROOT_DEAD_RELOC_TREE, then we don't need to create or update root->reloc_tree - Clear the BTRFS_ROOT_DEAD_RELOC_TREE flag until we have fully dropped reloc tree To co-operate with above modification, so as long as BTRFS_ROOT_DEAD_RELOC_TREE is still set, we won't try to re-create reloc tree at record_root_in_trans(). Reported-by: Cebtenzzre Fixes: d2311e698578 ("btrfs: relocation: Delay reloc tree deletion after merge_reloc_roots") CC: stable@vger.kernel.org # 5.1+ Reviewed-by: Josef Bacik Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 2f0e25afa486..00504657b602 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1435,6 +1435,13 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, int clear_rsv = 0; int ret; + /* + * The subvolume has reloc tree but the swap is finished, no need to + * create/update the dead reloc tree + */ + if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)) + return 0; + if (root->reloc_root) { reloc_root = root->reloc_root; reloc_root->last_trans = trans->transid; @@ -2187,7 +2194,6 @@ static int clean_dirty_subvols(struct reloc_control *rc) /* Merged subvolume, cleanup its reloc root */ struct btrfs_root *reloc_root = root->reloc_root; - clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); list_del_init(&root->reloc_dirty_list); root->reloc_root = NULL; if (reloc_root) { @@ -2196,6 +2202,7 @@ static int clean_dirty_subvols(struct reloc_control *rc) if (ret2 < 0 && !ret) ret = ret2; } + clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); btrfs_put_fs_root(root); } else { /* Orphan reloc tree, just clean it up */ From fab273595507a9ec7035df6d5512a955d80a80ba Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 25 Sep 2019 10:13:27 +0800 Subject: [PATCH 218/334] btrfs: Fix a regression which we can't convert to SINGLE profile [BUG] With v5.3 kernel, we can't convert to SINGLE profile: # btrfs balance start -f -dconvert=single $mnt ERROR: error during balancing '/mnt/btrfs': Invalid argument # dmesg -t | tail validate_convert_profile: data profile=0x1000000000000 allowed=0x20 is_valid=1 final=0x1000000000000 ret=1 BTRFS error (device dm-3): balance: invalid convert data profile single [CAUSE] With the extra debug output added, it shows that the @allowed bit is lacking the special in-memory only SINGLE profile bit. Thus we fail at that (profile & ~allowed) check. This regression is caused by commit 081db89b13cb ("btrfs: use raid_attr to get allowed profiles for balance conversion") and the fact that we don't use any bit to indicate SINGLE profile on-disk, but uses special in-memory only bit to help distinguish different profiles. [FIX] Add that BTRFS_AVAIL_ALLOC_BIT_SINGLE to @allowed, so the code should be the same as it was and fix the regression. Reported-by: Chris Murphy Fixes: 081db89b13cb ("btrfs: use raid_attr to get allowed profiles for balance conversion") CC: stable@vger.kernel.org # 5.3+ Reviewed-by: Anand Jain Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a324480bc88b..cdd7af424033 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4063,7 +4063,13 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, } num_devices = btrfs_num_devices(fs_info); - allowed = 0; + + /* + * SINGLE profile on-disk has no profile bit, but in-memory we have a + * special bit for it, to make it easier to distinguish. Thus we need + * to set it manually, or balance would refuse the profile. + */ + allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) if (num_devices >= btrfs_raid_array[i].devs_min) allowed |= btrfs_raid_array[i].bg_flag; From e41f9efb85d38d95744b9f35b9903109032b93d4 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 25 Sep 2019 14:09:30 +0100 Subject: [PATCH 219/334] sunrpc: clean up indentation issue There are statements that are indented incorrectly, remove the extraneous spacing. Signed-off-by: Colin Ian King Signed-off-by: J. Bruce Fields --- net/sunrpc/svc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 220b79988000..d11b70552c33 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1233,8 +1233,8 @@ svc_generic_init_request(struct svc_rqst *rqstp, if (rqstp->rq_vers >= progp->pg_nvers ) goto err_bad_vers; - versp = progp->pg_vers[rqstp->rq_vers]; - if (!versp) + versp = progp->pg_vers[rqstp->rq_vers]; + if (!versp) goto err_bad_vers; /* From 3fbd7ee285b2bbc6eebd15a3c8786d9776a402a8 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 14 Sep 2019 07:33:34 -0500 Subject: [PATCH 220/334] tasks: Add a count of task RCU users Add a count of the number of RCU users (currently 1) of the task struct so that we can later add the scheduler case and get rid of the very subtle task_rcu_dereference(), and just use rcu_dereference(). As suggested by Oleg have the count overlap rcu_head so that no additional space in task_struct is required. Inspired-by: Linus Torvalds Inspired-by: Oleg Nesterov Signed-off-by: Eric W. Biederman Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Metcalf Cc: Christoph Lameter Cc: Davidlohr Bueso Cc: Kirill Tkhai Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Russell King - ARM Linux admin Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/87woebdplt.fsf_-_@x220.int.ebiederm.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 5 ++++- include/linux/sched/task.h | 1 + kernel/exit.c | 7 ++++++- kernel/fork.c | 7 +++---- 4 files changed, 14 insertions(+), 6 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index e2e91960d79f..8e43e54a02c7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1147,7 +1147,10 @@ struct task_struct { struct tlbflush_unmap_batch tlb_ubc; - struct rcu_head rcu; + union { + refcount_t rcu_users; + struct rcu_head rcu; + }; /* Cache last used pipe for splice(): */ struct pipe_inode_info *splice_pipe; diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 3d90ed8f75f0..153a683646ac 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -120,6 +120,7 @@ static inline void put_task_struct(struct task_struct *t) } struct task_struct *task_rcu_dereference(struct task_struct **ptask); +void put_task_struct_rcu_user(struct task_struct *task); #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT extern int arch_task_struct_size __read_mostly; diff --git a/kernel/exit.c b/kernel/exit.c index 22ab6a4bdc51..3bcaec2ea3ba 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -182,6 +182,11 @@ static void delayed_put_task_struct(struct rcu_head *rhp) put_task_struct(tsk); } +void put_task_struct_rcu_user(struct task_struct *task) +{ + if (refcount_dec_and_test(&task->rcu_users)) + call_rcu(&task->rcu, delayed_put_task_struct); +} void release_task(struct task_struct *p) { @@ -222,7 +227,7 @@ repeat: write_unlock_irq(&tasklist_lock); release_thread(p); - call_rcu(&p->rcu, delayed_put_task_struct); + put_task_struct_rcu_user(p); p = leader; if (unlikely(zap_leader)) diff --git a/kernel/fork.c b/kernel/fork.c index 1d1cd06edbc1..7eefe338d7a2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -902,10 +902,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) if (orig->cpus_ptr == &orig->cpus_mask) tsk->cpus_ptr = &tsk->cpus_mask; - /* - * One for us, one for whoever does the "release_task()" (usually - * parent) - */ + /* One for the user space visible state that goes away when reaped. */ + refcount_set(&tsk->rcu_users, 1); + /* One for the rcu users, and one for the scheduler */ refcount_set(&tsk->usage, 2); #ifdef CONFIG_BLK_DEV_IO_TRACE tsk->btrace_seq = 0; From 0ff7b2cfbae36ebcd216c6a5ad7f8534eebeaee2 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 14 Sep 2019 07:33:58 -0500 Subject: [PATCH 221/334] tasks, sched/core: Ensure tasks are available for a grace period after leaving the runqueue In the ordinary case today the RCU grace period for a task_struct is triggered when another process wait's for it's zombine and causes the kernel to call release_task(). As the waiting task has to receive a signal and then act upon it before this happens, typically this will occur after the original task as been removed from the runqueue. Unfortunaty in some cases such as self reaping tasks it can be shown that release_task() will be called starting the grace period for task_struct long before the task leaves the runqueue. Therefore use put_task_struct_rcu_user() in finish_task_switch() to guarantee that the there is a RCU lifetime after the task leaves the runqueue. Besides the change in the start of the RCU grace period for the task_struct this change may cause perf_event_delayed_put and trace_sched_process_free. The function perf_event_delayed_put boils down to just a WARN_ON for cases that I assume never show happen. So I don't see any problem with delaying it. The function trace_sched_process_free is a trace point and thus visible to user space. Occassionally userspace has the strangest dependencies so this has a miniscule chance of causing a regression. This change only changes the timing of when the tracepoint is called. The change in timing arguably gives userspace a more accurate picture of what is going on. So I don't expect there to be a regression. In the case where a task self reaps we are pretty much guaranteed that the RCU grace period is delayed. So we should get quite a bit of coverage in of this worst case for the change in a normal threaded workload. So I expect any issues to turn up quickly or not at all. I have lightly tested this change and everything appears to work fine. Inspired-by: Linus Torvalds Inspired-by: Oleg Nesterov Signed-off-by: Eric W. Biederman Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Metcalf Cc: Christoph Lameter Cc: Davidlohr Bueso Cc: Kirill Tkhai Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Russell King - ARM Linux admin Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/87r24jdpl5.fsf_-_@x220.int.ebiederm.org Signed-off-by: Ingo Molnar --- kernel/fork.c | 11 +++++++---- kernel/sched/core.c | 2 +- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 7eefe338d7a2..d6e552552e52 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -902,10 +902,13 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) if (orig->cpus_ptr == &orig->cpus_mask) tsk->cpus_ptr = &tsk->cpus_mask; - /* One for the user space visible state that goes away when reaped. */ - refcount_set(&tsk->rcu_users, 1); - /* One for the rcu users, and one for the scheduler */ - refcount_set(&tsk->usage, 2); + /* + * One for the user space visible state that goes away when reaped. + * One for the scheduler. + */ + refcount_set(&tsk->rcu_users, 2); + /* One for the rcu users */ + refcount_set(&tsk->usage, 1); #ifdef CONFIG_BLK_DEV_IO_TRACE tsk->btrace_seq = 0; #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 06961b997ed6..5e5fefbd4cc7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3254,7 +3254,7 @@ static struct rq *finish_task_switch(struct task_struct *prev) /* Task is done with its stack. */ put_task_stack(prev); - put_task_struct(prev); + put_task_struct_rcu_user(prev); } tick_nohz_task_switch(); From 154abafc68bfb7c2ef2ad5308a3b2de8968c3f61 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 14 Sep 2019 07:34:30 -0500 Subject: [PATCH 222/334] tasks, sched/core: With a grace period after finish_task_switch(), remove unnecessary code Remove work arounds that were written before there was a grace period after tasks left the runqueue in finish_task_switch(). In particular now that there tasks exiting the runqueue exprience a RCU grace period none of the work performed by task_rcu_dereference() excpet the rcu_dereference() is necessary so replace task_rcu_dereference() with rcu_dereference(). Remove the code in rcuwait_wait_event() that checks to ensure the current task has not exited. It is no longer necessary as it is guaranteed that any running task will experience a RCU grace period after it leaves the run queueue. Remove the comment in rcuwait_wake_up() as it is no longer relevant. Ref: 8f95c90ceb54 ("sched/wait, RCU: Introduce rcuwait machinery") Ref: 150593bf8693 ("sched/api: Introduce task_rcu_dereference() and try_get_task_struct()") Signed-off-by: Eric W. Biederman Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Metcalf Cc: Christoph Lameter Cc: Davidlohr Bueso Cc: Kirill Tkhai Cc: Linus Torvalds Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Russell King - ARM Linux admin Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/87lfurdpk9.fsf_-_@x220.int.ebiederm.org Signed-off-by: Ingo Molnar --- include/linux/rcuwait.h | 20 +++--------- include/linux/sched/task.h | 1 - kernel/exit.c | 67 -------------------------------------- kernel/sched/fair.c | 2 +- kernel/sched/membarrier.c | 4 +-- 5 files changed, 7 insertions(+), 87 deletions(-) diff --git a/include/linux/rcuwait.h b/include/linux/rcuwait.h index 563290fc194f..75c97e4bbc57 100644 --- a/include/linux/rcuwait.h +++ b/include/linux/rcuwait.h @@ -6,16 +6,11 @@ /* * rcuwait provides a way of blocking and waking up a single - * task in an rcu-safe manner; where it is forbidden to use - * after exit_notify(). task_struct is not properly rcu protected, - * unless dealing with rcu-aware lists, ie: find_task_by_*(). + * task in an rcu-safe manner. * - * Alternatively we have task_rcu_dereference(), but the return - * semantics have different implications which would break the - * wakeup side. The only time @task is non-nil is when a user is - * blocked (or checking if it needs to) on a condition, and reset - * as soon as we know that the condition has succeeded and are - * awoken. + * The only time @task is non-nil is when a user is blocked (or + * checking if it needs to) on a condition, and reset as soon as we + * know that the condition has succeeded and are awoken. */ struct rcuwait { struct task_struct __rcu *task; @@ -37,13 +32,6 @@ extern void rcuwait_wake_up(struct rcuwait *w); */ #define rcuwait_wait_event(w, condition) \ ({ \ - /* \ - * Complain if we are called after do_exit()/exit_notify(), \ - * as we cannot rely on the rcu critical region for the \ - * wakeup side. \ - */ \ - WARN_ON(current->exit_state); \ - \ rcu_assign_pointer((w)->task, current); \ for (;;) { \ /* \ diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 153a683646ac..4b1c3b664f51 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -119,7 +119,6 @@ static inline void put_task_struct(struct task_struct *t) __put_task_struct(t); } -struct task_struct *task_rcu_dereference(struct task_struct **ptask); void put_task_struct_rcu_user(struct task_struct *task); #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT diff --git a/kernel/exit.c b/kernel/exit.c index 3bcaec2ea3ba..a46a50d67002 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -234,69 +234,6 @@ repeat: goto repeat; } -/* - * Note that if this function returns a valid task_struct pointer (!NULL) - * task->usage must remain >0 for the duration of the RCU critical section. - */ -struct task_struct *task_rcu_dereference(struct task_struct **ptask) -{ - struct sighand_struct *sighand; - struct task_struct *task; - - /* - * We need to verify that release_task() was not called and thus - * delayed_put_task_struct() can't run and drop the last reference - * before rcu_read_unlock(). We check task->sighand != NULL, - * but we can read the already freed and reused memory. - */ -retry: - task = rcu_dereference(*ptask); - if (!task) - return NULL; - - probe_kernel_address(&task->sighand, sighand); - - /* - * Pairs with atomic_dec_and_test() in put_task_struct(). If this task - * was already freed we can not miss the preceding update of this - * pointer. - */ - smp_rmb(); - if (unlikely(task != READ_ONCE(*ptask))) - goto retry; - - /* - * We've re-checked that "task == *ptask", now we have two different - * cases: - * - * 1. This is actually the same task/task_struct. In this case - * sighand != NULL tells us it is still alive. - * - * 2. This is another task which got the same memory for task_struct. - * We can't know this of course, and we can not trust - * sighand != NULL. - * - * In this case we actually return a random value, but this is - * correct. - * - * If we return NULL - we can pretend that we actually noticed that - * *ptask was updated when the previous task has exited. Or pretend - * that probe_slab_address(&sighand) reads NULL. - * - * If we return the new task (because sighand is not NULL for any - * reason) - this is fine too. This (new) task can't go away before - * another gp pass. - * - * And note: We could even eliminate the false positive if re-read - * task->sighand once again to avoid the falsely NULL. But this case - * is very unlikely so we don't care. - */ - if (!sighand) - return NULL; - - return task; -} - void rcuwait_wake_up(struct rcuwait *w) { struct task_struct *task; @@ -316,10 +253,6 @@ void rcuwait_wake_up(struct rcuwait *w) */ smp_mb(); /* (B) */ - /* - * Avoid using task_rcu_dereference() magic as long as we are careful, - * see comment in rcuwait_wait_event() regarding ->exit_state. - */ task = rcu_dereference(w->task); if (task) wake_up_process(task); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3101c662426d..5bc23996ffae 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1602,7 +1602,7 @@ static void task_numa_compare(struct task_numa_env *env, return; rcu_read_lock(); - cur = task_rcu_dereference(&dst_rq->curr); + cur = rcu_dereference(dst_rq->curr); if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur))) cur = NULL; diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index aa8d75804108..b14250a11608 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -71,7 +71,7 @@ static int membarrier_global_expedited(void) continue; rcu_read_lock(); - p = task_rcu_dereference(&cpu_rq(cpu)->curr); + p = rcu_dereference(cpu_rq(cpu)->curr); if (p && p->mm && (atomic_read(&p->mm->membarrier_state) & MEMBARRIER_STATE_GLOBAL_EXPEDITED)) { if (!fallback) @@ -150,7 +150,7 @@ static int membarrier_private_expedited(int flags) if (cpu == raw_smp_processor_id()) continue; rcu_read_lock(); - p = task_rcu_dereference(&cpu_rq(cpu)->curr); + p = rcu_dereference(cpu_rq(cpu)->curr); if (p && p->mm == current->mm) { if (!fallback) __cpumask_set_cpu(cpu, tmpmask); From 5311a98fef7d0dc2e8040ae0e18f5568d6d1dd5a Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 14 Sep 2019 07:35:02 -0500 Subject: [PATCH 223/334] tasks, sched/core: RCUify the assignment of rq->curr The current task on the runqueue is currently read with rcu_dereference(). To obtain ordinary RCU semantics for an rcu_dereference() of rq->curr it needs to be paired with rcu_assign_pointer() of rq->curr. Which provides the memory barrier necessary to order assignments to the task_struct and the assignment to rq->curr. Unfortunately the assignment of rq->curr in __schedule is a hot path, and it has already been show that additional barriers in that code will reduce the performance of the scheduler. So I will attempt to describe below why you can effectively have ordinary RCU semantics without any additional barriers. The assignment of rq->curr in init_idle is a slow path called once per cpu and that can use rcu_assign_pointer() without any concerns. As I write this there are effectively two users of rcu_dereference() on rq->curr. There is the membarrier code in kernel/sched/membarrier.c that only looks at "->mm" after the rcu_dereference(). Then there is task_numa_compare() in kernel/sched/fair.c. My best reading of the code shows that task_numa_compare only access: "->flags", "->cpus_ptr", "->numa_group", "->numa_faults[]", "->total_numa_faults", and "->se.cfs_rq". The code in __schedule() essentially does: rq_lock(...); smp_mb__after_spinlock(); next = pick_next_task(...); rq->curr = next; context_switch(prev, next); At the start of the function the rq_lock/smp_mb__after_spinlock pair provides a full memory barrier. Further there is a full memory barrier in context_switch(). This means that any task that has already run and modified itself (the common case) has already seen two memory barriers before __schedule() runs and begins executing. A task that modifies itself then sees a third full memory barrier pair with the rq_lock(); For a brand new task that is enqueued with wake_up_new_task() there are the memory barriers present from the taking and release the pi_lock and the rq_lock as the processes is enqueued as well as the full memory barrier at the start of __schedule() assuming __schedule() happens on the same cpu. This means that by the time we reach the assignment of rq->curr except for values on the task struct modified in pick_next_task the code has the same guarantees as if it used rcu_assign_pointer(). Reading through all of the implementations of pick_next_task it appears pick_next_task is limited to modifying the task_struct fields "->se", "->rt", "->dl". These fields are the sched_entity structures of the varies schedulers. Further "->se.cfs_rq" is only changed in cgroup attach/move operations initialized by userspace. Unless I have missed something this means that in practice that the users of "rcu_dereference(rq->curr)" get normal RCU semantics of rcu_dereference() for the fields the care about, despite the assignment of rq->curr in __schedule() ot using rcu_assign_pointer. Signed-off-by: Eric W. Biederman Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Metcalf Cc: Christoph Lameter Cc: Davidlohr Bueso Cc: Kirill Tkhai Cc: Linus Torvalds Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Russell King - ARM Linux admin Cc: Thomas Gleixner Link: https://lore.kernel.org/r/20190903200603.GW2349@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5e5fefbd4cc7..84c71160beb1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4033,7 +4033,11 @@ static void __sched notrace __schedule(bool preempt) if (likely(prev != next)) { rq->nr_switches++; - rq->curr = next; + /* + * RCU users of rcu_dereference(rq->curr) may not see + * changes to task_struct made by pick_next_task(). + */ + RCU_INIT_POINTER(rq->curr, next); /* * The membarrier system call requires each architecture * to have a full memory barrier after updating @@ -6060,7 +6064,8 @@ void init_idle(struct task_struct *idle, int cpu) __set_task_cpu(idle, cpu); rcu_read_unlock(); - rq->curr = rq->idle = idle; + rq->idle = idle; + rcu_assign_pointer(rq->curr, idle); idle->on_rq = TASK_ON_RQ_QUEUED; #ifdef CONFIG_SMP idle->on_cpu = 1; From fc0d77387cb5ae883fd774fc559e056a8dde024c Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 19 Sep 2019 13:36:59 -0400 Subject: [PATCH 224/334] sched/membarrier: Fix private expedited registration check Fix a logic flaw in the way membarrier_register_private_expedited() handles ready state checks for private expedited sync core and private expedited registrations. If a private expedited membarrier registration is first performed, and then a private expedited sync_core registration is performed, the ready state check will skip the second registration when it really should not. Signed-off-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Metcalf Cc: Christoph Lameter Cc: Eric W. Biederman Cc: Kirill Tkhai Cc: Linus Torvalds Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Russell King - ARM Linux admin Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190919173705.2181-2-mathieu.desnoyers@efficios.com Signed-off-by: Ingo Molnar --- kernel/sched/membarrier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index b14250a11608..d48b95fc4026 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -226,7 +226,7 @@ static int membarrier_register_private_expedited(int flags) * groups, which use the same mm. (CLONE_VM but not * CLONE_THREAD). */ - if (atomic_read(&mm->membarrier_state) & state) + if ((atomic_read(&mm->membarrier_state) & state) == state) return 0; atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state); if (flags & MEMBARRIER_FLAG_SYNC_CORE) From 09554009c0cad4cb2223dd943c813c9257c6883a Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 19 Sep 2019 13:37:00 -0400 Subject: [PATCH 225/334] sched/membarrier: Remove redundant check Checking that the number of threads is 1 is redundant with checking mm_users == 1. No change in functionality intended. Suggested-by: Oleg Nesterov Signed-off-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Metcalf Cc: Christoph Lameter Cc: Eric W. Biederman Cc: Kirill Tkhai Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Russell King - ARM Linux admin Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190919173705.2181-3-mathieu.desnoyers@efficios.com Signed-off-by: Ingo Molnar --- kernel/sched/membarrier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index d48b95fc4026..7ccbd0e19626 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -186,7 +186,7 @@ static int membarrier_register_global_expedited(void) MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY) return 0; atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state); - if (atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1) { + if (atomic_read(&mm->mm_users) == 1) { /* * For single mm user, single threaded process, we can * simply issue a memory barrier after setting @@ -232,7 +232,7 @@ static int membarrier_register_private_expedited(int flags) if (flags & MEMBARRIER_FLAG_SYNC_CORE) atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE, &mm->membarrier_state); - if (!(atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1)) { + if (atomic_read(&mm->mm_users) != 1) { /* * Ensure all future scheduler executions will observe the * new thread flag state for this process. From 2840cf02fae627860156737e83326df354ee4ec6 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 19 Sep 2019 13:37:01 -0400 Subject: [PATCH 226/334] sched/membarrier: Call sync_core only before usermode for same mm When the prev and next task's mm change, switch_mm() provides the core serializing guarantees before returning to usermode. The only case where an explicit core serialization is needed is when the scheduler keeps the same mm for prev and next. Suggested-by: Oleg Nesterov Signed-off-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Metcalf Cc: Christoph Lameter Cc: Eric W. Biederman Cc: Kirill Tkhai Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Russell King - ARM Linux admin Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190919173705.2181-4-mathieu.desnoyers@efficios.com Signed-off-by: Ingo Molnar --- include/linux/sched/mm.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 4a7944078cc3..8557ec664213 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -362,6 +362,8 @@ enum { static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm) { + if (current->mm != mm) + return; if (likely(!(atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE))) return; From 227a4aadc75ba22fcb6c4e1c078817b8cbaae4ce Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 19 Sep 2019 13:37:02 -0400 Subject: [PATCH 227/334] sched/membarrier: Fix p->mm->membarrier_state racy load The membarrier_state field is located within the mm_struct, which is not guaranteed to exist when used from runqueue-lock-free iteration on runqueues by the membarrier system call. Copy the membarrier_state from the mm_struct into the scheduler runqueue when the scheduler switches between mm. When registering membarrier for mm, after setting the registration bit in the mm membarrier state, issue a synchronize_rcu() to ensure the scheduler observes the change. In order to take care of the case where a runqueue keeps executing the target mm without swapping to other mm, iterate over each runqueue and issue an IPI to copy the membarrier_state from the mm_struct into each runqueue which have the same mm which state has just been modified. Move the mm membarrier_state field closer to pgd in mm_struct to use a cache line already touched by the scheduler switch_mm. The membarrier_execve() (now membarrier_exec_mmap) hook now needs to clear the runqueue's membarrier state in addition to clear the mm membarrier state, so move its implementation into the scheduler membarrier code so it can access the runqueue structure. Add memory barrier in membarrier_exec_mmap() prior to clearing the membarrier state, ensuring memory accesses executed prior to exec are not reordered with the stores clearing the membarrier state. As suggested by Linus, move all membarrier.c RCU read-side locks outside of the for each cpu loops. Suggested-by: Linus Torvalds Signed-off-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Metcalf Cc: Christoph Lameter Cc: Eric W. Biederman Cc: Kirill Tkhai Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Russell King - ARM Linux admin Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190919173705.2181-5-mathieu.desnoyers@efficios.com Signed-off-by: Ingo Molnar --- fs/exec.c | 2 +- include/linux/mm_types.h | 14 ++- include/linux/sched/mm.h | 8 +- kernel/sched/core.c | 4 +- kernel/sched/membarrier.c | 175 ++++++++++++++++++++++++++++---------- kernel/sched/sched.h | 34 ++++++++ 6 files changed, 183 insertions(+), 54 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index f7f6a140856a..555e93c7dec8 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1033,6 +1033,7 @@ static int exec_mmap(struct mm_struct *mm) } task_lock(tsk); active_mm = tsk->active_mm; + membarrier_exec_mmap(mm); tsk->mm = mm; tsk->active_mm = mm; activate_mm(active_mm, mm); @@ -1825,7 +1826,6 @@ static int __do_execve_file(int fd, struct filename *filename, /* execve succeeded */ current->fs->in_exec = 0; current->in_execve = 0; - membarrier_execve(current); rseq_execve(current); acct_update_integrals(current); task_numa_free(current, false); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6a7a1083b6fb..ec9bd3a6c827 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -383,6 +383,16 @@ struct mm_struct { unsigned long highest_vm_end; /* highest vma end address */ pgd_t * pgd; +#ifdef CONFIG_MEMBARRIER + /** + * @membarrier_state: Flags controlling membarrier behavior. + * + * This field is close to @pgd to hopefully fit in the same + * cache-line, which needs to be touched by switch_mm(). + */ + atomic_t membarrier_state; +#endif + /** * @mm_users: The number of users including userspace. * @@ -452,9 +462,7 @@ struct mm_struct { unsigned long flags; /* Must use atomic bitops to access */ struct core_state *core_state; /* coredumping support */ -#ifdef CONFIG_MEMBARRIER - atomic_t membarrier_state; -#endif + #ifdef CONFIG_AIO spinlock_t ioctx_lock; struct kioctx_table __rcu *ioctx_table; diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 8557ec664213..e6770012db18 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -370,10 +370,8 @@ static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm) sync_core_before_usermode(); } -static inline void membarrier_execve(struct task_struct *t) -{ - atomic_set(&t->mm->membarrier_state, 0); -} +extern void membarrier_exec_mmap(struct mm_struct *mm); + #else #ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS static inline void membarrier_arch_switch_mm(struct mm_struct *prev, @@ -382,7 +380,7 @@ static inline void membarrier_arch_switch_mm(struct mm_struct *prev, { } #endif -static inline void membarrier_execve(struct task_struct *t) +static inline void membarrier_exec_mmap(struct mm_struct *mm) { } static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 84c71160beb1..2d9a3947bef4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3358,15 +3358,15 @@ context_switch(struct rq *rq, struct task_struct *prev, else prev->active_mm = NULL; } else { // to user + membarrier_switch_mm(rq, prev->active_mm, next->mm); /* * sys_membarrier() requires an smp_mb() between setting - * rq->curr and returning to userspace. + * rq->curr / membarrier_switch_mm() and returning to userspace. * * The below provides this either through switch_mm(), or in * case 'prev->active_mm == next->mm' through * finish_task_switch()'s mmdrop(). */ - switch_mm_irqs_off(prev->active_mm, next->mm, next); if (!prev->mm) { // from kernel diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 7ccbd0e19626..070cf433bb9a 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -30,6 +30,39 @@ static void ipi_mb(void *info) smp_mb(); /* IPIs should be serializing but paranoid. */ } +static void ipi_sync_rq_state(void *info) +{ + struct mm_struct *mm = (struct mm_struct *) info; + + if (current->mm != mm) + return; + this_cpu_write(runqueues.membarrier_state, + atomic_read(&mm->membarrier_state)); + /* + * Issue a memory barrier after setting + * MEMBARRIER_STATE_GLOBAL_EXPEDITED in the current runqueue to + * guarantee that no memory access following registration is reordered + * before registration. + */ + smp_mb(); +} + +void membarrier_exec_mmap(struct mm_struct *mm) +{ + /* + * Issue a memory barrier before clearing membarrier_state to + * guarantee that no memory access prior to exec is reordered after + * clearing this state. + */ + smp_mb(); + atomic_set(&mm->membarrier_state, 0); + /* + * Keep the runqueue membarrier_state in sync with this mm + * membarrier_state. + */ + this_cpu_write(runqueues.membarrier_state, 0); +} + static int membarrier_global_expedited(void) { int cpu; @@ -56,6 +89,7 @@ static int membarrier_global_expedited(void) } cpus_read_lock(); + rcu_read_lock(); for_each_online_cpu(cpu) { struct task_struct *p; @@ -70,17 +104,25 @@ static int membarrier_global_expedited(void) if (cpu == raw_smp_processor_id()) continue; - rcu_read_lock(); + if (!(READ_ONCE(cpu_rq(cpu)->membarrier_state) & + MEMBARRIER_STATE_GLOBAL_EXPEDITED)) + continue; + + /* + * Skip the CPU if it runs a kernel thread. The scheduler + * leaves the prior task mm in place as an optimization when + * scheduling a kthread. + */ p = rcu_dereference(cpu_rq(cpu)->curr); - if (p && p->mm && (atomic_read(&p->mm->membarrier_state) & - MEMBARRIER_STATE_GLOBAL_EXPEDITED)) { - if (!fallback) - __cpumask_set_cpu(cpu, tmpmask); - else - smp_call_function_single(cpu, ipi_mb, NULL, 1); - } - rcu_read_unlock(); + if (p->flags & PF_KTHREAD) + continue; + + if (!fallback) + __cpumask_set_cpu(cpu, tmpmask); + else + smp_call_function_single(cpu, ipi_mb, NULL, 1); } + rcu_read_unlock(); if (!fallback) { preempt_disable(); smp_call_function_many(tmpmask, ipi_mb, NULL, 1); @@ -136,6 +178,7 @@ static int membarrier_private_expedited(int flags) } cpus_read_lock(); + rcu_read_lock(); for_each_online_cpu(cpu) { struct task_struct *p; @@ -157,8 +200,8 @@ static int membarrier_private_expedited(int flags) else smp_call_function_single(cpu, ipi_mb, NULL, 1); } - rcu_read_unlock(); } + rcu_read_unlock(); if (!fallback) { preempt_disable(); smp_call_function_many(tmpmask, ipi_mb, NULL, 1); @@ -177,32 +220,78 @@ static int membarrier_private_expedited(int flags) return 0; } +static int sync_runqueues_membarrier_state(struct mm_struct *mm) +{ + int membarrier_state = atomic_read(&mm->membarrier_state); + cpumask_var_t tmpmask; + int cpu; + + if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) { + this_cpu_write(runqueues.membarrier_state, membarrier_state); + + /* + * For single mm user, we can simply issue a memory barrier + * after setting MEMBARRIER_STATE_GLOBAL_EXPEDITED in the + * mm and in the current runqueue to guarantee that no memory + * access following registration is reordered before + * registration. + */ + smp_mb(); + return 0; + } + + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + return -ENOMEM; + + /* + * For mm with multiple users, we need to ensure all future + * scheduler executions will observe @mm's new membarrier + * state. + */ + synchronize_rcu(); + + /* + * For each cpu runqueue, if the task's mm match @mm, ensure that all + * @mm's membarrier state set bits are also set in in the runqueue's + * membarrier state. This ensures that a runqueue scheduling + * between threads which are users of @mm has its membarrier state + * updated. + */ + cpus_read_lock(); + rcu_read_lock(); + for_each_online_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + struct task_struct *p; + + p = rcu_dereference(&rq->curr); + if (p && p->mm == mm) + __cpumask_set_cpu(cpu, tmpmask); + } + rcu_read_unlock(); + + preempt_disable(); + smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1); + preempt_enable(); + + free_cpumask_var(tmpmask); + cpus_read_unlock(); + + return 0; +} + static int membarrier_register_global_expedited(void) { struct task_struct *p = current; struct mm_struct *mm = p->mm; + int ret; if (atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY) return 0; atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state); - if (atomic_read(&mm->mm_users) == 1) { - /* - * For single mm user, single threaded process, we can - * simply issue a memory barrier after setting - * MEMBARRIER_STATE_GLOBAL_EXPEDITED to guarantee that - * no memory access following registration is reordered - * before registration. - */ - smp_mb(); - } else { - /* - * For multi-mm user threads, we need to ensure all - * future scheduler executions will observe the new - * thread flag state for this mm. - */ - synchronize_rcu(); - } + ret = sync_runqueues_membarrier_state(mm); + if (ret) + return ret; atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY, &mm->membarrier_state); @@ -213,12 +302,15 @@ static int membarrier_register_private_expedited(int flags) { struct task_struct *p = current; struct mm_struct *mm = p->mm; - int state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY; + int ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY, + set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED, + ret; if (flags & MEMBARRIER_FLAG_SYNC_CORE) { if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) return -EINVAL; - state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY; + ready_state = + MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY; } /* @@ -226,20 +318,15 @@ static int membarrier_register_private_expedited(int flags) * groups, which use the same mm. (CLONE_VM but not * CLONE_THREAD). */ - if ((atomic_read(&mm->membarrier_state) & state) == state) + if ((atomic_read(&mm->membarrier_state) & ready_state) == ready_state) return 0; - atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state); if (flags & MEMBARRIER_FLAG_SYNC_CORE) - atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE, - &mm->membarrier_state); - if (atomic_read(&mm->mm_users) != 1) { - /* - * Ensure all future scheduler executions will observe the - * new thread flag state for this process. - */ - synchronize_rcu(); - } - atomic_or(state, &mm->membarrier_state); + set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE; + atomic_or(set_state, &mm->membarrier_state); + ret = sync_runqueues_membarrier_state(mm); + if (ret) + return ret; + atomic_or(ready_state, &mm->membarrier_state); return 0; } @@ -253,8 +340,10 @@ static int membarrier_register_private_expedited(int flags) * command specified does not exist, not available on the running * kernel, or if the command argument is invalid, this system call * returns -EINVAL. For a given command, with flags argument set to 0, - * this system call is guaranteed to always return the same value until - * reboot. + * if this system call returns -ENOSYS or -EINVAL, it is guaranteed to + * always return the same value until reboot. In addition, it can return + * -ENOMEM if there is not enough memory available to perform the system + * call. * * All memory accesses performed in program order from each targeted thread * is guaranteed to be ordered with respect to sys_membarrier(). If we use diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b3cb895d14a2..0db2c1b3361e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -911,6 +911,10 @@ struct rq { atomic_t nr_iowait; +#ifdef CONFIG_MEMBARRIER + int membarrier_state; +#endif + #ifdef CONFIG_SMP struct root_domain *rd; struct sched_domain __rcu *sd; @@ -2438,3 +2442,33 @@ static inline bool sched_energy_enabled(void) static inline bool sched_energy_enabled(void) { return false; } #endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ + +#ifdef CONFIG_MEMBARRIER +/* + * The scheduler provides memory barriers required by membarrier between: + * - prior user-space memory accesses and store to rq->membarrier_state, + * - store to rq->membarrier_state and following user-space memory accesses. + * In the same way it provides those guarantees around store to rq->curr. + */ +static inline void membarrier_switch_mm(struct rq *rq, + struct mm_struct *prev_mm, + struct mm_struct *next_mm) +{ + int membarrier_state; + + if (prev_mm == next_mm) + return; + + membarrier_state = atomic_read(&next_mm->membarrier_state); + if (READ_ONCE(rq->membarrier_state) == membarrier_state) + return; + + WRITE_ONCE(rq->membarrier_state, membarrier_state); +} +#else +static inline void membarrier_switch_mm(struct rq *rq, + struct mm_struct *prev_mm, + struct mm_struct *next_mm) +{ +} +#endif From 19a4ff534bb09686f53800564cb977bad2177c00 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 19 Sep 2019 13:37:03 -0400 Subject: [PATCH 228/334] selftests, sched/membarrier: Add multi-threaded test membarrier commands cover very different code paths if they are in a single-threaded vs multi-threaded process. Therefore, exercise both scenarios in the kernel selftests to increase coverage of this selftest. Signed-off-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Metcalf Cc: Christoph Lameter Cc: Eric W. Biederman Cc: Kirill Tkhai Cc: Linus Torvalds Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Russell King - ARM Linux admin Cc: Shuah Khan Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190919173705.2181-6-mathieu.desnoyers@efficios.com Signed-off-by: Ingo Molnar --- tools/testing/selftests/membarrier/.gitignore | 3 +- tools/testing/selftests/membarrier/Makefile | 5 +- ...mbarrier_test.c => membarrier_test_impl.h} | 40 +++++----- .../membarrier/membarrier_test_multi_thread.c | 73 +++++++++++++++++++ .../membarrier_test_single_thread.c | 24 ++++++ 5 files changed, 124 insertions(+), 21 deletions(-) rename tools/testing/selftests/membarrier/{membarrier_test.c => membarrier_test_impl.h} (95%) create mode 100644 tools/testing/selftests/membarrier/membarrier_test_multi_thread.c create mode 100644 tools/testing/selftests/membarrier/membarrier_test_single_thread.c diff --git a/tools/testing/selftests/membarrier/.gitignore b/tools/testing/selftests/membarrier/.gitignore index 020c44f49a9e..f2f7ec0a99b4 100644 --- a/tools/testing/selftests/membarrier/.gitignore +++ b/tools/testing/selftests/membarrier/.gitignore @@ -1 +1,2 @@ -membarrier_test +membarrier_test_multi_thread +membarrier_test_single_thread diff --git a/tools/testing/selftests/membarrier/Makefile b/tools/testing/selftests/membarrier/Makefile index 97e3bdf3d1e9..34d1c81a2324 100644 --- a/tools/testing/selftests/membarrier/Makefile +++ b/tools/testing/selftests/membarrier/Makefile @@ -1,7 +1,8 @@ # SPDX-License-Identifier: GPL-2.0-only CFLAGS += -g -I../../../../usr/include/ +LDLIBS += -lpthread -TEST_GEN_PROGS := membarrier_test +TEST_GEN_PROGS := membarrier_test_single_thread \ + membarrier_test_multi_thread include ../lib.mk - diff --git a/tools/testing/selftests/membarrier/membarrier_test.c b/tools/testing/selftests/membarrier/membarrier_test_impl.h similarity index 95% rename from tools/testing/selftests/membarrier/membarrier_test.c rename to tools/testing/selftests/membarrier/membarrier_test_impl.h index 70b4ddbf126b..186be69f0a59 100644 --- a/tools/testing/selftests/membarrier/membarrier_test.c +++ b/tools/testing/selftests/membarrier/membarrier_test_impl.h @@ -1,10 +1,11 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ #define _GNU_SOURCE #include #include #include #include #include +#include #include "../kselftest.h" @@ -223,7 +224,7 @@ static int test_membarrier_global_expedited_success(void) return 0; } -static int test_membarrier(void) +static int test_membarrier_fail(void) { int status; @@ -233,10 +234,27 @@ static int test_membarrier(void) status = test_membarrier_flags_fail(); if (status) return status; - status = test_membarrier_global_success(); + status = test_membarrier_private_expedited_fail(); if (status) return status; - status = test_membarrier_private_expedited_fail(); + status = sys_membarrier(MEMBARRIER_CMD_QUERY, 0); + if (status < 0) { + ksft_test_result_fail("sys_membarrier() failed\n"); + return status; + } + if (status & MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE) { + status = test_membarrier_private_expedited_sync_core_fail(); + if (status) + return status; + } + return 0; +} + +static int test_membarrier_success(void) +{ + int status; + + status = test_membarrier_global_success(); if (status) return status; status = test_membarrier_register_private_expedited_success(); @@ -251,9 +269,6 @@ static int test_membarrier(void) return status; } if (status & MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE) { - status = test_membarrier_private_expedited_sync_core_fail(); - if (status) - return status; status = test_membarrier_register_private_expedited_sync_core_success(); if (status) return status; @@ -300,14 +315,3 @@ static int test_membarrier_query(void) ksft_test_result_pass("sys_membarrier available\n"); return 0; } - -int main(int argc, char **argv) -{ - ksft_print_header(); - ksft_set_plan(13); - - test_membarrier_query(); - test_membarrier(); - - return ksft_exit_pass(); -} diff --git a/tools/testing/selftests/membarrier/membarrier_test_multi_thread.c b/tools/testing/selftests/membarrier/membarrier_test_multi_thread.c new file mode 100644 index 000000000000..ac5613e5b0eb --- /dev/null +++ b/tools/testing/selftests/membarrier/membarrier_test_multi_thread.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include + +#include "membarrier_test_impl.h" + +static int thread_ready, thread_quit; +static pthread_mutex_t test_membarrier_thread_mutex = + PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t test_membarrier_thread_cond = + PTHREAD_COND_INITIALIZER; + +void *test_membarrier_thread(void *arg) +{ + pthread_mutex_lock(&test_membarrier_thread_mutex); + thread_ready = 1; + pthread_cond_broadcast(&test_membarrier_thread_cond); + pthread_mutex_unlock(&test_membarrier_thread_mutex); + + pthread_mutex_lock(&test_membarrier_thread_mutex); + while (!thread_quit) + pthread_cond_wait(&test_membarrier_thread_cond, + &test_membarrier_thread_mutex); + pthread_mutex_unlock(&test_membarrier_thread_mutex); + + return NULL; +} + +static int test_mt_membarrier(void) +{ + int i; + pthread_t test_thread; + + pthread_create(&test_thread, NULL, + test_membarrier_thread, NULL); + + pthread_mutex_lock(&test_membarrier_thread_mutex); + while (!thread_ready) + pthread_cond_wait(&test_membarrier_thread_cond, + &test_membarrier_thread_mutex); + pthread_mutex_unlock(&test_membarrier_thread_mutex); + + test_membarrier_fail(); + + test_membarrier_success(); + + pthread_mutex_lock(&test_membarrier_thread_mutex); + thread_quit = 1; + pthread_cond_broadcast(&test_membarrier_thread_cond); + pthread_mutex_unlock(&test_membarrier_thread_mutex); + + pthread_join(test_thread, NULL); + + return 0; +} + +int main(int argc, char **argv) +{ + ksft_print_header(); + ksft_set_plan(13); + + test_membarrier_query(); + + /* Multi-threaded */ + test_mt_membarrier(); + + return ksft_exit_pass(); +} diff --git a/tools/testing/selftests/membarrier/membarrier_test_single_thread.c b/tools/testing/selftests/membarrier/membarrier_test_single_thread.c new file mode 100644 index 000000000000..c1c963902854 --- /dev/null +++ b/tools/testing/selftests/membarrier/membarrier_test_single_thread.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include + +#include "membarrier_test_impl.h" + +int main(int argc, char **argv) +{ + ksft_print_header(); + ksft_set_plan(13); + + test_membarrier_query(); + + test_membarrier_fail(); + + test_membarrier_success(); + + return ksft_exit_pass(); +} From c6d68c1c4a4d6611fc0f8145d764226571d737ca Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 19 Sep 2019 13:37:04 -0400 Subject: [PATCH 229/334] sched/membarrier: Skip IPIs when mm->mm_users == 1 If there is only a single mm_user for the mm, the private expedited membarrier command can skip the IPIs, because only a single thread is using the mm. Signed-off-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Metcalf Cc: Christoph Lameter Cc: Eric W. Biederman Cc: Kirill Tkhai Cc: Linus Torvalds Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Russell King - ARM Linux admin Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190919173705.2181-7-mathieu.desnoyers@efficios.com Signed-off-by: Ingo Molnar --- kernel/sched/membarrier.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 070cf433bb9a..fced54ad0f3d 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -145,20 +145,21 @@ static int membarrier_private_expedited(int flags) int cpu; bool fallback = false; cpumask_var_t tmpmask; + struct mm_struct *mm = current->mm; if (flags & MEMBARRIER_FLAG_SYNC_CORE) { if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) return -EINVAL; - if (!(atomic_read(¤t->mm->membarrier_state) & + if (!(atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY)) return -EPERM; } else { - if (!(atomic_read(¤t->mm->membarrier_state) & + if (!(atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)) return -EPERM; } - if (num_online_cpus() == 1) + if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) return 0; /* @@ -194,7 +195,7 @@ static int membarrier_private_expedited(int flags) continue; rcu_read_lock(); p = rcu_dereference(cpu_rq(cpu)->curr); - if (p && p->mm == current->mm) { + if (p && p->mm == mm) { if (!fallback) __cpumask_set_cpu(cpu, tmpmask); else From c172e0a3e8e65a4c6fffec5bc4d6de08d6f894f7 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 19 Sep 2019 13:37:05 -0400 Subject: [PATCH 230/334] sched/membarrier: Return -ENOMEM to userspace on memory allocation failure Remove the IPI fallback code from membarrier to deal with very infrequent cpumask memory allocation failure. Use GFP_KERNEL rather than GFP_NOWAIT, and relax the blocking guarantees for the expedited membarrier system call commands, allowing it to block if waiting for memory to be made available. In addition, now -ENOMEM can be returned to user-space if the cpumask memory allocation fails. Signed-off-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Metcalf Cc: Christoph Lameter Cc: Eric W. Biederman Cc: Kirill Tkhai Cc: Linus Torvalds Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Russell King - ARM Linux admin Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190919173705.2181-8-mathieu.desnoyers@efficios.com Signed-off-by: Ingo Molnar --- kernel/sched/membarrier.c | 63 +++++++++++++-------------------------- 1 file changed, 20 insertions(+), 43 deletions(-) diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index fced54ad0f3d..a39bed2c784f 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -66,7 +66,6 @@ void membarrier_exec_mmap(struct mm_struct *mm) static int membarrier_global_expedited(void) { int cpu; - bool fallback = false; cpumask_var_t tmpmask; if (num_online_cpus() == 1) @@ -78,15 +77,8 @@ static int membarrier_global_expedited(void) */ smp_mb(); /* system call entry is not a mb. */ - /* - * Expedited membarrier commands guarantee that they won't - * block, hence the GFP_NOWAIT allocation flag and fallback - * implementation. - */ - if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) { - /* Fallback for OOM. */ - fallback = true; - } + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + return -ENOMEM; cpus_read_lock(); rcu_read_lock(); @@ -117,18 +109,15 @@ static int membarrier_global_expedited(void) if (p->flags & PF_KTHREAD) continue; - if (!fallback) - __cpumask_set_cpu(cpu, tmpmask); - else - smp_call_function_single(cpu, ipi_mb, NULL, 1); + __cpumask_set_cpu(cpu, tmpmask); } rcu_read_unlock(); - if (!fallback) { - preempt_disable(); - smp_call_function_many(tmpmask, ipi_mb, NULL, 1); - preempt_enable(); - free_cpumask_var(tmpmask); - } + + preempt_disable(); + smp_call_function_many(tmpmask, ipi_mb, NULL, 1); + preempt_enable(); + + free_cpumask_var(tmpmask); cpus_read_unlock(); /* @@ -143,7 +132,6 @@ static int membarrier_global_expedited(void) static int membarrier_private_expedited(int flags) { int cpu; - bool fallback = false; cpumask_var_t tmpmask; struct mm_struct *mm = current->mm; @@ -168,15 +156,8 @@ static int membarrier_private_expedited(int flags) */ smp_mb(); /* system call entry is not a mb. */ - /* - * Expedited membarrier commands guarantee that they won't - * block, hence the GFP_NOWAIT allocation flag and fallback - * implementation. - */ - if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) { - /* Fallback for OOM. */ - fallback = true; - } + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + return -ENOMEM; cpus_read_lock(); rcu_read_lock(); @@ -195,20 +176,16 @@ static int membarrier_private_expedited(int flags) continue; rcu_read_lock(); p = rcu_dereference(cpu_rq(cpu)->curr); - if (p && p->mm == mm) { - if (!fallback) - __cpumask_set_cpu(cpu, tmpmask); - else - smp_call_function_single(cpu, ipi_mb, NULL, 1); - } + if (p && p->mm == mm) + __cpumask_set_cpu(cpu, tmpmask); } rcu_read_unlock(); - if (!fallback) { - preempt_disable(); - smp_call_function_many(tmpmask, ipi_mb, NULL, 1); - preempt_enable(); - free_cpumask_var(tmpmask); - } + + preempt_disable(); + smp_call_function_many(tmpmask, ipi_mb, NULL, 1); + preempt_enable(); + + free_cpumask_var(tmpmask); cpus_read_unlock(); /* @@ -264,7 +241,7 @@ static int sync_runqueues_membarrier_state(struct mm_struct *mm) struct rq *rq = cpu_rq(cpu); struct task_struct *p; - p = rcu_dereference(&rq->curr); + p = rcu_dereference(rq->curr); if (p && p->mm == mm) __cpumask_set_cpu(cpu, tmpmask); } From 714e501e16cd473538b609b3e351b2cc9f7f09ed Mon Sep 17 00:00:00 2001 From: KeMeng Shi Date: Mon, 16 Sep 2019 06:53:28 +0000 Subject: [PATCH 231/334] sched/core: Fix migration to invalid CPU in __set_cpus_allowed_ptr() An oops can be triggered in the scheduler when running qemu on arm64: Unable to handle kernel paging request at virtual address ffff000008effe40 Internal error: Oops: 96000007 [#1] SMP Process migration/0 (pid: 12, stack limit = 0x00000000084e3736) pstate: 20000085 (nzCv daIf -PAN -UAO) pc : __ll_sc___cmpxchg_case_acq_4+0x4/0x20 lr : move_queued_task.isra.21+0x124/0x298 ... Call trace: __ll_sc___cmpxchg_case_acq_4+0x4/0x20 __migrate_task+0xc8/0xe0 migration_cpu_stop+0x170/0x180 cpu_stopper_thread+0xec/0x178 smpboot_thread_fn+0x1ac/0x1e8 kthread+0x134/0x138 ret_from_fork+0x10/0x18 __set_cpus_allowed_ptr() will choose an active dest_cpu in affinity mask to migrage the process if process is not currently running on any one of the CPUs specified in affinity mask. __set_cpus_allowed_ptr() will choose an invalid dest_cpu (dest_cpu >= nr_cpu_ids, 1024 in my virtual machine) if CPUS in an affinity mask are deactived by cpu_down after cpumask_intersects check. cpumask_test_cpu() of dest_cpu afterwards is overflown and may pass if corresponding bit is coincidentally set. As a consequence, kernel will access an invalid rq address associate with the invalid CPU in migration_cpu_stop->__migrate_task->move_queued_task and the Oops occurs. The reproduce the crash: 1) A process repeatedly binds itself to cpu0 and cpu1 in turn by calling sched_setaffinity. 2) A shell script repeatedly does "echo 0 > /sys/devices/system/cpu/cpu1/online" and "echo 1 > /sys/devices/system/cpu/cpu1/online" in turn. 3) Oops appears if the invalid CPU is set in memory after tested cpumask. Signed-off-by: KeMeng Shi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/1568616808-16808-1-git-send-email-shikemeng@huawei.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2d9a3947bef4..83ea23e9e91f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1656,7 +1656,8 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, if (cpumask_equal(p->cpus_ptr, new_mask)) goto out; - if (!cpumask_intersects(new_mask, cpu_valid_mask)) { + dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); + if (dest_cpu >= nr_cpu_ids) { ret = -EINVAL; goto out; } @@ -1677,7 +1678,6 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, if (cpumask_test_cpu(task_cpu(p), new_mask)) goto out; - dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); if (task_running(rq, p) || p->state == TASK_WAKING) { struct migration_arg arg = { p, dest_cpu }; /* Need help from migration thread: drop lock and wait. */ From 763a9ec06c409dcde2a761aac4bb83ff3938e0b3 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Tue, 20 Aug 2019 14:40:55 -0400 Subject: [PATCH 232/334] sched/fair: Fix -Wunused-but-set-variable warnings Commit: de53fd7aedb1 ("sched/fair: Fix low cpu usage with high throttling by removing expiration of cpu-local slices") introduced a few compilation warnings: kernel/sched/fair.c: In function '__refill_cfs_bandwidth_runtime': kernel/sched/fair.c:4365:6: warning: variable 'now' set but not used [-Wunused-but-set-variable] kernel/sched/fair.c: In function 'start_cfs_bandwidth': kernel/sched/fair.c:4992:6: warning: variable 'overrun' set but not used [-Wunused-but-set-variable] Also, __refill_cfs_bandwidth_runtime() does no longer update the expiration time, so fix the comments accordingly. Signed-off-by: Qian Cai Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ben Segall Reviewed-by: Dave Chiluk Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: pauld@redhat.com Fixes: de53fd7aedb1 ("sched/fair: Fix low cpu usage with high throttling by removing expiration of cpu-local slices") Link: https://lkml.kernel.org/r/1566326455-8038-1-git-send-email-cai@lca.pw Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5bc23996ffae..dfdac90fd211 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4353,21 +4353,16 @@ static inline u64 sched_cfs_bandwidth_slice(void) } /* - * Replenish runtime according to assigned quota and update expiration time. - * We use sched_clock_cpu directly instead of rq->clock to avoid adding - * additional synchronization around rq->lock. + * Replenish runtime according to assigned quota. We use sched_clock_cpu + * directly instead of rq->clock to avoid adding additional synchronization + * around rq->lock. * * requires cfs_b->lock */ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) { - u64 now; - - if (cfs_b->quota == RUNTIME_INF) - return; - - now = sched_clock_cpu(smp_processor_id()); - cfs_b->runtime = cfs_b->quota; + if (cfs_b->quota != RUNTIME_INF) + cfs_b->runtime = cfs_b->quota; } static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) @@ -4983,15 +4978,13 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) { - u64 overrun; - lockdep_assert_held(&cfs_b->lock); if (cfs_b->period_active) return; cfs_b->period_active = 1; - overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period); + hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period); hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED); } From a49b4f4012ef233143c5f7ce44f97851e54d5ef9 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 23 Sep 2019 15:36:12 +0100 Subject: [PATCH 233/334] sched/core: Fix preempt_schedule() interrupt return comment preempt_schedule_irq() is the one that should be called on return from interrupt, clean up the comment to avoid any ambiguity. Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Acked-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Cc: linux-m68k@lists.linux-m68k.org Cc: linux-riscv@lists.infradead.org Cc: uclinux-h8-devel@lists.sourceforge.jp Link: https://lkml.kernel.org/r/20190923143620.29334-2-valentin.schneider@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 83ea23e9e91f..00ef44c8f7b5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4218,9 +4218,8 @@ static void __sched notrace preempt_schedule_common(void) #ifdef CONFIG_PREEMPTION /* - * this is the entry point to schedule() from in-kernel preemption - * off of preempt_enable. Kernel preemptions off return from interrupt - * occur there and call schedule directly. + * This is the entry point to schedule() from in-kernel preemption + * off of preempt_enable. */ asmlinkage __visible void __sched notrace preempt_schedule(void) { @@ -4291,7 +4290,7 @@ EXPORT_SYMBOL_GPL(preempt_schedule_notrace); #endif /* CONFIG_PREEMPTION */ /* - * this is the entry point to schedule() from kernel preemption + * This is the entry point to schedule() from kernel preemption * off of irq context. * Note, that this is called and return with irqs disabled. This will * protect us against recursive calling from irq. From 9fc41acc89e58262c917b4be89ef00a87485804f Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 23 Sep 2019 10:30:17 +0100 Subject: [PATCH 234/334] sched/core: Remove double update_max_interval() call on CPU startup update_max_interval() is called in both CPUHP_AP_SCHED_STARTING's startup and teardown callbacks, but it turns out it's also called at the end of the startup callback of CPUHP_AP_ACTIVE (which is further down the startup sequence). There's no point in repeating this interval update in the startup sequence since the CPU will remain online until it goes down the teardown path. Remove the redundant call in sched_cpu_activate() (CPUHP_AP_ACTIVE). Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: juri.lelli@redhat.com Cc: vincent.guittot@linaro.org Link: https://lkml.kernel.org/r/20190923093017.11755-1-valentin.schneider@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 00ef44c8f7b5..1b61cf48eee8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6425,8 +6425,6 @@ int sched_cpu_activate(unsigned int cpu) } rq_unlock_irqrestore(rq, &rf); - update_max_interval(); - return 0; } From 4892f51ad54ddff2883a60b6ad4323c1f632a9d6 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Fri, 20 Sep 2019 11:41:15 +0200 Subject: [PATCH 235/334] sched/fair: Avoid redundant EAS calculation The EAS wake-up path computes the system energy for several CPU candidates: the CPU with maximum spare capacity in each performance domain, and the prev_cpu. However, if prev_cpu also happens to be the CPU with maximum spare capacity in its performance domain, the energy calculation is still done twice, unnecessarily. Add a condition to filter out this corner case before doing the energy calculation. Reported-by: Pavan Kondeti Signed-off-by: Quentin Perret Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: juri.lelli@redhat.com Cc: morten.rasmussen@arm.com Cc: qais.yousef@arm.com Cc: rjw@rjwysocki.net Cc: tkjos@google.com Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Fixes: eb92692b2544 ("sched/fair: Speed-up energy-aware wake-ups") Link: https://lkml.kernel.org/r/20190920094115.GA11503@qperret.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dfdac90fd211..83ab35e2374f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6389,7 +6389,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) } /* Evaluate the energy impact of using this CPU. */ - if (max_spare_cap_cpu >= 0) { + if (max_spare_cap_cpu >= 0 && max_spare_cap_cpu != prev_cpu) { cur_delta = compute_energy(p, max_spare_cap_cpu, pd); cur_delta -= base_energy_pd; if (cur_delta < best_delta) { From 8a03222f508bf09e03cf38f6bd77b34b450c1d60 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 23 Sep 2019 11:41:12 -0700 Subject: [PATCH 236/334] selftests/bpf: test_progs: fix client/server race in tcp_rtt This is the same problem I found earlier in test_sockopt_inherit: there is a race between server thread doing accept() and client thread doing connect(). Let's explicitly synchronize them via pthread conditional variable. v2: * don't exit from server_thread without signaling condvar, fixes possible issue where main() would wait forever (Andrii Nakryiko) Fixes: b55873984dab ("selftests/bpf: test BPF_SOCK_OPS_RTT_CB") Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- .../selftests/bpf/prog_tests/tcp_rtt.c | 21 +++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c b/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c index fdc0b3614a9e..a82da555b1b0 100644 --- a/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c +++ b/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c @@ -203,14 +203,24 @@ static int start_server(void) return fd; } +static pthread_mutex_t server_started_mtx = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t server_started = PTHREAD_COND_INITIALIZER; + static void *server_thread(void *arg) { struct sockaddr_storage addr; socklen_t len = sizeof(addr); int fd = *(int *)arg; int client_fd; + int err; - if (CHECK_FAIL(listen(fd, 1)) < 0) { + err = listen(fd, 1); + + pthread_mutex_lock(&server_started_mtx); + pthread_cond_signal(&server_started); + pthread_mutex_unlock(&server_started_mtx); + + if (CHECK_FAIL(err < 0)) { perror("Failed to listed on socket"); return NULL; } @@ -248,7 +258,14 @@ void test_tcp_rtt(void) if (CHECK_FAIL(server_fd < 0)) goto close_cgroup_fd; - pthread_create(&tid, NULL, server_thread, (void *)&server_fd); + if (CHECK_FAIL(pthread_create(&tid, NULL, server_thread, + (void *)&server_fd))) + goto close_cgroup_fd; + + pthread_mutex_lock(&server_started_mtx); + pthread_cond_wait(&server_started, &server_started_mtx); + pthread_mutex_unlock(&server_started_mtx); + CHECK_FAIL(run_test(cgroup_fd, server_fd)); close(server_fd); close_cgroup_fd: From fcd30ae0665c778e283f73c1c885c7fd26d12ef2 Mon Sep 17 00:00:00 2001 From: Jonathan Lemon Date: Tue, 24 Sep 2019 09:25:21 -0700 Subject: [PATCH 237/334] bpf/xskmap: Return ERR_PTR for failure case instead of NULL. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When kzalloc() failed, NULL was returned to the caller, which tested the pointer with IS_ERR(), which didn't match, so the pointer was used later, resulting in a NULL dereference. Return ERR_PTR(-ENOMEM) instead of NULL. Reported-by: syzbot+491c1b7565ba9069ecae@syzkaller.appspotmail.com Fixes: 0402acd683c6 ("xsk: remove AF_XDP socket from map when the socket is released") Signed-off-by: Jonathan Lemon Acked-by: Björn Töpel Signed-off-by: Daniel Borkmann --- kernel/bpf/xskmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 942c662e2eed..82a1ffe15dfa 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -37,7 +37,7 @@ static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map, node = kzalloc(sizeof(*node), GFP_ATOMIC | __GFP_NOWARN); if (!node) - return NULL; + return ERR_PTR(-ENOMEM); err = xsk_map_inc(map); if (err) { From aef70a1f44c0b570e6345c02c2d240471859f0a4 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 25 Sep 2019 11:30:38 -0700 Subject: [PATCH 238/334] libbpf: fix false uninitialized variable warning Some compilers emit warning for potential uninitialized next_id usage. The code is correct, but control flow is too complicated for some compilers to figure this out. Re-initialize next_id to satisfy compiler. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- tools/lib/bpf/btf_dump.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index 715967762312..84b0661db7f3 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -1167,6 +1167,7 @@ static void btf_dump_emit_type_chain(struct btf_dump *d, return; } + next_id = decls->ids[decls->cnt - 1]; next_t = btf__type_by_id(d->btf, next_id); multidim = btf_is_array(next_t); /* we need space if we have named non-pointer */ From d778c30a056ac352d1c0c58b5850e0fcc5655a58 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 25 Sep 2019 11:36:14 -0700 Subject: [PATCH 239/334] selftests/bpf: delete unused variables in test_sysctl Remove no longer used variables and avoid compiler warnings. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/test_sysctl.c | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/bpf/test_sysctl.c b/tools/testing/selftests/bpf/test_sysctl.c index 4f8ec1f10a80..a320e3844b17 100644 --- a/tools/testing/selftests/bpf/test_sysctl.c +++ b/tools/testing/selftests/bpf/test_sysctl.c @@ -1385,7 +1385,6 @@ static int fixup_sysctl_value(const char *buf, size_t buf_len, uint8_t raw[sizeof(uint64_t)]; uint64_t num; } value = {}; - uint8_t c, i; if (buf_len > sizeof(value)) { log_err("Value is too big (%zd) to use in fixup", buf_len); From 4670d68b9254710fdeaf794cad54d8b2c9929e0a Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 25 Sep 2019 11:52:05 -0700 Subject: [PATCH 240/334] selftests/bpf: adjust strobemeta loop to satisfy latest clang Some recent changes in latest Clang started causing the following warning when unrolling strobemeta test case main loop: progs/strobemeta.h:416:2: warning: loop not unrolled: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning] This patch simplifies loop's exit condition to depend only on constant max iteration number (STROBE_MAX_MAP_ENTRIES), while moving early termination logic inside the loop body. The changes are equivalent from program logic standpoint, but fixes the warning. It also appears to improve generated BPF code, as it fixes previously failing non-unrolled strobemeta test cases. Cc: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/progs/strobemeta.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/progs/strobemeta.h b/tools/testing/selftests/bpf/progs/strobemeta.h index 8a399bdfd920..067eb625d01c 100644 --- a/tools/testing/selftests/bpf/progs/strobemeta.h +++ b/tools/testing/selftests/bpf/progs/strobemeta.h @@ -413,7 +413,10 @@ static __always_inline void *read_map_var(struct strobemeta_cfg *cfg, #else #pragma unroll #endif - for (int i = 0; i < STROBE_MAX_MAP_ENTRIES && i < map.cnt; ++i) { + for (int i = 0; i < STROBE_MAX_MAP_ENTRIES; ++i) { + if (i >= map.cnt) + break; + descr->key_lens[i] = 0; len = bpf_probe_read_str(payload, STROBE_MAX_STR_LEN, map.entries[i].key); From 34b7bb2995b839729a2e793ea1dccc987b6dbab6 Mon Sep 17 00:00:00 2001 From: Rain River Date: Mon, 23 Sep 2019 22:37:46 +0800 Subject: [PATCH 241/334] MAINTAINERS: add Yanjun to FORCEDETH maintainers list Yanjun has been spending quite a lot of time fixing bugs in FORCEDETH source code. I'd like to add Yanjun to maintainers list. Signed-off-by: Rain River Acked-by: Zhu Yanjun Signed-off-by: David S. Miller --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index b2326dece28e..d61071f187c6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -643,6 +643,7 @@ F: drivers/net/ethernet/alacritech/* FORCEDETH GIGABIT ETHERNET DRIVER M: Rain River +M: Zhu Yanjun L: netdev@vger.kernel.org S: Maintained F: drivers/net/ethernet/nvidia/* From bf69abad27d8fe1daca9558441fd0205fb2d7bc9 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 23 Sep 2019 17:52:42 +0200 Subject: [PATCH 242/334] net: Fix Kconfig indentation Adjust indentation from spaces to tab (+optional two spaces) as in coding style with command like: $ sed -e 's/^ /\t/' -i */Kconfig Signed-off-by: Krzysztof Kozlowski Acked-by: Sven Eckelmann Signed-off-by: David S. Miller --- net/batman-adv/Kconfig | 10 +-- net/ife/Kconfig | 2 +- net/ipv4/Kconfig | 4 +- net/ipv6/netfilter/Kconfig | 16 ++--- net/netfilter/Kconfig | 2 +- net/netfilter/ipvs/Kconfig | 6 +- net/rds/Kconfig | 4 +- net/sched/Kconfig | 144 ++++++++++++++++++------------------- 8 files changed, 94 insertions(+), 94 deletions(-) diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig index a3d188dfbe75..d5028af750d5 100644 --- a/net/batman-adv/Kconfig +++ b/net/batman-adv/Kconfig @@ -12,11 +12,11 @@ config BATMAN_ADV depends on NET select LIBCRC32C help - B.A.T.M.A.N. (better approach to mobile ad-hoc networking) is - a routing protocol for multi-hop ad-hoc mesh networks. The - networks may be wired or wireless. See - https://www.open-mesh.org/ for more information and user space - tools. + B.A.T.M.A.N. (better approach to mobile ad-hoc networking) is + a routing protocol for multi-hop ad-hoc mesh networks. The + networks may be wired or wireless. See + https://www.open-mesh.org/ for more information and user space + tools. config BATMAN_ADV_BATMAN_V bool "B.A.T.M.A.N. V protocol" diff --git a/net/ife/Kconfig b/net/ife/Kconfig index 6cd1f6d18f30..bcf650564db4 100644 --- a/net/ife/Kconfig +++ b/net/ife/Kconfig @@ -5,7 +5,7 @@ menuconfig NET_IFE depends on NET - tristate "Inter-FE based on IETF ForCES InterFE LFB" + tristate "Inter-FE based on IETF ForCES InterFE LFB" default n help Say Y here to add support of IFE encapsulation protocol diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 974de4d20f25..03381f3e12ba 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -492,8 +492,8 @@ config TCP_CONG_WESTWOOD wired networks and throughput over wireless links. config TCP_CONG_HTCP - tristate "H-TCP" - default m + tristate "H-TCP" + default m ---help--- H-TCP is a send-side only modifications of the TCP Reno protocol stack that optimizes the performance of TCP diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 6120a7800975..69443e9a3aa5 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -170,13 +170,13 @@ config IP6_NF_MATCH_RT To compile it as a module, choose M here. If unsure, say N. config IP6_NF_MATCH_SRH - tristate '"srh" Segment Routing header match support' - depends on NETFILTER_ADVANCED - help - srh matching allows you to match packets based on the segment + tristate '"srh" Segment Routing header match support' + depends on NETFILTER_ADVANCED + help + srh matching allows you to match packets based on the segment routing header of the packet. - To compile it as a module, choose M here. If unsure, say N. + To compile it as a module, choose M here. If unsure, say N. # The targets config IP6_NF_TARGET_HL @@ -249,10 +249,10 @@ config IP6_NF_SECURITY depends on SECURITY depends on NETFILTER_ADVANCED help - This option adds a `security' table to iptables, for use - with Mandatory Access Control (MAC) policy. + This option adds a `security' table to iptables, for use + with Mandatory Access Control (MAC) policy. - If unsure, say N. + If unsure, say N. config IP6_NF_NAT tristate "ip6tables NAT support" diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 34ec7afec116..91efae88e8c2 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -697,7 +697,7 @@ config NF_FLOW_TABLE_INET tristate "Netfilter flow table mixed IPv4/IPv6 module" depends on NF_FLOW_TABLE help - This option adds the flow table mixed IPv4/IPv6 support. + This option adds the flow table mixed IPv4/IPv6 support. To compile it as a module, choose M here. diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig index f6f1a0d5c47d..5b672e05d758 100644 --- a/net/netfilter/ipvs/Kconfig +++ b/net/netfilter/ipvs/Kconfig @@ -135,7 +135,7 @@ config IP_VS_WRR module, choose M here. If unsure, say N. config IP_VS_LC - tristate "least-connection scheduling" + tristate "least-connection scheduling" ---help--- The least-connection scheduling algorithm directs network connections to the server with the least number of active @@ -145,7 +145,7 @@ config IP_VS_LC module, choose M here. If unsure, say N. config IP_VS_WLC - tristate "weighted least-connection scheduling" + tristate "weighted least-connection scheduling" ---help--- The weighted least-connection scheduling algorithm directs network connections to the server with the least active connections @@ -333,7 +333,7 @@ config IP_VS_NFCT config IP_VS_PE_SIP tristate "SIP persistence engine" - depends on IP_VS_PROTO_UDP + depends on IP_VS_PROTO_UDP depends on NF_CONNTRACK_SIP ---help--- Allow persistence based on the SIP Call-ID diff --git a/net/rds/Kconfig b/net/rds/Kconfig index 38ea7f0f2699..c64e154bc18f 100644 --- a/net/rds/Kconfig +++ b/net/rds/Kconfig @@ -23,6 +23,6 @@ config RDS_TCP This transport does not support RDMA operations. config RDS_DEBUG - bool "RDS debugging messages" + bool "RDS debugging messages" depends on RDS - default n + default n diff --git a/net/sched/Kconfig b/net/sched/Kconfig index b3faafeafab9..5b044ae6dc1e 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -324,7 +324,7 @@ config NET_SCH_CAKE tristate "Common Applications Kept Enhanced (CAKE)" help Say Y here if you want to use the Common Applications Kept Enhanced - (CAKE) queue management algorithm. + (CAKE) queue management algorithm. To compile this driver as a module, choose M here: the module will be called sch_cake. @@ -730,8 +730,8 @@ config NET_CLS_ACT config NET_ACT_POLICE tristate "Traffic Policing" - depends on NET_CLS_ACT - ---help--- + depends on NET_CLS_ACT + ---help--- Say Y here if you want to do traffic policing, i.e. strict bandwidth limiting. This action replaces the existing policing module. @@ -740,9 +740,9 @@ config NET_ACT_POLICE module will be called act_police. config NET_ACT_GACT - tristate "Generic actions" - depends on NET_CLS_ACT - ---help--- + tristate "Generic actions" + depends on NET_CLS_ACT + ---help--- Say Y here to take generic actions such as dropping and accepting packets. @@ -750,15 +750,15 @@ config NET_ACT_GACT module will be called act_gact. config GACT_PROB - bool "Probability support" - depends on NET_ACT_GACT - ---help--- + bool "Probability support" + depends on NET_ACT_GACT + ---help--- Say Y here to use the generic action randomly or deterministically. config NET_ACT_MIRRED - tristate "Redirecting and Mirroring" - depends on NET_CLS_ACT - ---help--- + tristate "Redirecting and Mirroring" + depends on NET_CLS_ACT + ---help--- Say Y here to allow packets to be mirrored or redirected to other devices. @@ -766,10 +766,10 @@ config NET_ACT_MIRRED module will be called act_mirred. config NET_ACT_SAMPLE - tristate "Traffic Sampling" - depends on NET_CLS_ACT - select PSAMPLE - ---help--- + tristate "Traffic Sampling" + depends on NET_CLS_ACT + select PSAMPLE + ---help--- Say Y here to allow packet sampling tc action. The packet sample action consists of statistically choosing packets and sampling them using the psample module. @@ -778,9 +778,9 @@ config NET_ACT_SAMPLE module will be called act_sample. config NET_ACT_IPT - tristate "IPtables targets" - depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES - ---help--- + tristate "IPtables targets" + depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES + ---help--- Say Y here to be able to invoke iptables targets after successful classification. @@ -788,9 +788,9 @@ config NET_ACT_IPT module will be called act_ipt. config NET_ACT_NAT - tristate "Stateless NAT" - depends on NET_CLS_ACT - ---help--- + tristate "Stateless NAT" + depends on NET_CLS_ACT + ---help--- Say Y here to do stateless NAT on IPv4 packets. You should use netfilter for NAT unless you know what you are doing. @@ -798,18 +798,18 @@ config NET_ACT_NAT module will be called act_nat. config NET_ACT_PEDIT - tristate "Packet Editing" - depends on NET_CLS_ACT - ---help--- + tristate "Packet Editing" + depends on NET_CLS_ACT + ---help--- Say Y here if you want to mangle the content of packets. To compile this code as a module, choose M here: the module will be called act_pedit. config NET_ACT_SIMP - tristate "Simple Example (Debug)" - depends on NET_CLS_ACT - ---help--- + tristate "Simple Example (Debug)" + depends on NET_CLS_ACT + ---help--- Say Y here to add a simple action for demonstration purposes. It is meant as an example and for debugging purposes. It will print a configured policy string followed by the packet count @@ -821,9 +821,9 @@ config NET_ACT_SIMP module will be called act_simple. config NET_ACT_SKBEDIT - tristate "SKB Editing" - depends on NET_CLS_ACT - ---help--- + tristate "SKB Editing" + depends on NET_CLS_ACT + ---help--- Say Y here to change skb priority or queue_mapping settings. If unsure, say N. @@ -832,10 +832,10 @@ config NET_ACT_SKBEDIT module will be called act_skbedit. config NET_ACT_CSUM - tristate "Checksum Updating" - depends on NET_CLS_ACT && INET - select LIBCRC32C - ---help--- + tristate "Checksum Updating" + depends on NET_CLS_ACT && INET + select LIBCRC32C + ---help--- Say Y here to update some common checksum after some direct packet alterations. @@ -854,9 +854,9 @@ config NET_ACT_MPLS module will be called act_mpls. config NET_ACT_VLAN - tristate "Vlan manipulation" - depends on NET_CLS_ACT - ---help--- + tristate "Vlan manipulation" + depends on NET_CLS_ACT + ---help--- Say Y here to push or pop vlan headers. If unsure, say N. @@ -865,9 +865,9 @@ config NET_ACT_VLAN module will be called act_vlan. config NET_ACT_BPF - tristate "BPF based action" - depends on NET_CLS_ACT - ---help--- + tristate "BPF based action" + depends on NET_CLS_ACT + ---help--- Say Y here to execute BPF code on packets. The BPF code will decide if the packet should be dropped or not. @@ -877,10 +877,10 @@ config NET_ACT_BPF module will be called act_bpf. config NET_ACT_CONNMARK - tristate "Netfilter Connection Mark Retriever" - depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES - depends on NF_CONNTRACK && NF_CONNTRACK_MARK - ---help--- + tristate "Netfilter Connection Mark Retriever" + depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES + depends on NF_CONNTRACK && NF_CONNTRACK_MARK + ---help--- Say Y here to allow retrieving of conn mark If unsure, say N. @@ -889,10 +889,10 @@ config NET_ACT_CONNMARK module will be called act_connmark. config NET_ACT_CTINFO - tristate "Netfilter Connection Mark Actions" - depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES - depends on NF_CONNTRACK && NF_CONNTRACK_MARK - help + tristate "Netfilter Connection Mark Actions" + depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES + depends on NF_CONNTRACK && NF_CONNTRACK_MARK + help Say Y here to allow transfer of a connmark stored information. Current actions transfer connmark stored DSCP into ipv4/v6 diffserv and/or to transfer connmark to packet @@ -906,21 +906,21 @@ config NET_ACT_CTINFO module will be called act_ctinfo. config NET_ACT_SKBMOD - tristate "skb data modification action" - depends on NET_CLS_ACT - ---help--- - Say Y here to allow modification of skb data + tristate "skb data modification action" + depends on NET_CLS_ACT + ---help--- + Say Y here to allow modification of skb data - If unsure, say N. + If unsure, say N. - To compile this code as a module, choose M here: the - module will be called act_skbmod. + To compile this code as a module, choose M here: the + module will be called act_skbmod. config NET_ACT_IFE - tristate "Inter-FE action based on IETF ForCES InterFE LFB" - depends on NET_CLS_ACT - select NET_IFE - ---help--- + tristate "Inter-FE action based on IETF ForCES InterFE LFB" + depends on NET_CLS_ACT + select NET_IFE + ---help--- Say Y here to allow for sourcing and terminating metadata For details refer to netdev01 paper: "Distributing Linux Traffic Control Classifier-Action Subsystem" @@ -930,9 +930,9 @@ config NET_ACT_IFE module will be called act_ife. config NET_ACT_TUNNEL_KEY - tristate "IP tunnel metadata manipulation" - depends on NET_CLS_ACT - ---help--- + tristate "IP tunnel metadata manipulation" + depends on NET_CLS_ACT + ---help--- Say Y here to set/release ip tunnel metadata. If unsure, say N. @@ -941,9 +941,9 @@ config NET_ACT_TUNNEL_KEY module will be called act_tunnel_key. config NET_ACT_CT - tristate "connection tracking tc action" - depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT - help + tristate "connection tracking tc action" + depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT + help Say Y here to allow sending the packets to conntrack module. If unsure, say N. @@ -952,16 +952,16 @@ config NET_ACT_CT module will be called act_ct. config NET_IFE_SKBMARK - tristate "Support to encoding decoding skb mark on IFE action" - depends on NET_ACT_IFE + tristate "Support to encoding decoding skb mark on IFE action" + depends on NET_ACT_IFE config NET_IFE_SKBPRIO - tristate "Support to encoding decoding skb prio on IFE action" - depends on NET_ACT_IFE + tristate "Support to encoding decoding skb prio on IFE action" + depends on NET_ACT_IFE config NET_IFE_SKBTCINDEX - tristate "Support to encoding decoding skb tcindex on IFE action" - depends on NET_ACT_IFE + tristate "Support to encoding decoding skb tcindex on IFE action" + depends on NET_ACT_IFE config NET_TC_SKB_EXT bool "TC recirculation support" From 02bc5eb990597796d8e8383d1b98e540af963bf1 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 23 Sep 2019 17:52:43 +0200 Subject: [PATCH 243/334] drivers: net: Fix Kconfig indentation Adjust indentation from spaces to tab (+optional two spaces) as in coding style with command like: $ sed -e 's/^ /\t/' -i */Kconfig Signed-off-by: Krzysztof Kozlowski Acked-by: Kalle Valo Reviewed-by: Leon Romanovsky Signed-off-by: David S. Miller --- drivers/net/Kconfig | 2 +- drivers/net/arcnet/Kconfig | 22 ++-- drivers/net/can/usb/Kconfig | 8 +- drivers/net/ethernet/allwinner/Kconfig | 10 +- drivers/net/ethernet/emulex/benet/Kconfig | 2 +- .../net/ethernet/mellanox/mlx5/core/Kconfig | 36 +++--- drivers/net/ethernet/nxp/Kconfig | 8 +- drivers/net/ethernet/pensando/Kconfig | 4 +- drivers/net/phy/Kconfig | 6 +- drivers/net/wireless/ath/Kconfig | 2 +- drivers/net/wireless/ath/ar5523/Kconfig | 4 +- drivers/net/wireless/ath/ath6kl/Kconfig | 2 +- drivers/net/wireless/ath/ath9k/Kconfig | 2 +- drivers/net/wireless/ath/carl9170/Kconfig | 6 +- drivers/net/wireless/atmel/Kconfig | 32 ++--- drivers/net/wireless/intel/ipw2x00/Kconfig | 116 +++++++++--------- drivers/net/wireless/intel/iwlegacy/Kconfig | 6 +- drivers/net/wireless/intel/iwlwifi/Kconfig | 6 +- drivers/net/wireless/ralink/rt2x00/Kconfig | 24 ++-- 19 files changed, 149 insertions(+), 149 deletions(-) diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index 48e209e55843..df1c7989e13d 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -487,7 +487,7 @@ config FUJITSU_ES depends on ACPI help This driver provides support for Extended Socket network device - on Extended Partitioning of FUJITSU PRIMEQUEST 2000 E2 series. + on Extended Partitioning of FUJITSU PRIMEQUEST 2000 E2 series. config THUNDERBOLT_NET tristate "Networking over Thunderbolt cable" diff --git a/drivers/net/arcnet/Kconfig b/drivers/net/arcnet/Kconfig index faeb4419b205..27551bf3d7e4 100644 --- a/drivers/net/arcnet/Kconfig +++ b/drivers/net/arcnet/Kconfig @@ -56,19 +56,19 @@ config ARCNET_CAP tristate "Enable CAP mode packet interface" help ARCnet "cap mode" packet encapsulation. Used to get the hardware - acknowledge back to userspace. After the initial protocol byte every - packet is stuffed with an extra 4 byte "cookie" which doesn't - actually appear on the network. After transmit the driver will send - back a packet with protocol byte 0 containing the status of the - transmission: - 0=no hardware acknowledge - 1=excessive nak - 2=transmission accepted by the receiver hardware + acknowledge back to userspace. After the initial protocol byte every + packet is stuffed with an extra 4 byte "cookie" which doesn't + actually appear on the network. After transmit the driver will send + back a packet with protocol byte 0 containing the status of the + transmission: + 0=no hardware acknowledge + 1=excessive nak + 2=transmission accepted by the receiver hardware - Received packets are also stuffed with the extra 4 bytes but it will - be random data. + Received packets are also stuffed with the extra 4 bytes but it will + be random data. - Cap only listens to protocol 1-8. + Cap only listens to protocol 1-8. config ARCNET_COM90xx tristate "ARCnet COM90xx (normal) chipset driver" diff --git a/drivers/net/can/usb/Kconfig b/drivers/net/can/usb/Kconfig index 4b3d0ddcda79..b412f7ba4f89 100644 --- a/drivers/net/can/usb/Kconfig +++ b/drivers/net/can/usb/Kconfig @@ -15,10 +15,10 @@ config CAN_EMS_USB from EMS Dr. Thomas Wuensche (http://www.ems-wuensche.de). config CAN_ESD_USB2 - tristate "ESD USB/2 CAN/USB interface" - ---help--- - This driver supports the CAN-USB/2 interface - from esd electronic system design gmbh (http://www.esd.eu). + tristate "ESD USB/2 CAN/USB interface" + ---help--- + This driver supports the CAN-USB/2 interface + from esd electronic system design gmbh (http://www.esd.eu). config CAN_GS_USB tristate "Geschwister Schneider UG interfaces" diff --git a/drivers/net/ethernet/allwinner/Kconfig b/drivers/net/ethernet/allwinner/Kconfig index a5e2bcbf2722..264a482ec31d 100644 --- a/drivers/net/ethernet/allwinner/Kconfig +++ b/drivers/net/ethernet/allwinner/Kconfig @@ -21,17 +21,17 @@ config NET_VENDOR_ALLWINNER if NET_VENDOR_ALLWINNER config SUN4I_EMAC - tristate "Allwinner A10 EMAC support" + tristate "Allwinner A10 EMAC support" depends on ARCH_SUNXI depends on OF select CRC32 select MII select PHYLIB select MDIO_SUN4I - ---help--- - Support for Allwinner A10 EMAC ethernet driver. + ---help--- + Support for Allwinner A10 EMAC ethernet driver. - To compile this driver as a module, choose M here. The module - will be called sun4i-emac. + To compile this driver as a module, choose M here. The module + will be called sun4i-emac. endif # NET_VENDOR_ALLWINNER diff --git a/drivers/net/ethernet/emulex/benet/Kconfig b/drivers/net/ethernet/emulex/benet/Kconfig index e8c7eb842dbe..17d300ea9955 100644 --- a/drivers/net/ethernet/emulex/benet/Kconfig +++ b/drivers/net/ethernet/emulex/benet/Kconfig @@ -48,5 +48,5 @@ config BE2NET_SKYHAWK chipsets. (e.g. OneConnect OCe14xxx) comment "WARNING: be2net is useless without any enabled chip" - depends on BE2NET_BE2=n && BE2NET_BE3=n && BE2NET_LANCER=n && \ + depends on BE2NET_BE2=n && BE2NET_BE3=n && BE2NET_LANCER=n && \ BE2NET_SKYHAWK=n && BE2NET diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig index 0dba272a5b2f..a1f20b205299 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig +++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig @@ -20,15 +20,15 @@ config MLX5_ACCEL bool config MLX5_FPGA - bool "Mellanox Technologies Innova support" - depends on MLX5_CORE + bool "Mellanox Technologies Innova support" + depends on MLX5_CORE select MLX5_ACCEL - ---help--- - Build support for the Innova family of network cards by Mellanox - Technologies. Innova network cards are comprised of a ConnectX chip - and an FPGA chip on one board. If you select this option, the - mlx5_core driver will include the Innova FPGA core and allow building - sandbox-specific client drivers. + ---help--- + Build support for the Innova family of network cards by Mellanox + Technologies. Innova network cards are comprised of a ConnectX chip + and an FPGA chip on one board. If you select this option, the + mlx5_core driver will include the Innova FPGA core and allow building + sandbox-specific client drivers. config MLX5_CORE_EN bool "Mellanox 5th generation network adapters (ConnectX series) Ethernet support" @@ -58,14 +58,14 @@ config MLX5_EN_RXNFC API. config MLX5_MPFS - bool "Mellanox Technologies MLX5 MPFS support" - depends on MLX5_CORE_EN + bool "Mellanox Technologies MLX5 MPFS support" + depends on MLX5_CORE_EN default y - ---help--- + ---help--- Mellanox Technologies Ethernet Multi-Physical Function Switch (MPFS) - support in ConnectX NIC. MPFs is required for when multi-PF configuration - is enabled to allow passing user configured unicast MAC addresses to the - requesting PF. + support in ConnectX NIC. MPFs is required for when multi-PF configuration + is enabled to allow passing user configured unicast MAC addresses to the + requesting PF. config MLX5_ESWITCH bool "Mellanox Technologies MLX5 SRIOV E-Switch support" @@ -73,10 +73,10 @@ config MLX5_ESWITCH default y ---help--- Mellanox Technologies Ethernet SRIOV E-Switch support in ConnectX NIC. - E-Switch provides internal SRIOV packet steering and switching for the - enabled VFs and PF in two available modes: - Legacy SRIOV mode (L2 mac vlan steering based). - Switchdev mode (eswitch offloads). + E-Switch provides internal SRIOV packet steering and switching for the + enabled VFs and PF in two available modes: + Legacy SRIOV mode (L2 mac vlan steering based). + Switchdev mode (eswitch offloads). config MLX5_CORE_EN_DCB bool "Data Center Bridging (DCB) Support" diff --git a/drivers/net/ethernet/nxp/Kconfig b/drivers/net/ethernet/nxp/Kconfig index 418afb84c84b..ee83a71c2509 100644 --- a/drivers/net/ethernet/nxp/Kconfig +++ b/drivers/net/ethernet/nxp/Kconfig @@ -1,9 +1,9 @@ # SPDX-License-Identifier: GPL-2.0-only config LPC_ENET - tristate "NXP ethernet MAC on LPC devices" - depends on ARCH_LPC32XX || COMPILE_TEST - select PHYLIB - help + tristate "NXP ethernet MAC on LPC devices" + depends on ARCH_LPC32XX || COMPILE_TEST + select PHYLIB + help Say Y or M here if you want to use the NXP ethernet MAC included on some NXP LPC devices. You can safely enable this option for LPC32xx SoC. Also available as a module. diff --git a/drivers/net/ethernet/pensando/Kconfig b/drivers/net/ethernet/pensando/Kconfig index 5ea570be8379..bd0583e409df 100644 --- a/drivers/net/ethernet/pensando/Kconfig +++ b/drivers/net/ethernet/pensando/Kconfig @@ -26,7 +26,7 @@ config IONIC found in . - To compile this driver as a module, choose M here. The module - will be called ionic. + To compile this driver as a module, choose M here. The module + will be called ionic. endif # NET_VENDOR_PENSANDO diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig index 03be30cde552..fe602648b99f 100644 --- a/drivers/net/phy/Kconfig +++ b/drivers/net/phy/Kconfig @@ -460,9 +460,9 @@ config RENESAS_PHY Supports the Renesas PHYs uPD60620 and uPD60620A. config ROCKCHIP_PHY - tristate "Driver for Rockchip Ethernet PHYs" - ---help--- - Currently supports the integrated Ethernet PHY. + tristate "Driver for Rockchip Ethernet PHYs" + ---help--- + Currently supports the integrated Ethernet PHY. config SMSC_PHY tristate "SMSC PHYs" diff --git a/drivers/net/wireless/ath/Kconfig b/drivers/net/wireless/ath/Kconfig index d98d6ac90f3d..56616d988c96 100644 --- a/drivers/net/wireless/ath/Kconfig +++ b/drivers/net/wireless/ath/Kconfig @@ -34,7 +34,7 @@ config ATH_TRACEPOINTS depends on ATH_DEBUG depends on EVENT_TRACING ---help--- - This option enables tracepoints for atheros wireless drivers. + This option enables tracepoints for atheros wireless drivers. Currently, ath9k makes use of this facility. config ATH_REG_DYNAMIC_USER_REG_HINTS diff --git a/drivers/net/wireless/ath/ar5523/Kconfig b/drivers/net/wireless/ath/ar5523/Kconfig index 41d3c9a48b08..65b39c7d035d 100644 --- a/drivers/net/wireless/ath/ar5523/Kconfig +++ b/drivers/net/wireless/ath/ar5523/Kconfig @@ -5,5 +5,5 @@ config AR5523 select ATH_COMMON select FW_LOADER ---help--- - This module add support for AR5523 based USB dongles such as D-Link - DWL-G132, Netgear WPN111 and many more. + This module add support for AR5523 based USB dongles such as D-Link + DWL-G132, Netgear WPN111 and many more. diff --git a/drivers/net/wireless/ath/ath6kl/Kconfig b/drivers/net/wireless/ath/ath6kl/Kconfig index dcf8ca0dcc52..62c22fdcca38 100644 --- a/drivers/net/wireless/ath/ath6kl/Kconfig +++ b/drivers/net/wireless/ath/ath6kl/Kconfig @@ -2,7 +2,7 @@ config ATH6KL tristate "Atheros mobile chipsets support" depends on CFG80211 - ---help--- + ---help--- This module adds core support for wireless adapters based on Atheros AR6003 and AR6004 chipsets. You still need separate bus drivers for USB and SDIO to be able to use real devices. diff --git a/drivers/net/wireless/ath/ath9k/Kconfig b/drivers/net/wireless/ath/ath9k/Kconfig index 2d1247f61297..c99f42284465 100644 --- a/drivers/net/wireless/ath/ath9k/Kconfig +++ b/drivers/net/wireless/ath/ath9k/Kconfig @@ -148,7 +148,7 @@ config ATH9K_CHANNEL_CONTEXT depends on ATH9K default n ---help--- - This option enables channel context support in ath9k, which is needed + This option enables channel context support in ath9k, which is needed for multi-channel concurrency. Enable this if P2P PowerSave support is required. diff --git a/drivers/net/wireless/ath/carl9170/Kconfig b/drivers/net/wireless/ath/carl9170/Kconfig index 757eb765e17c..b1bce7aad399 100644 --- a/drivers/net/wireless/ath/carl9170/Kconfig +++ b/drivers/net/wireless/ath/carl9170/Kconfig @@ -41,9 +41,9 @@ config CARL9170_WPC default y config CARL9170_HWRNG - bool "Random number generator" - depends on CARL9170 && (HW_RANDOM = y || HW_RANDOM = CARL9170) - default n + bool "Random number generator" + depends on CARL9170 && (HW_RANDOM = y || HW_RANDOM = CARL9170) + default n help Provides a hardware random number generator to the kernel. diff --git a/drivers/net/wireless/atmel/Kconfig b/drivers/net/wireless/atmel/Kconfig index 809bdf331848..4c0556b3a5ba 100644 --- a/drivers/net/wireless/atmel/Kconfig +++ b/drivers/net/wireless/atmel/Kconfig @@ -20,22 +20,22 @@ config ATMEL select FW_LOADER select CRC32 ---help--- - A driver 802.11b wireless cards based on the Atmel fast-vnet - chips. This driver supports standard Linux wireless extensions. + A driver 802.11b wireless cards based on the Atmel fast-vnet + chips. This driver supports standard Linux wireless extensions. - Many cards based on this chipset do not have flash memory - and need their firmware loaded at start-up. If yours is - one of these, you will need to provide a firmware image - to be loaded into the card by the driver. The Atmel - firmware package can be downloaded from - + Many cards based on this chipset do not have flash memory + and need their firmware loaded at start-up. If yours is + one of these, you will need to provide a firmware image + to be loaded into the card by the driver. The Atmel + firmware package can be downloaded from + config PCI_ATMEL tristate "Atmel at76c506 PCI cards" depends on ATMEL && PCI ---help--- - Enable support for PCI and mini-PCI cards containing the - Atmel at76c506 chip. + Enable support for PCI and mini-PCI cards containing the + Atmel at76c506 chip. config PCMCIA_ATMEL tristate "Atmel at76c502/at76c504 PCMCIA cards" @@ -48,11 +48,11 @@ config PCMCIA_ATMEL Atmel at76c502 and at76c504 chips. config AT76C50X_USB - tristate "Atmel at76c503/at76c505/at76c505a USB cards" - depends on MAC80211 && USB - select FW_LOADER - ---help--- - Enable support for USB Wireless devices using Atmel at76c503, - at76c505 or at76c505a chips. + tristate "Atmel at76c503/at76c505/at76c505a USB cards" + depends on MAC80211 && USB + select FW_LOADER + ---help--- + Enable support for USB Wireless devices using Atmel at76c503, + at76c505 or at76c505a chips. endif # WLAN_VENDOR_ATMEL diff --git a/drivers/net/wireless/intel/ipw2x00/Kconfig b/drivers/net/wireless/intel/ipw2x00/Kconfig index 5d2878a73732..ab17903ba9f8 100644 --- a/drivers/net/wireless/intel/ipw2x00/Kconfig +++ b/drivers/net/wireless/intel/ipw2x00/Kconfig @@ -13,37 +13,37 @@ config IPW2100 select LIB80211 select LIBIPW ---help--- - A driver for the Intel PRO/Wireless 2100 Network + A driver for the Intel PRO/Wireless 2100 Network Connection 802.11b wireless network adapter. - See + See for information on the capabilities currently enabled in this driver and for tips for debugging issues and problems. In order to use this driver, you will need a firmware image for it. - You can obtain the firmware from - . Once you have the firmware image, you + You can obtain the firmware from + . Once you have the firmware image, you will need to place it in /lib/firmware. - You will also very likely need the Wireless Tools in order to - configure your card: + You will also very likely need the Wireless Tools in order to + configure your card: - . + . + + It is recommended that you compile this driver as a module (M) + rather than built-in (Y). This driver requires firmware at device + initialization time, and when built-in this typically happens + before the filesystem is accessible (hence firmware will be + unavailable and initialization will fail). If you do choose to build + this driver into your kernel image, you can avoid this problem by + including the firmware and a firmware loader in an initramfs. - It is recommended that you compile this driver as a module (M) - rather than built-in (Y). This driver requires firmware at device - initialization time, and when built-in this typically happens - before the filesystem is accessible (hence firmware will be - unavailable and initialization will fail). If you do choose to build - this driver into your kernel image, you can avoid this problem by - including the firmware and a firmware loader in an initramfs. - config IPW2100_MONITOR - bool "Enable promiscuous mode" - depends on IPW2100 - ---help--- + bool "Enable promiscuous mode" + depends on IPW2100 + ---help--- Enables promiscuous/monitor mode support for the ipw2100 driver. - With this feature compiled into the driver, you can switch to + With this feature compiled into the driver, you can switch to promiscuous mode via the Wireless Tool's Monitor mode. While in this mode, no packets can be sent. @@ -51,17 +51,17 @@ config IPW2100_DEBUG bool "Enable full debugging output in IPW2100 module." depends on IPW2100 ---help--- - This option will enable debug tracing output for the IPW2100. + This option will enable debug tracing output for the IPW2100. - This will result in the kernel module being ~60k larger. You can - control which debug output is sent to the kernel log by setting the - value in + This will result in the kernel module being ~60k larger. You can + control which debug output is sent to the kernel log by setting the + value in /sys/bus/pci/drivers/ipw2100/debug_level This entry will only exist if this option is enabled. - If you are not trying to debug or develop the IPW2100 driver, you + If you are not trying to debug or develop the IPW2100 driver, you most likely want to say N here. config IPW2200 @@ -75,37 +75,37 @@ config IPW2200 select LIB80211 select LIBIPW ---help--- - A driver for the Intel PRO/Wireless 2200BG and 2915ABG Network - Connection adapters. + A driver for the Intel PRO/Wireless 2200BG and 2915ABG Network + Connection adapters. - See + See for information on the capabilities currently enabled in this driver and for tips for debugging issues and problems. In order to use this driver, you will need a firmware image for it. - You can obtain the firmware from - . See the above referenced README.ipw2200 + You can obtain the firmware from + . See the above referenced README.ipw2200 for information on where to install the firmware images. - You will also very likely need the Wireless Tools in order to - configure your card: + You will also very likely need the Wireless Tools in order to + configure your card: - . + . - It is recommended that you compile this driver as a module (M) - rather than built-in (Y). This driver requires firmware at device - initialization time, and when built-in this typically happens - before the filesystem is accessible (hence firmware will be - unavailable and initialization will fail). If you do choose to build - this driver into your kernel image, you can avoid this problem by - including the firmware and a firmware loader in an initramfs. + It is recommended that you compile this driver as a module (M) + rather than built-in (Y). This driver requires firmware at device + initialization time, and when built-in this typically happens + before the filesystem is accessible (hence firmware will be + unavailable and initialization will fail). If you do choose to build + this driver into your kernel image, you can avoid this problem by + including the firmware and a firmware loader in an initramfs. config IPW2200_MONITOR - bool "Enable promiscuous mode" - depends on IPW2200 - ---help--- + bool "Enable promiscuous mode" + depends on IPW2200 + ---help--- Enables promiscuous/monitor mode support for the ipw2200 driver. - With this feature compiled into the driver, you can switch to + With this feature compiled into the driver, you can switch to promiscuous mode via the Wireless Tool's Monitor mode. While in this mode, no packets can be sent. @@ -118,28 +118,28 @@ config IPW2200_PROMISCUOUS depends on IPW2200_MONITOR select IPW2200_RADIOTAP ---help--- - Enables the creation of a second interface prefixed 'rtap'. - This second interface will provide every received in radiotap + Enables the creation of a second interface prefixed 'rtap'. + This second interface will provide every received in radiotap format. - This is useful for performing wireless network analysis while - maintaining an active association. + This is useful for performing wireless network analysis while + maintaining an active association. - Example usage: + Example usage: - % modprobe ipw2200 rtap_iface=1 - % ifconfig rtap0 up - % tethereal -i rtap0 + % modprobe ipw2200 rtap_iface=1 + % ifconfig rtap0 up + % tethereal -i rtap0 - If you do not specify 'rtap_iface=1' as a module parameter then - the rtap interface will not be created and you will need to turn - it on via sysfs: - - % echo 1 > /sys/bus/pci/drivers/ipw2200/*/rtap_iface + If you do not specify 'rtap_iface=1' as a module parameter then + the rtap interface will not be created and you will need to turn + it on via sysfs: + + % echo 1 > /sys/bus/pci/drivers/ipw2200/*/rtap_iface config IPW2200_QOS - bool "Enable QoS support" - depends on IPW2200 + bool "Enable QoS support" + depends on IPW2200 config IPW2200_DEBUG bool "Enable full debugging output in IPW2200 module." diff --git a/drivers/net/wireless/intel/iwlegacy/Kconfig b/drivers/net/wireless/intel/iwlegacy/Kconfig index e329fd7b09c0..100f55858b13 100644 --- a/drivers/net/wireless/intel/iwlegacy/Kconfig +++ b/drivers/net/wireless/intel/iwlegacy/Kconfig @@ -91,9 +91,9 @@ config IWLEGACY_DEBUG any problems you may encounter. config IWLEGACY_DEBUGFS - bool "iwlegacy (iwl 3945/4965) debugfs support" - depends on IWLEGACY && MAC80211_DEBUGFS - ---help--- + bool "iwlegacy (iwl 3945/4965) debugfs support" + depends on IWLEGACY && MAC80211_DEBUGFS + ---help--- Enable creation of debugfs files for the iwlegacy drivers. This is a low-impact option that allows getting insight into the driver's state at runtime. diff --git a/drivers/net/wireless/intel/iwlwifi/Kconfig b/drivers/net/wireless/intel/iwlwifi/Kconfig index 7dbc0d38bb3b..091d621ad25f 100644 --- a/drivers/net/wireless/intel/iwlwifi/Kconfig +++ b/drivers/net/wireless/intel/iwlwifi/Kconfig @@ -119,9 +119,9 @@ config IWLWIFI_DEBUG any problems you may encounter. config IWLWIFI_DEBUGFS - bool "iwlwifi debugfs support" - depends on MAC80211_DEBUGFS - ---help--- + bool "iwlwifi debugfs support" + depends on MAC80211_DEBUGFS + ---help--- Enable creation of debugfs files for the iwlwifi drivers. This is a low-impact option that allows getting insight into the driver's state at runtime. diff --git a/drivers/net/wireless/ralink/rt2x00/Kconfig b/drivers/net/wireless/ralink/rt2x00/Kconfig index 858f8aa3e616..f8a9244ce012 100644 --- a/drivers/net/wireless/ralink/rt2x00/Kconfig +++ b/drivers/net/wireless/ralink/rt2x00/Kconfig @@ -98,17 +98,17 @@ config RT2800PCI_RT53XX bool "rt2800pci - Include support for rt53xx devices (EXPERIMENTAL)" default y ---help--- - This adds support for rt53xx wireless chipset family to the - rt2800pci driver. - Supported chips: RT5390 + This adds support for rt53xx wireless chipset family to the + rt2800pci driver. + Supported chips: RT5390 config RT2800PCI_RT3290 bool "rt2800pci - Include support for rt3290 devices (EXPERIMENTAL)" default y ---help--- - This adds support for rt3290 wireless chipset family to the - rt2800pci driver. - Supported chips: RT3290 + This adds support for rt3290 wireless chipset family to the + rt2800pci driver. + Supported chips: RT3290 endif config RT2500USB @@ -176,16 +176,16 @@ config RT2800USB_RT3573 config RT2800USB_RT53XX bool "rt2800usb - Include support for rt53xx devices (EXPERIMENTAL)" ---help--- - This adds support for rt53xx wireless chipset family to the - rt2800usb driver. - Supported chips: RT5370 + This adds support for rt53xx wireless chipset family to the + rt2800usb driver. + Supported chips: RT5370 config RT2800USB_RT55XX bool "rt2800usb - Include support for rt55xx devices (EXPERIMENTAL)" ---help--- - This adds support for rt55xx wireless chipset family to the - rt2800usb driver. - Supported chips: RT5572 + This adds support for rt55xx wireless chipset family to the + rt2800usb driver. + Supported chips: RT5572 config RT2800USB_UNKNOWN bool "rt2800usb - Include support for unknown (USB) devices" From 3e8b9bfa110896f95d602d8c98d5f9d67e41d78c Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Mon, 23 Sep 2019 22:04:58 -0700 Subject: [PATCH 244/334] net/sched: cbs: Fix not adding cbs instance to list When removing a cbs instance when offloading is enabled, the crash below can be observed. The problem happens because that when offloading is enabled, the cbs instance is not added to the list. Also, the current code doesn't handle correctly the case when offload is disabled without removing the qdisc: if the link speed changes the credit calculations will be wrong. When we create the cbs instance with offloading enabled, it's not added to the notification list, when later we disable offloading, it's not in the list, so link speed changes will not affect it. The solution for both issues is the same, add the cbs instance being created unconditionally to the global list, even if the link state notification isn't useful "right now". Crash log: [518758.189866] BUG: kernel NULL pointer dereference, address: 0000000000000000 [518758.189870] #PF: supervisor read access in kernel mode [518758.189871] #PF: error_code(0x0000) - not-present page [518758.189872] PGD 0 P4D 0 [518758.189874] Oops: 0000 [#1] SMP PTI [518758.189876] CPU: 3 PID: 4825 Comm: tc Not tainted 5.2.9 #1 [518758.189877] Hardware name: Gigabyte Technology Co., Ltd. Z390 AORUS ULTRA/Z390 AORUS ULTRA-CF, BIOS F7 03/14/2019 [518758.189881] RIP: 0010:__list_del_entry_valid+0x29/0xa0 [518758.189883] Code: 90 48 b8 00 01 00 00 00 00 ad de 55 48 8b 17 4c 8b 47 08 48 89 e5 48 39 c2 74 27 48 b8 00 02 00 00 00 00 ad de 49 39 c0 74 2d <49> 8b 30 48 39 fe 75 3d 48 8b 52 08 48 39 f2 75 4c b8 01 00 00 00 [518758.189885] RSP: 0018:ffffa27e43903990 EFLAGS: 00010207 [518758.189887] RAX: dead000000000200 RBX: ffff8bce69f0f000 RCX: 0000000000000000 [518758.189888] RDX: 0000000000000000 RSI: ffff8bce69f0f064 RDI: ffff8bce69f0f1e0 [518758.189890] RBP: ffffa27e43903990 R08: 0000000000000000 R09: ffff8bce69e788c0 [518758.189891] R10: ffff8bce62acd400 R11: 00000000000003cb R12: ffff8bce69e78000 [518758.189892] R13: ffff8bce69f0f140 R14: 0000000000000000 R15: 0000000000000000 [518758.189894] FS: 00007fa1572c8f80(0000) GS:ffff8bce6e0c0000(0000) knlGS:0000000000000000 [518758.189895] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [518758.189896] CR2: 0000000000000000 CR3: 000000040a398006 CR4: 00000000003606e0 [518758.189898] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [518758.189899] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [518758.189900] Call Trace: [518758.189904] cbs_destroy+0x32/0xa0 [sch_cbs] [518758.189906] qdisc_destroy+0x45/0x120 [518758.189907] qdisc_put+0x25/0x30 [518758.189908] qdisc_graft+0x2c1/0x450 [518758.189910] tc_get_qdisc+0x1c8/0x310 [518758.189912] ? get_page_from_freelist+0x91a/0xcb0 [518758.189914] rtnetlink_rcv_msg+0x293/0x360 [518758.189916] ? kmem_cache_alloc_node_trace+0x178/0x260 [518758.189918] ? __kmalloc_node_track_caller+0x38/0x50 [518758.189920] ? rtnl_calcit.isra.0+0xf0/0xf0 [518758.189922] netlink_rcv_skb+0x48/0x110 [518758.189923] rtnetlink_rcv+0x10/0x20 [518758.189925] netlink_unicast+0x15b/0x1d0 [518758.189926] netlink_sendmsg+0x1ea/0x380 [518758.189929] sock_sendmsg+0x2f/0x40 [518758.189930] ___sys_sendmsg+0x295/0x2f0 [518758.189932] ? ___sys_recvmsg+0x151/0x1e0 [518758.189933] ? do_wp_page+0x7e/0x450 [518758.189935] __sys_sendmsg+0x48/0x80 [518758.189937] __x64_sys_sendmsg+0x1a/0x20 [518758.189939] do_syscall_64+0x53/0x1f0 [518758.189941] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [518758.189942] RIP: 0033:0x7fa15755169a [518758.189944] Code: 48 c7 c0 ff ff ff ff eb be 0f 1f 80 00 00 00 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 18 b8 2e 00 00 00 c5 fc 77 0f 05 <48> 3d 00 f0 ff ff 77 5e c3 0f 1f 44 00 00 48 83 ec 28 89 54 24 1c [518758.189946] RSP: 002b:00007ffda58b60b8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e [518758.189948] RAX: ffffffffffffffda RBX: 000055e4b836d9a0 RCX: 00007fa15755169a [518758.189949] RDX: 0000000000000000 RSI: 00007ffda58b6128 RDI: 0000000000000003 [518758.189951] RBP: 00007ffda58b6190 R08: 0000000000000001 R09: 000055e4b9d848a0 [518758.189952] R10: 0000000000000000 R11: 0000000000000246 R12: 000000005d654b49 [518758.189953] R13: 0000000000000000 R14: 00007ffda58b6230 R15: 00007ffda58b6210 [518758.189955] Modules linked in: sch_cbs sch_etf sch_mqprio netlink_diag unix_diag e1000e igb intel_pch_thermal thermal video backlight pcc_cpufreq [518758.189960] CR2: 0000000000000000 [518758.189961] ---[ end trace 6a13f7aaf5376019 ]--- [518758.189963] RIP: 0010:__list_del_entry_valid+0x29/0xa0 [518758.189964] Code: 90 48 b8 00 01 00 00 00 00 ad de 55 48 8b 17 4c 8b 47 08 48 89 e5 48 39 c2 74 27 48 b8 00 02 00 00 00 00 ad de 49 39 c0 74 2d <49> 8b 30 48 39 fe 75 3d 48 8b 52 08 48 39 f2 75 4c b8 01 00 00 00 [518758.189967] RSP: 0018:ffffa27e43903990 EFLAGS: 00010207 [518758.189968] RAX: dead000000000200 RBX: ffff8bce69f0f000 RCX: 0000000000000000 [518758.189969] RDX: 0000000000000000 RSI: ffff8bce69f0f064 RDI: ffff8bce69f0f1e0 [518758.189971] RBP: ffffa27e43903990 R08: 0000000000000000 R09: ffff8bce69e788c0 [518758.189972] R10: ffff8bce62acd400 R11: 00000000000003cb R12: ffff8bce69e78000 [518758.189973] R13: ffff8bce69f0f140 R14: 0000000000000000 R15: 0000000000000000 [518758.189975] FS: 00007fa1572c8f80(0000) GS:ffff8bce6e0c0000(0000) knlGS:0000000000000000 [518758.189976] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [518758.189977] CR2: 0000000000000000 CR3: 000000040a398006 CR4: 00000000003606e0 [518758.189979] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [518758.189980] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Fixes: e0a7683d30e9 ("net/sched: cbs: fix port_rate miscalculation") Signed-off-by: Vinicius Costa Gomes Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/sch_cbs.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c index 93b58fde99b7..1bef152c5721 100644 --- a/net/sched/sch_cbs.c +++ b/net/sched/sch_cbs.c @@ -392,7 +392,6 @@ static int cbs_init(struct Qdisc *sch, struct nlattr *opt, { struct cbs_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); - int err; if (!opt) { NL_SET_ERR_MSG(extack, "Missing CBS qdisc options which are mandatory"); @@ -404,6 +403,10 @@ static int cbs_init(struct Qdisc *sch, struct nlattr *opt, if (!q->qdisc) return -ENOMEM; + spin_lock(&cbs_list_lock); + list_add(&q->cbs_list, &cbs_list); + spin_unlock(&cbs_list_lock); + qdisc_hash_add(q->qdisc, false); q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0); @@ -413,17 +416,7 @@ static int cbs_init(struct Qdisc *sch, struct nlattr *opt, qdisc_watchdog_init(&q->watchdog, sch); - err = cbs_change(sch, opt, extack); - if (err) - return err; - - if (!q->offload) { - spin_lock(&cbs_list_lock); - list_add(&q->cbs_list, &cbs_list); - spin_unlock(&cbs_list_lock); - } - - return 0; + return cbs_change(sch, opt, extack); } static void cbs_destroy(struct Qdisc *sch) @@ -431,15 +424,18 @@ static void cbs_destroy(struct Qdisc *sch) struct cbs_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); - spin_lock(&cbs_list_lock); - list_del(&q->cbs_list); - spin_unlock(&cbs_list_lock); + /* Nothing to do if we couldn't create the underlying qdisc */ + if (!q->qdisc) + return; qdisc_watchdog_cancel(&q->watchdog); cbs_disable_offload(dev, q); - if (q->qdisc) - qdisc_put(q->qdisc); + spin_lock(&cbs_list_lock); + list_del(&q->cbs_list); + spin_unlock(&cbs_list_lock); + + qdisc_put(q->qdisc); } static int cbs_dump(struct Qdisc *sch, struct sk_buff *skb) From adecda5bee0a05c11f1a4a4b16b01d11a832fd43 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 24 Sep 2019 11:09:37 +0200 Subject: [PATCH 245/334] net: print proper warning on dst underflow Proper warnings with stack traces make it much easier to figure out what's doing the double free and create more meaningful bug reports from users. Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- net/core/dst.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/dst.c b/net/core/dst.c index 1325316d9eab..193af526e908 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -172,7 +172,7 @@ void dst_release(struct dst_entry *dst) int newrefcnt; newrefcnt = atomic_dec_return(&dst->__refcnt); - if (unlikely(newrefcnt < 0)) + if (WARN_ONCE(newrefcnt < 0, "dst_release underflow")) net_warn_ratelimited("%s: dst:%p refcnt:%d\n", __func__, dst, newrefcnt); if (!newrefcnt) @@ -187,7 +187,7 @@ void dst_release_immediate(struct dst_entry *dst) int newrefcnt; newrefcnt = atomic_dec_return(&dst->__refcnt); - if (unlikely(newrefcnt < 0)) + if (WARN_ONCE(newrefcnt < 0, "dst_release_immediate underflow")) net_warn_ratelimited("%s: dst:%p refcnt:%d\n", __func__, dst, newrefcnt); if (!newrefcnt) From c3ca78e2174413c136d62ebdf8039580fe72b504 Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 25 Sep 2019 00:32:13 -0500 Subject: [PATCH 246/334] smb3: pass mode bits into create calls We need to populate an ACL (security descriptor open context) on file and directory correct. This patch passes in the mode. Followon patch will build the open context and the security descriptor (from the mode) that goes in the open context. Signed-off-by: Steve French Reviewed-by: Aurelien Aptel --- fs/cifs/cifsglob.h | 6 ++++-- fs/cifs/cifsproto.h | 3 ++- fs/cifs/cifssmb.c | 3 ++- fs/cifs/inode.c | 3 ++- fs/cifs/smb2inode.c | 34 +++++++++++++++++++--------------- fs/cifs/smb2pdu.c | 20 ++++++++++++++++++++ fs/cifs/smb2proto.h | 3 ++- 7 files changed, 51 insertions(+), 21 deletions(-) diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 54e204589cb9..2e960e1049db 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -331,8 +331,9 @@ struct smb_version_operations { umode_t mode, struct cifs_tcon *tcon, const char *full_path, struct cifs_sb_info *cifs_sb); - int (*mkdir)(const unsigned int, struct cifs_tcon *, const char *, - struct cifs_sb_info *); + int (*mkdir)(const unsigned int xid, struct inode *inode, umode_t mode, + struct cifs_tcon *tcon, const char *name, + struct cifs_sb_info *sb); /* set info on created directory */ void (*mkdir_setinfo)(struct inode *, const char *, struct cifs_sb_info *, struct cifs_tcon *, @@ -1209,6 +1210,7 @@ struct cifs_search_info { bool smallBuf:1; /* so we know which buf_release function to call */ }; +#define ACL_NO_MODE -1 struct cifs_open_parms { struct cifs_tcon *tcon; struct cifs_sb_info *cifs_sb; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 99b1b1ef558c..e53e9f62b87b 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -372,7 +372,8 @@ extern int CIFSSMBUnixSetPathInfo(const unsigned int xid, const struct nls_table *nls_codepage, int remap); -extern int CIFSSMBMkDir(const unsigned int xid, struct cifs_tcon *tcon, +extern int CIFSSMBMkDir(const unsigned int xid, struct inode *inode, + umode_t mode, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb); extern int CIFSSMBRmDir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index dbee2132e419..4f554f019a98 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1078,7 +1078,8 @@ RmDirRetry: } int -CIFSSMBMkDir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, +CIFSSMBMkDir(const unsigned int xid, struct inode *inode, umode_t mode, + struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb) { int rc = 0; diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 26cdfbf1e164..3bae2e53f0b8 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1622,13 +1622,14 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, umode_t mode) } /* BB add setting the equivalent of mode via CreateX w/ACLs */ - rc = server->ops->mkdir(xid, tcon, full_path, cifs_sb); + rc = server->ops->mkdir(xid, inode, mode, tcon, full_path, cifs_sb); if (rc) { cifs_dbg(FYI, "cifs_mkdir returned 0x%x\n", rc); d_drop(direntry); goto mkdir_out; } + /* TODO: skip this for smb2/smb3 */ rc = cifs_mkdir_qinfo(inode, direntry, mode, full_path, cifs_sb, tcon, xid); mkdir_out: diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index d2a3fb7e5c8d..4121ac1163ca 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -51,7 +51,7 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path, __u32 desired_access, __u32 create_disposition, - __u32 create_options, void *ptr, int command, + __u32 create_options, umode_t mode, void *ptr, int command, struct cifsFileInfo *cfile) { int rc; @@ -103,6 +103,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, oparms.create_options |= CREATE_OPEN_BACKUP_INTENT; oparms.fid = &fid; oparms.reconnect = false; + oparms.mode = mode; memset(&open_iov, 0, sizeof(open_iov)); rqst[num_rqst].rq_iov = open_iov; @@ -478,7 +479,7 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, cifs_get_readable_path(tcon, full_path, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, create_options, - smb2_data, SMB2_OP_QUERY_INFO, cfile); + ACL_NO_MODE, smb2_data, SMB2_OP_QUERY_INFO, cfile); if (rc == -EOPNOTSUPP) { *symlink = true; create_options |= OPEN_REPARSE_POINT; @@ -486,8 +487,8 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, /* Failed on a symbolic link - query a reparse point info */ rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, - create_options, smb2_data, - SMB2_OP_QUERY_INFO, NULL); + create_options, ACL_NO_MODE, + smb2_data, SMB2_OP_QUERY_INFO, NULL); } if (rc) goto out; @@ -499,12 +500,14 @@ out: } int -smb2_mkdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, +smb2_mkdir(const unsigned int xid, struct inode *parent_inode, umode_t mode, + struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb) { return smb2_compound_op(xid, tcon, cifs_sb, name, FILE_WRITE_ATTRIBUTES, FILE_CREATE, - CREATE_NOT_FILE, NULL, SMB2_OP_MKDIR, NULL); + CREATE_NOT_FILE, mode, NULL, SMB2_OP_MKDIR, + NULL); } void @@ -525,8 +528,8 @@ smb2_mkdir_setinfo(struct inode *inode, const char *name, cifs_get_writable_path(tcon, name, &cfile); tmprc = smb2_compound_op(xid, tcon, cifs_sb, name, FILE_WRITE_ATTRIBUTES, FILE_CREATE, - CREATE_NOT_FILE, &data, SMB2_OP_SET_INFO, - cfile); + CREATE_NOT_FILE, ACL_NO_MODE, + &data, SMB2_OP_SET_INFO, cfile); if (tmprc == 0) cifs_i->cifsAttrs = dosattrs; } @@ -536,7 +539,7 @@ smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb) { return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, - CREATE_NOT_FILE, + CREATE_NOT_FILE, ACL_NO_MODE, NULL, SMB2_OP_RMDIR, NULL); } @@ -546,7 +549,7 @@ smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name, { return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT, - NULL, SMB2_OP_DELETE, NULL); + ACL_NO_MODE, NULL, SMB2_OP_DELETE, NULL); } static int @@ -564,7 +567,8 @@ smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon, goto smb2_rename_path; } rc = smb2_compound_op(xid, tcon, cifs_sb, from_name, access, - FILE_OPEN, 0, smb2_to_name, command, cfile); + FILE_OPEN, 0, ACL_NO_MODE, smb2_to_name, + command, cfile); smb2_rename_path: kfree(smb2_to_name); return rc; @@ -601,8 +605,8 @@ smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon, __le64 eof = cpu_to_le64(size); return smb2_compound_op(xid, tcon, cifs_sb, full_path, - FILE_WRITE_DATA, FILE_OPEN, 0, &eof, - SMB2_OP_SET_EOF, NULL); + FILE_WRITE_DATA, FILE_OPEN, 0, ACL_NO_MODE, + &eof, SMB2_OP_SET_EOF, NULL); } int @@ -623,8 +627,8 @@ smb2_set_file_info(struct inode *inode, const char *full_path, return PTR_ERR(tlink); rc = smb2_compound_op(xid, tlink_tcon(tlink), cifs_sb, full_path, - FILE_WRITE_ATTRIBUTES, FILE_OPEN, 0, buf, - SMB2_OP_SET_INFO, NULL); + FILE_WRITE_ATTRIBUTES, FILE_OPEN, + 0, ACL_NO_MODE, buf, SMB2_OP_SET_INFO, NULL); cifs_put_tlink(tlink); return rc; } diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index ea08e6159481..85f9d614d968 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -751,6 +751,8 @@ add_posix_context(struct kvec *iov, unsigned int *num_iovec, umode_t mode) unsigned int num = *num_iovec; iov[num].iov_base = create_posix_buf(mode); + if (mode == -1) + cifs_dbg(VFS, "illegal mode\n"); /* BB REMOVEME */ if (iov[num].iov_base == NULL) return -ENOMEM; iov[num].iov_len = sizeof(struct create_posix); @@ -2417,6 +2419,7 @@ SMB2_open_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, __u8 *oplock, /* File attributes ignored on open (used in create though) */ req->FileAttributes = cpu_to_le32(file_attributes); req->ShareAccess = FILE_SHARE_ALL_LE; + req->CreateDisposition = cpu_to_le32(oparms->disposition); req->CreateOptions = cpu_to_le32(oparms->create_options & CREATE_OPTIONS_MASK); req->NameOffset = cpu_to_le16(sizeof(struct smb2_create_req)); @@ -2518,6 +2521,23 @@ SMB2_open_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, __u8 *oplock, return rc; } + /* TODO: add handling for the mode on create */ + if (oparms->disposition == FILE_CREATE) + cifs_dbg(VFS, "mode is 0x%x\n", oparms->mode); /* BB REMOVEME */ + + if ((oparms->disposition == FILE_CREATE) && (oparms->mode != -1)) { + if (n_iov > 2) { + struct create_context *ccontext = + (struct create_context *)iov[n_iov-1].iov_base; + ccontext->Next = + cpu_to_le32(iov[n_iov-1].iov_len); + } + + /* rc = add_sd_context(iov, &n_iov, oparms->mode); */ + if (rc) + return rc; + } + if (n_iov > 2) { struct create_context *ccontext = (struct create_context *)iov[n_iov-1].iov_base; diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 67a91b11fd59..da3a6d580808 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -84,7 +84,8 @@ extern int smb311_posix_mkdir(const unsigned int xid, struct inode *inode, umode_t mode, struct cifs_tcon *tcon, const char *full_path, struct cifs_sb_info *cifs_sb); -extern int smb2_mkdir(const unsigned int xid, struct cifs_tcon *tcon, +extern int smb2_mkdir(const unsigned int xid, struct inode *inode, + umode_t mode, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb); extern void smb2_mkdir_setinfo(struct inode *inode, const char *full_path, struct cifs_sb_info *cifs_sb, From ba56d8ce38c8252fff5b745db3899cf092578ede Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 23 Sep 2019 17:02:46 +0800 Subject: [PATCH 247/334] macsec: drop skb sk before calling gro_cells_receive Fei Liu reported a crash when doing netperf on a topo of macsec dev over veth: [ 448.919128] refcount_t: underflow; use-after-free. [ 449.090460] Call trace: [ 449.092895] refcount_sub_and_test+0xb4/0xc0 [ 449.097155] tcp_wfree+0x2c/0x150 [ 449.100460] ip_rcv+0x1d4/0x3a8 [ 449.103591] __netif_receive_skb_core+0x554/0xae0 [ 449.108282] __netif_receive_skb+0x28/0x78 [ 449.112366] netif_receive_skb_internal+0x54/0x100 [ 449.117144] napi_gro_complete+0x70/0xc0 [ 449.121054] napi_gro_flush+0x6c/0x90 [ 449.124703] napi_complete_done+0x50/0x130 [ 449.128788] gro_cell_poll+0x8c/0xa8 [ 449.132351] net_rx_action+0x16c/0x3f8 [ 449.136088] __do_softirq+0x128/0x320 The issue was caused by skb's true_size changed without its sk's sk_wmem_alloc increased in tcp/skb_gro_receive(). Later when the skb is being freed and the skb's truesize is subtracted from its sk's sk_wmem_alloc in tcp_wfree(), underflow occurs. macsec is calling gro_cells_receive() to receive a packet, which actually requires skb->sk to be NULL. However when macsec dev is over veth, it's possible the skb->sk is still set if the skb was not unshared or expanded from the peer veth. ip_rcv() is calling skb_orphan() to drop the skb's sk for tproxy, but it is too late for macsec's calling gro_cells_receive(). So fix it by dropping the skb's sk earlier on rx path of macsec. Fixes: 5491e7c6b1a9 ("macsec: enable GRO and RPS on macsec devices") Reported-by: Xiumei Mu Reported-by: Fei Liu Signed-off-by: Xin Long Signed-off-by: David S. Miller --- drivers/net/macsec.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index 8f46aa1ddec0..cb7637364b40 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -1235,6 +1235,7 @@ deliver: macsec_rxsa_put(rx_sa); macsec_rxsc_put(rx_sc); + skb_orphan(skb); ret = gro_cells_receive(&macsec->gro_cells, skb); if (ret == NET_RX_SUCCESS) count_rx(dev, skb->len); From 4f28bd956e081fc018fe9b41ffa31573f17bfb61 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Mon, 23 Sep 2019 11:59:15 +0200 Subject: [PATCH 248/334] net: stmmac: Fix page pool size The size of individual pages in the page pool in given by an order. The order is the binary logarithm of the number of pages that make up one of the pages in the pool. However, the driver currently passes the number of pages rather than the order, so it ends up wasting quite a bit of memory. Fix this by taking the binary logarithm and passing that in the order field. Fixes: 2af6106ae949 ("net: stmmac: Introducing support for Page Pool") Signed-off-by: Thierry Reding Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index a6cb2aa60e64..d3232738fb25 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -1557,13 +1557,15 @@ static int alloc_dma_rx_desc_resources(struct stmmac_priv *priv) for (queue = 0; queue < rx_count; queue++) { struct stmmac_rx_queue *rx_q = &priv->rx_queue[queue]; struct page_pool_params pp_params = { 0 }; + unsigned int num_pages; rx_q->queue_index = queue; rx_q->priv_data = priv; pp_params.flags = PP_FLAG_DMA_MAP; pp_params.pool_size = DMA_RX_SIZE; - pp_params.order = DIV_ROUND_UP(priv->dma_buf_sz, PAGE_SIZE); + num_pages = DIV_ROUND_UP(priv->dma_buf_sz, PAGE_SIZE); + pp_params.order = ilog2(num_pages); pp_params.nid = dev_to_node(priv->device); pp_params.dev = priv->device; pp_params.dma_dir = DMA_FROM_DEVICE; From c1d419d00494487b8401b2ac38d7e8e2a4977d9e Mon Sep 17 00:00:00 2001 From: Biju Das Date: Mon, 23 Sep 2019 14:32:46 +0100 Subject: [PATCH 249/334] dt-bindings: net: ravb: Add support for r8a774b1 SoC Document RZ/G2N (R8A774B1) SoC bindings. Signed-off-by: Biju Das Reviewed-by: Sergei Shtylyov Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/renesas,ravb.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/net/renesas,ravb.txt b/Documentation/devicetree/bindings/net/renesas,ravb.txt index 7ad36213093e..5df4aa7f6811 100644 --- a/Documentation/devicetree/bindings/net/renesas,ravb.txt +++ b/Documentation/devicetree/bindings/net/renesas,ravb.txt @@ -18,6 +18,7 @@ Required properties: R-Car Gen2 and RZ/G1 devices. - "renesas,etheravb-r8a774a1" for the R8A774A1 SoC. + - "renesas,etheravb-r8a774b1" for the R8A774B1 SoC. - "renesas,etheravb-r8a774c0" for the R8A774C0 SoC. - "renesas,etheravb-r8a7795" for the R8A7795 SoC. - "renesas,etheravb-r8a7796" for the R8A7796 SoC. From ea8564c865299815095bebeb4b25bef474218e4c Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Tue, 24 Sep 2019 19:11:52 +0800 Subject: [PATCH 250/334] openvswitch: change type of UPCALL_PID attribute to NLA_UNSPEC userspace openvswitch patch "(dpif-linux: Implement the API functions to allow multiple handler threads read upcall)" changes its type from U32 to UNSPEC, but leave the kernel unchanged and after kernel 6e237d099fac "(netlink: Relax attr validation for fixed length types)", this bug is exposed by the below warning [ 57.215841] netlink: 'ovs-vswitchd': attribute type 5 has an invalid length. Fixes: 5cd667b0a456 ("openvswitch: Allow each vport to have an array of 'port_id's") Signed-off-by: Li RongQing Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/datapath.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index dde9d762edee..f30e406fbec5 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -2294,7 +2294,7 @@ static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = { [OVS_VPORT_ATTR_STATS] = { .len = sizeof(struct ovs_vport_stats) }, [OVS_VPORT_ATTR_PORT_NO] = { .type = NLA_U32 }, [OVS_VPORT_ATTR_TYPE] = { .type = NLA_U32 }, - [OVS_VPORT_ATTR_UPCALL_PID] = { .type = NLA_U32 }, + [OVS_VPORT_ATTR_UPCALL_PID] = { .type = NLA_UNSPEC }, [OVS_VPORT_ATTR_OPTIONS] = { .type = NLA_NESTED }, [OVS_VPORT_ATTR_IFINDEX] = { .type = NLA_U32 }, [OVS_VPORT_ATTR_NETNSID] = { .type = NLA_S32 }, From ca7a03c4175366a92cee0ccc4fec0038c3266e26 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 24 Sep 2019 16:01:28 +0200 Subject: [PATCH 251/334] ipv6: do not free rt if FIB_LOOKUP_NOREF is set on suppress rule Commit 7d9e5f422150 removed references from certain dsts, but accounting for this never translated down into the fib6 suppression code. This bug was triggered by WireGuard users who use wg-quick(8), which uses the "suppress-prefix" directive to ip-rule(8) for routing all of their internet traffic without routing loops. The test case added here causes the reference underflow by causing packets to evaluate a suppress rule. Fixes: 7d9e5f422150 ("ipv6: convert major tx path to use RT6_LOOKUP_F_DST_NOREF") Signed-off-by: Jason A. Donenfeld Acked-by: Wei Wang Signed-off-by: David S. Miller --- net/ipv6/fib6_rules.c | 3 ++- tools/testing/selftests/net/fib_tests.sh | 17 ++++++++++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index d22b6c140f23..f9e8fe3ff0c5 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -287,7 +287,8 @@ static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg return false; suppress_route: - ip6_rt_put(rt); + if (!(arg->flags & FIB_LOOKUP_NOREF)) + ip6_rt_put(rt); return true; } diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh index cba83a12da82..c4ba0ff4a53f 100755 --- a/tools/testing/selftests/net/fib_tests.sh +++ b/tools/testing/selftests/net/fib_tests.sh @@ -9,7 +9,7 @@ ret=0 ksft_skip=4 # all tests in this script. Can be overridden with -t option -TESTS="unregister down carrier nexthop ipv6_rt ipv4_rt ipv6_addr_metric ipv4_addr_metric ipv6_route_metrics ipv4_route_metrics ipv4_route_v6_gw rp_filter" +TESTS="unregister down carrier nexthop suppress ipv6_rt ipv4_rt ipv6_addr_metric ipv4_addr_metric ipv6_route_metrics ipv4_route_metrics ipv4_route_v6_gw rp_filter" VERBOSE=0 PAUSE_ON_FAIL=no @@ -616,6 +616,20 @@ fib_nexthop_test() cleanup } +fib_suppress_test() +{ + $IP link add dummy1 type dummy + $IP link set dummy1 up + $IP -6 route add default dev dummy1 + $IP -6 rule add table main suppress_prefixlength 0 + ping -f -c 1000 -W 1 1234::1 || true + $IP -6 rule del table main suppress_prefixlength 0 + $IP link del dummy1 + + # If we got here without crashing, we're good. + return 0 +} + ################################################################################ # Tests on route add and replace @@ -1593,6 +1607,7 @@ do fib_carrier_test|carrier) fib_carrier_test;; fib_rp_filter_test|rp_filter) fib_rp_filter_test;; fib_nexthop_test|nexthop) fib_nexthop_test;; + fib_suppress_test|suppress) fib_suppress_test;; ipv6_route_test|ipv6_rt) ipv6_route_test;; ipv4_route_test|ipv4_rt) ipv4_route_test;; ipv6_addr_metric) ipv6_addr_metric_test;; From 39529a9948d8f67f39cb72bec914c1adab38562d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 25 Sep 2019 13:37:45 -0700 Subject: [PATCH 252/334] libbpf: Teach btf_dumper to emit stand-alone anonymous enum definitions BTF-to-C converter previously skipped anonymous enums in an assumption that those are embedded in struct's field definitions. This is not always the case and a lot of kernel constants are defined as part of anonymous enums. This change fixes the logic by eagerly marking all types as either referenced by any other type or not. This is enough to distinguish two classes of anonymous enums and emit previously omitted enum definitions. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20190925203745.3173184-1-andriin@fb.com --- tools/lib/bpf/btf_dump.c | 93 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 87 insertions(+), 6 deletions(-) diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index 84b0661db7f3..ede55fec3618 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -48,6 +48,8 @@ struct btf_dump_type_aux_state { __u8 fwd_emitted: 1; /* whether unique non-duplicate name was already assigned */ __u8 name_resolved: 1; + /* whether type is referenced from any other type */ + __u8 referenced: 1; }; struct btf_dump { @@ -173,6 +175,7 @@ void btf_dump__free(struct btf_dump *d) free(d); } +static int btf_dump_mark_referenced(struct btf_dump *d); static int btf_dump_order_type(struct btf_dump *d, __u32 id, bool through_ptr); static void btf_dump_emit_type(struct btf_dump *d, __u32 id, __u32 cont_id); @@ -213,6 +216,11 @@ int btf_dump__dump_type(struct btf_dump *d, __u32 id) /* VOID is special */ d->type_states[0].order_state = ORDERED; d->type_states[0].emit_state = EMITTED; + + /* eagerly determine referenced types for anon enums */ + err = btf_dump_mark_referenced(d); + if (err) + return err; } d->emit_queue_cnt = 0; @@ -226,6 +234,79 @@ int btf_dump__dump_type(struct btf_dump *d, __u32 id) return 0; } +/* + * Mark all types that are referenced from any other type. This is used to + * determine top-level anonymous enums that need to be emitted as an + * independent type declarations. + * Anonymous enums come in two flavors: either embedded in a struct's field + * definition, in which case they have to be declared inline as part of field + * type declaration; or as a top-level anonymous enum, typically used for + * declaring global constants. It's impossible to distinguish between two + * without knowning whether given enum type was referenced from other type: + * top-level anonymous enum won't be referenced by anything, while embedded + * one will. + */ +static int btf_dump_mark_referenced(struct btf_dump *d) +{ + int i, j, n = btf__get_nr_types(d->btf); + const struct btf_type *t; + __u16 vlen; + + for (i = 1; i <= n; i++) { + t = btf__type_by_id(d->btf, i); + vlen = btf_vlen(t); + + switch (btf_kind(t)) { + case BTF_KIND_INT: + case BTF_KIND_ENUM: + case BTF_KIND_FWD: + break; + + case BTF_KIND_VOLATILE: + case BTF_KIND_CONST: + case BTF_KIND_RESTRICT: + case BTF_KIND_PTR: + case BTF_KIND_TYPEDEF: + case BTF_KIND_FUNC: + case BTF_KIND_VAR: + d->type_states[t->type].referenced = 1; + break; + + case BTF_KIND_ARRAY: { + const struct btf_array *a = btf_array(t); + + d->type_states[a->index_type].referenced = 1; + d->type_states[a->type].referenced = 1; + break; + } + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: { + const struct btf_member *m = btf_members(t); + + for (j = 0; j < vlen; j++, m++) + d->type_states[m->type].referenced = 1; + break; + } + case BTF_KIND_FUNC_PROTO: { + const struct btf_param *p = btf_params(t); + + for (j = 0; j < vlen; j++, p++) + d->type_states[p->type].referenced = 1; + break; + } + case BTF_KIND_DATASEC: { + const struct btf_var_secinfo *v = btf_var_secinfos(t); + + for (j = 0; j < vlen; j++, v++) + d->type_states[v->type].referenced = 1; + break; + } + default: + return -EINVAL; + } + } + return 0; +} static int btf_dump_add_emit_queue_id(struct btf_dump *d, __u32 id) { __u32 *new_queue; @@ -395,7 +476,12 @@ static int btf_dump_order_type(struct btf_dump *d, __u32 id, bool through_ptr) } case BTF_KIND_ENUM: case BTF_KIND_FWD: - if (t->name_off != 0) { + /* + * non-anonymous or non-referenced enums are top-level + * declarations and should be emitted. Same logic can be + * applied to FWDs, it won't hurt anyways. + */ + if (t->name_off != 0 || !tstate->referenced) { err = btf_dump_add_emit_queue_id(d, id); if (err) return err; @@ -536,11 +622,6 @@ static void btf_dump_emit_type(struct btf_dump *d, __u32 id, __u32 cont_id) t = btf__type_by_id(d->btf, id); kind = btf_kind(t); - if (top_level_def && t->name_off == 0) { - pr_warning("unexpected nameless definition, id:[%u]\n", id); - return; - } - if (tstate->emit_state == EMITTING) { if (tstate->fwd_emitted) return; From e3439af4a339acd7fddbd6d59b8ecefaac07a611 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 25 Sep 2019 10:38:35 +0100 Subject: [PATCH 253/334] bpf: Clean up indentation issue in BTF kflag processing There is a statement that is indented one level too deeply, remove the extraneous tab. Signed-off-by: Colin Ian King Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20190925093835.19515-1-colin.king@canonical.com --- kernel/bpf/btf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 722d38e543e9..29c7c06c6bd6 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -2332,7 +2332,7 @@ static int btf_enum_check_kflag_member(struct btf_verifier_env *env, if (BITS_PER_BYTE_MASKED(struct_bits_off)) { btf_verifier_log_member(env, struct_type, member, "Member is not byte aligned"); - return -EINVAL; + return -EINVAL; } nr_bits = int_bitsize; From ff3ee62a55869b1a64266b5c15af16f2eb37c8a7 Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 26 Sep 2019 04:37:18 -0500 Subject: [PATCH 254/334] smb3: missing ACL related flags Various SMB3 ACL related flags (for security descriptor and ACEs for example) were missing and some fields are different in SMB3 and CIFS. Update cifsacl.h definitions based on current MS-DTYP specification. Signed-off-by: Steve French Reviewed-by: Ronnie Sahlberg Reviewed-by: Aurelien Aptel --- fs/cifs/cifsacl.h | 81 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 80 insertions(+), 1 deletion(-) diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h index eb428349f29a..439b99cefeb0 100644 --- a/fs/cifs/cifsacl.h +++ b/fs/cifs/cifsacl.h @@ -90,14 +90,93 @@ struct cifs_acl { __le32 num_aces; } __attribute__((packed)); +/* ACE types - see MS-DTYP 2.4.4.1 */ +#define ACCESS_ALLOWED_ACE_TYPE 0x00 +#define ACCESS_DENIED_ACE_TYPE 0x01 +#define SYSTEM_AUDIT_ACE_TYPE 0x02 +#define SYSTEM_ALARM_ACE_TYPE 0x03 +#define ACCESS_ALLOWED_COMPOUND_ACE_TYPE 0x04 +#define ACCESS_ALLOWED_OBJECT_ACE_TYPE 0x05 +#define ACCESS_DENIED_OBJECT_ACE_TYPE 0x06 +#define SYSTEM_AUDIT_OBJECT_ACE_TYPE 0x07 +#define SYSTEM_ALARM_OBJECT_ACE_TYPE 0x08 +#define ACCESS_ALLOWED_CALLBACK_ACE_TYPE 0x09 +#define ACCESS_DENIED_CALLBACK_ACE_TYPE 0x0A +#define ACCESS_ALLOWED_CALLBACK_OBJECT_ACE_TYPE 0x0B +#define ACCESS_DENIED_CALLBACK_OBJECT_ACE_TYPE 0x0C +#define SYSTEM_AUDIT_CALLBACK_ACE_TYPE 0x0D +#define SYSTEM_ALARM_CALLBACK_ACE_TYPE 0x0E /* Reserved */ +#define SYSTEM_AUDIT_CALLBACK_OBJECT_ACE_TYPE 0x0F +#define SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE 0x10 /* reserved */ +#define SYSTEM_MANDATORY_LABEL_ACE_TYPE 0x11 +#define SYSTEM_RESOURCE_ATTRIBUTE_ACE_TYPE 0x12 +#define SYSTEM_SCOPED_POLICY_ID_ACE_TYPE 0x13 + +/* ACE flags */ +#define OBJECT_INHERIT_ACE 0x01 +#define CONTAINER_INHERIT_ACE 0x02 +#define NO_PROPAGATE_INHERIT_ACE 0x04 +#define INHERIT_ONLY_ACE 0x08 +#define INHERITED_ACE 0x10 +#define SUCCESSFUL_ACCESS_ACE_FLAG 0x40 +#define FAILED_ACCESS_ACE_FLAG 0x80 + struct cifs_ace { - __u8 type; + __u8 type; /* see above and MS-DTYP 2.4.4.1 */ __u8 flags; __le16 size; __le32 access_req; struct cifs_sid sid; /* ie UUID of user or group who gets these perms */ } __attribute__((packed)); +/* + * The current SMB3 form of security descriptor is similar to what was used for + * cifs (see above) but some fields are split, and fields in the struct below + * matches names of fields to the the spec, MS-DTYP (see sections 2.4.5 and + * 2.4.6). Note that "CamelCase" fields are used in this struct in order to + * match the MS-DTYP and MS-SMB2 specs which define the wire format. + */ +struct smb3_sd { + __u8 Revision; /* revision level, MUST be one */ + __u8 Sbz1; /* only meaningful if 'RM' flag set below */ + __le16 Control; + __le32 OffsetOwner; + __le32 OffsetGroup; + __le32 OffsetSacl; + __le32 OffsetDacl; +} __packed; + +/* Meaning of 'Control' field flags */ +#define ACL_CONTROL_SR 0x0001 /* Self relative */ +#define ACL_CONTROL_RM 0x0002 /* Resource manager control bits */ +#define ACL_CONTROL_PS 0x0004 /* SACL protected from inherits */ +#define ACL_CONTROL_PD 0x0008 /* DACL protected from inherits */ +#define ACL_CONTROL_SI 0x0010 /* SACL Auto-Inherited */ +#define ACL_CONTROL_DI 0x0020 /* DACL Auto-Inherited */ +#define ACL_CONTROL_SC 0x0040 /* SACL computed through inheritance */ +#define ACL_CONTROL_DC 0x0080 /* DACL computed through inheritence */ +#define ACL_CONTROL_SS 0x0100 /* Create server ACL */ +#define ACL_CONTROL_DT 0x0200 /* DACL provided by trusteed source */ +#define ACL_CONTROL_SD 0x0400 /* SACL defaulted */ +#define ACL_CONTROL_SP 0x0800 /* SACL is present on object */ +#define ACL_CONTROL_DD 0x1000 /* DACL defaulted */ +#define ACL_CONTROL_DP 0x2000 /* DACL is present on object */ +#define ACL_CONTROL_GD 0x4000 /* Group was defaulted */ +#define ACL_CONTROL_OD 0x8000 /* User was defaulted */ + +/* Meaning of AclRevision flags */ +#define ACL_REVISION 0x02 /* See section 2.4.4.1 of MS-DTYP */ +#define ACL_REVISION_DS 0x04 /* Additional AceTypes allowed */ + +struct smb3_acl { + u8 AclRevision; /* revision level */ + u8 Sbz1; /* MBZ */ + __le16 AclSize; + __le16 AceCount; + __le16 Sbz2; /* MBZ */ +} __packed; + + /* * Minimum security identifier can be one for system defined Users * and Groups such as NULL SID and World or Built-in accounts such From a016e2794fc3a245a91946038dd8f34d65e53cc3 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Thu, 26 Sep 2019 12:31:20 -0700 Subject: [PATCH 255/334] CIFS: Fix oplock handling for SMB 2.1+ protocols There may be situations when a server negotiates SMB 2.1 protocol version or higher but responds to a CREATE request with an oplock rather than a lease. Currently the client doesn't handle such a case correctly: when another CREATE comes in the server sends an oplock break to the initial CREATE and the client doesn't send an ack back due to a wrong caching level being set (READ instead of RWH). Missing an oplock break ack makes the server wait until the break times out which dramatically increases the latency of the second CREATE. Fix this by properly detecting oplocks when using SMB 2.1 protocol version and higher. Cc: Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French Reviewed-by: Ronnie Sahlberg --- fs/cifs/smb2ops.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 3010066a8709..4c0922596467 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -3333,6 +3333,11 @@ smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE) return; + /* Check if the server granted an oplock rather than a lease */ + if (oplock & SMB2_OPLOCK_LEVEL_EXCLUSIVE) + return smb2_set_oplock_level(cinode, oplock, epoch, + purge_cache); + if (oplock & SMB2_LEASE_READ_CACHING_HE) { new_oplock |= CIFS_CACHE_READ_FLG; strcat(message, "R"); From 253c892193ab58da6b1d94371285971b22c63260 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Thu, 26 Sep 2019 22:25:02 +1000 Subject: [PATCH 256/334] powerpc/eeh: Fix eeh eeh_debugfs_break_device() with SRIOV devices s/CONFIG_IOV/CONFIG_PCI_IOV/ Whoops. Fixes: bd6461cc7b3c ("powerpc/eeh: Add a eeh_dev_break debugfs interface") Signed-off-by: Oliver O'Halloran [mpe: Fixup the #endif comment as well] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190926122502.14826-1-oohall@gmail.com --- arch/powerpc/kernel/eeh.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 0a91dee51245..bc8a551013be 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -1960,7 +1960,7 @@ static int eeh_debugfs_break_device(struct pci_dev *pdev) pci_err(pdev, "Going to break: %pR\n", bar); if (pdev->is_virtfn) { -#ifndef CONFIG_IOV +#ifndef CONFIG_PCI_IOV return -ENXIO; #else /* @@ -1980,7 +1980,7 @@ static int eeh_debugfs_break_device(struct pci_dev *pdev) pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_SRIOV); pos += PCI_SRIOV_CTRL; bit = PCI_SRIOV_CTRL_MSE; -#endif /* !CONFIG_IOV */ +#endif /* !CONFIG_PCI_IOV */ } else { bit = PCI_COMMAND_MEMORY; pos = PCI_COMMAND; From 424adc329bcbf3bfad80a6b01cffe5f2ee5080a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 24 Sep 2019 18:02:59 +0200 Subject: [PATCH 257/334] dimlib: make DIMLIB a hidden symbol MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit According to Tal Gilboa the only benefit from DIM comes from a driver that uses it. So it doesn't make sense to make this symbol user visible, instead all drivers that use it should select it (as is already the case AFAICT). Signed-off-by: Uwe Kleine-König Acked-by: Randy Dunlap Signed-off-by: David S. Miller --- lib/Kconfig | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/Kconfig b/lib/Kconfig index 4e6b1c3e4c98..d7fc9eb33b9b 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -555,8 +555,7 @@ config SIGNATURE Implementation is done using GnuPG MPI library config DIMLIB - bool "DIM library" - default y + bool help Dynamic Interrupt Moderation library. Implements an algorithm for dynamically change CQ modertion values From 31aefe14bc9f56566041303d733fda511d3a1c3e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 25 Sep 2019 13:54:30 +0300 Subject: [PATCH 258/334] net: aquantia: Fix aq_vec_isr_legacy() return value The irqreturn_t type is an enum or an unsigned int in GCC. That creates to problems because it can't detect if the self->aq_hw_ops->hw_irq_read() call fails and at the end the function always returns IRQ_HANDLED. drivers/net/ethernet/aquantia/atlantic/aq_vec.c:316 aq_vec_isr_legacy() warn: unsigned 'err' is never less than zero. drivers/net/ethernet/aquantia/atlantic/aq_vec.c:329 aq_vec_isr_legacy() warn: always true condition '(err >= 0) => (0-u32max >= 0)' Fixes: 970a2e9864b0 ("net: ethernet: aquantia: Vector operations") Signed-off-by: Dan Carpenter Reviewed-by: Igor Russkikh Signed-off-by: David S. Miller --- drivers/net/ethernet/aquantia/atlantic/aq_vec.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_vec.c b/drivers/net/ethernet/aquantia/atlantic/aq_vec.c index 28892b8acd0e..a95c263a45aa 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_vec.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_vec.c @@ -306,15 +306,13 @@ irqreturn_t aq_vec_isr_legacy(int irq, void *private) { struct aq_vec_s *self = private; u64 irq_mask = 0U; - irqreturn_t err = 0; + int err; - if (!self) { - err = -EINVAL; - goto err_exit; - } + if (!self) + return IRQ_NONE; err = self->aq_hw_ops->hw_irq_read(self->aq_hw, &irq_mask); if (err < 0) - goto err_exit; + return IRQ_NONE; if (irq_mask) { self->aq_hw_ops->hw_irq_disable(self->aq_hw, @@ -322,11 +320,10 @@ irqreturn_t aq_vec_isr_legacy(int irq, void *private) napi_schedule(&self->napi); } else { self->aq_hw_ops->hw_irq_enable(self->aq_hw, 1U); - err = IRQ_NONE; + return IRQ_NONE; } -err_exit: - return err >= 0 ? IRQ_HANDLED : IRQ_NONE; + return IRQ_HANDLED; } cpumask_t *aq_vec_get_affinity_mask(struct aq_vec_s *self) From 286183147666fb76c057836c57d86e9e6f508bca Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 25 Sep 2019 13:54:59 +0300 Subject: [PATCH 259/334] cxgb4: Signedness bug in init_one() The "chip" variable is an enum, and it's treated as unsigned int by GCC in this context so the error handling isn't triggered. Fixes: e8d452923ae6 ("cxgb4: clean up init_one") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index 71854a19cebe..38024877751c 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -5701,7 +5701,7 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) whoami = t4_read_reg(adapter, PL_WHOAMI_A); pci_read_config_word(pdev, PCI_DEVICE_ID, &device_id); chip = t4_get_chip_type(adapter, CHELSIO_PCI_ID_VER(device_id)); - if (chip < 0) { + if ((int)chip < 0) { dev_err(&pdev->dev, "Device %d is not supported\n", device_id); err = chip; goto out_free_adapter; From 002dfe8085255b7bf1e0758c3d195c5412d35be9 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 25 Sep 2019 13:55:32 +0300 Subject: [PATCH 260/334] net: hisilicon: Fix signedness bug in hix5hd2_dev_probe() The "priv->phy_mode" variable is an enum and in this context GCC will treat it as unsigned to the error handling will never trigger. Fixes: 57c5bc9ad7d7 ("net: hisilicon: add hix5hd2 mac driver") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hix5hd2_gmac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hix5hd2_gmac.c b/drivers/net/ethernet/hisilicon/hix5hd2_gmac.c index 95a6b0926170..c41b19c760f8 100644 --- a/drivers/net/ethernet/hisilicon/hix5hd2_gmac.c +++ b/drivers/net/ethernet/hisilicon/hix5hd2_gmac.c @@ -1194,7 +1194,7 @@ static int hix5hd2_dev_probe(struct platform_device *pdev) goto err_free_mdio; priv->phy_mode = of_get_phy_mode(node); - if (priv->phy_mode < 0) { + if ((int)priv->phy_mode < 0) { netdev_err(ndev, "not find phy-mode\n"); ret = -EINVAL; goto err_mdiobus; From 25a584955f020d6ec499c513923fb220f3112d2b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 25 Sep 2019 13:56:04 +0300 Subject: [PATCH 261/334] net: broadcom/bcmsysport: Fix signedness in bcm_sysport_probe() The "priv->phy_interface" variable is an enum and in this context GCC will treat it as unsigned so the error handling will never be triggered. Fixes: 80105befdb4b ("net: systemport: add Broadcom SYSTEMPORT Ethernet MAC driver") Signed-off-by: Dan Carpenter Acked-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bcmsysport.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c index 7df887e4024c..a977a459bd20 100644 --- a/drivers/net/ethernet/broadcom/bcmsysport.c +++ b/drivers/net/ethernet/broadcom/bcmsysport.c @@ -2481,7 +2481,7 @@ static int bcm_sysport_probe(struct platform_device *pdev) priv->phy_interface = of_get_phy_mode(dn); /* Default to GMII interface mode */ - if (priv->phy_interface < 0) + if ((int)priv->phy_interface < 0) priv->phy_interface = PHY_INTERFACE_MODE_GMII; /* In the case of a fixed PHY, the DT node associated From bd55f8ddbc437c225391ca8f487e7ec10243c4cc Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 25 Sep 2019 13:56:38 +0300 Subject: [PATCH 262/334] net: netsec: Fix signedness bug in netsec_probe() The "priv->phy_interface" variable is an enum and in this context GCC will treat it as an unsigned int so the error handling is never triggered. Fixes: 533dd11a12f6 ("net: socionext: Add Synquacer NetSec driver") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- drivers/net/ethernet/socionext/netsec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/socionext/netsec.c b/drivers/net/ethernet/socionext/netsec.c index 1502fe8b0456..55db7fbd43cc 100644 --- a/drivers/net/ethernet/socionext/netsec.c +++ b/drivers/net/ethernet/socionext/netsec.c @@ -2007,7 +2007,7 @@ static int netsec_probe(struct platform_device *pdev) NETIF_MSG_LINK | NETIF_MSG_PROBE; priv->phy_interface = device_get_phy_mode(&pdev->dev); - if (priv->phy_interface < 0) { + if ((int)priv->phy_interface < 0) { dev_err(&pdev->dev, "missing required property 'phy-mode'\n"); ret = -ENODEV; goto free_ndev; From ced81eb84d6ab0a993ff79c04cc0238d44415af4 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 25 Sep 2019 13:57:14 +0300 Subject: [PATCH 263/334] enetc: Fix a signedness bug in enetc_of_get_phy() The "priv->if_mode" is type phy_interface_t which is an enum. In this context GCC will treat the enum as an unsigned int so this error handling is never triggered. Fixes: d4fd0404c1c9 ("enetc: Introduce basic PF and VF ENETC ethernet drivers") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/enetc/enetc_pf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc_pf.c b/drivers/net/ethernet/freescale/enetc/enetc_pf.c index 7d6513ff8507..b73421c3e25b 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_pf.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_pf.c @@ -785,7 +785,7 @@ static int enetc_of_get_phy(struct enetc_ndev_priv *priv) } priv->if_mode = of_get_phy_mode(np); - if (priv->if_mode < 0) { + if ((int)priv->if_mode < 0) { dev_err(priv->dev, "missing phy type\n"); of_node_put(priv->phy_node); if (of_phy_is_fixed_link(np)) From 7f9e88e6ef8c971f2c638b5ff7044c59b5d0f58d Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 25 Sep 2019 13:57:50 +0300 Subject: [PATCH 264/334] net: socionext: Fix a signedness bug in ave_probe() The "phy_mode" variable is an enum and in this context GCC treats it as an unsigned int so the error handling is never triggered. Fixes: 4c270b55a5af ("net: ethernet: socionext: add AVE ethernet driver") Signed-off-by: Dan Carpenter Reviewed-by: Kunihiko Hayashi Signed-off-by: David S. Miller --- drivers/net/ethernet/socionext/sni_ave.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/socionext/sni_ave.c b/drivers/net/ethernet/socionext/sni_ave.c index 10d0c3e478ab..d047a53f34f2 100644 --- a/drivers/net/ethernet/socionext/sni_ave.c +++ b/drivers/net/ethernet/socionext/sni_ave.c @@ -1566,7 +1566,7 @@ static int ave_probe(struct platform_device *pdev) np = dev->of_node; phy_mode = of_get_phy_mode(np); - if (phy_mode < 0) { + if ((int)phy_mode < 0) { dev_err(dev, "phy-mode not found\n"); return -EINVAL; } From f10210517a2f37feea2edf85eb34c98977265c16 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 25 Sep 2019 13:58:22 +0300 Subject: [PATCH 265/334] net: stmmac: dwmac-meson8b: Fix signedness bug in probe The "dwmac->phy_mode" is an enum and in this context GCC treats it as an unsigned int so the error handling is never triggered. Fixes: 566e82516253 ("net: stmmac: add a glue driver for the Amlogic Meson 8b / GXBB DWMAC") Signed-off-by: Dan Carpenter Reviewed-by: Martin Blumenstingl Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c index 9cda29e4b89d..306da8f6b7d5 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c @@ -339,7 +339,7 @@ static int meson8b_dwmac_probe(struct platform_device *pdev) dwmac->dev = &pdev->dev; dwmac->phy_mode = of_get_phy_mode(pdev->dev.of_node); - if (dwmac->phy_mode < 0) { + if ((int)dwmac->phy_mode < 0) { dev_err(&pdev->dev, "missing phy-mode property\n"); ret = -EINVAL; goto err_remove_config_dt; From 73e211e11be86715d66bd3c9d38b3c34b05fca9a Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 25 Sep 2019 13:59:11 +0300 Subject: [PATCH 266/334] net: axienet: fix a signedness bug in probe The "lp->phy_mode" is an enum but in this context GCC treats it as an unsigned int so the error handling is never triggered. Fixes: ee06b1728b95 ("net: axienet: add support for standard phy-mode binding") Signed-off-by: Dan Carpenter Reviewed-by: Radhey Shyam Pandey Signed-off-by: David S. Miller --- drivers/net/ethernet/xilinx/xilinx_axienet_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c index 4fc627fb4d11..676006f32f91 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@ -1762,7 +1762,7 @@ static int axienet_probe(struct platform_device *pdev) } } else { lp->phy_mode = of_get_phy_mode(pdev->dev.of_node); - if (lp->phy_mode < 0) { + if ((int)lp->phy_mode < 0) { ret = -EINVAL; goto free_netdev; } From d7eb651212fdbafa82d485d8e76095ac3b14c193 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 25 Sep 2019 14:01:00 +0300 Subject: [PATCH 267/334] of: mdio: Fix a signedness bug in of_phy_get_and_connect() The "iface" variable is an enum and in this context GCC treats it as an unsigned int so the error handling is never triggered. Fixes: b78624125304 ("of_mdio: Abstract a general interface for phy connect") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- drivers/of/of_mdio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/of/of_mdio.c b/drivers/of/of_mdio.c index 000b95787df1..bd6129db6417 100644 --- a/drivers/of/of_mdio.c +++ b/drivers/of/of_mdio.c @@ -362,7 +362,7 @@ struct phy_device *of_phy_get_and_connect(struct net_device *dev, int ret; iface = of_get_phy_mode(np); - if (iface < 0) + if ((int)iface < 0) return NULL; if (of_phy_is_fixed_link(np)) { ret = of_phy_register_fixed_link(np); From 1a4b62a0b8a3b81eca24366f63e214a7144b9f02 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 25 Sep 2019 14:05:24 +0300 Subject: [PATCH 268/334] net: nixge: Fix a signedness bug in nixge_probe() The "priv->phy_mode" is an enum and in this context GCC will treat it as an unsigned int so it can never be less than zero. Fixes: 492caffa8a1a ("net: ethernet: nixge: Add support for National Instruments XGE netdev") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- drivers/net/ethernet/ni/nixge.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ni/nixge.c b/drivers/net/ethernet/ni/nixge.c index 0b384f97d2fd..2761f3a3ae50 100644 --- a/drivers/net/ethernet/ni/nixge.c +++ b/drivers/net/ethernet/ni/nixge.c @@ -1347,7 +1347,7 @@ static int nixge_probe(struct platform_device *pdev) } priv->phy_mode = of_get_phy_mode(pdev->dev.of_node); - if (priv->phy_mode < 0) { + if ((int)priv->phy_mode < 0) { netdev_err(ndev, "not find \"phy-mode\" property\n"); err = -EINVAL; goto unregister_mdio; From 231042181dc9d6122c6faba64e99ccb25f13cc6c Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 25 Sep 2019 14:05:54 +0300 Subject: [PATCH 269/334] net: ethernet: stmmac: Fix signedness bug in ipq806x_gmac_of_parse() The "gmac->phy_mode" variable is an enum and in this context GCC will treat it as an unsigned int so the error handling will never be triggered. Fixes: b1c17215d718 ("stmmac: add ipq806x glue layer") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c index 2c6d7c69c8f7..0d21082ceb93 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c @@ -191,7 +191,7 @@ static int ipq806x_gmac_of_parse(struct ipq806x_gmac *gmac) struct device *dev = &gmac->pdev->dev; gmac->phy_mode = of_get_phy_mode(dev->of_node); - if (gmac->phy_mode < 0) { + if ((int)gmac->phy_mode < 0) { dev_err(dev, "missing phy mode property\n"); return -EINVAL; } From 0355d6c1d591b8f9e281783ec0cf95fbed893194 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 24 Sep 2019 12:29:34 -0700 Subject: [PATCH 270/334] kcm: disable preemption in kcm_parse_func_strparser() After commit a2c11b034142 ("kcm: use BPF_PROG_RUN") syzbot easily triggers the warning in cant_sleep(). As explained in commit 6cab5e90ab2b ("bpf: run bpf programs with preemption disabled") we need to disable preemption before running bpf programs. BUG: assuming atomic context at net/kcm/kcmsock.c:382 in_atomic(): 0, irqs_disabled(): 0, pid: 7, name: kworker/u4:0 3 locks held by kworker/u4:0/7: #0: ffff888216726128 ((wq_completion)kstrp){+.+.}, at: __write_once_size include/linux/compiler.h:226 [inline] #0: ffff888216726128 ((wq_completion)kstrp){+.+.}, at: arch_atomic64_set arch/x86/include/asm/atomic64_64.h:34 [inline] #0: ffff888216726128 ((wq_completion)kstrp){+.+.}, at: atomic64_set include/asm-generic/atomic-instrumented.h:855 [inline] #0: ffff888216726128 ((wq_completion)kstrp){+.+.}, at: atomic_long_set include/asm-generic/atomic-long.h:40 [inline] #0: ffff888216726128 ((wq_completion)kstrp){+.+.}, at: set_work_data kernel/workqueue.c:620 [inline] #0: ffff888216726128 ((wq_completion)kstrp){+.+.}, at: set_work_pool_and_clear_pending kernel/workqueue.c:647 [inline] #0: ffff888216726128 ((wq_completion)kstrp){+.+.}, at: process_one_work+0x88b/0x1740 kernel/workqueue.c:2240 #1: ffff8880a989fdc0 ((work_completion)(&strp->work)){+.+.}, at: process_one_work+0x8c1/0x1740 kernel/workqueue.c:2244 #2: ffff888098998d10 (sk_lock-AF_INET){+.+.}, at: lock_sock include/net/sock.h:1522 [inline] #2: ffff888098998d10 (sk_lock-AF_INET){+.+.}, at: strp_sock_lock+0x2e/0x40 net/strparser/strparser.c:440 CPU: 0 PID: 7 Comm: kworker/u4:0 Not tainted 5.3.0+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: kstrp strp_work Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x172/0x1f0 lib/dump_stack.c:113 __cant_sleep kernel/sched/core.c:6826 [inline] __cant_sleep.cold+0xa4/0xbc kernel/sched/core.c:6803 kcm_parse_func_strparser+0x54/0x200 net/kcm/kcmsock.c:382 __strp_recv+0x5dc/0x1b20 net/strparser/strparser.c:221 strp_recv+0xcf/0x10b net/strparser/strparser.c:343 tcp_read_sock+0x285/0xa00 net/ipv4/tcp.c:1639 strp_read_sock+0x14d/0x200 net/strparser/strparser.c:366 do_strp_work net/strparser/strparser.c:414 [inline] strp_work+0xe3/0x130 net/strparser/strparser.c:423 process_one_work+0x9af/0x1740 kernel/workqueue.c:2269 Fixes: a2c11b034142 ("kcm: use BPF_PROG_RUN") Fixes: 6cab5e90ab2b ("bpf: run bpf programs with preemption disabled") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/kcm/kcmsock.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index 8f12f5c6ab87..ea9e73428ed9 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -378,8 +378,12 @@ static int kcm_parse_func_strparser(struct strparser *strp, struct sk_buff *skb) { struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp); struct bpf_prog *prog = psock->bpf_prog; + int res; - return BPF_PROG_RUN(prog, skb); + preempt_disable(); + res = BPF_PROG_RUN(prog, skb); + preempt_enable(); + return res; } static int kcm_read_sock_done(struct strparser *strp, int err) From 159d2c7d8106177bd9a986fd005a311fe0d11285 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 24 Sep 2019 13:11:26 -0700 Subject: [PATCH 271/334] sch_netem: fix rcu splat in netem_enqueue() qdisc_root() use from netem_enqueue() triggers a lockdep warning. __dev_queue_xmit() uses rcu_read_lock_bh() which is not equivalent to rcu_read_lock() + local_bh_disable_bh as far as lockdep is concerned. WARNING: suspicious RCU usage 5.3.0-rc7+ #0 Not tainted ----------------------------- include/net/sch_generic.h:492 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 3 locks held by syz-executor427/8855: #0: 00000000b5525c01 (rcu_read_lock_bh){....}, at: lwtunnel_xmit_redirect include/net/lwtunnel.h:92 [inline] #0: 00000000b5525c01 (rcu_read_lock_bh){....}, at: ip_finish_output2+0x2dc/0x2570 net/ipv4/ip_output.c:214 #1: 00000000b5525c01 (rcu_read_lock_bh){....}, at: __dev_queue_xmit+0x20a/0x3650 net/core/dev.c:3804 #2: 00000000364bae92 (&(&sch->q.lock)->rlock){+.-.}, at: spin_lock include/linux/spinlock.h:338 [inline] #2: 00000000364bae92 (&(&sch->q.lock)->rlock){+.-.}, at: __dev_xmit_skb net/core/dev.c:3502 [inline] #2: 00000000364bae92 (&(&sch->q.lock)->rlock){+.-.}, at: __dev_queue_xmit+0x14b8/0x3650 net/core/dev.c:3838 stack backtrace: CPU: 0 PID: 8855 Comm: syz-executor427 Not tainted 5.3.0-rc7+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x172/0x1f0 lib/dump_stack.c:113 lockdep_rcu_suspicious+0x153/0x15d kernel/locking/lockdep.c:5357 qdisc_root include/net/sch_generic.h:492 [inline] netem_enqueue+0x1cfb/0x2d80 net/sched/sch_netem.c:479 __dev_xmit_skb net/core/dev.c:3527 [inline] __dev_queue_xmit+0x15d2/0x3650 net/core/dev.c:3838 dev_queue_xmit+0x18/0x20 net/core/dev.c:3902 neigh_hh_output include/net/neighbour.h:500 [inline] neigh_output include/net/neighbour.h:509 [inline] ip_finish_output2+0x1726/0x2570 net/ipv4/ip_output.c:228 __ip_finish_output net/ipv4/ip_output.c:308 [inline] __ip_finish_output+0x5fc/0xb90 net/ipv4/ip_output.c:290 ip_finish_output+0x38/0x1f0 net/ipv4/ip_output.c:318 NF_HOOK_COND include/linux/netfilter.h:294 [inline] ip_mc_output+0x292/0xf40 net/ipv4/ip_output.c:417 dst_output include/net/dst.h:436 [inline] ip_local_out+0xbb/0x190 net/ipv4/ip_output.c:125 ip_send_skb+0x42/0xf0 net/ipv4/ip_output.c:1555 udp_send_skb.isra.0+0x6b2/0x1160 net/ipv4/udp.c:887 udp_sendmsg+0x1e96/0x2820 net/ipv4/udp.c:1174 inet_sendmsg+0x9e/0xe0 net/ipv4/af_inet.c:807 sock_sendmsg_nosec net/socket.c:637 [inline] sock_sendmsg+0xd7/0x130 net/socket.c:657 ___sys_sendmsg+0x3e2/0x920 net/socket.c:2311 __sys_sendmmsg+0x1bf/0x4d0 net/socket.c:2413 __do_sys_sendmmsg net/socket.c:2442 [inline] __se_sys_sendmmsg net/socket.c:2439 [inline] __x64_sys_sendmmsg+0x9d/0x100 net/socket.c:2439 do_syscall_64+0xfd/0x6a0 arch/x86/entry/common.c:296 entry_SYSCALL_64_after_hwframe+0x49/0xbe Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- include/net/sch_generic.h | 5 +++++ net/sched/sch_netem.c | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 43f5b7ed02bd..637548d54b3e 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -494,6 +494,11 @@ static inline struct Qdisc *qdisc_root(const struct Qdisc *qdisc) return q; } +static inline struct Qdisc *qdisc_root_bh(const struct Qdisc *qdisc) +{ + return rcu_dereference_bh(qdisc->dev_queue->qdisc); +} + static inline struct Qdisc *qdisc_root_sleeping(const struct Qdisc *qdisc) { return qdisc->dev_queue->qdisc_sleeping; diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index f5cb35e550f8..0e44039e729c 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -476,7 +476,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, * skb will be queued. */ if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) { - struct Qdisc *rootq = qdisc_root(sch); + struct Qdisc *rootq = qdisc_root_bh(sch); u32 dupsave = q->duplicate; /* prevent duplicating a dup... */ q->duplicate = 0; From 2b6fd3ea438c742d162a40a124b0181922633163 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 25 Sep 2019 02:47:07 +0200 Subject: [PATCH 272/334] net: dsa: qca8k: Fix port enable for CPU port MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The CPU port does not have a PHY connected to it. So calling phy_support_asym_pause() results in an Opps. As with other DSA drivers, add a guard that the port is a user port. Reported-by: Michal Vokáč Fixes: 0394a63acfe2 ("net: dsa: enable and disable all ports") Signed-off-by: Andrew Lunn Tested-by: Michal Vokáč Signed-off-by: David S. Miller --- drivers/net/dsa/qca8k.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c index 16f15c93a102..684aa51684db 100644 --- a/drivers/net/dsa/qca8k.c +++ b/drivers/net/dsa/qca8k.c @@ -936,6 +936,9 @@ qca8k_port_enable(struct dsa_switch *ds, int port, { struct qca8k_priv *priv = (struct qca8k_priv *)ds->priv; + if (!dsa_is_user_port(ds, port)) + return 0; + qca8k_port_set_status(priv, port, 1); priv->port_sts[port].enabled = 1; From 768fb61fcc13b2acaca758275d54c09a65e2968b Mon Sep 17 00:00:00 2001 From: Allan Zhang Date: Wed, 25 Sep 2019 16:43:12 -0700 Subject: [PATCH 273/334] bpf: Fix bpf_event_output re-entry issue BPF_PROG_TYPE_SOCK_OPS program can reenter bpf_event_output because it can be called from atomic and non-atomic contexts since we don't have bpf_prog_active to prevent it happen. This patch enables 3 levels of nesting to support normal, irq and nmi context. We can easily reproduce the issue by running netperf crr mode with 100 flows and 10 threads from netperf client side. Here is the whole stack dump: [ 515.228898] WARNING: CPU: 20 PID: 14686 at kernel/trace/bpf_trace.c:549 bpf_event_output+0x1f9/0x220 [ 515.228903] CPU: 20 PID: 14686 Comm: tcp_crr Tainted: G W 4.15.0-smp-fixpanic #44 [ 515.228904] Hardware name: Intel TBG,ICH10/Ikaria_QC_1b, BIOS 1.22.0 06/04/2018 [ 515.228905] RIP: 0010:bpf_event_output+0x1f9/0x220 [ 515.228906] RSP: 0018:ffff9a57ffc03938 EFLAGS: 00010246 [ 515.228907] RAX: 0000000000000012 RBX: 0000000000000001 RCX: 0000000000000000 [ 515.228907] RDX: 0000000000000000 RSI: 0000000000000096 RDI: ffffffff836b0f80 [ 515.228908] RBP: ffff9a57ffc039c8 R08: 0000000000000004 R09: 0000000000000012 [ 515.228908] R10: ffff9a57ffc1de40 R11: 0000000000000000 R12: 0000000000000002 [ 515.228909] R13: ffff9a57e13bae00 R14: 00000000ffffffff R15: ffff9a57ffc1e2c0 [ 515.228910] FS: 00007f5a3e6ec700(0000) GS:ffff9a57ffc00000(0000) knlGS:0000000000000000 [ 515.228910] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 515.228911] CR2: 0000537082664fff CR3: 000000061fed6002 CR4: 00000000000226f0 [ 515.228911] Call Trace: [ 515.228913] [ 515.228919] [] bpf_sockopt_event_output+0x3b/0x50 [ 515.228923] [] ? bpf_ktime_get_ns+0xe/0x10 [ 515.228927] [] ? __cgroup_bpf_run_filter_sock_ops+0x85/0x100 [ 515.228930] [] ? tcp_init_transfer+0x125/0x150 [ 515.228933] [] ? tcp_finish_connect+0x89/0x110 [ 515.228936] [] ? tcp_rcv_state_process+0x704/0x1010 [ 515.228939] [] ? sk_filter_trim_cap+0x53/0x2a0 [ 515.228942] [] ? tcp_v6_inbound_md5_hash+0x6f/0x1d0 [ 515.228945] [] ? tcp_v6_do_rcv+0x1c0/0x460 [ 515.228947] [] ? tcp_v6_rcv+0x9f8/0xb30 [ 515.228951] [] ? ip6_route_input+0x190/0x220 [ 515.228955] [] ? ip6_protocol_deliver_rcu+0x6d/0x450 [ 515.228958] [] ? ip6_rcv_finish+0xb6/0x170 [ 515.228961] [] ? ip6_protocol_deliver_rcu+0x450/0x450 [ 515.228963] [] ? ipv6_rcv+0x61/0xe0 [ 515.228966] [] ? ipv6_list_rcv+0x330/0x330 [ 515.228969] [] ? __netif_receive_skb_one_core+0x5b/0xa0 [ 515.228972] [] ? __netif_receive_skb+0x21/0x70 [ 515.228975] [] ? process_backlog+0xb2/0x150 [ 515.228978] [] ? net_rx_action+0x16f/0x410 [ 515.228982] [] ? __do_softirq+0xdd/0x305 [ 515.228986] [] ? irq_exit+0x9c/0xb0 [ 515.228989] [] ? smp_call_function_single_interrupt+0x65/0x120 [ 515.228991] [] ? call_function_single_interrupt+0x81/0x90 [ 515.228992] [ 515.228996] [] ? io_serial_in+0x20/0x20 [ 515.229000] [] ? console_unlock+0x230/0x490 [ 515.229003] [] ? vprintk_emit+0x26a/0x2a0 [ 515.229006] [] ? vprintk_default+0x1f/0x30 [ 515.229008] [] ? vprintk_func+0x35/0x70 [ 515.229011] [] ? printk+0x50/0x66 [ 515.229013] [] ? bpf_event_output+0xb7/0x220 [ 515.229016] [] ? bpf_sockopt_event_output+0x3b/0x50 [ 515.229019] [] ? bpf_ktime_get_ns+0xe/0x10 [ 515.229023] [] ? release_sock+0x97/0xb0 [ 515.229026] [] ? tcp_recvmsg+0x31a/0xda0 [ 515.229029] [] ? __cgroup_bpf_run_filter_sock_ops+0x85/0x100 [ 515.229032] [] ? tcp_set_state+0x191/0x1b0 [ 515.229035] [] ? tcp_disconnect+0x2e/0x600 [ 515.229038] [] ? tcp_close+0x3eb/0x460 [ 515.229040] [] ? inet_release+0x42/0x70 [ 515.229043] [] ? inet6_release+0x39/0x50 [ 515.229046] [] ? __sock_release+0x4d/0xd0 [ 515.229049] [] ? sock_close+0x15/0x20 [ 515.229052] [] ? __fput+0xe7/0x1f0 [ 515.229055] [] ? ____fput+0xe/0x10 [ 515.229058] [] ? task_work_run+0x82/0xb0 [ 515.229061] [] ? exit_to_usermode_loop+0x7e/0x11f [ 515.229064] [] ? do_syscall_64+0x111/0x130 [ 515.229067] [] ? entry_SYSCALL_64_after_hwframe+0x3d/0xa2 Fixes: a5a3a828cd00 ("bpf: add perf event notificaton support for sock_ops") Signed-off-by: Allan Zhang Signed-off-by: Daniel Borkmann Reviewed-by: Stanislav Fomichev Reviewed-by: Eric Dumazet Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20190925234312.94063-2-allanzhang@google.com --- kernel/trace/bpf_trace.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index ca1255d14576..3e38a010003c 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -500,14 +500,17 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; -static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs); -static DEFINE_PER_CPU(struct perf_sample_data, bpf_misc_sd); +static DEFINE_PER_CPU(int, bpf_event_output_nest_level); +struct bpf_nested_pt_regs { + struct pt_regs regs[3]; +}; +static DEFINE_PER_CPU(struct bpf_nested_pt_regs, bpf_pt_regs); +static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_misc_sds); u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy) { - struct perf_sample_data *sd = this_cpu_ptr(&bpf_misc_sd); - struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs); + int nest_level = this_cpu_inc_return(bpf_event_output_nest_level); struct perf_raw_frag frag = { .copy = ctx_copy, .size = ctx_size, @@ -522,12 +525,25 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, .data = meta, }, }; + struct perf_sample_data *sd; + struct pt_regs *regs; + u64 ret; + + if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(bpf_misc_sds.sds))) { + ret = -EBUSY; + goto out; + } + sd = this_cpu_ptr(&bpf_misc_sds.sds[nest_level - 1]); + regs = this_cpu_ptr(&bpf_pt_regs.regs[nest_level - 1]); perf_fetch_caller_regs(regs); perf_sample_data_init(sd, 0, 0); sd->raw = &raw; - return __bpf_perf_event_output(regs, map, flags, sd); + ret = __bpf_perf_event_output(regs, map, flags, sd); +out: + this_cpu_dec(bpf_event_output_nest_level); + return ret; } BPF_CALL_0(bpf_get_current_task) From 4f6570d7206bb052f42718d55fbe72977f0318ea Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 24 Sep 2019 08:01:14 -0700 Subject: [PATCH 274/334] ipv6: add priority parameter to ip6_xmit() Currently, ip6_xmit() sets skb->priority based on sk->sk_priority This is not desirable for TCP since TCP shares the same ctl socket for a given netns. We want to be able to send RST or ACK packets with a non zero skb->priority. This patch has no functional change. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ipv6.h | 2 +- net/dccp/ipv6.c | 5 +++-- net/ipv6/inet6_connection_sock.c | 2 +- net/ipv6/ip6_output.c | 4 ++-- net/ipv6/tcp_ipv6.c | 6 ++++-- net/sctp/ipv6.c | 2 +- 6 files changed, 12 insertions(+), 9 deletions(-) diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 8dfc65639aa4..009605c56f20 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -981,7 +981,7 @@ int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb); * upper-layer output functions */ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, - __u32 mark, struct ipv6_txoptions *opt, int tclass); + __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority); int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr); diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 1b7381ff787b..25aab672fc99 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -230,7 +230,8 @@ static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req opt = ireq->ipv6_opt; if (!opt) opt = rcu_dereference(np->opt); - err = ip6_xmit(sk, skb, &fl6, sk->sk_mark, opt, np->tclass); + err = ip6_xmit(sk, skb, &fl6, sk->sk_mark, opt, np->tclass, + sk->sk_priority); rcu_read_unlock(); err = net_xmit_eval(err); } @@ -284,7 +285,7 @@ static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb) dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL); if (!IS_ERR(dst)) { skb_dst_set(skb, dst); - ip6_xmit(ctl_sk, skb, &fl6, 0, NULL, 0); + ip6_xmit(ctl_sk, skb, &fl6, 0, NULL, 0, 0); DCCP_INC_STATS(DCCP_MIB_OUTSEGS); DCCP_INC_STATS(DCCP_MIB_OUTRSTS); return; diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 4da24aa6c696..0a0945a5b30d 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -133,7 +133,7 @@ int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl_unused fl6.daddr = sk->sk_v6_daddr; res = ip6_xmit(sk, skb, &fl6, sk->sk_mark, rcu_dereference(np->opt), - np->tclass); + np->tclass, sk->sk_priority); rcu_read_unlock(); return res; } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 89a4c7c2e25d..edadee4a7e76 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -193,7 +193,7 @@ bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np) * which are using proper atomic operations or spinlocks. */ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, - __u32 mark, struct ipv6_txoptions *opt, int tclass) + __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority) { struct net *net = sock_net(sk); const struct ipv6_pinfo *np = inet6_sk(sk); @@ -258,7 +258,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, hdr->daddr = *first_hop; skb->protocol = htons(ETH_P_IPV6); - skb->priority = sk->sk_priority; + skb->priority = priority; skb->mark = mark; mtu = dst_mtu(dst); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 87f44d3250ee..806064c28867 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -512,7 +512,8 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, opt = ireq->ipv6_opt; if (!opt) opt = rcu_dereference(np->opt); - err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt, np->tclass); + err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt, np->tclass, + sk->sk_priority); rcu_read_unlock(); err = net_xmit_eval(err); } @@ -907,7 +908,8 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL); if (!IS_ERR(dst)) { skb_dst_set(buff, dst); - ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL, tclass); + ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL, tclass, + 0); TCP_INC_STATS(net, TCP_MIB_OUTSEGS); if (rst) TCP_INC_STATS(net, TCP_MIB_OUTRSTS); diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index e5f2fc726a98..dd860fea0148 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -215,7 +215,7 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport) rcu_read_lock(); res = ip6_xmit(sk, skb, fl6, sk->sk_mark, rcu_dereference(np->opt), - tclass); + tclass, sk->sk_priority); rcu_read_unlock(); return res; } From e9a5dceee56cb527a3498f1a59bd8726baa1e717 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 24 Sep 2019 08:01:15 -0700 Subject: [PATCH 275/334] ipv6: tcp: provide sk->sk_priority to ctl packets We can populate skb->priority for some ctl packets instead of always using zero. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 806064c28867..5f557bf27da2 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -804,7 +804,7 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 tsval, u32 tsecr, int oif, struct tcp_md5sig_key *key, int rst, - u8 tclass, __be32 label) + u8 tclass, __be32 label, u32 priority) { const struct tcphdr *th = tcp_hdr(skb); struct tcphdr *t1; @@ -909,7 +909,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 if (!IS_ERR(dst)) { skb_dst_set(buff, dst); ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL, tclass, - 0); + priority); TCP_INC_STATS(net, TCP_MIB_OUTSEGS); if (rst) TCP_INC_STATS(net, TCP_MIB_OUTRSTS); @@ -932,6 +932,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) struct sock *sk1 = NULL; #endif __be32 label = 0; + u32 priority = 0; struct net *net; int oif = 0; @@ -992,6 +993,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) trace_tcp_send_reset(sk, skb); if (np->repflow) label = ip6_flowlabel(ipv6h); + priority = sk->sk_priority; } if (sk->sk_state == TCP_TIME_WAIT) label = cpu_to_be32(inet_twsk(sk)->tw_flowlabel); @@ -1001,7 +1003,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) } tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, - label); + label, priority); #ifdef CONFIG_TCP_MD5SIG out: @@ -1012,10 +1014,10 @@ out: static void tcp_v6_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 tsval, u32 tsecr, int oif, struct tcp_md5sig_key *key, u8 tclass, - __be32 label) + __be32 label, u32 priority) { tcp_v6_send_response(sk, skb, seq, ack, win, tsval, tsecr, oif, key, 0, - tclass, label); + tclass, label, priority); } static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) @@ -1027,7 +1029,7 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcp_time_stamp_raw() + tcptw->tw_ts_offset, tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw), - tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel)); + tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel), 0); inet_twsk_put(tw); } @@ -1050,7 +1052,7 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, req->ts_recent, sk->sk_bound_dev_if, tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr), - 0, 0); + 0, 0, sk->sk_priority); } From f6c0f5d209fa80eb808e08aa4206f6e264041ef6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 24 Sep 2019 08:01:16 -0700 Subject: [PATCH 276/334] tcp: honor SO_PRIORITY in TIME_WAIT state ctl packets sent on behalf of TIME_WAIT sockets currently have a zero skb->priority, which can cause various problems. In this patch we : - add a tw_priority field in struct inet_timewait_sock. - populate it from sk->sk_priority when a TIME_WAIT is created. - For IPv4, change ip_send_unicast_reply() and its two callers to propagate tw_priority correctly. ip_send_unicast_reply() no longer changes sk->sk_priority. - For IPv6, make sure TIME_WAIT sockets pass their tw_priority field to tcp_v6_send_response() and tcp_v6_send_ack(). Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_timewait_sock.h | 1 + net/ipv4/ip_output.c | 1 - net/ipv4/tcp_ipv4.c | 4 ++++ net/ipv4/tcp_minisocks.c | 1 + net/ipv6/tcp_ipv6.c | 6 ++++-- 5 files changed, 10 insertions(+), 3 deletions(-) diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index aef38c140014..dfd919b3119e 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -71,6 +71,7 @@ struct inet_timewait_sock { tw_pad : 2, /* 2 bits hole */ tw_tos : 8; u32 tw_txhash; + u32 tw_priority; struct timer_list tw_timer; struct inet_bind_bucket *tw_tb; }; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index a77c3a4c24de..28fca408812c 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1694,7 +1694,6 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, inet_sk(sk)->tos = arg->tos; - sk->sk_priority = skb->priority; sk->sk_protocol = ip_hdr(skb)->protocol; sk->sk_bound_dev_if = arg->bound_dev_if; sk->sk_sndbuf = sysctl_wmem_default; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index fd394ad179a0..2ee45e3755e9 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -771,6 +771,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) if (sk) { ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? inet_twsk(sk)->tw_mark : sk->sk_mark; + ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? + inet_twsk(sk)->tw_priority : sk->sk_priority; transmit_time = tcp_transmit_time(sk); } ip_send_unicast_reply(ctl_sk, @@ -866,6 +868,8 @@ static void tcp_v4_send_ack(const struct sock *sk, ctl_sk = this_cpu_read(*net->ipv4.tcp_sk); ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? inet_twsk(sk)->tw_mark : sk->sk_mark; + ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? + inet_twsk(sk)->tw_priority : sk->sk_priority; transmit_time = tcp_transmit_time(sk); ip_send_unicast_reply(ctl_sk, skb, &TCP_SKB_CB(skb)->header.h4.opt, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 8bcaf2586b68..bb140a5db8c0 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -266,6 +266,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tw->tw_transparent = inet->transparent; tw->tw_mark = sk->sk_mark; + tw->tw_priority = sk->sk_priority; tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; tcptw->tw_rcv_nxt = tp->rcv_nxt; tcptw->tw_snd_nxt = tp->snd_nxt; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 5f557bf27da2..e3d9f4559c99 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -995,8 +995,10 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) label = ip6_flowlabel(ipv6h); priority = sk->sk_priority; } - if (sk->sk_state == TCP_TIME_WAIT) + if (sk->sk_state == TCP_TIME_WAIT) { label = cpu_to_be32(inet_twsk(sk)->tw_flowlabel); + priority = inet_twsk(sk)->tw_priority; + } } else { if (net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_TCP_RESET) label = ip6_flowlabel(ipv6h); @@ -1029,7 +1031,7 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcp_time_stamp_raw() + tcptw->tw_ts_offset, tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw), - tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel), 0); + tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel), tw->tw_priority); inet_twsk_put(tw); } From 05733434ee9ae6548723a808647248583e347cca Mon Sep 17 00:00:00 2001 From: Ka-Cheong Poon Date: Tue, 24 Sep 2019 08:51:16 -0700 Subject: [PATCH 277/334] net/rds: Check laddr_check before calling it In rds_bind(), laddr_check is called without checking if it is NULL or not. And rs_transport should be reset if rds_add_bound() fails. Fixes: c5c1a030a7db ("net/rds: An rds_sock is added too early to the hash table") Reported-by: syzbot+fae39afd2101a17ec624@syzkaller.appspotmail.com Signed-off-by: Ka-Cheong Poon Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/bind.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/rds/bind.c b/net/rds/bind.c index 20c156a73e73..5b5fb4ca8d3e 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -244,7 +244,8 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) */ if (rs->rs_transport) { trans = rs->rs_transport; - if (trans->laddr_check(sock_net(sock->sk), + if (!trans->laddr_check || + trans->laddr_check(sock_net(sock->sk), binding_addr, scope_id) != 0) { ret = -ENOPROTOOPT; goto out; @@ -263,6 +264,8 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) sock_set_flag(sk, SOCK_RCU_FREE); ret = rds_add_bound(rs, binding_addr, &port, scope_id); + if (ret) + rs->rs_transport = NULL; out: release_sock(sk); From 4ce70b4aed5752332b268909336b351721965dc4 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Tue, 24 Sep 2019 18:51:16 +0300 Subject: [PATCH 278/334] net: sched: sch_htb: don't call qdisc_put() while holding tree lock Recent changes that removed rtnl dependency from rules update path of tc also made tcf_block_put() function sleeping. This function is called from ops->destroy() of several Qdisc implementations, which in turn is called by qdisc_put(). Some Qdiscs call qdisc_put() while holding sch tree spinlock, which results sleeping-while-atomic BUG. Steps to reproduce for htb: tc qdisc add dev ens1f0 root handle 1: htb default 12 tc class add dev ens1f0 parent 1: classid 1:1 htb rate 100kbps ceil 100kbps tc qdisc add dev ens1f0 parent 1:1 handle 40: sfq perturb 10 tc class add dev ens1f0 parent 1:1 classid 1:2 htb rate 100kbps ceil 100kbps Resulting dmesg: [ 4791.148551] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:909 [ 4791.151354] in_atomic(): 1, irqs_disabled(): 0, pid: 27273, name: tc [ 4791.152805] INFO: lockdep is turned off. [ 4791.153605] CPU: 19 PID: 27273 Comm: tc Tainted: G W 5.3.0-rc8+ #721 [ 4791.154336] Hardware name: Supermicro SYS-2028TP-DECR/X10DRT-P, BIOS 2.0b 03/30/2017 [ 4791.155075] Call Trace: [ 4791.155803] dump_stack+0x85/0xc0 [ 4791.156529] ___might_sleep.cold+0xac/0xbc [ 4791.157251] __mutex_lock+0x5b/0x960 [ 4791.157966] ? console_unlock+0x363/0x5d0 [ 4791.158676] ? tcf_chain0_head_change_cb_del.isra.0+0x1b/0xf0 [ 4791.159395] ? tcf_chain0_head_change_cb_del.isra.0+0x1b/0xf0 [ 4791.160103] tcf_chain0_head_change_cb_del.isra.0+0x1b/0xf0 [ 4791.160815] tcf_block_put_ext.part.0+0x21/0x50 [ 4791.161530] tcf_block_put+0x50/0x70 [ 4791.162233] sfq_destroy+0x15/0x50 [sch_sfq] [ 4791.162936] qdisc_destroy+0x5f/0x160 [ 4791.163642] htb_change_class.cold+0x5df/0x69d [sch_htb] [ 4791.164505] tc_ctl_tclass+0x19d/0x480 [ 4791.165360] rtnetlink_rcv_msg+0x170/0x4b0 [ 4791.166191] ? netlink_deliver_tap+0x95/0x400 [ 4791.166907] ? rtnl_dellink+0x2d0/0x2d0 [ 4791.167625] netlink_rcv_skb+0x49/0x110 [ 4791.168345] netlink_unicast+0x171/0x200 [ 4791.169058] netlink_sendmsg+0x224/0x3f0 [ 4791.169771] sock_sendmsg+0x5e/0x60 [ 4791.170475] ___sys_sendmsg+0x2ae/0x330 [ 4791.171183] ? ___sys_recvmsg+0x159/0x1f0 [ 4791.171894] ? do_wp_page+0x9c/0x790 [ 4791.172595] ? __handle_mm_fault+0xcd3/0x19e0 [ 4791.173309] __sys_sendmsg+0x59/0xa0 [ 4791.174024] do_syscall_64+0x5c/0xb0 [ 4791.174725] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 4791.175435] RIP: 0033:0x7f0aa41497b8 [ 4791.176129] Code: 89 02 48 c7 c0 ff ff ff ff eb bb 0f 1f 80 00 00 00 00 f3 0f 1e fa 48 8d 05 65 8f 0c 00 8b 00 85 c0 75 17 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 58 c3 0f 1f 80 00 00 00 00 48 83 ec 28 89 5 4 [ 4791.177532] RSP: 002b:00007fff4e37d588 EFLAGS: 00000246 ORIG_RAX: 000000000000002e [ 4791.178243] RAX: ffffffffffffffda RBX: 000000005d8132f7 RCX: 00007f0aa41497b8 [ 4791.178947] RDX: 0000000000000000 RSI: 00007fff4e37d5f0 RDI: 0000000000000003 [ 4791.179662] RBP: 0000000000000000 R08: 0000000000000001 R09: 00000000020149a0 [ 4791.180382] R10: 0000000000404eda R11: 0000000000000246 R12: 0000000000000001 [ 4791.181100] R13: 000000000047f640 R14: 0000000000000000 R15: 0000000000000000 In htb_change_class() function save parent->leaf.q to local temporary variable and put reference to it after sch tree lock is released in order not to call potentially sleeping cls API in atomic section. This is safe to do because Qdisc has already been reset by qdisc_purge_queue() inside sch tree lock critical section. Fixes: c266f64dbfa2 ("net: sched: protect block state with mutex") Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- net/sched/sch_htb.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 7bcf20ef9145..8184c87da8be 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1302,6 +1302,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, struct htb_class *cl = (struct htb_class *)*arg, *parent; struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_HTB_MAX + 1]; + struct Qdisc *parent_qdisc = NULL; struct tc_htb_opt *hopt; u64 rate64, ceil64; int warn = 0; @@ -1401,7 +1402,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, if (parent && !parent->level) { /* turn parent into inner node */ qdisc_purge_queue(parent->leaf.q); - qdisc_put(parent->leaf.q); + parent_qdisc = parent->leaf.q; if (parent->prio_activity) htb_deactivate(q, parent); @@ -1480,6 +1481,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, cl->cbuffer = PSCHED_TICKS2NS(hopt->cbuffer); sch_tree_unlock(sch); + qdisc_put(parent_qdisc); if (warn) pr_warn("HTB: quantum of class %X is %s. Consider r2q change.\n", From c2999f7fb05b87da4060e38150c70fa46794d82b Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Tue, 24 Sep 2019 18:51:17 +0300 Subject: [PATCH 279/334] net: sched: multiq: don't call qdisc_put() while holding tree lock Recent changes that removed rtnl dependency from rules update path of tc also made tcf_block_put() function sleeping. This function is called from ops->destroy() of several Qdisc implementations, which in turn is called by qdisc_put(). Some Qdiscs call qdisc_put() while holding sch tree spinlock, which results sleeping-while-atomic BUG. Steps to reproduce for multiq: tc qdisc add dev ens1f0 root handle 1: multiq tc qdisc add dev ens1f0 parent 1:10 handle 50: sfq perturb 10 ethtool -L ens1f0 combined 2 tc qdisc change dev ens1f0 root handle 1: multiq Resulting dmesg: [ 5539.419344] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:909 [ 5539.420945] in_atomic(): 1, irqs_disabled(): 0, pid: 27658, name: tc [ 5539.422435] INFO: lockdep is turned off. [ 5539.423904] CPU: 21 PID: 27658 Comm: tc Tainted: G W 5.3.0-rc8+ #721 [ 5539.425400] Hardware name: Supermicro SYS-2028TP-DECR/X10DRT-P, BIOS 2.0b 03/30/2017 [ 5539.426911] Call Trace: [ 5539.428380] dump_stack+0x85/0xc0 [ 5539.429823] ___might_sleep.cold+0xac/0xbc [ 5539.431262] __mutex_lock+0x5b/0x960 [ 5539.432682] ? tcf_chain0_head_change_cb_del.isra.0+0x1b/0xf0 [ 5539.434103] ? __nla_validate_parse+0x51/0x840 [ 5539.435493] ? tcf_chain0_head_change_cb_del.isra.0+0x1b/0xf0 [ 5539.436903] tcf_chain0_head_change_cb_del.isra.0+0x1b/0xf0 [ 5539.438327] tcf_block_put_ext.part.0+0x21/0x50 [ 5539.439752] tcf_block_put+0x50/0x70 [ 5539.441165] sfq_destroy+0x15/0x50 [sch_sfq] [ 5539.442570] qdisc_destroy+0x5f/0x160 [ 5539.444000] multiq_tune+0x14a/0x420 [sch_multiq] [ 5539.445421] tc_modify_qdisc+0x324/0x840 [ 5539.446841] rtnetlink_rcv_msg+0x170/0x4b0 [ 5539.448269] ? netlink_deliver_tap+0x95/0x400 [ 5539.449691] ? rtnl_dellink+0x2d0/0x2d0 [ 5539.451116] netlink_rcv_skb+0x49/0x110 [ 5539.452522] netlink_unicast+0x171/0x200 [ 5539.453914] netlink_sendmsg+0x224/0x3f0 [ 5539.455304] sock_sendmsg+0x5e/0x60 [ 5539.456686] ___sys_sendmsg+0x2ae/0x330 [ 5539.458071] ? ___sys_recvmsg+0x159/0x1f0 [ 5539.459461] ? do_wp_page+0x9c/0x790 [ 5539.460846] ? __handle_mm_fault+0xcd3/0x19e0 [ 5539.462263] __sys_sendmsg+0x59/0xa0 [ 5539.463661] do_syscall_64+0x5c/0xb0 [ 5539.465044] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 5539.466454] RIP: 0033:0x7f1fe08177b8 [ 5539.467863] Code: 89 02 48 c7 c0 ff ff ff ff eb bb 0f 1f 80 00 00 00 00 f3 0f 1e fa 48 8d 05 65 8f 0c 00 8b 00 85 c0 75 17 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 58 c3 0f 1f 80 00 00 00 00 48 83 ec 28 89 5 4 [ 5539.470906] RSP: 002b:00007ffe812de5d8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e [ 5539.472483] RAX: ffffffffffffffda RBX: 000000005d8135e3 RCX: 00007f1fe08177b8 [ 5539.474069] RDX: 0000000000000000 RSI: 00007ffe812de640 RDI: 0000000000000003 [ 5539.475655] RBP: 0000000000000000 R08: 0000000000000001 R09: 000000000182e9b0 [ 5539.477203] R10: 0000000000404eda R11: 0000000000000246 R12: 0000000000000001 [ 5539.478699] R13: 000000000047f640 R14: 0000000000000000 R15: 0000000000000000 Rearrange locking in multiq_tune() in following ways: - In loop that removes Qdiscs from disabled queues, call qdisc_purge_queue() instead of qdisc_tree_flush_backlog() on Qdisc that is being destroyed. Save the Qdisc in temporary allocated array and call qdisc_put() on each element of the array after sch tree lock is released. This is safe to do because Qdiscs have already been reset by qdisc_purge_queue() inside sch tree lock critical section. - Do the same change for second loop that initializes Qdiscs for newly enabled queues in multiq_tune() function. Since sch tree lock is obtained and released on each iteration of this loop, just call qdisc_put() directly outside of critical section. Don't verify that old Qdisc is not noop_qdisc before releasing reference to it because such check is already performed by qdisc_put*() functions. Fixes: c266f64dbfa2 ("net: sched: protect block state with mutex") Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- net/sched/sch_multiq.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index e1087746f6a2..b2b7fdb06fc6 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -174,7 +174,8 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt, { struct multiq_sched_data *q = qdisc_priv(sch); struct tc_multiq_qopt *qopt; - int i; + struct Qdisc **removed; + int i, n_removed = 0; if (!netif_is_multiqueue(qdisc_dev(sch))) return -EOPNOTSUPP; @@ -185,6 +186,11 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt, qopt->bands = qdisc_dev(sch)->real_num_tx_queues; + removed = kmalloc(sizeof(*removed) * (q->max_bands - q->bands), + GFP_KERNEL); + if (!removed) + return -ENOMEM; + sch_tree_lock(sch); q->bands = qopt->bands; for (i = q->bands; i < q->max_bands; i++) { @@ -192,13 +198,17 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt, struct Qdisc *child = q->queues[i]; q->queues[i] = &noop_qdisc; - qdisc_tree_flush_backlog(child); - qdisc_put(child); + qdisc_purge_queue(child); + removed[n_removed++] = child; } } sch_tree_unlock(sch); + for (i = 0; i < n_removed; i++) + qdisc_put(removed[i]); + kfree(removed); + for (i = 0; i < q->bands; i++) { if (q->queues[i] == &noop_qdisc) { struct Qdisc *child, *old; @@ -213,11 +223,10 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt, if (child != &noop_qdisc) qdisc_hash_add(child, true); - if (old != &noop_qdisc) { - qdisc_tree_flush_backlog(old); - qdisc_put(old); - } + if (old != &noop_qdisc) + qdisc_purge_queue(old); sch_tree_unlock(sch); + qdisc_put(old); } } } From e3ae1f96accd21405715fe9c56b4d83bc7d96d44 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Tue, 24 Sep 2019 18:51:18 +0300 Subject: [PATCH 280/334] net: sched: sch_sfb: don't call qdisc_put() while holding tree lock Recent changes that removed rtnl dependency from rules update path of tc also made tcf_block_put() function sleeping. This function is called from ops->destroy() of several Qdisc implementations, which in turn is called by qdisc_put(). Some Qdiscs call qdisc_put() while holding sch tree spinlock, which results sleeping-while-atomic BUG. Steps to reproduce for sfb: tc qdisc add dev ens1f0 handle 1: root sfb tc qdisc add dev ens1f0 parent 1:10 handle 50: sfq perturb 10 tc qdisc change dev ens1f0 root handle 1: sfb Resulting dmesg: [ 7265.938717] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:909 [ 7265.940152] in_atomic(): 1, irqs_disabled(): 0, pid: 28579, name: tc [ 7265.941455] INFO: lockdep is turned off. [ 7265.942744] CPU: 11 PID: 28579 Comm: tc Tainted: G W 5.3.0-rc8+ #721 [ 7265.944065] Hardware name: Supermicro SYS-2028TP-DECR/X10DRT-P, BIOS 2.0b 03/30/2017 [ 7265.945396] Call Trace: [ 7265.946709] dump_stack+0x85/0xc0 [ 7265.947994] ___might_sleep.cold+0xac/0xbc [ 7265.949282] __mutex_lock+0x5b/0x960 [ 7265.950543] ? tcf_chain0_head_change_cb_del.isra.0+0x1b/0xf0 [ 7265.951803] ? tcf_chain0_head_change_cb_del.isra.0+0x1b/0xf0 [ 7265.953022] tcf_chain0_head_change_cb_del.isra.0+0x1b/0xf0 [ 7265.954248] tcf_block_put_ext.part.0+0x21/0x50 [ 7265.955478] tcf_block_put+0x50/0x70 [ 7265.956694] sfq_destroy+0x15/0x50 [sch_sfq] [ 7265.957898] qdisc_destroy+0x5f/0x160 [ 7265.959099] sfb_change+0x175/0x330 [sch_sfb] [ 7265.960304] tc_modify_qdisc+0x324/0x840 [ 7265.961503] rtnetlink_rcv_msg+0x170/0x4b0 [ 7265.962692] ? netlink_deliver_tap+0x95/0x400 [ 7265.963876] ? rtnl_dellink+0x2d0/0x2d0 [ 7265.965064] netlink_rcv_skb+0x49/0x110 [ 7265.966251] netlink_unicast+0x171/0x200 [ 7265.967427] netlink_sendmsg+0x224/0x3f0 [ 7265.968595] sock_sendmsg+0x5e/0x60 [ 7265.969753] ___sys_sendmsg+0x2ae/0x330 [ 7265.970916] ? ___sys_recvmsg+0x159/0x1f0 [ 7265.972074] ? do_wp_page+0x9c/0x790 [ 7265.973233] ? __handle_mm_fault+0xcd3/0x19e0 [ 7265.974407] __sys_sendmsg+0x59/0xa0 [ 7265.975591] do_syscall_64+0x5c/0xb0 [ 7265.976753] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 7265.977938] RIP: 0033:0x7f229069f7b8 [ 7265.979117] Code: 89 02 48 c7 c0 ff ff ff ff eb bb 0f 1f 80 00 00 00 00 f3 0f 1e fa 48 8d 05 65 8f 0c 00 8b 00 85 c0 75 17 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 58 c3 0f 1f 80 00 00 00 00 48 83 ec 28 89 5 4 [ 7265.981681] RSP: 002b:00007ffd7ed2d158 EFLAGS: 00000246 ORIG_RAX: 000000000000002e [ 7265.983001] RAX: ffffffffffffffda RBX: 000000005d813ca1 RCX: 00007f229069f7b8 [ 7265.984336] RDX: 0000000000000000 RSI: 00007ffd7ed2d1c0 RDI: 0000000000000003 [ 7265.985682] RBP: 0000000000000000 R08: 0000000000000001 R09: 000000000165c9a0 [ 7265.987021] R10: 0000000000404eda R11: 0000000000000246 R12: 0000000000000001 [ 7265.988309] R13: 000000000047f640 R14: 0000000000000000 R15: 0000000000000000 In sfb_change() function use qdisc_purge_queue() instead of qdisc_tree_flush_backlog() to properly reset old child Qdisc and save pointer to it into local temporary variable. Put reference to Qdisc after sch tree lock is released in order not to call potentially sleeping cls API in atomic section. This is safe to do because Qdisc has already been reset by qdisc_purge_queue() inside sch tree lock critical section. Reported-by: syzbot+ac54455281db908c581e@syzkaller.appspotmail.com Fixes: c266f64dbfa2 ("net: sched: protect block state with mutex") Suggested-by: Cong Wang Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- net/sched/sch_sfb.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index 1dff8506a715..d448fe3068e5 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -488,7 +488,7 @@ static int sfb_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct sfb_sched_data *q = qdisc_priv(sch); - struct Qdisc *child; + struct Qdisc *child, *old; struct nlattr *tb[TCA_SFB_MAX + 1]; const struct tc_sfb_qopt *ctl = &sfb_default_ops; u32 limit; @@ -518,8 +518,8 @@ static int sfb_change(struct Qdisc *sch, struct nlattr *opt, qdisc_hash_add(child, true); sch_tree_lock(sch); - qdisc_tree_flush_backlog(q->qdisc); - qdisc_put(q->qdisc); + qdisc_purge_queue(q->qdisc); + old = q->qdisc; q->qdisc = child; q->rehash_interval = msecs_to_jiffies(ctl->rehash_interval); @@ -542,6 +542,7 @@ static int sfb_change(struct Qdisc *sch, struct nlattr *opt, sfb_init_perturbation(1, q); sch_tree_unlock(sch); + qdisc_put(old); return 0; } From bab32fc069ce8829c416e8737c119f62a57970f9 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 16 Sep 2019 20:02:38 +0800 Subject: [PATCH 281/334] btrfs: qgroup: Fix the wrong target io_tree when freeing reserved data space [BUG] Under the following case with qgroup enabled, if some error happened after we have reserved delalloc space, then in error handling path, we could cause qgroup data space leakage: From btrfs_truncate_block() in inode.c: ret = btrfs_delalloc_reserve_space(inode, &data_reserved, block_start, blocksize); if (ret) goto out; again: page = find_or_create_page(mapping, index, mask); if (!page) { btrfs_delalloc_release_space(inode, data_reserved, block_start, blocksize, true); btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize, true); ret = -ENOMEM; goto out; } [CAUSE] In the above case, btrfs_delalloc_reserve_space() will call btrfs_qgroup_reserve_data() and mark the io_tree range with EXTENT_QGROUP_RESERVED flag. In the error handling path, we have the following call stack: btrfs_delalloc_release_space() |- btrfs_free_reserved_data_space() |- btrsf_qgroup_free_data() |- __btrfs_qgroup_release_data(reserved=@reserved, free=1) |- qgroup_free_reserved_data(reserved=@reserved) |- clear_record_extent_bits(); |- freed += changeset.bytes_changed; However due to a completion bug, qgroup_free_reserved_data() will clear EXTENT_QGROUP_RESERVED flag in BTRFS_I(inode)->io_failure_tree, other than the correct BTRFS_I(inode)->io_tree. Since io_failure_tree is never marked with that flag, btrfs_qgroup_free_data() will not free any data reserved space at all, causing a leakage. This type of error handling can only be triggered by errors outside of qgroup code. So EDQUOT error from qgroup can't trigger it. [FIX] Fix the wrong target io_tree. Reported-by: Josef Bacik Fixes: bc42bda22345 ("btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges") CC: stable@vger.kernel.org # 4.14+ Reviewed-by: Nikolay Borisov Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 52701c1be109..4ab85555a947 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3486,7 +3486,7 @@ static int qgroup_free_reserved_data(struct inode *inode, * EXTENT_QGROUP_RESERVED, we won't double free. * So not need to rush. */ - ret = clear_record_extent_bits(&BTRFS_I(inode)->io_failure_tree, + ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, free_start, free_start + free_len - 1, EXTENT_QGROUP_RESERVED, &changeset); if (ret < 0) From d4e204948fe3e0dc8e1fbf3f8f3290c9c2823be3 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 16 Sep 2019 20:02:39 +0800 Subject: [PATCH 282/334] btrfs: qgroup: Fix reserved data space leak if we have multiple reserve calls [BUG] The following script can cause btrfs qgroup data space leak: mkfs.btrfs -f $dev mount $dev -o nospace_cache $mnt btrfs subv create $mnt/subv btrfs quota en $mnt btrfs quota rescan -w $mnt btrfs qgroup limit 128m $mnt/subv for (( i = 0; i < 3; i++)); do # Create 3 64M holes for latter fallocate to fail truncate -s 192m $mnt/subv/file xfs_io -c "pwrite 64m 4k" $mnt/subv/file > /dev/null xfs_io -c "pwrite 128m 4k" $mnt/subv/file > /dev/null sync # it's supposed to fail, and each failure will leak at least 64M # data space xfs_io -f -c "falloc 0 192m" $mnt/subv/file &> /dev/null rm $mnt/subv/file sync done # Shouldn't fail after we removed the file xfs_io -f -c "falloc 0 64m" $mnt/subv/file [CAUSE] Btrfs qgroup data reserve code allow multiple reservations to happen on a single extent_changeset: E.g: btrfs_qgroup_reserve_data(inode, &data_reserved, 0, SZ_1M); btrfs_qgroup_reserve_data(inode, &data_reserved, SZ_1M, SZ_2M); btrfs_qgroup_reserve_data(inode, &data_reserved, 0, SZ_4M); Btrfs qgroup code has its internal tracking to make sure we don't double-reserve in above example. The only pattern utilizing this feature is in the main while loop of btrfs_fallocate() function. However btrfs_qgroup_reserve_data()'s error handling has a bug in that on error it clears all ranges in the io_tree with EXTENT_QGROUP_RESERVED flag but doesn't free previously reserved bytes. This bug has a two fold effect: - Clearing EXTENT_QGROUP_RESERVED ranges This is the correct behavior, but it prevents btrfs_qgroup_check_reserved_leak() to catch the leakage as the detector is purely EXTENT_QGROUP_RESERVED flag based. - Leak the previously reserved data bytes. The bug manifests when N calls to btrfs_qgroup_reserve_data are made and the last one fails, leaking space reserved in the previous ones. [FIX] Also free previously reserved data bytes when btrfs_qgroup_reserve_data fails. Fixes: 524725537023 ("btrfs: qgroup: Introduce btrfs_qgroup_reserve_data function") CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 4ab85555a947..c4bb69941c77 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3442,6 +3442,9 @@ cleanup: while ((unode = ulist_next(&reserved->range_changed, &uiter))) clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val, unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL); + /* Also free data bytes of already reserved one */ + btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, + orig_reserved, BTRFS_QGROUP_RSV_DATA); extent_changeset_release(reserved); return ret; } From dac91170f8e9c73784af5fad6225e954b795601c Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 25 Sep 2019 07:53:19 -0700 Subject: [PATCH 283/334] vrf: Do not attempt to create IPv6 mcast rule if IPv6 is disabled A user reported that vrf create fails when IPv6 is disabled at boot using 'ipv6.disable=1': https://bugzilla.kernel.org/show_bug.cgi?id=204903 The failure is adding fib rules at create time. Add RTNL_FAMILY_IP6MR to the check in vrf_fib_rule if ipv6_mod_enabled is disabled. Fixes: e4a38c0c4b27 ("ipv6: add vrf table handling code for ipv6 mcast") Signed-off-by: David Ahern Cc: Patrick Ruddy Signed-off-by: David S. Miller --- drivers/net/vrf.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 6e84328bdd40..a4b38a980c3c 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -1154,7 +1154,8 @@ static int vrf_fib_rule(const struct net_device *dev, __u8 family, bool add_it) struct sk_buff *skb; int err; - if (family == AF_INET6 && !ipv6_mod_enabled()) + if ((family == AF_INET6 || family == RTNL_FAMILY_IP6MR) && + !ipv6_mod_enabled()) return 0; skb = nlmsg_new(vrf_fib_rule_nl_size(), GFP_KERNEL); From dfe5999dc03e321d08190772c98843284d5cf419 Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Wed, 25 Sep 2019 18:02:35 +0300 Subject: [PATCH 284/334] net/sched: Set default of CONFIG_NET_TC_SKB_EXT to N This a new feature, it is preferred that it defaults to N. We will probe the feature support from userspace before actually using it. Fixes: 95a7233c452a ('net: openvswitch: Set OvS recirc_id from tc chain index') Signed-off-by: Paul Blakey Signed-off-by: David S. Miller --- net/sched/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 5b044ae6dc1e..2985509147a2 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -966,7 +966,6 @@ config NET_IFE_SKBTCINDEX config NET_TC_SKB_EXT bool "TC recirculation support" depends on NET_CLS_ACT - default y if NET_CLS_ACT select SKB_EXTENSIONS help From 8572cea1461a006bce1d06c0c4b0575869125fa4 Mon Sep 17 00:00:00 2001 From: Navid Emamdoost Date: Wed, 25 Sep 2019 13:24:02 -0500 Subject: [PATCH 285/334] nfp: flower: prevent memory leak in nfp_flower_spawn_phy_reprs In nfp_flower_spawn_phy_reprs, in the for loop over eth_tbl if any of intermediate allocations or initializations fail memory is leaked. requiered releases are added. Fixes: b94524529741 ("nfp: flower: add per repr private data for LAG offload") Signed-off-by: Navid Emamdoost Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/flower/main.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.c b/drivers/net/ethernet/netronome/nfp/flower/main.c index 7a20447cca19..91a47899220f 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/main.c +++ b/drivers/net/ethernet/netronome/nfp/flower/main.c @@ -515,6 +515,7 @@ nfp_flower_spawn_phy_reprs(struct nfp_app *app, struct nfp_flower_priv *priv) repr_priv = kzalloc(sizeof(*repr_priv), GFP_KERNEL); if (!repr_priv) { err = -ENOMEM; + nfp_repr_free(repr); goto err_reprs_clean; } @@ -525,11 +526,13 @@ nfp_flower_spawn_phy_reprs(struct nfp_app *app, struct nfp_flower_priv *priv) port = nfp_port_alloc(app, NFP_PORT_PHYS_PORT, repr); if (IS_ERR(port)) { err = PTR_ERR(port); + kfree(repr_priv); nfp_repr_free(repr); goto err_reprs_clean; } err = nfp_port_init_phy_port(app->pf, app, port, i); if (err) { + kfree(repr_priv); nfp_port_free(port); nfp_repr_free(repr); goto err_reprs_clean; @@ -542,6 +545,7 @@ nfp_flower_spawn_phy_reprs(struct nfp_app *app, struct nfp_flower_priv *priv) err = nfp_repr_init(app, repr, cmsg_port_id, port, priv->nn->dp.netdev); if (err) { + kfree(repr_priv); nfp_port_free(port); nfp_repr_free(repr); goto err_reprs_clean; From 8ce39eb5a67aee25d9f05b40b673c95b23502e3e Mon Sep 17 00:00:00 2001 From: Navid Emamdoost Date: Wed, 25 Sep 2019 14:05:09 -0500 Subject: [PATCH 286/334] nfp: flower: fix memory leak in nfp_flower_spawn_vnic_reprs In nfp_flower_spawn_vnic_reprs in the loop if initialization or the allocations fail memory is leaked. Appropriate releases are added. Fixes: b94524529741 ("nfp: flower: add per repr private data for LAG offload") Signed-off-by: Navid Emamdoost Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/flower/main.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.c b/drivers/net/ethernet/netronome/nfp/flower/main.c index 91a47899220f..d8ad9346a26a 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/main.c +++ b/drivers/net/ethernet/netronome/nfp/flower/main.c @@ -400,6 +400,7 @@ nfp_flower_spawn_vnic_reprs(struct nfp_app *app, repr_priv = kzalloc(sizeof(*repr_priv), GFP_KERNEL); if (!repr_priv) { err = -ENOMEM; + nfp_repr_free(repr); goto err_reprs_clean; } @@ -413,6 +414,7 @@ nfp_flower_spawn_vnic_reprs(struct nfp_app *app, port = nfp_port_alloc(app, port_type, repr); if (IS_ERR(port)) { err = PTR_ERR(port); + kfree(repr_priv); nfp_repr_free(repr); goto err_reprs_clean; } @@ -433,6 +435,7 @@ nfp_flower_spawn_vnic_reprs(struct nfp_app *app, err = nfp_repr_init(app, repr, port_id, port, priv->nn->dp.netdev); if (err) { + kfree(repr_priv); nfp_port_free(port); nfp_repr_free(repr); goto err_reprs_clean; From a3aa6e65beebf3780026753ebf39db19f4c92990 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Thu, 26 Sep 2019 00:08:42 +0200 Subject: [PATCH 287/334] net: dsa: microchip: Always set regmap stride to 1 The regmap stride is set to 1 for regmap describing 8bit registers already. However, for 16/32/64bit registers, the stride is 2/4/8 respectively. This is not correct, as the switch protocol supports unaligned register reads and writes and the KSZ87xx even uses such unaligned register accesses to read e.g. MIB counter. This patch fixes MIB counter access on KSZ87xx. Signed-off-by: Marek Vasut Cc: Andrew Lunn Cc: David S. Miller Cc: Florian Fainelli Cc: George McCollister Cc: Tristram Ha Cc: Vivien Didelot Cc: Woojung Huh Fixes: 46558d601cb6 ("net: dsa: microchip: Initial SPI regmap support") Fixes: 255b59ad0db2 ("net: dsa: microchip: Factor out regmap config generation into common header") Reviewed-by: George McCollister Tested-by: George McCollister Signed-off-by: David S. Miller --- drivers/net/dsa/microchip/ksz_common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/microchip/ksz_common.h b/drivers/net/dsa/microchip/ksz_common.h index a24d8e61fbe7..dd60d0837fc6 100644 --- a/drivers/net/dsa/microchip/ksz_common.h +++ b/drivers/net/dsa/microchip/ksz_common.h @@ -303,7 +303,7 @@ static inline void ksz_pwrite32(struct ksz_device *dev, int port, int offset, { \ .name = #width, \ .val_bits = (width), \ - .reg_stride = (width) / 8, \ + .reg_stride = 1, \ .reg_bits = (regbits) + (regalign), \ .pad_bits = (regpad), \ .max_register = BIT(regbits) - 1, \ From 991ad2b24da2fa7b4ba9775ca1fed2d660d54ab0 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 25 Sep 2019 17:20:42 -0700 Subject: [PATCH 288/334] lib: dimlib: fix help text typos MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix help text typos for DIMLIB. Fixes: 4f75da3666c0 ("linux/dim: Move implementation to .c files") Signed-off-by: Randy Dunlap Cc: Uwe Kleine-König Cc: Tal Gilboa Cc: Saeed Mahameed Acked-by: Uwe Kleine-König Signed-off-by: David S. Miller --- lib/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Kconfig b/lib/Kconfig index d7fc9eb33b9b..183f92a297ca 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -558,7 +558,7 @@ config DIMLIB bool help Dynamic Interrupt Moderation library. - Implements an algorithm for dynamically change CQ modertion values + Implements an algorithm for dynamically changing CQ moderation values according to run time performance. # From 2df4de1681767df900e15e34195bbf7dc1b23e06 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Wed, 25 Sep 2019 19:28:19 -0700 Subject: [PATCH 289/334] ptp: correctly disable flags on old ioctls Commit 415606588c61 ("PTP: introduce new versions of IOCTLs", 2019-09-13) introduced new versions of the PTP ioctls which actually validate that the flags are acceptable values. As part of this, it cleared the flags value using a bitwise and+negation, in an attempt to prevent the old ioctl from accidentally enabling new features. This is incorrect for a couple of reasons. First, it results in accidentally preventing previously working flags on the request ioctl. By clearing the "valid" flags, we now no longer allow setting the enable, rising edge, or falling edge flags. Second, if we add new additional flags in the future, they must not be set by the old ioctl. (Since the flag wasn't checked before, we could potentially break userspace programs which sent garbage flag data. The correct way to resolve this is to check for and clear all but the originally valid flags. Create defines indicating which flags are correctly checked and interpreted by the original ioctls. Use these to clear any bits which will not be correctly interpreted by the original ioctls. In the future, new flags must be added to the VALID_FLAGS macros, but *not* to the V1_VALID_FLAGS macros. In this way, new features may be exposed over the v2 ioctls, but without breaking previous userspace which happened to not clear the flags value properly. The old ioctl will continue to behave the same way, while the new ioctl gains the benefit of using the flags fields. Cc: Richard Cochran Cc: Felipe Balbi Cc: David S. Miller Cc: Christopher Hall Signed-off-by: Jacob Keller Acked-by: Richard Cochran Signed-off-by: David S. Miller --- drivers/ptp/ptp_chardev.c | 4 ++-- include/uapi/linux/ptp_clock.h | 22 ++++++++++++++++++++++ 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c index 9c18476d8d10..67d0199840fd 100644 --- a/drivers/ptp/ptp_chardev.c +++ b/drivers/ptp/ptp_chardev.c @@ -155,7 +155,7 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg) err = -EINVAL; break; } else if (cmd == PTP_EXTTS_REQUEST) { - req.extts.flags &= ~PTP_EXTTS_VALID_FLAGS; + req.extts.flags &= PTP_EXTTS_V1_VALID_FLAGS; req.extts.rsv[0] = 0; req.extts.rsv[1] = 0; } @@ -184,7 +184,7 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg) err = -EINVAL; break; } else if (cmd == PTP_PEROUT_REQUEST) { - req.perout.flags &= ~PTP_PEROUT_VALID_FLAGS; + req.perout.flags &= PTP_PEROUT_V1_VALID_FLAGS; req.perout.rsv[0] = 0; req.perout.rsv[1] = 0; req.perout.rsv[2] = 0; diff --git a/include/uapi/linux/ptp_clock.h b/include/uapi/linux/ptp_clock.h index f16301015949..59e89a1bc3bb 100644 --- a/include/uapi/linux/ptp_clock.h +++ b/include/uapi/linux/ptp_clock.h @@ -31,15 +31,37 @@ #define PTP_ENABLE_FEATURE (1<<0) #define PTP_RISING_EDGE (1<<1) #define PTP_FALLING_EDGE (1<<2) + +/* + * flag fields valid for the new PTP_EXTTS_REQUEST2 ioctl. + */ #define PTP_EXTTS_VALID_FLAGS (PTP_ENABLE_FEATURE | \ PTP_RISING_EDGE | \ PTP_FALLING_EDGE) +/* + * flag fields valid for the original PTP_EXTTS_REQUEST ioctl. + * DO NOT ADD NEW FLAGS HERE. + */ +#define PTP_EXTTS_V1_VALID_FLAGS (PTP_ENABLE_FEATURE | \ + PTP_RISING_EDGE | \ + PTP_FALLING_EDGE) + /* * Bits of the ptp_perout_request.flags field: */ #define PTP_PEROUT_ONE_SHOT (1<<0) + +/* + * flag fields valid for the new PTP_PEROUT_REQUEST2 ioctl. + */ #define PTP_PEROUT_VALID_FLAGS (PTP_PEROUT_ONE_SHOT) + +/* + * No flags are valid for the original PTP_PEROUT_REQUEST ioctl + */ +#define PTP_PEROUT_V1_VALID_FLAGS (0) + /* * struct ptp_clock_time - represents a time value * From fd4a8093ec0bd37d450587d50f3e10f0af57cc47 Mon Sep 17 00:00:00 2001 From: Kunihiko Hayashi Date: Thu, 26 Sep 2019 15:35:10 +0900 Subject: [PATCH 290/334] net: socionext: ave: Avoid using netdev_err() before calling register_netdev() Until calling register_netdev(), ndev->dev_name isn't specified, and netdev_err() displays "(unnamed net_device)". ave 65000000.ethernet (unnamed net_device) (uninitialized): invalid phy-mode setting ave: probe of 65000000.ethernet failed with error -22 This replaces netdev_err() with dev_err() before calling register_netdev(). Signed-off-by: Kunihiko Hayashi Signed-off-by: David S. Miller --- drivers/net/ethernet/socionext/sni_ave.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/socionext/sni_ave.c b/drivers/net/ethernet/socionext/sni_ave.c index d047a53f34f2..6e984d5a729f 100644 --- a/drivers/net/ethernet/socionext/sni_ave.c +++ b/drivers/net/ethernet/socionext/sni_ave.c @@ -1662,19 +1662,19 @@ static int ave_probe(struct platform_device *pdev) "socionext,syscon-phy-mode", 1, 0, &args); if (ret) { - netdev_err(ndev, "can't get syscon-phy-mode property\n"); + dev_err(dev, "can't get syscon-phy-mode property\n"); goto out_free_netdev; } priv->regmap = syscon_node_to_regmap(args.np); of_node_put(args.np); if (IS_ERR(priv->regmap)) { - netdev_err(ndev, "can't map syscon-phy-mode\n"); + dev_err(dev, "can't map syscon-phy-mode\n"); ret = PTR_ERR(priv->regmap); goto out_free_netdev; } ret = priv->data->get_pinmode(priv, phy_mode, args.args[0]); if (ret) { - netdev_err(ndev, "invalid phy-mode setting\n"); + dev_err(dev, "invalid phy-mode setting\n"); goto out_free_netdev; } From 407d8098cb1ab338199f4753162799a488d87d23 Mon Sep 17 00:00:00 2001 From: Hans Andersson Date: Thu, 26 Sep 2019 09:54:37 +0200 Subject: [PATCH 291/334] net: phy: micrel: add Asym Pause workaround for KSZ9021 The Micrel KSZ9031 PHY may fail to establish a link when the Asymmetric Pause capability is set. This issue is described in a Silicon Errata (DS80000691D or DS80000692D), which advises to always disable the capability. Micrel KSZ9021 has no errata, but has the same issue with Asymmetric Pause. This patch apply the same workaround as the one for KSZ9031. Fixes: 3aed3e2a143c ("net: phy: micrel: add Asym Pause workaround") Signed-off-by: Hans Andersson Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/micrel.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index 3c8186f269f9..2fea5541c35a 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -763,6 +763,8 @@ static int ksz9031_get_features(struct phy_device *phydev) * Whenever the device's Asymmetric Pause capability is set to 1, * link-up may fail after a link-up to link-down transition. * + * The Errata Sheet is for ksz9031, but ksz9021 has the same issue + * * Workaround: * Do not enable the Asymmetric Pause capability bit. */ @@ -1076,6 +1078,7 @@ static struct phy_driver ksphy_driver[] = { /* PHY_GBIT_FEATURES */ .driver_data = &ksz9021_type, .probe = kszphy_probe, + .get_features = ksz9031_get_features, .config_init = ksz9021_config_init, .ack_interrupt = kszphy_ack_interrupt, .config_intr = kszphy_config_intr, From d1c536e3177390da43d99f20143b810c35433d1f Mon Sep 17 00:00:00 2001 From: Russell King Date: Sun, 22 Sep 2019 11:26:53 +0100 Subject: [PATCH 292/334] mmc: sdhci: improve ADMA error reporting ADMA errors are potentially data corrupting events; although we print the register state, we do not usefully print the ADMA descriptors. Worse than that, we print them by referencing their virtual address which is meaningless when the register state gives us the DMA address of the failing descriptor. Print the ADMA descriptors giving their DMA addresses rather than their virtual addresses, and print them using SDHCI_DUMP() rather than DBG(). We also do not show the correct value of the interrupt status register; the register dump shows the current value, after we have cleared the pending interrupts we are going to service. What is more useful is to print the interrupts that _were_ pending at the time the ADMA error was encountered. Fix that too. Signed-off-by: Russell King Acked-by: Adrian Hunter Cc: stable@vger.kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index 4b297f397326..922a5b594c5e 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -2874,6 +2874,7 @@ static void sdhci_cmd_irq(struct sdhci_host *host, u32 intmask, u32 *intmask_p) static void sdhci_adma_show_error(struct sdhci_host *host) { void *desc = host->adma_table; + dma_addr_t dma = host->adma_addr; sdhci_dumpregs(host); @@ -2881,18 +2882,21 @@ static void sdhci_adma_show_error(struct sdhci_host *host) struct sdhci_adma2_64_desc *dma_desc = desc; if (host->flags & SDHCI_USE_64_BIT_DMA) - DBG("%p: DMA 0x%08x%08x, LEN 0x%04x, Attr=0x%02x\n", - desc, le32_to_cpu(dma_desc->addr_hi), + SDHCI_DUMP("%08llx: DMA 0x%08x%08x, LEN 0x%04x, Attr=0x%02x\n", + (unsigned long long)dma, + le32_to_cpu(dma_desc->addr_hi), le32_to_cpu(dma_desc->addr_lo), le16_to_cpu(dma_desc->len), le16_to_cpu(dma_desc->cmd)); else - DBG("%p: DMA 0x%08x, LEN 0x%04x, Attr=0x%02x\n", - desc, le32_to_cpu(dma_desc->addr_lo), + SDHCI_DUMP("%08llx: DMA 0x%08x, LEN 0x%04x, Attr=0x%02x\n", + (unsigned long long)dma, + le32_to_cpu(dma_desc->addr_lo), le16_to_cpu(dma_desc->len), le16_to_cpu(dma_desc->cmd)); desc += host->desc_sz; + dma += host->desc_sz; if (dma_desc->cmd & cpu_to_le16(ADMA2_END)) break; @@ -2968,7 +2972,8 @@ static void sdhci_data_irq(struct sdhci_host *host, u32 intmask) != MMC_BUS_TEST_R) host->data->error = -EILSEQ; else if (intmask & SDHCI_INT_ADMA_ERROR) { - pr_err("%s: ADMA error\n", mmc_hostname(host->mmc)); + pr_err("%s: ADMA error: 0x%08x\n", mmc_hostname(host->mmc), + intmask); sdhci_adma_show_error(host); host->data->error = -EIO; if (host->ops->adma_workaround) From 121bd08b029e03404c451bb237729cdff76eafed Mon Sep 17 00:00:00 2001 From: Russell King Date: Sun, 22 Sep 2019 11:26:58 +0100 Subject: [PATCH 293/334] mmc: sdhci-of-esdhc: set DMA snooping based on DMA coherence We must not unconditionally set the DMA snoop bit; if the DMA API is assuming that the device is not DMA coherent, and the device snoops the CPU caches, the device can see stale cache lines brought in by speculative prefetch. This leads to the device seeing stale data, potentially resulting in corrupted data transfers. Commonly, this results in a descriptor fetch error such as: mmc0: ADMA error mmc0: sdhci: ============ SDHCI REGISTER DUMP =========== mmc0: sdhci: Sys addr: 0x00000000 | Version: 0x00002202 mmc0: sdhci: Blk size: 0x00000008 | Blk cnt: 0x00000001 mmc0: sdhci: Argument: 0x00000000 | Trn mode: 0x00000013 mmc0: sdhci: Present: 0x01f50008 | Host ctl: 0x00000038 mmc0: sdhci: Power: 0x00000003 | Blk gap: 0x00000000 mmc0: sdhci: Wake-up: 0x00000000 | Clock: 0x000040d8 mmc0: sdhci: Timeout: 0x00000003 | Int stat: 0x00000001 mmc0: sdhci: Int enab: 0x037f108f | Sig enab: 0x037f108b mmc0: sdhci: ACmd stat: 0x00000000 | Slot int: 0x00002202 mmc0: sdhci: Caps: 0x35fa0000 | Caps_1: 0x0000af00 mmc0: sdhci: Cmd: 0x0000333a | Max curr: 0x00000000 mmc0: sdhci: Resp[0]: 0x00000920 | Resp[1]: 0x001d8a33 mmc0: sdhci: Resp[2]: 0x325b5900 | Resp[3]: 0x3f400e00 mmc0: sdhci: Host ctl2: 0x00000000 mmc0: sdhci: ADMA Err: 0x00000009 | ADMA Ptr: 0x000000236d43820c mmc0: sdhci: ============================================ mmc0: error -5 whilst initialising SD card but can lead to other errors, and potentially direct the SDHCI controller to read/write data to other memory locations (e.g. if a valid descriptor is visible to the device in a stale cache line.) Fix this by ensuring that the DMA snoop bit corresponds with the behaviour of the DMA API. Since the driver currently only supports DT, use of_dma_is_coherent(). Note that device_get_dma_attr() can not be used as that risks re-introducing this bug if/when the driver is converted to ACPI. Signed-off-by: Russell King Acked-by: Adrian Hunter Cc: stable@vger.kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-of-esdhc.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/host/sdhci-of-esdhc.c b/drivers/mmc/host/sdhci-of-esdhc.c index 3271c2d76629..1d1953dfc54b 100644 --- a/drivers/mmc/host/sdhci-of-esdhc.c +++ b/drivers/mmc/host/sdhci-of-esdhc.c @@ -495,7 +495,12 @@ static int esdhc_of_enable_dma(struct sdhci_host *host) dma_set_mask_and_coherent(dev, DMA_BIT_MASK(40)); value = sdhci_readl(host, ESDHC_DMA_SYSCTL); - value |= ESDHC_DMA_SNOOP; + + if (of_dma_is_coherent(dev->of_node)) + value |= ESDHC_DMA_SNOOP; + else + value &= ~ESDHC_DMA_SNOOP; + sdhci_writel(host, value, ESDHC_DMA_SYSCTL); return 0; } From 4ee7dde4c777f14cb0f98dd201491bf6cc15899b Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 23 Sep 2019 12:08:09 +0200 Subject: [PATCH 294/334] mmc: sdhci: Let drivers define their DMA mask Add host operation ->set_dma_mask() so that drivers can define their own DMA masks. Signed-off-by: Adrian Hunter Tested-by: Nicolin Chen Signed-off-by: Thierry Reding Cc: stable@vger.kernel.org # v4.15 + Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci.c | 12 ++++-------- drivers/mmc/host/sdhci.h | 1 + 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index 922a5b594c5e..b056400e34b1 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -3781,18 +3781,14 @@ int sdhci_setup_host(struct sdhci_host *host) host->flags &= ~SDHCI_USE_ADMA; } - /* - * It is assumed that a 64-bit capable device has set a 64-bit DMA mask - * and *must* do 64-bit DMA. A driver has the opportunity to change - * that during the first call to ->enable_dma(). Similarly - * SDHCI_QUIRK2_BROKEN_64_BIT_DMA must be left to the drivers to - * implement. - */ if (sdhci_can_64bit_dma(host)) host->flags |= SDHCI_USE_64_BIT_DMA; if (host->flags & (SDHCI_USE_SDMA | SDHCI_USE_ADMA)) { - ret = sdhci_set_dma_mask(host); + if (host->ops->set_dma_mask) + ret = host->ops->set_dma_mask(host); + else + ret = sdhci_set_dma_mask(host); if (!ret && host->ops->enable_dma) ret = host->ops->enable_dma(host); diff --git a/drivers/mmc/host/sdhci.h b/drivers/mmc/host/sdhci.h index a29c4cd2d92e..0ed3e0eaef5f 100644 --- a/drivers/mmc/host/sdhci.h +++ b/drivers/mmc/host/sdhci.h @@ -622,6 +622,7 @@ struct sdhci_ops { u32 (*irq)(struct sdhci_host *host, u32 intmask); + int (*set_dma_mask)(struct sdhci_host *host); int (*enable_dma)(struct sdhci_host *host); unsigned int (*get_max_clock)(struct sdhci_host *host); unsigned int (*get_min_clock)(struct sdhci_host *host); From b960bc448a252428bacca271f3416a8bda3b599b Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Mon, 23 Sep 2019 12:08:10 +0200 Subject: [PATCH 295/334] mmc: tegra: Implement ->set_dma_mask() The SDHCI controller on Tegra186 supports 40-bit addressing, which is usually enough to address all of system memory. However, if the SDHCI controller is behind an IOMMU, the address space can go beyond. This happens on Tegra186 and later where the ARM SMMU has an input address space of 48 bits. If the DMA API is backed by this ARM SMMU, the top- down IOVA allocator will cause IOV addresses to be returned that the SDHCI controller cannot access. Unfortunately, prior to the introduction of the ->set_dma_mask() host operation, the SDHCI core would set either a 64-bit DMA mask if the controller claimed to support 64-bit addressing, or a 32-bit DMA mask otherwise. Since the full 64 bits cannot be addressed on Tegra, this had to be worked around in commit 68481a7e1c84 ("mmc: tegra: Mark 64 bit dma broken on Tegra186") by setting the SDHCI_QUIRK2_BROKEN_64_BIT_DMA quirk, which effectively restricts the DMA mask to 32 bits. One disadvantage of this is that dma_map_*() APIs will now try to use the swiotlb to bounce DMA to addresses beyond of the controller's DMA mask. This in turn caused degraded performance and can lead to situations where the swiotlb buffer is exhausted, which in turn leads to DMA transfers to fail. With the recent introduction of the ->set_dma_mask() host operation, this can now be properly fixed. For each generation of Tegra, the exact supported DMA mask can be configured. This kills two birds with one stone: it avoids the use of bounce buffers because system memory never exceeds the addressable memory range of the SDHCI controllers on these devices, and at the same time when an IOMMU is involved, it prevents IOV addresses from being allocated beyond the addressible range of the controllers. Since the DMA mask is now properly handled, the 64-bit DMA quirk can be removed. Signed-off-by: Nicolin Chen [treding@nvidia.com: provide more background in commit message] Tested-by: Nicolin Chen Acked-by: Adrian Hunter Signed-off-by: Thierry Reding Cc: stable@vger.kernel.org # v4.15 + Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-tegra.c | 48 ++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/drivers/mmc/host/sdhci-tegra.c b/drivers/mmc/host/sdhci-tegra.c index 02d8f524bb9e..7bc950520fd9 100644 --- a/drivers/mmc/host/sdhci-tegra.c +++ b/drivers/mmc/host/sdhci-tegra.c @@ -4,6 +4,7 @@ */ #include +#include #include #include #include @@ -104,6 +105,7 @@ struct sdhci_tegra_soc_data { const struct sdhci_pltfm_data *pdata; + u64 dma_mask; u32 nvquirks; u8 min_tap_delay; u8 max_tap_delay; @@ -1233,11 +1235,25 @@ static const struct cqhci_host_ops sdhci_tegra_cqhci_ops = { .update_dcmd_desc = sdhci_tegra_update_dcmd_desc, }; +static int tegra_sdhci_set_dma_mask(struct sdhci_host *host) +{ + struct sdhci_pltfm_host *platform = sdhci_priv(host); + struct sdhci_tegra *tegra = sdhci_pltfm_priv(platform); + const struct sdhci_tegra_soc_data *soc = tegra->soc_data; + struct device *dev = mmc_dev(host->mmc); + + if (soc->dma_mask) + return dma_set_mask_and_coherent(dev, soc->dma_mask); + + return 0; +} + static const struct sdhci_ops tegra_sdhci_ops = { .get_ro = tegra_sdhci_get_ro, .read_w = tegra_sdhci_readw, .write_l = tegra_sdhci_writel, .set_clock = tegra_sdhci_set_clock, + .set_dma_mask = tegra_sdhci_set_dma_mask, .set_bus_width = sdhci_set_bus_width, .reset = tegra_sdhci_reset, .platform_execute_tuning = tegra_sdhci_execute_tuning, @@ -1257,6 +1273,7 @@ static const struct sdhci_pltfm_data sdhci_tegra20_pdata = { static const struct sdhci_tegra_soc_data soc_data_tegra20 = { .pdata = &sdhci_tegra20_pdata, + .dma_mask = DMA_BIT_MASK(32), .nvquirks = NVQUIRK_FORCE_SDHCI_SPEC_200 | NVQUIRK_ENABLE_BLOCK_GAP_DET, }; @@ -1283,6 +1300,7 @@ static const struct sdhci_pltfm_data sdhci_tegra30_pdata = { static const struct sdhci_tegra_soc_data soc_data_tegra30 = { .pdata = &sdhci_tegra30_pdata, + .dma_mask = DMA_BIT_MASK(32), .nvquirks = NVQUIRK_ENABLE_SDHCI_SPEC_300 | NVQUIRK_ENABLE_SDR50 | NVQUIRK_ENABLE_SDR104 | @@ -1295,6 +1313,7 @@ static const struct sdhci_ops tegra114_sdhci_ops = { .write_w = tegra_sdhci_writew, .write_l = tegra_sdhci_writel, .set_clock = tegra_sdhci_set_clock, + .set_dma_mask = tegra_sdhci_set_dma_mask, .set_bus_width = sdhci_set_bus_width, .reset = tegra_sdhci_reset, .platform_execute_tuning = tegra_sdhci_execute_tuning, @@ -1316,6 +1335,7 @@ static const struct sdhci_pltfm_data sdhci_tegra114_pdata = { static const struct sdhci_tegra_soc_data soc_data_tegra114 = { .pdata = &sdhci_tegra114_pdata, + .dma_mask = DMA_BIT_MASK(32), }; static const struct sdhci_pltfm_data sdhci_tegra124_pdata = { @@ -1325,22 +1345,13 @@ static const struct sdhci_pltfm_data sdhci_tegra124_pdata = { SDHCI_QUIRK_NO_HISPD_BIT | SDHCI_QUIRK_BROKEN_ADMA_ZEROLEN_DESC | SDHCI_QUIRK_CAP_CLOCK_BASE_BROKEN, - .quirks2 = SDHCI_QUIRK2_PRESET_VALUE_BROKEN | - /* - * The TRM states that the SD/MMC controller found on - * Tegra124 can address 34 bits (the maximum supported by - * the Tegra memory controller), but tests show that DMA - * to or from above 4 GiB doesn't work. This is possibly - * caused by missing programming, though it's not obvious - * what sequence is required. Mark 64-bit DMA broken for - * now to fix this for existing users (e.g. Nyan boards). - */ - SDHCI_QUIRK2_BROKEN_64_BIT_DMA, + .quirks2 = SDHCI_QUIRK2_PRESET_VALUE_BROKEN, .ops = &tegra114_sdhci_ops, }; static const struct sdhci_tegra_soc_data soc_data_tegra124 = { .pdata = &sdhci_tegra124_pdata, + .dma_mask = DMA_BIT_MASK(34), }; static const struct sdhci_ops tegra210_sdhci_ops = { @@ -1349,6 +1360,7 @@ static const struct sdhci_ops tegra210_sdhci_ops = { .write_w = tegra210_sdhci_writew, .write_l = tegra_sdhci_writel, .set_clock = tegra_sdhci_set_clock, + .set_dma_mask = tegra_sdhci_set_dma_mask, .set_bus_width = sdhci_set_bus_width, .reset = tegra_sdhci_reset, .set_uhs_signaling = tegra_sdhci_set_uhs_signaling, @@ -1369,6 +1381,7 @@ static const struct sdhci_pltfm_data sdhci_tegra210_pdata = { static const struct sdhci_tegra_soc_data soc_data_tegra210 = { .pdata = &sdhci_tegra210_pdata, + .dma_mask = DMA_BIT_MASK(34), .nvquirks = NVQUIRK_NEEDS_PAD_CONTROL | NVQUIRK_HAS_PADCALIB | NVQUIRK_DIS_CARD_CLK_CONFIG_TAP | @@ -1383,6 +1396,7 @@ static const struct sdhci_ops tegra186_sdhci_ops = { .read_w = tegra_sdhci_readw, .write_l = tegra_sdhci_writel, .set_clock = tegra_sdhci_set_clock, + .set_dma_mask = tegra_sdhci_set_dma_mask, .set_bus_width = sdhci_set_bus_width, .reset = tegra_sdhci_reset, .set_uhs_signaling = tegra_sdhci_set_uhs_signaling, @@ -1398,20 +1412,13 @@ static const struct sdhci_pltfm_data sdhci_tegra186_pdata = { SDHCI_QUIRK_NO_HISPD_BIT | SDHCI_QUIRK_BROKEN_ADMA_ZEROLEN_DESC | SDHCI_QUIRK_CAP_CLOCK_BASE_BROKEN, - .quirks2 = SDHCI_QUIRK2_PRESET_VALUE_BROKEN | - /* SDHCI controllers on Tegra186 support 40-bit addressing. - * IOVA addresses are 48-bit wide on Tegra186. - * With 64-bit dma mask used for SDHCI, accesses can - * be broken. Disable 64-bit dma, which would fall back - * to 32-bit dma mask. Ideally 40-bit dma mask would work, - * But it is not supported as of now. - */ - SDHCI_QUIRK2_BROKEN_64_BIT_DMA, + .quirks2 = SDHCI_QUIRK2_PRESET_VALUE_BROKEN, .ops = &tegra186_sdhci_ops, }; static const struct sdhci_tegra_soc_data soc_data_tegra186 = { .pdata = &sdhci_tegra186_pdata, + .dma_mask = DMA_BIT_MASK(40), .nvquirks = NVQUIRK_NEEDS_PAD_CONTROL | NVQUIRK_HAS_PADCALIB | NVQUIRK_DIS_CARD_CLK_CONFIG_TAP | @@ -1424,6 +1431,7 @@ static const struct sdhci_tegra_soc_data soc_data_tegra186 = { static const struct sdhci_tegra_soc_data soc_data_tegra194 = { .pdata = &sdhci_tegra186_pdata, + .dma_mask = DMA_BIT_MASK(39), .nvquirks = NVQUIRK_NEEDS_PAD_CONTROL | NVQUIRK_HAS_PADCALIB | NVQUIRK_DIS_CARD_CLK_CONFIG_TAP | From 6ba5bbba95f789d76ce3bf440ee02fdaf52ec486 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 26 Sep 2019 12:13:06 +0100 Subject: [PATCH 296/334] NFC: st95hf: clean up indentation issue The return statement is indented incorrectly, add in a missing tab and remove an extraneous space after the return Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- drivers/nfc/st95hf/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nfc/st95hf/core.c b/drivers/nfc/st95hf/core.c index 7eda62a9e0df..9642971e89ce 100644 --- a/drivers/nfc/st95hf/core.c +++ b/drivers/nfc/st95hf/core.c @@ -661,7 +661,7 @@ static int st95hf_error_handling(struct st95hf_context *stcontext, result = -ETIMEDOUT; else result = -EIO; - return result; + return result; } /* Check for CRC err only if CRC is present in the tag response */ From 4208966f65f520d7f392dbaa62e39a8fa88ffb95 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 26 Sep 2019 12:22:52 +0100 Subject: [PATCH 297/334] net: ena: clean up indentation issue There memset is indented incorrectly, remove the extraneous tabs. Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- drivers/net/ethernet/amazon/ena/ena_eth_com.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_eth_com.c b/drivers/net/ethernet/amazon/ena/ena_eth_com.c index 38046bf0ff44..2845ac277724 100644 --- a/drivers/net/ethernet/amazon/ena/ena_eth_com.c +++ b/drivers/net/ethernet/amazon/ena/ena_eth_com.c @@ -211,8 +211,8 @@ static int ena_com_sq_update_llq_tail(struct ena_com_io_sq *io_sq) pkt_ctrl->curr_bounce_buf = ena_com_get_next_bounce_buffer(&io_sq->bounce_buf_ctrl); - memset(io_sq->llq_buf_ctrl.curr_bounce_buf, - 0x0, llq_info->desc_list_entry_size); + memset(io_sq->llq_buf_ctrl.curr_bounce_buf, + 0x0, llq_info->desc_list_entry_size); pkt_ctrl->idx = 0; if (unlikely(llq_info->desc_stride_ctrl == ENA_ADMIN_SINGLE_DESC_PER_ENTRY)) From 979b9b251ae06e3408153bd7b9342a290d65e826 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 26 Sep 2019 14:43:38 +0300 Subject: [PATCH 298/334] mlxsw: spectrum: Clear VLAN filters during port initialization When a port is created, its VLAN filters are not cleared by the firmware. This causes tagged packets to be later dropped by the ingress STP filters, which default to DISCARD state. The above did not matter much until commit b5ce611fd96e ("mlxsw: spectrum: Add devlink-trap support") where we exposed the drop reason to users. Without this patch, the drop reason users will see is not consistent. If a port is enslaved to a VLAN-aware bridge and a packet with an invalid VLAN tries to ingress the bridge, it will be dropped due to ingress STP filter. If the VLAN is later enabled and then disabled, the packet will be dropped by the ingress VLAN filter despite the above being a seemingly NOP operation. Fix this by clearing all the VLAN filters during port initialization. Adjust the test accordingly. Fixes: b5ce611fd96e ("mlxsw: spectrum: Add devlink-trap support") Reported-by: Alex Kushnarov Tested-by: Alex Kushnarov Acked-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 9 +++++++++ .../selftests/drivers/net/mlxsw/devlink_trap_l2_drops.sh | 7 ------- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index dd234cf7b39d..dcf9562bce8a 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -3771,6 +3771,14 @@ static int mlxsw_sp_port_create(struct mlxsw_sp *mlxsw_sp, u8 local_port, goto err_port_qdiscs_init; } + err = mlxsw_sp_port_vlan_set(mlxsw_sp_port, 0, VLAN_N_VID - 1, false, + false); + if (err) { + dev_err(mlxsw_sp->bus_info->dev, "Port %d: Failed to clear VLAN filter\n", + mlxsw_sp_port->local_port); + goto err_port_vlan_clear; + } + err = mlxsw_sp_port_nve_init(mlxsw_sp_port); if (err) { dev_err(mlxsw_sp->bus_info->dev, "Port %d: Failed to initialize NVE\n", @@ -3818,6 +3826,7 @@ err_port_vlan_create: err_port_pvid_set: mlxsw_sp_port_nve_fini(mlxsw_sp_port); err_port_nve_init: +err_port_vlan_clear: mlxsw_sp_tc_qdisc_fini(mlxsw_sp_port); err_port_qdiscs_init: mlxsw_sp_port_fids_fini(mlxsw_sp_port); diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l2_drops.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l2_drops.sh index 5dcdfa20fc6c..126caf28b529 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l2_drops.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l2_drops.sh @@ -224,13 +224,6 @@ ingress_vlan_filter_test() local vid=10 bridge vlan add vid $vid dev $swp2 master - # During initialization the firmware enables all the VLAN filters and - # the driver does not turn them off since the traffic will be discarded - # by the STP filter whose default is DISCARD state. Add the VID on the - # ingress bridge port and then remove it to make sure it is not member - # in the VLAN. - bridge vlan add vid $vid dev $swp1 master - bridge vlan del vid $vid dev $swp1 master RET=0 From 44bde514eb13ac32b20442880e8175584af7592c Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 26 Sep 2019 14:43:39 +0300 Subject: [PATCH 299/334] Documentation: Clarify trap's description Alex noted that the below description might not be obvious to all users. Clarify it by adding an example. Fixes: f3047ca01f12 ("Documentation: Add devlink-trap documentation") Reported-by: Alex Kushnarov Reviewed-by: Alex Kushnarov Acked-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- Documentation/networking/devlink-trap.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Documentation/networking/devlink-trap.rst b/Documentation/networking/devlink-trap.rst index c20c7c483664..8e90a85f3bd5 100644 --- a/Documentation/networking/devlink-trap.rst +++ b/Documentation/networking/devlink-trap.rst @@ -143,7 +143,8 @@ be added to the following table: * - ``port_list_is_empty`` - ``drop`` - Traps packets that the device decided to drop in case they need to be - flooded and the flood list is empty + flooded (e.g., unknown unicast, unregistered multicast) and there are + no ports the packets should be flooded to * - ``port_loopback_filter`` - ``drop`` - Traps packets that the device decided to drop in case after layer 2 From 52feb8b588f6d23673dd7cc2b44b203493b627f6 Mon Sep 17 00:00:00 2001 From: Danielle Ratson Date: Thu, 26 Sep 2019 14:43:40 +0300 Subject: [PATCH 300/334] mlxsw: spectrum_flower: Fail in case user specifies multiple mirror actions The ASIC can only mirror a packet to one port, but when user is trying to set more than one mirror action, it doesn't fail. Add a check if more than one mirror action was specified per rule and if so, fail for not being supported. Fixes: d0d13c1858a11 ("mlxsw: spectrum_acl: Add support for mirror action") Signed-off-by: Danielle Ratson Acked-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c index 0ad1a24abfc6..b607919c8ad0 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c @@ -21,6 +21,7 @@ static int mlxsw_sp_flower_parse_actions(struct mlxsw_sp *mlxsw_sp, struct netlink_ext_ack *extack) { const struct flow_action_entry *act; + int mirror_act_count = 0; int err, i; if (!flow_action_has_entries(flow_action)) @@ -105,6 +106,11 @@ static int mlxsw_sp_flower_parse_actions(struct mlxsw_sp *mlxsw_sp, case FLOW_ACTION_MIRRED: { struct net_device *out_dev = act->dev; + if (mirror_act_count++) { + NL_SET_ERR_MSG_MOD(extack, "Multiple mirror actions per rule are not supported"); + return -EOPNOTSUPP; + } + err = mlxsw_sp_acl_rulei_act_mirror(mlxsw_sp, rulei, block, out_dev, extack); From 6b3656a60f2067738d1a423328199720806f0c44 Mon Sep 17 00:00:00 2001 From: "Kevin(Yudong) Yang" Date: Thu, 26 Sep 2019 10:30:05 -0400 Subject: [PATCH 301/334] tcp_bbr: fix quantization code to not raise cwnd if not probing bandwidth There was a bug in the previous logic that attempted to ensure gain cycling gets inflight above BDP even for small BDPs. This code correctly raised and lowered target inflight values during the gain cycle. And this code correctly ensured that cwnd was raised when probing bandwidth. However, it did not correspondingly ensure that cwnd was *not* raised in this way when *not* probing for bandwidth. The result was that small-BDP flows that were always cwnd-bound could go for many cycles with a fixed cwnd, and not probe or yield bandwidth at all. This meant that multiple small-BDP flows could fail to converge in their bandwidth allocations. Fixes: 3c346b233c68 ("tcp_bbr: fix bw probing to raise in-flight data for very small BDPs") Signed-off-by: Kevin(Yudong) Yang Acked-by: Neal Cardwell Acked-by: Yuchung Cheng Acked-by: Soheil Hassas Yeganeh Acked-by: Priyaranjan Jha Signed-off-by: David S. Miller --- net/ipv4/tcp_bbr.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 95b59540eee1..32772d6ded4e 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -388,7 +388,7 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain) * which allows 2 outstanding 2-packet sequences, to try to keep pipe * full even with ACK-every-other-packet delayed ACKs. */ -static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd, int gain) +static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd) { struct bbr *bbr = inet_csk_ca(sk); @@ -399,7 +399,7 @@ static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd, int gain) cwnd = (cwnd + 1) & ~1U; /* Ensure gain cycling gets inflight above BDP even for small BDPs. */ - if (bbr->mode == BBR_PROBE_BW && gain > BBR_UNIT) + if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == 0) cwnd += 2; return cwnd; @@ -411,7 +411,7 @@ static u32 bbr_inflight(struct sock *sk, u32 bw, int gain) u32 inflight; inflight = bbr_bdp(sk, bw, gain); - inflight = bbr_quantization_budget(sk, inflight, gain); + inflight = bbr_quantization_budget(sk, inflight); return inflight; } @@ -531,7 +531,7 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, * due to aggregation (of data and/or ACKs) visible in the ACK stream. */ target_cwnd += bbr_ack_aggregation_cwnd(sk); - target_cwnd = bbr_quantization_budget(sk, target_cwnd, gain); + target_cwnd = bbr_quantization_budget(sk, target_cwnd); /* If we're below target cwnd, slow start cwnd toward target cwnd. */ if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */ From 174e23810cd3183dc2ca3f5166ef965a55eaaf54 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 26 Sep 2019 20:37:05 +0200 Subject: [PATCH 302/334] sk_buff: drop all skb extensions on free and skb scrubbing Now that we have a 3rd extension, add a new helper that drops the extension space and use it when we need to scrub an sk_buff. At this time, scrubbing clears secpath and bridge netfilter data, but retains the tc skb extension, after this patch all three get cleared. NAPI reuse/free assumes we can only have a secpath attached to skb, but it seems better to clear all extensions there as well. v2: add unlikely hint (Eric Dumazet) Fixes: 95a7233c452a ("net: openvswitch: Set OvS recirc_id from tc chain index") Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/linux/skbuff.h | 9 +++++++++ net/core/dev.c | 4 ++-- net/core/skbuff.c | 2 +- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 907209c0794e..e7d3b1a513ef 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4144,8 +4144,17 @@ static inline void *skb_ext_find(const struct sk_buff *skb, enum skb_ext_id id) return NULL; } + +static inline void skb_ext_reset(struct sk_buff *skb) +{ + if (unlikely(skb->active_extensions)) { + __skb_ext_put(skb->extensions); + skb->active_extensions = 0; + } +} #else static inline void skb_ext_put(struct sk_buff *skb) {} +static inline void skb_ext_reset(struct sk_buff *skb) {} static inline void skb_ext_del(struct sk_buff *skb, int unused) {} static inline void __skb_ext_copy(struct sk_buff *d, const struct sk_buff *s) {} static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *s) {} diff --git a/net/core/dev.c b/net/core/dev.c index 71b18e80389f..bf3ed413abaf 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5666,7 +5666,7 @@ EXPORT_SYMBOL(gro_find_complete_by_type); static void napi_skb_free_stolen_head(struct sk_buff *skb) { skb_dst_drop(skb); - secpath_reset(skb); + skb_ext_put(skb); kmem_cache_free(skbuff_head_cache, skb); } @@ -5733,7 +5733,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) skb->encapsulation = 0; skb_shinfo(skb)->gso_type = 0; skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); - secpath_reset(skb); + skb_ext_reset(skb); napi->skb = skb; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index f12e8a050edb..01d65206f4fb 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5119,7 +5119,7 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) skb->skb_iif = 0; skb->ignore_df = 0; skb_dst_drop(skb); - secpath_reset(skb); + skb_ext_reset(skb); nf_reset(skb); nf_reset_trace(skb); From a41e8a88b06ee39fad4cb4a8ccf822563560a89c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Sep 2019 15:42:51 -0700 Subject: [PATCH 303/334] tcp: better handle TCP_USER_TIMEOUT in SYN_SENT state Yuchung Cheng and Marek Majkowski independently reported a weird behavior of TCP_USER_TIMEOUT option when used at connect() time. When the TCP_USER_TIMEOUT is reached, tcp_write_timeout() believes the flow should live, and the following condition in tcp_clamp_rto_to_user_timeout() programs one jiffie timers : remaining = icsk->icsk_user_timeout - elapsed; if (remaining <= 0) return 1; /* user timeout has passed; fire ASAP */ This silly situation ends when the max syn rtx count is reached. This patch makes sure we honor both TCP_SYNCNT and TCP_USER_TIMEOUT, avoiding these spurious SYN packets. Fixes: b701a99e431d ("tcp: Add tcp_clamp_rto_to_user_timeout() helper to improve accuracy") Signed-off-by: Eric Dumazet Reported-by: Yuchung Cheng Reported-by: Marek Majkowski Cc: Jon Maxwell Link: https://marc.info/?l=linux-netdev&m=156940118307949&w=2 Acked-by: Jon Maxwell Tested-by: Marek Majkowski Signed-off-by: Marek Majkowski Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index dbd9d2d0ee63..40de2d2364a1 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -210,7 +210,7 @@ static int tcp_write_timeout(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); - bool expired, do_reset; + bool expired = false, do_reset; int retry_until; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { @@ -242,9 +242,10 @@ static int tcp_write_timeout(struct sock *sk) if (tcp_out_of_resources(sk, do_reset)) return 1; } + } + if (!expired) expired = retransmits_timed_out(sk, retry_until, icsk->icsk_user_timeout); - } tcp_fastopen_active_detect_blackhole(sk, expired); if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTO_CB_FLAG)) From e51df6ce668a8f75ce27f83ce0f60103c568c375 Mon Sep 17 00:00:00 2001 From: Ben Chuang Date: Wed, 11 Sep 2019 15:23:44 +0800 Subject: [PATCH 304/334] mmc: host: sdhci-pci: Add Genesys Logic GL975x support Add support for the GL9750 and GL9755 chipsets. Enable v4 mode and wait 5ms after set 1.8V signal enable for GL9750/ GL9755. Fix the value of SDHCI_MAX_CURRENT register and use the vendor tuning flow for GL9750. Co-developed-by: Michael K Johnson Signed-off-by: Michael K Johnson Signed-off-by: Ben Chuang Acked-by: Adrian Hunter Signed-off-by: Ulf Hansson --- drivers/mmc/host/Kconfig | 1 + drivers/mmc/host/Makefile | 2 +- drivers/mmc/host/sdhci-pci-core.c | 2 + drivers/mmc/host/sdhci-pci-gli.c | 352 ++++++++++++++++++++++++++++++ drivers/mmc/host/sdhci-pci.h | 5 + 5 files changed, 361 insertions(+), 1 deletion(-) create mode 100644 drivers/mmc/host/sdhci-pci-gli.c diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig index 3a52f5703286..49ea02c467bf 100644 --- a/drivers/mmc/host/Kconfig +++ b/drivers/mmc/host/Kconfig @@ -94,6 +94,7 @@ config MMC_SDHCI_PCI depends on MMC_SDHCI && PCI select MMC_CQHCI select IOSF_MBI if X86 + select MMC_SDHCI_IO_ACCESSORS help This selects the PCI Secure Digital Host Controller Interface. Most controllers found today are PCI devices. diff --git a/drivers/mmc/host/Makefile b/drivers/mmc/host/Makefile index 390ee162fe71..11c4598e91d9 100644 --- a/drivers/mmc/host/Makefile +++ b/drivers/mmc/host/Makefile @@ -13,7 +13,7 @@ obj-$(CONFIG_MMC_MXS) += mxs-mmc.o obj-$(CONFIG_MMC_SDHCI) += sdhci.o obj-$(CONFIG_MMC_SDHCI_PCI) += sdhci-pci.o sdhci-pci-y += sdhci-pci-core.o sdhci-pci-o2micro.o sdhci-pci-arasan.o \ - sdhci-pci-dwc-mshc.o + sdhci-pci-dwc-mshc.o sdhci-pci-gli.o obj-$(subst m,y,$(CONFIG_MMC_SDHCI_PCI)) += sdhci-pci-data.o obj-$(CONFIG_MMC_SDHCI_ACPI) += sdhci-acpi.o obj-$(CONFIG_MMC_SDHCI_PXAV3) += sdhci-pxav3.o diff --git a/drivers/mmc/host/sdhci-pci-core.c b/drivers/mmc/host/sdhci-pci-core.c index e1ca185d7328..eaffa85bc728 100644 --- a/drivers/mmc/host/sdhci-pci-core.c +++ b/drivers/mmc/host/sdhci-pci-core.c @@ -1685,6 +1685,8 @@ static const struct pci_device_id pci_ids[] = { SDHCI_PCI_DEVICE(O2, SEABIRD1, o2), SDHCI_PCI_DEVICE(ARASAN, PHY_EMMC, arasan), SDHCI_PCI_DEVICE(SYNOPSYS, DWC_MSHC, snps), + SDHCI_PCI_DEVICE(GLI, 9750, gl9750), + SDHCI_PCI_DEVICE(GLI, 9755, gl9755), SDHCI_PCI_DEVICE_CLASS(AMD, SYSTEM_SDHCI, PCI_CLASS_MASK, amd), /* Generic SD host controller */ {PCI_DEVICE_CLASS(SYSTEM_SDHCI, PCI_CLASS_MASK)}, diff --git a/drivers/mmc/host/sdhci-pci-gli.c b/drivers/mmc/host/sdhci-pci-gli.c new file mode 100644 index 000000000000..5eea8d70a85d --- /dev/null +++ b/drivers/mmc/host/sdhci-pci-gli.c @@ -0,0 +1,352 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (C) 2019 Genesys Logic, Inc. + * + * Authors: Ben Chuang + * + * Version: v0.9.0 (2019-08-08) + */ + +#include +#include +#include +#include +#include +#include "sdhci.h" +#include "sdhci-pci.h" + +/* Genesys Logic extra registers */ +#define SDHCI_GLI_9750_WT 0x800 +#define SDHCI_GLI_9750_WT_EN BIT(0) +#define GLI_9750_WT_EN_ON 0x1 +#define GLI_9750_WT_EN_OFF 0x0 + +#define SDHCI_GLI_9750_DRIVING 0x860 +#define SDHCI_GLI_9750_DRIVING_1 GENMASK(11, 0) +#define SDHCI_GLI_9750_DRIVING_2 GENMASK(27, 26) +#define GLI_9750_DRIVING_1_VALUE 0xFFF +#define GLI_9750_DRIVING_2_VALUE 0x3 + +#define SDHCI_GLI_9750_PLL 0x864 +#define SDHCI_GLI_9750_PLL_TX2_INV BIT(23) +#define SDHCI_GLI_9750_PLL_TX2_DLY GENMASK(22, 20) +#define GLI_9750_PLL_TX2_INV_VALUE 0x1 +#define GLI_9750_PLL_TX2_DLY_VALUE 0x0 + +#define SDHCI_GLI_9750_SW_CTRL 0x874 +#define SDHCI_GLI_9750_SW_CTRL_4 GENMASK(7, 6) +#define GLI_9750_SW_CTRL_4_VALUE 0x3 + +#define SDHCI_GLI_9750_MISC 0x878 +#define SDHCI_GLI_9750_MISC_TX1_INV BIT(2) +#define SDHCI_GLI_9750_MISC_RX_INV BIT(3) +#define SDHCI_GLI_9750_MISC_TX1_DLY GENMASK(6, 4) +#define GLI_9750_MISC_TX1_INV_VALUE 0x0 +#define GLI_9750_MISC_RX_INV_ON 0x1 +#define GLI_9750_MISC_RX_INV_OFF 0x0 +#define GLI_9750_MISC_RX_INV_VALUE GLI_9750_MISC_RX_INV_OFF +#define GLI_9750_MISC_TX1_DLY_VALUE 0x5 + +#define SDHCI_GLI_9750_TUNING_CONTROL 0x540 +#define SDHCI_GLI_9750_TUNING_CONTROL_EN BIT(4) +#define GLI_9750_TUNING_CONTROL_EN_ON 0x1 +#define GLI_9750_TUNING_CONTROL_EN_OFF 0x0 +#define SDHCI_GLI_9750_TUNING_CONTROL_GLITCH_1 BIT(16) +#define SDHCI_GLI_9750_TUNING_CONTROL_GLITCH_2 GENMASK(20, 19) +#define GLI_9750_TUNING_CONTROL_GLITCH_1_VALUE 0x1 +#define GLI_9750_TUNING_CONTROL_GLITCH_2_VALUE 0x2 + +#define SDHCI_GLI_9750_TUNING_PARAMETERS 0x544 +#define SDHCI_GLI_9750_TUNING_PARAMETERS_RX_DLY GENMASK(2, 0) +#define GLI_9750_TUNING_PARAMETERS_RX_DLY_VALUE 0x1 + +#define GLI_MAX_TUNING_LOOP 40 + +/* Genesys Logic chipset */ +static inline void gl9750_wt_on(struct sdhci_host *host) +{ + u32 wt_value; + u32 wt_enable; + + wt_value = sdhci_readl(host, SDHCI_GLI_9750_WT); + wt_enable = FIELD_GET(SDHCI_GLI_9750_WT_EN, wt_value); + + if (wt_enable == GLI_9750_WT_EN_ON) + return; + + wt_value &= ~SDHCI_GLI_9750_WT_EN; + wt_value |= FIELD_PREP(SDHCI_GLI_9750_WT_EN, GLI_9750_WT_EN_ON); + + sdhci_writel(host, wt_value, SDHCI_GLI_9750_WT); +} + +static inline void gl9750_wt_off(struct sdhci_host *host) +{ + u32 wt_value; + u32 wt_enable; + + wt_value = sdhci_readl(host, SDHCI_GLI_9750_WT); + wt_enable = FIELD_GET(SDHCI_GLI_9750_WT_EN, wt_value); + + if (wt_enable == GLI_9750_WT_EN_OFF) + return; + + wt_value &= ~SDHCI_GLI_9750_WT_EN; + wt_value |= FIELD_PREP(SDHCI_GLI_9750_WT_EN, GLI_9750_WT_EN_OFF); + + sdhci_writel(host, wt_value, SDHCI_GLI_9750_WT); +} + +static void gli_set_9750(struct sdhci_host *host) +{ + u32 driving_value; + u32 pll_value; + u32 sw_ctrl_value; + u32 misc_value; + u32 parameter_value; + u32 control_value; + u16 ctrl2; + + gl9750_wt_on(host); + + driving_value = sdhci_readl(host, SDHCI_GLI_9750_DRIVING); + pll_value = sdhci_readl(host, SDHCI_GLI_9750_PLL); + sw_ctrl_value = sdhci_readl(host, SDHCI_GLI_9750_SW_CTRL); + misc_value = sdhci_readl(host, SDHCI_GLI_9750_MISC); + parameter_value = sdhci_readl(host, SDHCI_GLI_9750_TUNING_PARAMETERS); + control_value = sdhci_readl(host, SDHCI_GLI_9750_TUNING_CONTROL); + + driving_value &= ~(SDHCI_GLI_9750_DRIVING_1); + driving_value &= ~(SDHCI_GLI_9750_DRIVING_2); + driving_value |= FIELD_PREP(SDHCI_GLI_9750_DRIVING_1, + GLI_9750_DRIVING_1_VALUE); + driving_value |= FIELD_PREP(SDHCI_GLI_9750_DRIVING_2, + GLI_9750_DRIVING_2_VALUE); + sdhci_writel(host, driving_value, SDHCI_GLI_9750_DRIVING); + + sw_ctrl_value &= ~SDHCI_GLI_9750_SW_CTRL_4; + sw_ctrl_value |= FIELD_PREP(SDHCI_GLI_9750_SW_CTRL_4, + GLI_9750_SW_CTRL_4_VALUE); + sdhci_writel(host, sw_ctrl_value, SDHCI_GLI_9750_SW_CTRL); + + /* reset the tuning flow after reinit and before starting tuning */ + pll_value &= ~SDHCI_GLI_9750_PLL_TX2_INV; + pll_value &= ~SDHCI_GLI_9750_PLL_TX2_DLY; + pll_value |= FIELD_PREP(SDHCI_GLI_9750_PLL_TX2_INV, + GLI_9750_PLL_TX2_INV_VALUE); + pll_value |= FIELD_PREP(SDHCI_GLI_9750_PLL_TX2_DLY, + GLI_9750_PLL_TX2_DLY_VALUE); + + misc_value &= ~SDHCI_GLI_9750_MISC_TX1_INV; + misc_value &= ~SDHCI_GLI_9750_MISC_RX_INV; + misc_value &= ~SDHCI_GLI_9750_MISC_TX1_DLY; + misc_value |= FIELD_PREP(SDHCI_GLI_9750_MISC_TX1_INV, + GLI_9750_MISC_TX1_INV_VALUE); + misc_value |= FIELD_PREP(SDHCI_GLI_9750_MISC_RX_INV, + GLI_9750_MISC_RX_INV_VALUE); + misc_value |= FIELD_PREP(SDHCI_GLI_9750_MISC_TX1_DLY, + GLI_9750_MISC_TX1_DLY_VALUE); + + parameter_value &= ~SDHCI_GLI_9750_TUNING_PARAMETERS_RX_DLY; + parameter_value |= FIELD_PREP(SDHCI_GLI_9750_TUNING_PARAMETERS_RX_DLY, + GLI_9750_TUNING_PARAMETERS_RX_DLY_VALUE); + + control_value &= ~SDHCI_GLI_9750_TUNING_CONTROL_GLITCH_1; + control_value &= ~SDHCI_GLI_9750_TUNING_CONTROL_GLITCH_2; + control_value |= FIELD_PREP(SDHCI_GLI_9750_TUNING_CONTROL_GLITCH_1, + GLI_9750_TUNING_CONTROL_GLITCH_1_VALUE); + control_value |= FIELD_PREP(SDHCI_GLI_9750_TUNING_CONTROL_GLITCH_2, + GLI_9750_TUNING_CONTROL_GLITCH_2_VALUE); + + sdhci_writel(host, pll_value, SDHCI_GLI_9750_PLL); + sdhci_writel(host, misc_value, SDHCI_GLI_9750_MISC); + + /* disable tuned clk */ + ctrl2 = sdhci_readw(host, SDHCI_HOST_CONTROL2); + ctrl2 &= ~SDHCI_CTRL_TUNED_CLK; + sdhci_writew(host, ctrl2, SDHCI_HOST_CONTROL2); + + /* enable tuning parameters control */ + control_value &= ~SDHCI_GLI_9750_TUNING_CONTROL_EN; + control_value |= FIELD_PREP(SDHCI_GLI_9750_TUNING_CONTROL_EN, + GLI_9750_TUNING_CONTROL_EN_ON); + sdhci_writel(host, control_value, SDHCI_GLI_9750_TUNING_CONTROL); + + /* write tuning parameters */ + sdhci_writel(host, parameter_value, SDHCI_GLI_9750_TUNING_PARAMETERS); + + /* disable tuning parameters control */ + control_value &= ~SDHCI_GLI_9750_TUNING_CONTROL_EN; + control_value |= FIELD_PREP(SDHCI_GLI_9750_TUNING_CONTROL_EN, + GLI_9750_TUNING_CONTROL_EN_OFF); + sdhci_writel(host, control_value, SDHCI_GLI_9750_TUNING_CONTROL); + + /* clear tuned clk */ + ctrl2 = sdhci_readw(host, SDHCI_HOST_CONTROL2); + ctrl2 &= ~SDHCI_CTRL_TUNED_CLK; + sdhci_writew(host, ctrl2, SDHCI_HOST_CONTROL2); + + gl9750_wt_off(host); +} + +static void gli_set_9750_rx_inv(struct sdhci_host *host, bool b) +{ + u32 misc_value; + + gl9750_wt_on(host); + + misc_value = sdhci_readl(host, SDHCI_GLI_9750_MISC); + misc_value &= ~SDHCI_GLI_9750_MISC_RX_INV; + if (b) { + misc_value |= FIELD_PREP(SDHCI_GLI_9750_MISC_RX_INV, + GLI_9750_MISC_RX_INV_ON); + } else { + misc_value |= FIELD_PREP(SDHCI_GLI_9750_MISC_RX_INV, + GLI_9750_MISC_RX_INV_OFF); + } + sdhci_writel(host, misc_value, SDHCI_GLI_9750_MISC); + + gl9750_wt_off(host); +} + +static int __sdhci_execute_tuning_9750(struct sdhci_host *host, u32 opcode) +{ + int i; + int rx_inv; + + for (rx_inv = 0; rx_inv < 2; rx_inv++) { + gli_set_9750_rx_inv(host, !!rx_inv); + sdhci_start_tuning(host); + + for (i = 0; i < GLI_MAX_TUNING_LOOP; i++) { + u16 ctrl; + + sdhci_send_tuning(host, opcode); + + if (!host->tuning_done) { + sdhci_abort_tuning(host, opcode); + break; + } + + ctrl = sdhci_readw(host, SDHCI_HOST_CONTROL2); + if (!(ctrl & SDHCI_CTRL_EXEC_TUNING)) { + if (ctrl & SDHCI_CTRL_TUNED_CLK) + return 0; /* Success! */ + break; + } + } + } + if (!host->tuning_done) { + pr_info("%s: Tuning timeout, falling back to fixed sampling clock\n", + mmc_hostname(host->mmc)); + return -ETIMEDOUT; + } + + pr_info("%s: Tuning failed, falling back to fixed sampling clock\n", + mmc_hostname(host->mmc)); + sdhci_reset_tuning(host); + + return -EAGAIN; +} + +static int gl9750_execute_tuning(struct sdhci_host *host, u32 opcode) +{ + host->mmc->retune_period = 0; + if (host->tuning_mode == SDHCI_TUNING_MODE_1) + host->mmc->retune_period = host->tuning_count; + + gli_set_9750(host); + host->tuning_err = __sdhci_execute_tuning_9750(host, opcode); + sdhci_end_tuning(host); + + return 0; +} + +static int gli_probe_slot_gl9750(struct sdhci_pci_slot *slot) +{ + struct sdhci_host *host = slot->host; + + slot->host->mmc->caps2 |= MMC_CAP2_NO_SDIO; + sdhci_enable_v4_mode(host); + + return 0; +} + +static int gli_probe_slot_gl9755(struct sdhci_pci_slot *slot) +{ + struct sdhci_host *host = slot->host; + + slot->host->mmc->caps2 |= MMC_CAP2_NO_SDIO; + sdhci_enable_v4_mode(host); + + return 0; +} + +static void sdhci_gli_voltage_switch(struct sdhci_host *host) +{ + /* + * According to Section 3.6.1 signal voltage switch procedure in + * SD Host Controller Simplified Spec. 4.20, steps 6~8 are as + * follows: + * (6) Set 1.8V Signal Enable in the Host Control 2 register. + * (7) Wait 5ms. 1.8V voltage regulator shall be stable within this + * period. + * (8) If 1.8V Signal Enable is cleared by Host Controller, go to + * step (12). + * + * Wait 5ms after set 1.8V signal enable in Host Control 2 register + * to ensure 1.8V signal enable bit is set by GL9750/GL9755. + */ + usleep_range(5000, 5500); +} + +static void sdhci_gl9750_reset(struct sdhci_host *host, u8 mask) +{ + sdhci_reset(host, mask); + gli_set_9750(host); +} + +static u32 sdhci_gl9750_readl(struct sdhci_host *host, int reg) +{ + u32 value; + + value = readl(host->ioaddr + reg); + if (unlikely(reg == SDHCI_MAX_CURRENT && !(value & 0xff))) + value |= 0xc8; + + return value; +} + +static const struct sdhci_ops sdhci_gl9755_ops = { + .set_clock = sdhci_set_clock, + .enable_dma = sdhci_pci_enable_dma, + .set_bus_width = sdhci_set_bus_width, + .reset = sdhci_reset, + .set_uhs_signaling = sdhci_set_uhs_signaling, + .voltage_switch = sdhci_gli_voltage_switch, +}; + +const struct sdhci_pci_fixes sdhci_gl9755 = { + .quirks = SDHCI_QUIRK_NO_ENDATTR_IN_NOPDESC, + .quirks2 = SDHCI_QUIRK2_BROKEN_DDR50, + .probe_slot = gli_probe_slot_gl9755, + .ops = &sdhci_gl9755_ops, +}; + +static const struct sdhci_ops sdhci_gl9750_ops = { + .read_l = sdhci_gl9750_readl, + .set_clock = sdhci_set_clock, + .enable_dma = sdhci_pci_enable_dma, + .set_bus_width = sdhci_set_bus_width, + .reset = sdhci_gl9750_reset, + .set_uhs_signaling = sdhci_set_uhs_signaling, + .voltage_switch = sdhci_gli_voltage_switch, + .platform_execute_tuning = gl9750_execute_tuning, +}; + +const struct sdhci_pci_fixes sdhci_gl9750 = { + .quirks = SDHCI_QUIRK_NO_ENDATTR_IN_NOPDESC, + .quirks2 = SDHCI_QUIRK2_BROKEN_DDR50, + .probe_slot = gli_probe_slot_gl9750, + .ops = &sdhci_gl9750_ops, +}; diff --git a/drivers/mmc/host/sdhci-pci.h b/drivers/mmc/host/sdhci-pci.h index 1abc9d47a4c0..558202fe64c6 100644 --- a/drivers/mmc/host/sdhci-pci.h +++ b/drivers/mmc/host/sdhci-pci.h @@ -68,6 +68,9 @@ #define PCI_DEVICE_ID_SYNOPSYS_DWC_MSHC 0xc202 +#define PCI_DEVICE_ID_GLI_9755 0x9755 +#define PCI_DEVICE_ID_GLI_9750 0x9750 + /* * PCI device class and mask */ @@ -188,5 +191,7 @@ int sdhci_pci_enable_dma(struct sdhci_host *host); extern const struct sdhci_pci_fixes sdhci_arasan; extern const struct sdhci_pci_fixes sdhci_snps; extern const struct sdhci_pci_fixes sdhci_o2; +extern const struct sdhci_pci_fixes sdhci_gl9750; +extern const struct sdhci_pci_fixes sdhci_gl9755; #endif /* __SDHCI_PCI_H */ From 78beef629fd95be4ed853b2d37b832f766bd96ca Mon Sep 17 00:00:00 2001 From: Navid Emamdoost Date: Thu, 26 Sep 2019 20:51:46 -0500 Subject: [PATCH 305/334] nfp: abm: fix memory leak in nfp_abm_u32_knode_replace In nfp_abm_u32_knode_replace if the allocation for match fails it should go to the error handling instead of returning. Updated other gotos to have correct errno returned, too. Signed-off-by: Navid Emamdoost Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/abm/cls.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/abm/cls.c b/drivers/net/ethernet/netronome/nfp/abm/cls.c index 23ebddfb9532..9f8a1f69c0c4 100644 --- a/drivers/net/ethernet/netronome/nfp/abm/cls.c +++ b/drivers/net/ethernet/netronome/nfp/abm/cls.c @@ -176,8 +176,10 @@ nfp_abm_u32_knode_replace(struct nfp_abm_link *alink, u8 mask, val; int err; - if (!nfp_abm_u32_check_knode(alink->abm, knode, proto, extack)) + if (!nfp_abm_u32_check_knode(alink->abm, knode, proto, extack)) { + err = -EOPNOTSUPP; goto err_delete; + } tos_off = proto == htons(ETH_P_IP) ? 16 : 20; @@ -198,14 +200,18 @@ nfp_abm_u32_knode_replace(struct nfp_abm_link *alink, if ((iter->val & cmask) == (val & cmask) && iter->band != knode->res->classid) { NL_SET_ERR_MSG_MOD(extack, "conflict with already offloaded filter"); + err = -EOPNOTSUPP; goto err_delete; } } if (!match) { match = kzalloc(sizeof(*match), GFP_KERNEL); - if (!match) - return -ENOMEM; + if (!match) { + err = -ENOMEM; + goto err_delete; + } + list_add(&match->list, &alink->dscp_map); } match->handle = knode->handle; @@ -221,7 +227,7 @@ nfp_abm_u32_knode_replace(struct nfp_abm_link *alink, err_delete: nfp_abm_u32_knode_delete(alink, knode); - return -EOPNOTSUPP; + return err; } static int nfp_abm_setup_tc_block_cb(enum tc_setup_type type, From faeacb6ddb13b7a020b50b9246fe098653cfbd6e Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 27 Sep 2019 10:40:39 +0100 Subject: [PATCH 306/334] net: tap: clean up an indentation issue There is a statement that is indented too deeply, remove the extraneous tab. Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- drivers/net/tap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/tap.c b/drivers/net/tap.c index dd614c2cd994..3ae70c7e6860 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -1200,7 +1200,7 @@ err_kfree: kfree_skb(skb); err: rcu_read_lock(); - tap = rcu_dereference(q->tap); + tap = rcu_dereference(q->tap); if (tap && tap->count_tx_dropped) tap->count_tx_dropped(tap); rcu_read_unlock(); From f15d9a992f901d4f22db868adf800844d1cac9f2 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 25 Sep 2019 15:22:55 +0200 Subject: [PATCH 307/334] iommu/amd: Remove domain->updated This struct member was used to track whether a domain change requires updates to the device-table and IOMMU cache flushes. The problem is, that access to this field is racy since locking in the common mapping code-paths has been eliminated. Move the updated field to the stack to get rid of all potential races and remove the field from the struct. Fixes: 92d420ec028d ("iommu/amd: Relax locking in dma_ops path") Reviewed-by: Filippo Sironi Reviewed-by: Jerry Snitselaar Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 49 +++++++++++++++++---------------- drivers/iommu/amd_iommu_types.h | 1 - 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 7bdce3b10f3d..042854bbc5bc 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -1458,10 +1458,11 @@ static void free_pagetable(struct protection_domain *domain) * another level increases the size of the address space by 9 bits to a size up * to 64 bits. */ -static void increase_address_space(struct protection_domain *domain, +static bool increase_address_space(struct protection_domain *domain, gfp_t gfp) { unsigned long flags; + bool ret = false; u64 *pte; spin_lock_irqsave(&domain->lock, flags); @@ -1478,19 +1479,21 @@ static void increase_address_space(struct protection_domain *domain, iommu_virt_to_phys(domain->pt_root)); domain->pt_root = pte; domain->mode += 1; - domain->updated = true; + + ret = true; out: spin_unlock_irqrestore(&domain->lock, flags); - return; + return ret; } static u64 *alloc_pte(struct protection_domain *domain, unsigned long address, unsigned long page_size, u64 **pte_page, - gfp_t gfp) + gfp_t gfp, + bool *updated) { int level, end_lvl; u64 *pte, *page; @@ -1498,7 +1501,7 @@ static u64 *alloc_pte(struct protection_domain *domain, BUG_ON(!is_power_of_2(page_size)); while (address > PM_LEVEL_SIZE(domain->mode)) - increase_address_space(domain, gfp); + *updated = increase_address_space(domain, gfp) || *updated; level = domain->mode - 1; pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; @@ -1530,7 +1533,7 @@ static u64 *alloc_pte(struct protection_domain *domain, for (i = 0; i < count; ++i) cmpxchg64(&lpte[i], __pte, 0ULL); - domain->updated = true; + *updated = true; continue; } @@ -1547,7 +1550,7 @@ static u64 *alloc_pte(struct protection_domain *domain, if (cmpxchg64(pte, __pte, __npte) != __pte) free_page((unsigned long)page); else if (IOMMU_PTE_PRESENT(__pte)) - domain->updated = true; + *updated = true; continue; } @@ -1656,28 +1659,29 @@ static int iommu_map_page(struct protection_domain *dom, gfp_t gfp) { struct page *freelist = NULL; + bool updated = false; u64 __pte, *pte; - int i, count; + int ret, i, count; BUG_ON(!IS_ALIGNED(bus_addr, page_size)); BUG_ON(!IS_ALIGNED(phys_addr, page_size)); + ret = -EINVAL; if (!(prot & IOMMU_PROT_MASK)) - return -EINVAL; + goto out; count = PAGE_SIZE_PTE_COUNT(page_size); - pte = alloc_pte(dom, bus_addr, page_size, NULL, gfp); + pte = alloc_pte(dom, bus_addr, page_size, NULL, gfp, &updated); - if (!pte) { - update_domain(dom); - return -ENOMEM; - } + ret = -ENOMEM; + if (!pte) + goto out; for (i = 0; i < count; ++i) freelist = free_clear_pte(&pte[i], pte[i], freelist); if (freelist != NULL) - dom->updated = true; + updated = true; if (count > 1) { __pte = PAGE_SIZE_PTE(__sme_set(phys_addr), page_size); @@ -1693,12 +1697,16 @@ static int iommu_map_page(struct protection_domain *dom, for (i = 0; i < count; ++i) pte[i] = __pte; - update_domain(dom); + ret = 0; + +out: + if (updated) + update_domain(dom); /* Everything flushed out, free pages now */ free_page_list(freelist); - return 0; + return ret; } static unsigned long iommu_unmap_page(struct protection_domain *dom, @@ -2399,15 +2407,10 @@ static void update_device_table(struct protection_domain *domain) static void update_domain(struct protection_domain *domain) { - if (!domain->updated) - return; - update_device_table(domain); domain_flush_devices(domain); domain_flush_tlb_pde(domain); - - domain->updated = false; } static int dir2prot(enum dma_data_direction direction) @@ -3333,7 +3336,6 @@ void amd_iommu_domain_direct_map(struct iommu_domain *dom) /* Update data structure */ domain->mode = PAGE_MODE_NONE; - domain->updated = true; /* Make changes visible to IOMMUs */ update_domain(domain); @@ -3379,7 +3381,6 @@ int amd_iommu_domain_enable_v2(struct iommu_domain *dom, int pasids) domain->glx = levels; domain->flags |= PD_IOMMUV2_MASK; - domain->updated = true; update_domain(domain); diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h index 9ac229e92b07..0186501ab971 100644 --- a/drivers/iommu/amd_iommu_types.h +++ b/drivers/iommu/amd_iommu_types.h @@ -475,7 +475,6 @@ struct protection_domain { int glx; /* Number of levels for GCR3 table */ u64 *gcr3_tbl; /* Guest CR3 table */ unsigned long flags; /* flags to find out type of domain */ - bool updated; /* complete domain flush required */ unsigned dev_cnt; /* devices assigned to this domain */ unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */ }; From 3a11905b69eb026402448c750f97a0eadfa76b08 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 25 Sep 2019 15:22:56 +0200 Subject: [PATCH 308/334] iommu/amd: Remove amd_iommu_devtable_lock The lock is not necessary because the device table does not contain shared state that needs protection. Locking is only needed on an individual entry basis, and that needs to happen on the iommu_dev_data level. Fixes: 92d420ec028d ("iommu/amd: Relax locking in dma_ops path") Reviewed-by: Filippo Sironi Reviewed-by: Jerry Snitselaar Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 042854bbc5bc..37a9c04fc728 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -70,7 +70,6 @@ */ #define AMD_IOMMU_PGSIZES ((~0xFFFUL) & ~(2ULL << 38)) -static DEFINE_SPINLOCK(amd_iommu_devtable_lock); static DEFINE_SPINLOCK(pd_bitmap_lock); /* List of all available dev_data structures */ @@ -2080,10 +2079,11 @@ static void do_detach(struct iommu_dev_data *dev_data) static int __attach_device(struct iommu_dev_data *dev_data, struct protection_domain *domain) { + unsigned long flags; int ret; /* lock domain */ - spin_lock(&domain->lock); + spin_lock_irqsave(&domain->lock, flags); ret = -EBUSY; if (dev_data->domain != NULL) @@ -2097,7 +2097,7 @@ static int __attach_device(struct iommu_dev_data *dev_data, out_unlock: /* ready */ - spin_unlock(&domain->lock); + spin_unlock_irqrestore(&domain->lock, flags); return ret; } @@ -2181,7 +2181,6 @@ static int attach_device(struct device *dev, { struct pci_dev *pdev; struct iommu_dev_data *dev_data; - unsigned long flags; int ret; dev_data = get_dev_data(dev); @@ -2209,9 +2208,7 @@ static int attach_device(struct device *dev, } skip_ats_check: - spin_lock_irqsave(&amd_iommu_devtable_lock, flags); ret = __attach_device(dev_data, domain); - spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags); /* * We might boot into a crash-kernel here. The crashed kernel @@ -2231,14 +2228,15 @@ skip_ats_check: static void __detach_device(struct iommu_dev_data *dev_data) { struct protection_domain *domain; + unsigned long flags; domain = dev_data->domain; - spin_lock(&domain->lock); + spin_lock_irqsave(&domain->lock, flags); do_detach(dev_data); - spin_unlock(&domain->lock); + spin_unlock_irqrestore(&domain->lock, flags); } /* @@ -2248,7 +2246,6 @@ static void detach_device(struct device *dev) { struct protection_domain *domain; struct iommu_dev_data *dev_data; - unsigned long flags; dev_data = get_dev_data(dev); domain = dev_data->domain; @@ -2262,10 +2259,7 @@ static void detach_device(struct device *dev) if (WARN_ON(!dev_data->domain)) return; - /* lock device table */ - spin_lock_irqsave(&amd_iommu_devtable_lock, flags); __detach_device(dev_data); - spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags); if (!dev_is_pci(dev)) return; @@ -2910,9 +2904,6 @@ int __init amd_iommu_init_dma_ops(void) static void cleanup_domain(struct protection_domain *domain) { struct iommu_dev_data *entry; - unsigned long flags; - - spin_lock_irqsave(&amd_iommu_devtable_lock, flags); while (!list_empty(&domain->dev_list)) { entry = list_first_entry(&domain->dev_list, @@ -2920,8 +2911,6 @@ static void cleanup_domain(struct protection_domain *domain) BUG_ON(!entry->domain); __detach_device(entry); } - - spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags); } static void protection_domain_free(struct protection_domain *domain) From f6c0bfce271b2dd613e8b8e009eefe89c1f788e8 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 25 Sep 2019 15:22:57 +0200 Subject: [PATCH 309/334] iommu/amd: Take domain->lock for complete attach/detach path The code-paths before __attach_device() and __detach_device() are called also access and modify domain state, so take the domain lock there too. This allows to get rid of the __detach_device() function. Fixes: 92d420ec028d ("iommu/amd: Relax locking in dma_ops path") Reviewed-by: Filippo Sironi Reviewed-by: Jerry Snitselaar Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 65 ++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 39 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 37a9c04fc728..2919168577ff 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -2079,27 +2079,13 @@ static void do_detach(struct iommu_dev_data *dev_data) static int __attach_device(struct iommu_dev_data *dev_data, struct protection_domain *domain) { - unsigned long flags; - int ret; - - /* lock domain */ - spin_lock_irqsave(&domain->lock, flags); - - ret = -EBUSY; if (dev_data->domain != NULL) - goto out_unlock; + return -EBUSY; /* Attach alias group root */ do_attach(dev_data, domain); - ret = 0; - -out_unlock: - - /* ready */ - spin_unlock_irqrestore(&domain->lock, flags); - - return ret; + return 0; } @@ -2181,8 +2167,11 @@ static int attach_device(struct device *dev, { struct pci_dev *pdev; struct iommu_dev_data *dev_data; + unsigned long flags; int ret; + spin_lock_irqsave(&domain->lock, flags); + dev_data = get_dev_data(dev); if (!dev_is_pci(dev)) @@ -2190,12 +2179,13 @@ static int attach_device(struct device *dev, pdev = to_pci_dev(dev); if (domain->flags & PD_IOMMUV2_MASK) { + ret = -EINVAL; if (!dev_data->passthrough) - return -EINVAL; + goto out; if (dev_data->iommu_v2) { if (pdev_iommuv2_enable(pdev) != 0) - return -EINVAL; + goto out; dev_data->ats.enabled = true; dev_data->ats.qdep = pci_ats_queue_depth(pdev); @@ -2219,24 +2209,10 @@ skip_ats_check: domain_flush_complete(domain); - return ret; -} - -/* - * Removes a device from a protection domain (unlocked) - */ -static void __detach_device(struct iommu_dev_data *dev_data) -{ - struct protection_domain *domain; - unsigned long flags; - - domain = dev_data->domain; - - spin_lock_irqsave(&domain->lock, flags); - - do_detach(dev_data); - +out: spin_unlock_irqrestore(&domain->lock, flags); + + return ret; } /* @@ -2246,10 +2222,13 @@ static void detach_device(struct device *dev) { struct protection_domain *domain; struct iommu_dev_data *dev_data; + unsigned long flags; dev_data = get_dev_data(dev); domain = dev_data->domain; + spin_lock_irqsave(&domain->lock, flags); + /* * First check if the device is still attached. It might already * be detached from its domain because the generic @@ -2257,12 +2236,12 @@ static void detach_device(struct device *dev) * our alias handling. */ if (WARN_ON(!dev_data->domain)) - return; + goto out; - __detach_device(dev_data); + do_detach(dev_data); if (!dev_is_pci(dev)) - return; + goto out; if (domain->flags & PD_IOMMUV2_MASK && dev_data->iommu_v2) pdev_iommuv2_disable(to_pci_dev(dev)); @@ -2270,6 +2249,9 @@ static void detach_device(struct device *dev) pci_disable_ats(to_pci_dev(dev)); dev_data->ats.enabled = false; + +out: + spin_unlock_irqrestore(&domain->lock, flags); } static int amd_iommu_add_device(struct device *dev) @@ -2904,13 +2886,18 @@ int __init amd_iommu_init_dma_ops(void) static void cleanup_domain(struct protection_domain *domain) { struct iommu_dev_data *entry; + unsigned long flags; + + spin_lock_irqsave(&domain->lock, flags); while (!list_empty(&domain->dev_list)) { entry = list_first_entry(&domain->dev_list, struct iommu_dev_data, list); BUG_ON(!entry->domain); - __detach_device(entry); + do_detach(entry); } + + spin_unlock_irqrestore(&domain->lock, flags); } static void protection_domain_free(struct protection_domain *domain) From 45e528d9c479aeef2d3d1db1e619b243f91e324f Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 25 Sep 2019 15:22:58 +0200 Subject: [PATCH 310/334] iommu/amd: Check for busy devices earlier in attach_device() Check early in attach_device whether the device is already attached to a domain. This also simplifies the code path so that __attach_device() can be removed. Fixes: 92d420ec028d ("iommu/amd: Relax locking in dma_ops path") Reviewed-by: Filippo Sironi Reviewed-by: Jerry Snitselaar Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 2919168577ff..459247c32dc0 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -2072,23 +2072,6 @@ static void do_detach(struct iommu_dev_data *dev_data) domain->dev_cnt -= 1; } -/* - * If a device is not yet associated with a domain, this function makes the - * device visible in the domain - */ -static int __attach_device(struct iommu_dev_data *dev_data, - struct protection_domain *domain) -{ - if (dev_data->domain != NULL) - return -EBUSY; - - /* Attach alias group root */ - do_attach(dev_data, domain); - - return 0; -} - - static void pdev_iommuv2_disable(struct pci_dev *pdev) { pci_disable_ats(pdev); @@ -2174,6 +2157,10 @@ static int attach_device(struct device *dev, dev_data = get_dev_data(dev); + ret = -EBUSY; + if (dev_data->domain != NULL) + goto out; + if (!dev_is_pci(dev)) goto skip_ats_check; @@ -2198,7 +2185,9 @@ static int attach_device(struct device *dev, } skip_ats_check: - ret = __attach_device(dev_data, domain); + ret = 0; + + do_attach(dev_data, domain); /* * We might boot into a crash-kernel here. The crashed kernel From ab7b2577f0d119052b98b8d913bad369ac2760eb Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 25 Sep 2019 15:22:59 +0200 Subject: [PATCH 311/334] iommu/amd: Lock dev_data in attach/detach code paths Make sure that attaching a detaching a device can't race against each other and protect the iommu_dev_data with a spin_lock in these code paths. Fixes: 92d420ec028d ("iommu/amd: Relax locking in dma_ops path") Reviewed-by: Filippo Sironi Reviewed-by: Jerry Snitselaar Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 9 +++++++++ drivers/iommu/amd_iommu_types.h | 3 +++ 2 files changed, 12 insertions(+) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 459247c32dc0..bac4e20a5919 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -201,6 +201,7 @@ static struct iommu_dev_data *alloc_dev_data(u16 devid) if (!dev_data) return NULL; + spin_lock_init(&dev_data->lock); dev_data->devid = devid; ratelimit_default_init(&dev_data->rs); @@ -2157,6 +2158,8 @@ static int attach_device(struct device *dev, dev_data = get_dev_data(dev); + spin_lock(&dev_data->lock); + ret = -EBUSY; if (dev_data->domain != NULL) goto out; @@ -2199,6 +2202,8 @@ skip_ats_check: domain_flush_complete(domain); out: + spin_unlock(&dev_data->lock); + spin_unlock_irqrestore(&domain->lock, flags); return ret; @@ -2218,6 +2223,8 @@ static void detach_device(struct device *dev) spin_lock_irqsave(&domain->lock, flags); + spin_lock(&dev_data->lock); + /* * First check if the device is still attached. It might already * be detached from its domain because the generic @@ -2240,6 +2247,8 @@ static void detach_device(struct device *dev) dev_data->ats.enabled = false; out: + spin_unlock(&dev_data->lock); + spin_unlock_irqrestore(&domain->lock, flags); } diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h index 0186501ab971..c9c1612d52e0 100644 --- a/drivers/iommu/amd_iommu_types.h +++ b/drivers/iommu/amd_iommu_types.h @@ -633,6 +633,9 @@ struct devid_map { * This struct contains device specific data for the IOMMU */ struct iommu_dev_data { + /*Protect against attach/detach races */ + spinlock_t lock; + struct list_head list; /* For domain->dev_list */ struct llist_node dev_data_list; /* For global dev_data_list */ struct protection_domain *domain; /* Domain the device is bound to */ From 2a78f9962565e53b78363eaf516eb052009e8020 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 25 Sep 2019 15:23:00 +0200 Subject: [PATCH 312/334] iommu/amd: Lock code paths traversing protection_domain->dev_list The traversing of this list requires protection_domain->lock to be taken to avoid nasty races with attach/detach code. Make sure the lock is held on all code-paths traversing this list. Reported-by: Filippo Sironi Fixes: 92d420ec028d ("iommu/amd: Relax locking in dma_ops path") Reviewed-by: Filippo Sironi Reviewed-by: Jerry Snitselaar Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index bac4e20a5919..9c26976a0f99 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -1334,8 +1334,12 @@ static void domain_flush_np_cache(struct protection_domain *domain, dma_addr_t iova, size_t size) { if (unlikely(amd_iommu_np_cache)) { + unsigned long flags; + + spin_lock_irqsave(&domain->lock, flags); domain_flush_pages(domain, iova, size); domain_flush_complete(domain); + spin_unlock_irqrestore(&domain->lock, flags); } } @@ -1700,8 +1704,13 @@ static int iommu_map_page(struct protection_domain *dom, ret = 0; out: - if (updated) + if (updated) { + unsigned long flags; + + spin_lock_irqsave(&dom->lock, flags); update_domain(dom); + spin_unlock_irqrestore(&dom->lock, flags); + } /* Everything flushed out, free pages now */ free_page_list(freelist); @@ -1857,8 +1866,12 @@ static void free_gcr3_table(struct protection_domain *domain) static void dma_ops_domain_flush_tlb(struct dma_ops_domain *dom) { + unsigned long flags; + + spin_lock_irqsave(&dom->domain.lock, flags); domain_flush_tlb(&dom->domain); domain_flush_complete(&dom->domain); + spin_unlock_irqrestore(&dom->domain.lock, flags); } static void iova_domain_flush_tlb(struct iova_domain *iovad) @@ -2414,6 +2427,7 @@ static dma_addr_t __map_single(struct device *dev, { dma_addr_t offset = paddr & ~PAGE_MASK; dma_addr_t address, start, ret; + unsigned long flags; unsigned int pages; int prot = 0; int i; @@ -2451,8 +2465,10 @@ out_unmap: iommu_unmap_page(&dma_dom->domain, start, PAGE_SIZE); } + spin_lock_irqsave(&dma_dom->domain.lock, flags); domain_flush_tlb(&dma_dom->domain); domain_flush_complete(&dma_dom->domain); + spin_unlock_irqrestore(&dma_dom->domain.lock, flags); dma_ops_free_iova(dma_dom, address, pages); @@ -2481,8 +2497,12 @@ static void __unmap_single(struct dma_ops_domain *dma_dom, } if (amd_iommu_unmap_flush) { + unsigned long flags; + + spin_lock_irqsave(&dma_dom->domain.lock, flags); domain_flush_tlb(&dma_dom->domain); domain_flush_complete(&dma_dom->domain); + spin_unlock_irqrestore(&dma_dom->domain.lock, flags); dma_ops_free_iova(dma_dom, dma_addr, pages); } else { pages = __roundup_pow_of_two(pages); @@ -3246,9 +3266,12 @@ static bool amd_iommu_is_attach_deferred(struct iommu_domain *domain, static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain) { struct protection_domain *dom = to_pdomain(domain); + unsigned long flags; + spin_lock_irqsave(&dom->lock, flags); domain_flush_tlb_pde(dom); domain_flush_complete(dom); + spin_unlock_irqrestore(&dom->lock, flags); } static void amd_iommu_iotlb_sync(struct iommu_domain *domain, From 127068abe85bf3dee50df51cb039a5a987a4a666 Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Thu, 5 Sep 2019 20:24:12 +0100 Subject: [PATCH 313/334] i2c: qcom-geni: Disable DMA processing on the Lenovo Yoga C630 We have a production-level laptop (Lenovo Yoga C630) which is exhibiting a rather horrific bug. When I2C HID devices are being scanned for at boot-time the QCom Geni based I2C (Serial Engine) attempts to use DMA. When it does, the laptop reboots and the user never sees the OS. Attempts are being made to debug the reason for the spontaneous reboot. No luck so far, hence the requirement for this hot-fix. This workaround will be removed once we have a viable fix. Signed-off-by: Lee Jones Tested-by: Bjorn Andersson Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-qcom-geni.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/i2c/busses/i2c-qcom-geni.c b/drivers/i2c/busses/i2c-qcom-geni.c index a89bfce5388e..17abf60c94ae 100644 --- a/drivers/i2c/busses/i2c-qcom-geni.c +++ b/drivers/i2c/busses/i2c-qcom-geni.c @@ -355,11 +355,13 @@ static int geni_i2c_rx_one_msg(struct geni_i2c_dev *gi2c, struct i2c_msg *msg, { dma_addr_t rx_dma; unsigned long time_left; - void *dma_buf; + void *dma_buf = NULL; struct geni_se *se = &gi2c->se; size_t len = msg->len; - dma_buf = i2c_get_dma_safe_msg_buf(msg, 32); + if (!of_machine_is_compatible("lenovo,yoga-c630")) + dma_buf = i2c_get_dma_safe_msg_buf(msg, 32); + if (dma_buf) geni_se_select_mode(se, GENI_SE_DMA); else @@ -394,11 +396,13 @@ static int geni_i2c_tx_one_msg(struct geni_i2c_dev *gi2c, struct i2c_msg *msg, { dma_addr_t tx_dma; unsigned long time_left; - void *dma_buf; + void *dma_buf = NULL; struct geni_se *se = &gi2c->se; size_t len = msg->len; - dma_buf = i2c_get_dma_safe_msg_buf(msg, 32); + if (!of_machine_is_compatible("lenovo,yoga-c630")) + dma_buf = i2c_get_dma_safe_msg_buf(msg, 32); + if (dma_buf) geni_se_select_mode(se, GENI_SE_DMA); else From a71e2ac1f32097fbb2beab098687a7a95c84543e Mon Sep 17 00:00:00 2001 From: Chris Brandt Date: Thu, 26 Sep 2019 07:19:09 -0500 Subject: [PATCH 314/334] i2c: riic: Clear NACK in tend isr The NACKF flag should be cleared in INTRIICNAKI interrupt processing as description in HW manual. This issue shows up quickly when PREEMPT_RT is applied and a device is probed that is not plugged in (like a touchscreen controller). The result is endless interrupts that halt system boot. Fixes: 310c18a41450 ("i2c: riic: add driver") Cc: stable@vger.kernel.org Reported-by: Chien Nguyen Signed-off-by: Chris Brandt Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-riic.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/i2c/busses/i2c-riic.c b/drivers/i2c/busses/i2c-riic.c index f31413fd9521..800414886f6b 100644 --- a/drivers/i2c/busses/i2c-riic.c +++ b/drivers/i2c/busses/i2c-riic.c @@ -202,6 +202,7 @@ static irqreturn_t riic_tend_isr(int irq, void *data) if (readb(riic->base + RIIC_ICSR2) & ICSR2_NACKF) { /* We got a NACKIE */ readb(riic->base + RIIC_ICDRR); /* dummy read */ + riic_clear_set_bit(riic, ICSR2_NACKF, 0, RIIC_ICSR2); riic->err = -ENXIO; } else if (riic->bytes_left) { return IRQ_NONE; From fd4b204a0971854c35795ab60b3673a5b57dfebc Mon Sep 17 00:00:00 2001 From: Jarkko Nikula Date: Fri, 27 Sep 2019 14:09:11 +0300 Subject: [PATCH 315/334] i2c: i801: Bring back Block Process Call support for certain platforms Commit b84398d6d7f9 ("i2c: i801: Use iTCO version 6 in Cannon Lake PCH and beyond") looks like to drop by accident Block Write-Block Read Process Call support for Intel Sunrisepoint, Lewisburg, Denverton and Kaby Lake. That support was added for above and newer platforms by the commit 315cd67c9453 ("i2c: i801: Add Block Write-Block Read Process Call support") so bring it back for above platforms. Fixes: b84398d6d7f9 ("i2c: i801: Use iTCO version 6 in Cannon Lake PCH and beyond") Signed-off-by: Jarkko Nikula Reviewed-by: Alexander Sverdlin Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-i801.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/i2c/busses/i2c-i801.c b/drivers/i2c/busses/i2c-i801.c index c09791fb4929..f1c714acc280 100644 --- a/drivers/i2c/busses/i2c-i801.c +++ b/drivers/i2c/busses/i2c-i801.c @@ -1736,6 +1736,7 @@ static int i801_probe(struct pci_dev *dev, const struct pci_device_id *id) case PCI_DEVICE_ID_INTEL_LEWISBURG_SSKU_SMBUS: case PCI_DEVICE_ID_INTEL_DNV_SMBUS: case PCI_DEVICE_ID_INTEL_KABYLAKE_PCH_H_SMBUS: + priv->features |= FEATURE_BLOCK_PROC; priv->features |= FEATURE_I2C_BLOCK_READ; priv->features |= FEATURE_IRQ; priv->features |= FEATURE_SMBUS_PEC; From 11af27f494086188620e7768e421894af93c126a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Ard=C3=B6?= Date: Fri, 6 Sep 2019 16:06:09 +0200 Subject: [PATCH 316/334] i2c: slave-eeprom: Add read only mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add read-only versions of all EEPROMs. These versions are read-only on the i2c side, but can be written from the sysfs side. Signed-off-by: Björn Ardö Signed-off-by: Wolfram Sang --- drivers/i2c/i2c-slave-eeprom.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/i2c/i2c-slave-eeprom.c b/drivers/i2c/i2c-slave-eeprom.c index 92ff9991bae8..db9763cb4dae 100644 --- a/drivers/i2c/i2c-slave-eeprom.c +++ b/drivers/i2c/i2c-slave-eeprom.c @@ -33,11 +33,13 @@ struct eeprom_data { u16 address_mask; u8 num_address_bytes; u8 idx_write_cnt; + bool read_only; u8 buffer[]; }; #define I2C_SLAVE_BYTELEN GENMASK(15, 0) #define I2C_SLAVE_FLAG_ADDR16 BIT(16) +#define I2C_SLAVE_FLAG_RO BIT(17) #define I2C_SLAVE_DEVICE_MAGIC(_len, _flags) ((_flags) | (_len)) static int i2c_slave_eeprom_slave_cb(struct i2c_client *client, @@ -53,9 +55,11 @@ static int i2c_slave_eeprom_slave_cb(struct i2c_client *client, eeprom->buffer_idx = *val | (eeprom->buffer_idx << 8); eeprom->idx_write_cnt++; } else { - spin_lock(&eeprom->buffer_lock); - eeprom->buffer[eeprom->buffer_idx++ & eeprom->address_mask] = *val; - spin_unlock(&eeprom->buffer_lock); + if (!eeprom->read_only) { + spin_lock(&eeprom->buffer_lock); + eeprom->buffer[eeprom->buffer_idx++ & eeprom->address_mask] = *val; + spin_unlock(&eeprom->buffer_lock); + } } break; @@ -130,6 +134,7 @@ static int i2c_slave_eeprom_probe(struct i2c_client *client, const struct i2c_de eeprom->idx_write_cnt = 0; eeprom->num_address_bytes = flag_addr16 ? 2 : 1; eeprom->address_mask = size - 1; + eeprom->read_only = FIELD_GET(I2C_SLAVE_FLAG_RO, id->driver_data); spin_lock_init(&eeprom->buffer_lock); i2c_set_clientdata(client, eeprom); @@ -165,8 +170,11 @@ static int i2c_slave_eeprom_remove(struct i2c_client *client) static const struct i2c_device_id i2c_slave_eeprom_id[] = { { "slave-24c02", I2C_SLAVE_DEVICE_MAGIC(2048 / 8, 0) }, + { "slave-24c02ro", I2C_SLAVE_DEVICE_MAGIC(2048 / 8, I2C_SLAVE_FLAG_RO) }, { "slave-24c32", I2C_SLAVE_DEVICE_MAGIC(32768 / 8, I2C_SLAVE_FLAG_ADDR16) }, + { "slave-24c32ro", I2C_SLAVE_DEVICE_MAGIC(32768 / 8, I2C_SLAVE_FLAG_ADDR16 | I2C_SLAVE_FLAG_RO) }, { "slave-24c64", I2C_SLAVE_DEVICE_MAGIC(65536 / 8, I2C_SLAVE_FLAG_ADDR16) }, + { "slave-24c64ro", I2C_SLAVE_DEVICE_MAGIC(65536 / 8, I2C_SLAVE_FLAG_ADDR16 | I2C_SLAVE_FLAG_RO) }, { } }; MODULE_DEVICE_TABLE(i2c, i2c_slave_eeprom_id); From ac79f78dab892fcdc11fda8af5cc5e80d09dca8a Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 4 Sep 2019 12:54:18 -0700 Subject: [PATCH 317/334] Revert "Revert "mm, thp: restore node-local hugepage allocations"" This reverts commit a8282608c88e08b1782141026eab61204c1e533f. The commit references the original intended semantic for MADV_HUGEPAGE which has subsequently taken on three unique purposes: - enables or disables thp for a range of memory depending on the system's config (is thp "enabled" set to "always" or "madvise"), - determines the synchronous compaction behavior for thp allocations at fault (is thp "defrag" set to "always", "defer+madvise", or "madvise"), and - reverts a previous MADV_NOHUGEPAGE (there is no madvise mode to only clear previous hugepage advice). These are the three purposes that currently exist in 5.2 and over the past several years that userspace has been written around. Adding a NUMA locality preference adds a fourth dimension to an already conflated advice mode. Based on the semantic that MADV_HUGEPAGE has provided over the past several years, there exist workloads that use the tunable based on these principles: specifically that the allocation should attempt to defragment a local node before falling back. It is agreed that remote hugepages typically (but not always) have a better access latency than remote native pages, although on Naples this is at parity for intersocket. The revert commit that this patch reverts allows hugepage allocation to immediately allocate remotely when local memory is fragmented. This is contrary to the semantic of MADV_HUGEPAGE over the past several years: that is, memory compaction should be attempted locally before falling back. The performance degradation of remote hugepages over local hugepages on Rome, for example, is 53.5% increased access latency. For this reason, the goal is to revert back to the 5.2 and previous behavior that would attempt local defragmentation before falling back. With the patch that is reverted by this patch, we see performance degradations at the tail because the allocator happily allocates the remote hugepage rather than even attempting to make a local hugepage available. zone_reclaim_mode is not a solution to this problem since it does not only impact hugepage allocations but rather changes the memory allocation strategy for *all* page allocations. Signed-off-by: David Rientjes Cc: Andrea Arcangeli Cc: Michal Hocko Cc: Mel Gorman Cc: Vlastimil Babka Cc: Stefan Priebe - Profihost AG Cc: "Kirill A. Shutemov" Cc: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mempolicy.h | 2 -- mm/huge_memory.c | 42 +++++++++++++++------------------------ mm/mempolicy.c | 2 +- 3 files changed, 17 insertions(+), 29 deletions(-) diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index bac395f1d00a..5228c62af416 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -139,8 +139,6 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, struct mempolicy *get_task_policy(struct task_struct *p); struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, unsigned long addr); -struct mempolicy *get_vma_policy(struct vm_area_struct *vma, - unsigned long addr); bool vma_policy_mof(struct vm_area_struct *vma); extern void numa_default_policy(void); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index de1f15969e27..62f0d8e9d76b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -648,37 +648,27 @@ release: static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma, unsigned long addr) { const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); - gfp_t this_node = 0; - -#ifdef CONFIG_NUMA - struct mempolicy *pol; - /* - * __GFP_THISNODE is used only when __GFP_DIRECT_RECLAIM is not - * specified, to express a general desire to stay on the current - * node for optimistic allocation attempts. If the defrag mode - * and/or madvise hint requires the direct reclaim then we prefer - * to fallback to other node rather than node reclaim because that - * can lead to excessive reclaim even though there is free memory - * on other nodes. We expect that NUMA preferences are specified - * by memory policies. - */ - pol = get_vma_policy(vma, addr); - if (pol->mode != MPOL_BIND) - this_node = __GFP_THISNODE; - mpol_cond_put(pol); -#endif + const gfp_t gfp_mask = GFP_TRANSHUGE_LIGHT | __GFP_THISNODE; + /* Always do synchronous compaction */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) - return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); + return GFP_TRANSHUGE | __GFP_THISNODE | + (vma_madvised ? 0 : __GFP_NORETRY); + + /* Kick kcompactd and fail quickly */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) - return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM | this_node; + return gfp_mask | __GFP_KSWAPD_RECLAIM; + + /* Synchronous compaction if madvised, otherwise kick kcompactd */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) - return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : - __GFP_KSWAPD_RECLAIM | this_node); + return gfp_mask | (vma_madvised ? __GFP_DIRECT_RECLAIM : + __GFP_KSWAPD_RECLAIM); + + /* Only do synchronous compaction if madvised */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) - return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : - this_node); - return GFP_TRANSHUGE_LIGHT | this_node; + return gfp_mask | (vma_madvised ? __GFP_DIRECT_RECLAIM : 0); + + return gfp_mask; } /* Caller must hold page table lock. */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 65e0874fce17..9c9877a43d58 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1734,7 +1734,7 @@ struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, * freeing by another task. It is the caller's responsibility to free the * extra reference for shared policies. */ -struct mempolicy *get_vma_policy(struct vm_area_struct *vma, +static struct mempolicy *get_vma_policy(struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol = __get_vma_policy(vma, addr); From 19deb7695e072deaff025e03de40c61b525bd57e Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 4 Sep 2019 12:54:20 -0700 Subject: [PATCH 318/334] Revert "Revert "Revert "mm, thp: consolidate THP gfp handling into alloc_hugepage_direct_gfpmask"" This reverts commit 92717d429b38e4f9f934eed7e605cc42858f1839. Since commit a8282608c88e ("Revert "mm, thp: restore node-local hugepage allocations"") is reverted in this series, it is better to restore the previous 5.2 behavior between the thp allocation and the page allocator rather than to attempt any consolidation or cleanup for a policy that is now reverted. It's less risky during an rc cycle and subsequent patches in this series further modify the same policy that the pre-5.3 behavior implements. Consolidation and cleanup can be done subsequent to a sane default page allocation strategy, so this patch reverts a cleanup done on a strategy that is now reverted and thus is the least risky option. Signed-off-by: David Rientjes Cc: Andrea Arcangeli Cc: Michal Hocko Cc: Mel Gorman Cc: Vlastimil Babka Cc: Stefan Priebe - Profihost AG Cc: "Kirill A. Shutemov" Cc: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 12 ++++++++---- mm/huge_memory.c | 27 +++++++++++++-------------- mm/mempolicy.c | 32 +++++++++++++++++++++++++++++--- mm/shmem.c | 2 +- 4 files changed, 51 insertions(+), 22 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index f33881688f42..fb07b503dc45 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -510,18 +510,22 @@ alloc_pages(gfp_t gfp_mask, unsigned int order) } extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, struct vm_area_struct *vma, unsigned long addr, - int node); + int node, bool hugepage); +#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ + alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true) #else #define alloc_pages(gfp_mask, order) \ alloc_pages_node(numa_node_id(), gfp_mask, order) -#define alloc_pages_vma(gfp_mask, order, vma, addr, node)\ +#define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\ + alloc_pages(gfp_mask, order) +#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ alloc_pages(gfp_mask, order) #endif #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) #define alloc_page_vma(gfp_mask, vma, addr) \ - alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id()) + alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false) #define alloc_page_vma_node(gfp_mask, vma, addr, node) \ - alloc_pages_vma(gfp_mask, 0, vma, addr, node) + alloc_pages_vma(gfp_mask, 0, vma, addr, node, false) extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); extern unsigned long get_zeroed_page(gfp_t gfp_mask); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 62f0d8e9d76b..aec462cc5d46 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -645,30 +645,30 @@ release: * available * never: never stall for any thp allocation */ -static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma, unsigned long addr) +static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) { const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); - const gfp_t gfp_mask = GFP_TRANSHUGE_LIGHT | __GFP_THISNODE; /* Always do synchronous compaction */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) - return GFP_TRANSHUGE | __GFP_THISNODE | - (vma_madvised ? 0 : __GFP_NORETRY); + return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); /* Kick kcompactd and fail quickly */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) - return gfp_mask | __GFP_KSWAPD_RECLAIM; + return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; /* Synchronous compaction if madvised, otherwise kick kcompactd */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) - return gfp_mask | (vma_madvised ? __GFP_DIRECT_RECLAIM : - __GFP_KSWAPD_RECLAIM); + return GFP_TRANSHUGE_LIGHT | + (vma_madvised ? __GFP_DIRECT_RECLAIM : + __GFP_KSWAPD_RECLAIM); /* Only do synchronous compaction if madvised */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) - return gfp_mask | (vma_madvised ? __GFP_DIRECT_RECLAIM : 0); + return GFP_TRANSHUGE_LIGHT | + (vma_madvised ? __GFP_DIRECT_RECLAIM : 0); - return gfp_mask; + return GFP_TRANSHUGE_LIGHT; } /* Caller must hold page table lock. */ @@ -740,8 +740,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) pte_free(vma->vm_mm, pgtable); return ret; } - gfp = alloc_hugepage_direct_gfpmask(vma, haddr); - page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, vma, haddr, numa_node_id()); + gfp = alloc_hugepage_direct_gfpmask(vma); + page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); if (unlikely(!page)) { count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; @@ -1348,9 +1348,8 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) alloc: if (__transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) { - huge_gfp = alloc_hugepage_direct_gfpmask(vma, haddr); - new_page = alloc_pages_vma(huge_gfp, HPAGE_PMD_ORDER, vma, - haddr, numa_node_id()); + huge_gfp = alloc_hugepage_direct_gfpmask(vma); + new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER); } else new_page = NULL; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 9c9877a43d58..547cd403ed02 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1180,8 +1180,8 @@ static struct page *new_page(struct page *page, unsigned long start) } else if (PageTransHuge(page)) { struct page *thp; - thp = alloc_pages_vma(GFP_TRANSHUGE, HPAGE_PMD_ORDER, vma, - address, numa_node_id()); + thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address, + HPAGE_PMD_ORDER); if (!thp) return NULL; prep_transhuge_page(thp); @@ -2083,6 +2083,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, * @vma: Pointer to VMA or NULL if not available. * @addr: Virtual Address of the allocation. Must be inside the VMA. * @node: Which node to prefer for allocation (modulo policy). + * @hugepage: for hugepages try only the preferred node if possible * * This function allocates a page from the kernel page pool and applies * a NUMA policy associated with the VMA or the current process. @@ -2093,7 +2094,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, */ struct page * alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, - unsigned long addr, int node) + unsigned long addr, int node, bool hugepage) { struct mempolicy *pol; struct page *page; @@ -2111,6 +2112,31 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, goto out; } + if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) { + int hpage_node = node; + + /* + * For hugepage allocation and non-interleave policy which + * allows the current node (or other explicitly preferred + * node) we only try to allocate from the current/preferred + * node and don't fall back to other nodes, as the cost of + * remote accesses would likely offset THP benefits. + * + * If the policy is interleave, or does not allow the current + * node in its nodemask, we allocate the standard way. + */ + if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL)) + hpage_node = pol->v.preferred_node; + + nmask = policy_nodemask(gfp, pol); + if (!nmask || node_isset(hpage_node, *nmask)) { + mpol_cond_put(pol); + page = __alloc_pages_node(hpage_node, + gfp | __GFP_THISNODE, order); + goto out; + } + } + nmask = policy_nodemask(gfp, pol); preferred_nid = policy_node(gfp, pol, node); page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask); diff --git a/mm/shmem.c b/mm/shmem.c index 2bed4761f279..626d8c74b973 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1466,7 +1466,7 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp, shmem_pseudo_vma_init(&pvma, info, hindex); page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN, - HPAGE_PMD_ORDER, &pvma, 0, numa_node_id()); + HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true); shmem_pseudo_vma_destroy(&pvma); if (page) prep_transhuge_page(page); From b39d0ee2632d2f4fb180e8e4eba33736283f23de Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 4 Sep 2019 12:54:22 -0700 Subject: [PATCH 319/334] mm, page_alloc: avoid expensive reclaim when compaction may not succeed Memory compaction has a couple significant drawbacks as the allocation order increases, specifically: - isolate_freepages() is responsible for finding free pages to use as migration targets and is implemented as a linear scan of memory starting at the end of a zone, - failing order-0 watermark checks in memory compaction does not account for how far below the watermarks the zone actually is: to enable migration, there must be *some* free memory available. Per the above, watermarks are not always suffficient if isolate_freepages() cannot find the free memory but it could require hundreds of MBs of reclaim to even reach this threshold (read: potentially very expensive reclaim with no indication compaction can be successful), and - if compaction at this order has failed recently so that it does not even run as a result of deferred compaction, looping through reclaim can often be pointless. For hugepage allocations, these are quite substantial drawbacks because these are very high order allocations (order-9 on x86) and falling back to doing reclaim can potentially be *very* expensive without any indication that compaction would even be successful. Reclaim itself is unlikely to free entire pageblocks and certainly no reliance should be put on it to do so in isolation (recall lumpy reclaim). This means we should avoid reclaim and simply fail hugepage allocation if compaction is deferred. It is also not helpful to thrash a zone by doing excessive reclaim if compaction may not be able to access that memory. If order-0 watermarks fail and the allocation order is sufficiently large, it is likely better to fail the allocation rather than thrashing the zone. Signed-off-by: David Rientjes Cc: Andrea Arcangeli Cc: Michal Hocko Cc: Mel Gorman Cc: Vlastimil Babka Cc: Stefan Priebe - Profihost AG Cc: "Kirill A. Shutemov" Cc: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9c9194959271..87cbd92065e5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4458,6 +4458,28 @@ retry_cpuset: if (page) goto got_pg; + if (order >= pageblock_order && (gfp_mask & __GFP_IO)) { + /* + * If allocating entire pageblock(s) and compaction + * failed because all zones are below low watermarks + * or is prohibited because it recently failed at this + * order, fail immediately. + * + * Reclaim is + * - potentially very expensive because zones are far + * below their low watermarks or this is part of very + * bursty high order allocations, + * - not guaranteed to help because isolate_freepages() + * may not iterate over freed pages as part of its + * linear scan, and + * - unlikely to make entire pageblocks free on its + * own. + */ + if (compact_result == COMPACT_SKIPPED || + compact_result == COMPACT_DEFERRED) + goto nopage; + } + /* * Checks for costly allocations with __GFP_NORETRY, which * includes THP page fault allocations From 76e654cc91bbe627aa6067916f02a4d3ac041620 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 4 Sep 2019 12:54:25 -0700 Subject: [PATCH 320/334] mm, page_alloc: allow hugepage fallback to remote nodes when madvised For systems configured to always try hard to allocate transparent hugepages (thp defrag setting of "always") or for memory that has been explicitly madvised to MADV_HUGEPAGE, it is often better to fallback to remote memory to allocate the hugepage if the local allocation fails first. The point is to allow the initial call to __alloc_pages_node() to attempt to defragment local memory to make a hugepage available, if possible, rather than immediately fallback to remote memory. Local hugepages will always have a better access latency than remote (huge)pages, so an attempt to make a hugepage available locally is always preferred. If memory compaction cannot be successful locally, however, it is likely better to fallback to remote memory. This could take on two forms: either allow immediate fallback to remote memory or do per-zone watermark checks. It would be possible to fallback only when per-zone watermarks fail for order-0 memory, since that would require local reclaim for all subsequent faults so remote huge allocation is likely better than thrashing the local zone for large workloads. In this case, it is assumed that because the system is configured to try hard to allocate hugepages or the vma is advised to explicitly want to try hard for hugepages that remote allocation is better when local allocation and memory compaction have both failed. Signed-off-by: David Rientjes Cc: Andrea Arcangeli Cc: Michal Hocko Cc: Mel Gorman Cc: Vlastimil Babka Cc: Stefan Priebe - Profihost AG Cc: "Kirill A. Shutemov" Cc: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 547cd403ed02..8caab1f81a52 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2133,6 +2133,17 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, mpol_cond_put(pol); page = __alloc_pages_node(hpage_node, gfp | __GFP_THISNODE, order); + + /* + * If hugepage allocations are configured to always + * synchronous compact or the vma has been madvised + * to prefer hugepage backing, retry allowing remote + * memory as well. + */ + if (!page && (gfp & __GFP_DIRECT_RECLAIM)) + page = __alloc_pages_node(hpage_node, + gfp | __GFP_NORETRY, order); + goto out; } } From d2aea95a1a4d195d939d16303700921be318a2b9 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 28 Sep 2019 05:53:29 -0400 Subject: [PATCH 321/334] tracing/probe: Fix to check the difference of nr_args before adding probe Steven reported that a test triggered: ================================================================== BUG: KASAN: slab-out-of-bounds in trace_kprobe_create+0xa9e/0xe40 Read of size 8 at addr ffff8880c4f25a48 by task ftracetest/4798 CPU: 2 PID: 4798 Comm: ftracetest Not tainted 5.3.0-rc6-test+ #30 Hardware name: Hewlett-Packard HP Compaq Pro 6300 SFF/339A, BIOS K01 v03.03 07/14/2016 Call Trace: dump_stack+0x7c/0xc0 ? trace_kprobe_create+0xa9e/0xe40 print_address_description+0x6c/0x332 ? trace_kprobe_create+0xa9e/0xe40 ? trace_kprobe_create+0xa9e/0xe40 __kasan_report.cold.6+0x1a/0x3b ? trace_kprobe_create+0xa9e/0xe40 kasan_report+0xe/0x12 trace_kprobe_create+0xa9e/0xe40 ? print_kprobe_event+0x280/0x280 ? match_held_lock+0x1b/0x240 ? find_held_lock+0xac/0xd0 ? fs_reclaim_release.part.112+0x5/0x20 ? lock_downgrade+0x350/0x350 ? kasan_unpoison_shadow+0x30/0x40 ? __kasan_kmalloc.constprop.6+0xc1/0xd0 ? trace_kprobe_create+0xe40/0xe40 ? trace_kprobe_create+0xe40/0xe40 create_or_delete_trace_kprobe+0x2e/0x60 trace_run_command+0xc3/0xe0 ? trace_panic_handler+0x20/0x20 ? kasan_unpoison_shadow+0x30/0x40 trace_parse_run_command+0xdc/0x163 vfs_write+0xe1/0x240 ksys_write+0xba/0x150 ? __ia32_sys_read+0x50/0x50 ? tracer_hardirqs_on+0x61/0x180 ? trace_hardirqs_off_caller+0x43/0x110 ? mark_held_locks+0x29/0xa0 ? do_syscall_64+0x14/0x260 do_syscall_64+0x68/0x260 Fix to check the difference of nr_args before adding probe on existing probes. This also may set the error log index bigger than the number of command parameters. In that case it sets the error position is next to the last parameter. Link: http://lkml.kernel.org/r/156966474783.3478.13217501608215769150.stgit@devnote2 Fixes: ca89bc071d5e ("tracing/kprobe: Add multi-probe per event support") Reported-by: Steven Rostedt (VMware) Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_probe.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index baf58a3612c0..905b10af5d5c 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -178,6 +178,16 @@ void __trace_probe_log_err(int offset, int err_type) if (!command) return; + if (trace_probe_log.index >= trace_probe_log.argc) { + /** + * Set the error position is next to the last arg + space. + * Note that len includes the terminal null and the cursor + * appaers at pos + 1. + */ + pos = len; + offset = 0; + } + /* And make a command string from argv array */ p = command; for (i = 0; i < trace_probe_log.argc; i++) { @@ -1084,6 +1094,12 @@ int trace_probe_compare_arg_type(struct trace_probe *a, struct trace_probe *b) { int i; + /* In case of more arguments */ + if (a->nr_args < b->nr_args) + return a->nr_args + 1; + if (a->nr_args > b->nr_args) + return b->nr_args + 1; + for (i = 0; i < a->nr_args; i++) { if ((b->nr_args <= i) || ((a->args[i].type != b->args[i].type) || From 968e5170939662341242812b9c82ef51cf140a33 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 26 Sep 2019 09:22:59 -0700 Subject: [PATCH 322/334] tracing: Fix clang -Wint-in-bool-context warnings in IF_ASSIGN macro After r372664 in clang, the IF_ASSIGN macro causes a couple hundred warnings along the lines of: kernel/trace/trace_output.c:1331:2: warning: converting the enum constant to a boolean [-Wint-in-bool-context] kernel/trace/trace.h:409:3: note: expanded from macro 'trace_assign_type' IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, ^ kernel/trace/trace.h:371:14: note: expanded from macro 'IF_ASSIGN' WARN_ON(id && (entry)->type != id); \ ^ 264 warnings generated. This warning can catch issues with constructs like: if (state == A || B) where the developer really meant: if (state == A || state == B) This is currently the only occurrence of the warning in the kernel tree across defconfig, allyesconfig, allmodconfig for arm32, arm64, and x86_64. Add the implicit '!= 0' to the WARN_ON statement to fix the warnings and find potential issues in the future. Link: https://github.com/llvm/llvm-project/commit/28b38c277a2941e9e891b2db30652cfd962f070b Link: https://github.com/ClangBuiltLinux/linux/issues/686 Link: http://lkml.kernel.org/r/20190926162258.466321-1-natechancellor@gmail.com Reviewed-by: Nick Desaulniers Signed-off-by: Nathan Chancellor Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 26b0a08f3c7d..f801d154ff6a 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -365,11 +365,11 @@ static inline struct trace_array *top_trace_array(void) __builtin_types_compatible_p(typeof(var), type *) #undef IF_ASSIGN -#define IF_ASSIGN(var, entry, etype, id) \ - if (FTRACE_CMP_TYPE(var, etype)) { \ - var = (typeof(var))(entry); \ - WARN_ON(id && (entry)->type != id); \ - break; \ +#define IF_ASSIGN(var, entry, etype, id) \ + if (FTRACE_CMP_TYPE(var, etype)) { \ + var = (typeof(var))(entry); \ + WARN_ON(id != 0 && (entry)->type != id); \ + break; \ } /* Will cause compile errors if type is not found. */ From 96c5c6e6a5b6db592acae039fed54b5c8844cd35 Mon Sep 17 00:00:00 2001 From: Navid Emamdoost Date: Fri, 20 Sep 2019 17:57:59 -0500 Subject: [PATCH 323/334] tracing: Have error path in predicate_parse() free its allocated memory In predicate_parse, there is an error path that is not going to out_free instead it returns directly which leads to a memory leak. Link: http://lkml.kernel.org/r/20190920225800.3870-1-navid.emamdoost@gmail.com Signed-off-by: Navid Emamdoost Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_filter.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index c773b8fb270c..c9a74f82b14a 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -452,8 +452,10 @@ predicate_parse(const char *str, int nr_parens, int nr_preds, switch (*next) { case '(': /* #2 */ - if (top - op_stack > nr_parens) - return ERR_PTR(-EINVAL); + if (top - op_stack > nr_parens) { + ret = -EINVAL; + goto out_free; + } *(++top) = invert; continue; case '!': /* #3 */ From f7d6316fb43735ae8af969c2e582fbee85709483 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Sat, 14 Sep 2019 18:32:15 +0800 Subject: [PATCH 324/334] mm, tracing: Print symbol name for call_site in trace events To improve the readability of raw slab trace points, print the call_site ip using '%pS'. Then we can grep events with function names. [002] .... 808.188897: kmem_cache_free: call_site=putname+0x47/0x50 ptr=00000000cef40c80 [002] .... 808.188898: kfree: call_site=security_cred_free+0x42/0x50 ptr=0000000062400820 [002] .... 808.188904: kmem_cache_free: call_site=put_cred_rcu+0x88/0xa0 ptr=0000000058d74ef8 [002] .... 808.188913: kmem_cache_alloc: call_site=prepare_creds+0x26/0x100 ptr=0000000058d74ef8 bytes_req=168 bytes_alloc=576 gfp_flags=GFP_KERNEL [002] .... 808.188917: kmalloc: call_site=security_prepare_creds+0x77/0xa0 ptr=0000000062400820 bytes_req=8 bytes_alloc=336 gfp_flags=GFP_KERNEL|__GFP_ZERO [002] .... 808.188920: kmem_cache_alloc: call_site=getname_flags+0x4f/0x1e0 ptr=00000000cef40c80 bytes_req=4096 bytes_alloc=4480 gfp_flags=GFP_KERNEL [002] .... 808.188925: kmem_cache_free: call_site=putname+0x47/0x50 ptr=00000000cef40c80 [002] .... 808.188926: kfree: call_site=security_cred_free+0x42/0x50 ptr=0000000062400820 [002] .... 808.188931: kmem_cache_free: call_site=put_cred_rcu+0x88/0xa0 ptr=0000000058d74ef8 Link: http://lkml.kernel.org/r/20190914103215.23301-1-changbin.du@gmail.com Signed-off-by: Changbin Du Signed-off-by: Steven Rostedt (VMware) --- include/trace/events/kmem.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index eb57e3037deb..69e8bb8963db 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -35,8 +35,8 @@ DECLARE_EVENT_CLASS(kmem_alloc, __entry->gfp_flags = gfp_flags; ), - TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s", - __entry->call_site, + TP_printk("call_site=%pS ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s", + (void *)__entry->call_site, __entry->ptr, __entry->bytes_req, __entry->bytes_alloc, @@ -131,7 +131,8 @@ DECLARE_EVENT_CLASS(kmem_free, __entry->ptr = ptr; ), - TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr) + TP_printk("call_site=%pS ptr=%p", + (void *)__entry->call_site, __entry->ptr) ); DEFINE_EVENT(kmem_free, kfree, From 8ed4889eb83179dbc9a105cfed65cc42ecb61097 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 27 Sep 2019 11:10:22 -0400 Subject: [PATCH 325/334] selftests/ftrace: Fix same probe error test The "same probe" selftest that tests that adding the same probe fails doesn't add the same probe and passes, which fails the test. Fixes: b78b94b82122 ("selftests/ftrace: Update kprobe event error testcase") Signed-off-by: Steven Rostedt (VMware) --- .../selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc index 8a4025e912cb..ef1e9bafb098 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc @@ -95,7 +95,7 @@ echo 'p:kprobes/testevent _do_fork abcd=\1' > kprobe_events check_error 'p:kprobes/testevent _do_fork ^bcd=\1' # DIFF_ARG_TYPE check_error 'p:kprobes/testevent _do_fork ^abcd=\1:u8' # DIFF_ARG_TYPE check_error 'p:kprobes/testevent _do_fork ^abcd=\"foo"' # DIFF_ARG_TYPE -check_error '^p:kprobes/testevent _do_fork' # SAME_PROBE +check_error '^p:kprobes/testevent _do_fork abcd=\1' # SAME_PROBE fi exit 0 From dc925a36060e8cef050a9d05c64dae1c30dc5027 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 25 Sep 2019 10:29:49 +0200 Subject: [PATCH 326/334] Documentation/process: Clarify disclosure rules The role of the contact list provided by the disclosing party and how it affects the disclosure process and the ability to include experts into the development process is not really well explained. Neither is it entirely clear when the disclosing party will be informed about the fact that a developer who is not covered by an employer NDA needs to be brought in and disclosed. Explain the role of the contact list and the information policy along with an eventual conflict resolution better. Reported-by: Dave Hansen Signed-off-by: Thomas Gleixner Acked-by: Dave Hansen Link: https://lore.kernel.org/r/alpine.DEB.2.21.1909251028390.10825@nanos.tec.linutronix.de Signed-off-by: Greg Kroah-Hartman --- .../process/embargoed-hardware-issues.rst | 40 +++++++++++++++---- 1 file changed, 33 insertions(+), 7 deletions(-) diff --git a/Documentation/process/embargoed-hardware-issues.rst b/Documentation/process/embargoed-hardware-issues.rst index e57b9f39c69f..a3c3349046c4 100644 --- a/Documentation/process/embargoed-hardware-issues.rst +++ b/Documentation/process/embargoed-hardware-issues.rst @@ -143,6 +143,20 @@ via their employer, they cannot enter individual non-disclosure agreements in their role as Linux kernel developers. They will, however, agree to adhere to this documented process and the Memorandum of Understanding. +The disclosing party should provide a list of contacts for all other +entities who have already been, or should be, informed about the issue. +This serves several purposes: + + - The list of disclosed entities allows communication accross the + industry, e.g. other OS vendors, HW vendors, etc. + + - The disclosed entities can be contacted to name experts who should + participate in the mitigation development. + + - If an expert which is required to handle an issue is employed by an + listed entity or member of an listed entity, then the response teams can + request the disclosure of that expert from that entity. This ensures + that the expert is also part of the entity's response team. Disclosure """""""""" @@ -158,10 +172,7 @@ Mitigation development """""""""""""""""""""" The initial response team sets up an encrypted mailing-list or repurposes -an existing one if appropriate. The disclosing party should provide a list -of contacts for all other parties who have already been, or should be, -informed about the issue. The response team contacts these parties so they -can name experts who should be subscribed to the mailing-list. +an existing one if appropriate. Using a mailing-list is close to the normal Linux development process and has been successfully used in developing mitigations for various hardware @@ -175,9 +186,24 @@ development branch against the mainline kernel and backport branches for stable kernel versions as necessary. The initial response team will identify further experts from the Linux -kernel developer community as needed and inform the disclosing party about -their participation. Bringing in experts can happen at any time of the -development process and often needs to be handled in a timely manner. +kernel developer community as needed. Bringing in experts can happen at any +time of the development process and needs to be handled in a timely manner. + +If an expert is employed by or member of an entity on the disclosure list +provided by the disclosing party, then participation will be requested from +the relevant entity. + +If not, then the disclosing party will be informed about the experts +participation. The experts are covered by the Memorandum of Understanding +and the disclosing party is requested to acknowledge the participation. In +case that the disclosing party has a compelling reason to object, then this +objection has to be raised within five work days and resolved with the +incident team immediately. If the disclosing party does not react within +five work days this is taken as silent acknowledgement. + +After acknowledgement or resolution of an objection the expert is disclosed +by the incident team and brought into the development process. + Coordinated release """"""""""""""""""" From 50ee7529ec4500c88f8664560770a7a1b65db72b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 28 Sep 2019 16:53:52 -0700 Subject: [PATCH 327/334] random: try to actively add entropy rather than passively wait for it For 5.3 we had to revert a nice ext4 IO pattern improvement, because it caused a bootup regression due to lack of entropy at bootup together with arguably broken user space that was asking for secure random numbers when it really didn't need to. See commit 72dbcf721566 (Revert "ext4: make __ext4_get_inode_loc plug"). This aims to solve the issue by actively generating entropy noise using the CPU cycle counter when waiting for the random number generator to initialize. This only works when you have a high-frequency time stamp counter available, but that's the case on all modern x86 CPU's, and on most other modern CPU's too. What we do is to generate jitter entropy from the CPU cycle counter under a somewhat complex load: calling the scheduler while also guaranteeing a certain amount of timing noise by also triggering a timer. I'm sure we can tweak this, and that people will want to look at other alternatives, but there's been a number of papers written on jitter entropy, and this should really be fairly conservative by crediting one bit of entropy for every timer-induced jump in the cycle counter. Not because the timer itself would be all that unpredictable, but because the interaction between the timer and the loop is going to be. Even if (and perhaps particularly if) the timer actually happens on another CPU, the cacheline interaction between the loop that reads the cycle counter and the timer itself firing is going to add perturbations to the cycle counter values that get mixed into the entropy pool. As Thomas pointed out, with a modern out-of-order CPU, even quite simple loops show a fair amount of hard-to-predict timing variability even in the absense of external interrupts. But this tries to take that further by actually having a fairly complex interaction. This is not going to solve the entropy issue for architectures that have no CPU cycle counter, but it's not clear how (and if) that is solvable, and the hardware in question is largely starting to be irrelevant. And by doing this we can at least avoid some of the even more contentious approaches (like making the entropy waiting time out in order to avoid the possibly unbounded waiting). Cc: Ahmed Darwish Cc: Thomas Gleixner Cc: Theodore Ts'o Cc: Nicholas Mc Guire Cc: Andy Lutomirski Cc: Kees Cook Cc: Willy Tarreau Cc: Alexander E. Patrakov Cc: Lennart Poettering Signed-off-by: Linus Torvalds --- drivers/char/random.c | 62 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 61 insertions(+), 1 deletion(-) diff --git a/drivers/char/random.c b/drivers/char/random.c index 5d5ea4ce1442..2fda6166c1dd 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1731,6 +1731,56 @@ void get_random_bytes(void *buf, int nbytes) } EXPORT_SYMBOL(get_random_bytes); + +/* + * Each time the timer fires, we expect that we got an unpredictable + * jump in the cycle counter. Even if the timer is running on another + * CPU, the timer activity will be touching the stack of the CPU that is + * generating entropy.. + * + * Note that we don't re-arm the timer in the timer itself - we are + * happy to be scheduled away, since that just makes the load more + * complex, but we do not want the timer to keep ticking unless the + * entropy loop is running. + * + * So the re-arming always happens in the entropy loop itself. + */ +static void entropy_timer(struct timer_list *t) +{ + credit_entropy_bits(&input_pool, 1); +} + +/* + * If we have an actual cycle counter, see if we can + * generate enough entropy with timing noise + */ +static void try_to_generate_entropy(void) +{ + struct { + unsigned long now; + struct timer_list timer; + } stack; + + stack.now = random_get_entropy(); + + /* Slow counter - or none. Don't even bother */ + if (stack.now == random_get_entropy()) + return; + + timer_setup_on_stack(&stack.timer, entropy_timer, 0); + while (!crng_ready()) { + if (!timer_pending(&stack.timer)) + mod_timer(&stack.timer, jiffies+1); + mix_pool_bytes(&input_pool, &stack.now, sizeof(stack.now)); + schedule(); + stack.now = random_get_entropy(); + } + + del_timer_sync(&stack.timer); + destroy_timer_on_stack(&stack.timer); + mix_pool_bytes(&input_pool, &stack.now, sizeof(stack.now)); +} + /* * Wait for the urandom pool to be seeded and thus guaranteed to supply * cryptographically secure random numbers. This applies to: the /dev/urandom @@ -1745,7 +1795,17 @@ int wait_for_random_bytes(void) { if (likely(crng_ready())) return 0; - return wait_event_interruptible(crng_init_wait, crng_ready()); + + do { + int ret; + ret = wait_event_interruptible_timeout(crng_init_wait, crng_ready(), HZ); + if (ret) + return ret > 0 ? 0 : ret; + + try_to_generate_entropy(); + } while (!crng_ready()); + + return 0; } EXPORT_SYMBOL(wait_for_random_bytes); From 02f03c4206c1b2a7451d3b3546f86c9c783eac13 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 29 Sep 2019 17:59:23 -0700 Subject: [PATCH 328/334] Revert "Revert "ext4: make __ext4_get_inode_loc plug"" This reverts commit 72dbcf72156641fde4d8ea401e977341bfd35a05. Instead of waiting forever for entropy that may just not happen, we now try to actively generate entropy when required, and are thus hopefully avoiding the problem that caused the nice ext4 IO pattern fix to be reverted. So revert the revert. Cc: Ahmed S. Darwish Cc: Ted Ts'o Cc: Willy Tarreau Cc: Alexander E. Patrakov Signed-off-by: Linus Torvalds --- fs/ext4/inode.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 006b7a2070bf..420fe3deed39 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4586,6 +4586,7 @@ static int __ext4_get_inode_loc(struct inode *inode, struct buffer_head *bh; struct super_block *sb = inode->i_sb; ext4_fsblk_t block; + struct blk_plug plug; int inodes_per_block, inode_offset; iloc->bh = NULL; @@ -4674,6 +4675,7 @@ make_io: * If we need to do any I/O, try to pre-readahead extra * blocks from the inode table. */ + blk_start_plug(&plug); if (EXT4_SB(sb)->s_inode_readahead_blks) { ext4_fsblk_t b, end, table; unsigned num; @@ -4704,6 +4706,7 @@ make_io: get_bh(bh); bh->b_end_io = end_buffer_read_sync; submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh); + blk_finish_plug(&plug); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { EXT4_ERROR_INODE_BLOCK(inode, block, From fdbdcddc2c93096e9b956de930d2d710a1342502 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 28 Aug 2019 16:35:19 +0300 Subject: [PATCH 329/334] csky: Use generic free_initrd_mem() The csky implementation of free_initrd_mem() is an open-coded version of free_reserved_area() without poisoning. Remove it and make csky use the generic version of free_initrd_mem(). Signed-off-by: Mike Rapoport Signed-off-by: Guo Ren --- arch/csky/mm/init.c | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/arch/csky/mm/init.c b/arch/csky/mm/init.c index eb0dc9e5065f..d4c2292ea46b 100644 --- a/arch/csky/mm/init.c +++ b/arch/csky/mm/init.c @@ -60,22 +60,6 @@ void __init mem_init(void) mem_init_print_info(NULL); } -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - if (start < end) - pr_info("Freeing initrd memory: %ldk freed\n", - (end - start) >> 10); - - for (; start < end; start += PAGE_SIZE) { - ClearPageReserved(virt_to_page(start)); - init_page_count(virt_to_page(start)); - free_page(start); - totalram_pages_inc(); - } -} -#endif - extern char __init_begin[], __init_end[]; void free_initmem(void) From 48ede51fd94fe9251058fc85626b2aeb5cbb5884 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Wed, 25 Sep 2019 19:56:16 +0800 Subject: [PATCH 330/334] csky: Fixup add zero_fp fixup perf backtrace panic We need set fp zero to let backtrace know the end. The patch fixup perf callchain panic problem, because backtrace didn't know what is the end of fp. Signed-off-by: Guo Ren Reported-by: Mao Han --- arch/csky/kernel/entry.S | 50 +++++++++++++++++++++++--------------- arch/csky/kernel/process.c | 2 +- 2 files changed, 31 insertions(+), 21 deletions(-) diff --git a/arch/csky/kernel/entry.S b/arch/csky/kernel/entry.S index a7e84ccccbd8..564dab2fabaa 100644 --- a/arch/csky/kernel/entry.S +++ b/arch/csky/kernel/entry.S @@ -17,6 +17,12 @@ #define PTE_INDX_SHIFT 10 #define _PGDIR_SHIFT 22 +.macro zero_fp +#ifdef CONFIG_STACKTRACE + movi r8, 0 +#endif +.endm + .macro tlbop_begin name, val0, val1, val2 ENTRY(csky_\name) mtcr a3, ss2 @@ -96,6 +102,7 @@ ENTRY(csky_\name) SAVE_ALL 0 .endm .macro tlbop_end is_write + zero_fp RD_MEH a2 psrset ee, ie mov a0, sp @@ -120,6 +127,7 @@ tlbop_end 1 ENTRY(csky_systemcall) SAVE_ALL TRAP0_SIZE + zero_fp psrset ee, ie @@ -136,9 +144,9 @@ ENTRY(csky_systemcall) mov r9, sp bmaski r10, THREAD_SHIFT andn r9, r10 - ldw r8, (r9, TINFO_FLAGS) - ANDI_R3 r8, (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT) - cmpnei r8, 0 + ldw r12, (r9, TINFO_FLAGS) + ANDI_R3 r12, (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT) + cmpnei r12, 0 bt csky_syscall_trace #if defined(__CSKYABIV2__) subi sp, 8 @@ -180,7 +188,7 @@ csky_syscall_trace: ENTRY(ret_from_kernel_thread) jbsr schedule_tail - mov a0, r8 + mov a0, r10 jsr r9 jbsr ret_from_exception @@ -189,9 +197,9 @@ ENTRY(ret_from_fork) mov r9, sp bmaski r10, THREAD_SHIFT andn r9, r10 - ldw r8, (r9, TINFO_FLAGS) - ANDI_R3 r8, (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT) - cmpnei r8, 0 + ldw r12, (r9, TINFO_FLAGS) + ANDI_R3 r12, (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT) + cmpnei r12, 0 bf ret_from_exception mov a0, sp /* sp = pt_regs pointer */ jbsr syscall_trace_exit @@ -209,9 +217,9 @@ ret_from_exception: bmaski r10, THREAD_SHIFT andn r9, r10 - ldw r8, (r9, TINFO_FLAGS) - andi r8, (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED) - cmpnei r8, 0 + ldw r12, (r9, TINFO_FLAGS) + andi r12, (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED) + cmpnei r12, 0 bt exit_work 1: RESTORE_ALL @@ -220,11 +228,11 @@ exit_work: lrw syscallid, ret_from_exception mov lr, syscallid - btsti r8, TIF_NEED_RESCHED + btsti r12, TIF_NEED_RESCHED bt work_resched mov a0, sp - mov a1, r8 + mov a1, r12 jmpi do_notify_resume work_resched: @@ -232,6 +240,7 @@ work_resched: ENTRY(csky_trap) SAVE_ALL 0 + zero_fp psrset ee mov a0, sp /* Push Stack pointer arg */ jbsr trap_c /* Call C-level trap handler */ @@ -265,6 +274,7 @@ ENTRY(csky_get_tls) ENTRY(csky_irq) SAVE_ALL 0 + zero_fp psrset ee #ifdef CONFIG_PREEMPT @@ -276,21 +286,21 @@ ENTRY(csky_irq) * Get task_struct->stack.preempt_count for current, * and increase 1. */ - ldw r8, (r9, TINFO_PREEMPT) - addi r8, 1 - stw r8, (r9, TINFO_PREEMPT) + ldw r12, (r9, TINFO_PREEMPT) + addi r12, 1 + stw r12, (r9, TINFO_PREEMPT) #endif mov a0, sp jbsr csky_do_IRQ #ifdef CONFIG_PREEMPT - subi r8, 1 - stw r8, (r9, TINFO_PREEMPT) - cmpnei r8, 0 + subi r12, 1 + stw r12, (r9, TINFO_PREEMPT) + cmpnei r12, 0 bt 2f - ldw r8, (r9, TINFO_FLAGS) - btsti r8, TIF_NEED_RESCHED + ldw r12, (r9, TINFO_FLAGS) + btsti r12, TIF_NEED_RESCHED bf 2f 1: jbsr preempt_schedule_irq /* irq en/disable is done inside */ diff --git a/arch/csky/kernel/process.c b/arch/csky/kernel/process.c index e555740c0be5..f320d9248a22 100644 --- a/arch/csky/kernel/process.c +++ b/arch/csky/kernel/process.c @@ -55,7 +55,7 @@ int copy_thread(unsigned long clone_flags, if (unlikely(p->flags & PF_KTHREAD)) { memset(childregs, 0, sizeof(struct pt_regs)); childstack->r15 = (unsigned long) ret_from_kernel_thread; - childstack->r8 = kthread_arg; + childstack->r10 = kthread_arg; childstack->r9 = usp; childregs->sr = mfcr("psr"); } else { From 3a09d8e2893b2403a043890e5832966e8640feaf Mon Sep 17 00:00:00 2001 From: Mao Han Date: Wed, 25 Sep 2019 17:23:02 +0800 Subject: [PATCH 331/334] csky: Fixup csky_pmu.max_period assignment The csky_pmu.max_period has type u64, and BIT() can only return 32 bits unsigned long on C-SKY. The initialization for max_period will be incorrect when count_width is bigger than 32. Use BIT_ULL() Signed-off-by: Mao Han Signed-off-by: Guo Ren --- arch/csky/kernel/perf_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/csky/kernel/perf_event.c b/arch/csky/kernel/perf_event.c index 4c1a1934d76a..7570109cddc6 100644 --- a/arch/csky/kernel/perf_event.c +++ b/arch/csky/kernel/perf_event.c @@ -1306,7 +1306,7 @@ int csky_pmu_device_probe(struct platform_device *pdev, &csky_pmu.count_width)) { csky_pmu.count_width = DEFAULT_COUNT_WIDTH; } - csky_pmu.max_period = BIT(csky_pmu.count_width) - 1; + csky_pmu.max_period = BIT_ULL(csky_pmu.count_width) - 1; csky_pmu.plat_device = pdev; From a2139d3b4fd7ef26a363a1b1eb6cd55be2c1bcd1 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 23 Sep 2019 15:36:14 +0100 Subject: [PATCH 332/334] csky: entry: Remove unneeded need_resched() loop Since the enabling and disabling of IRQs within preempt_schedule_irq() is contained in a need_resched() loop, we don't need the outer arch code loop. Signed-off-by: Valentin Schneider Signed-off-by: Guo Ren --- arch/csky/kernel/entry.S | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/csky/kernel/entry.S b/arch/csky/kernel/entry.S index 564dab2fabaa..a7a5b67df898 100644 --- a/arch/csky/kernel/entry.S +++ b/arch/csky/kernel/entry.S @@ -302,11 +302,7 @@ ENTRY(csky_irq) ldw r12, (r9, TINFO_FLAGS) btsti r12, TIF_NEED_RESCHED bf 2f -1: jbsr preempt_schedule_irq /* irq en/disable is done inside */ - ldw r7, (r9, TINFO_FLAGS) /* get new tasks TI_FLAGS */ - btsti r7, TIF_NEED_RESCHED - bt 1b /* go again */ #endif 2: jmpi ret_from_exception From 9af032a30172e119a5935f802b066631f8ded2d6 Mon Sep 17 00:00:00 2001 From: Krzysztof Wilczynski Date: Tue, 3 Sep 2019 13:36:51 +0200 Subject: [PATCH 333/334] csky: Move static keyword to the front of declaration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the static keyword to the front of declaration of csky_pmu_of_device_ids, and resolve the following compiler warning that can be seen when building with warnings enabled (W=1): arch/csky/kernel/perf_event.c:1340:1: warning: ‘static’ is not at beginning of declaration [-Wold-style-declaration] Signed-off-by: Krzysztof Wilczynski Signed-off-by: Guo Ren --- arch/csky/kernel/perf_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/csky/kernel/perf_event.c b/arch/csky/kernel/perf_event.c index 7570109cddc6..1a29f1157449 100644 --- a/arch/csky/kernel/perf_event.c +++ b/arch/csky/kernel/perf_event.c @@ -1337,7 +1337,7 @@ int csky_pmu_device_probe(struct platform_device *pdev, return ret; } -const static struct of_device_id csky_pmu_of_device_ids[] = { +static const struct of_device_id csky_pmu_of_device_ids[] = { {.compatible = "csky,csky-pmu"}, {}, }; From 54ecb8f7028c5eb3d740bb82b0f1d90f2df63c5c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 30 Sep 2019 10:35:40 -0700 Subject: [PATCH 334/334] Linux 5.4-rc1 --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index d456746da347..6f54f2f95743 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 5 -PATCHLEVEL = 3 +PATCHLEVEL = 4 SUBLEVEL = 0 -EXTRAVERSION = +EXTRAVERSION = -rc1 NAME = Bobtail Squid # *DOCUMENTATION*