diff --git a/.clang-format b/.clang-format index badfc1ba440a..6cbd6ee51610 100644 --- a/.clang-format +++ b/.clang-format @@ -203,11 +203,13 @@ ForEachMacros: - 'for_each_matching_node' - 'for_each_matching_node_and_match' - 'for_each_member' - - 'for_each_memblock' + - 'for_each_mem_region' - 'for_each_memblock_type' - 'for_each_memcg_cache_index' - 'for_each_mem_pfn_range' + - '__for_each_mem_range' - 'for_each_mem_range' + - '__for_each_mem_range_rev' - 'for_each_mem_range_rev' - 'for_each_migratetype_order' - 'for_each_msi_entry' @@ -271,6 +273,7 @@ ForEachMacros: - 'for_each_registered_fb' - 'for_each_requested_gpio' - 'for_each_requested_gpio_in_range' + - 'for_each_reserved_mem_range' - 'for_each_reserved_mem_region' - 'for_each_rtd_codec_dais' - 'for_each_rtd_codec_dais_rollback' diff --git a/Documentation/ABI/obsolete/sysfs-selinux-checkreqprot b/Documentation/ABI/obsolete/sysfs-selinux-checkreqprot index 49ed9c8fd1e5..ed6b52ca210f 100644 --- a/Documentation/ABI/obsolete/sysfs-selinux-checkreqprot +++ b/Documentation/ABI/obsolete/sysfs-selinux-checkreqprot @@ -15,7 +15,7 @@ Description: actual protection), and Android and Linux distributions have been explicitly writing a "0" to /sys/fs/selinux/checkreqprot during initialization for some time. Support for setting checkreqprot to 1 - will be removed in a future kernel release, at which point the kernel + will be removed no sooner than June 2021, at which point the kernel will always cease using checkreqprot internally and will always check the actual protections being applied upon mmap/mprotect calls. The checkreqprot selinuxfs node will remain for backward compatibility diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index baa07b30845e..608d7c279396 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1259,6 +1259,10 @@ PAGE_SIZE multiple when read back. can show up in the middle. Don't rely on items remaining in a fixed position; use the keys to look up specific values! + If the entry has no per-node counter(or not show in the + mempry.numa_stat). We use 'npn'(non-per-node) as the tag + to indicate that it will not show in the mempry.numa_stat. + anon Amount of memory used in anonymous mappings such as brk(), sbrk(), and mmap(MAP_ANONYMOUS) @@ -1270,15 +1274,11 @@ PAGE_SIZE multiple when read back. kernel_stack Amount of memory allocated to kernel stacks. - slab - Amount of memory used for storing in-kernel data - structures. - - percpu + percpu(npn) Amount of memory used for storing per-cpu kernel data structures. - sock + sock(npn) Amount of memory used in network transmission buffers shmem @@ -1318,11 +1318,9 @@ PAGE_SIZE multiple when read back. Part of "slab" that cannot be reclaimed on memory pressure. - pgfault - Total number of page faults incurred - - pgmajfault - Number of major page faults incurred + slab(npn) + Amount of memory used for storing in-kernel data + structures. workingset_refault_anon Number of refaults of previously evicted anonymous pages. @@ -1348,37 +1346,68 @@ PAGE_SIZE multiple when read back. workingset_nodereclaim Number of times a shadow node has been reclaimed - pgrefill + pgfault(npn) + Total number of page faults incurred + + pgmajfault(npn) + Number of major page faults incurred + + pgrefill(npn) Amount of scanned pages (in an active LRU list) - pgscan + pgscan(npn) Amount of scanned pages (in an inactive LRU list) - pgsteal + pgsteal(npn) Amount of reclaimed pages - pgactivate + pgactivate(npn) Amount of pages moved to the active LRU list - pgdeactivate + pgdeactivate(npn) Amount of pages moved to the inactive LRU list - pglazyfree + pglazyfree(npn) Amount of pages postponed to be freed under memory pressure - pglazyfreed + pglazyfreed(npn) Amount of reclaimed lazyfree pages - thp_fault_alloc + thp_fault_alloc(npn) Number of transparent hugepages which were allocated to satisfy a page fault. This counter is not present when CONFIG_TRANSPARENT_HUGEPAGE is not set. - thp_collapse_alloc + thp_collapse_alloc(npn) Number of transparent hugepages which were allocated to allow collapsing an existing range of pages. This counter is not present when CONFIG_TRANSPARENT_HUGEPAGE is not set. + memory.numa_stat + A read-only nested-keyed file which exists on non-root cgroups. + + This breaks down the cgroup's memory footprint into different + types of memory, type-specific details, and other information + per node on the state of the memory management system. + + This is useful for providing visibility into the NUMA locality + information within an memcg since the pages are allowed to be + allocated from any physical node. One of the use case is evaluating + application performance by combining this information with the + application's CPU allocation. + + All memory amounts are in bytes. + + The output format of memory.numa_stat is:: + + type N0= N1= ... + + The entries are ordered to be human readable, and new entries + can show up in the middle. Don't rely on items remaining in a + fixed position; use the keys to look up specific values! + + The entries can refer to the memory.stat. + memory.swap.current A read-only single value file which exists on non-root cgroups. diff --git a/Documentation/admin-guide/kdump/gdbmacros.txt b/Documentation/admin-guide/kdump/gdbmacros.txt index 220d0a80ca2c..82aecdcae8a6 100644 --- a/Documentation/admin-guide/kdump/gdbmacros.txt +++ b/Documentation/admin-guide/kdump/gdbmacros.txt @@ -170,57 +170,82 @@ document trapinfo address the kernel panicked. end -define dump_log_idx - set $idx = $arg0 - if ($argc > 1) - set $prev_flags = $arg1 +define dump_record + set var $desc = $arg0 + set var $info = $arg1 + if ($argc > 2) + set var $prev_flags = $arg2 else - set $prev_flags = 0 - end - set $msg = ((struct printk_log *) (log_buf + $idx)) - set $prefix = 1 - set $newline = 1 - set $log = log_buf + $idx + sizeof(*$msg) - - # prev & LOG_CONT && !(msg->flags & LOG_PREIX) - if (($prev_flags & 8) && !($msg->flags & 4)) - set $prefix = 0 + set var $prev_flags = 0 end - # msg->flags & LOG_CONT - if ($msg->flags & 8) + set var $prefix = 1 + set var $newline = 1 + + set var $begin = $desc->text_blk_lpos.begin % (1U << prb->text_data_ring.size_bits) + set var $next = $desc->text_blk_lpos.next % (1U << prb->text_data_ring.size_bits) + + # handle data-less record + if ($begin & 1) + set var $text_len = 0 + set var $log = "" + else + # handle wrapping data block + if ($begin > $next) + set var $begin = 0 + end + + # skip over descriptor id + set var $begin = $begin + sizeof(long) + + # handle truncated message + if ($next - $begin < $info->text_len) + set var $text_len = $next - $begin + else + set var $text_len = $info->text_len + end + + set var $log = &prb->text_data_ring.data[$begin] + end + + # prev & LOG_CONT && !(info->flags & LOG_PREIX) + if (($prev_flags & 8) && !($info->flags & 4)) + set var $prefix = 0 + end + + # info->flags & LOG_CONT + if ($info->flags & 8) # (prev & LOG_CONT && !(prev & LOG_NEWLINE)) if (($prev_flags & 8) && !($prev_flags & 2)) - set $prefix = 0 + set var $prefix = 0 end - # (!(msg->flags & LOG_NEWLINE)) - if (!($msg->flags & 2)) - set $newline = 0 + # (!(info->flags & LOG_NEWLINE)) + if (!($info->flags & 2)) + set var $newline = 0 end end if ($prefix) - printf "[%5lu.%06lu] ", $msg->ts_nsec / 1000000000, $msg->ts_nsec % 1000000000 + printf "[%5lu.%06lu] ", $info->ts_nsec / 1000000000, $info->ts_nsec % 1000000000 end - if ($msg->text_len != 0) - eval "printf \"%%%d.%ds\", $log", $msg->text_len, $msg->text_len + if ($text_len) + eval "printf \"%%%d.%ds\", $log", $text_len, $text_len end if ($newline) printf "\n" end - if ($msg->dict_len > 0) - set $dict = $log + $msg->text_len - set $idx = 0 - set $line = 1 - while ($idx < $msg->dict_len) - if ($line) - printf " " - set $line = 0 - end - set $c = $dict[$idx] + + # handle dictionary data + + set var $dict = &$info->dev_info.subsystem[0] + set var $dict_len = sizeof($info->dev_info.subsystem) + if ($dict[0] != '\0') + printf " SUBSYSTEM=" + set var $idx = 0 + while ($idx < $dict_len) + set var $c = $dict[$idx] if ($c == '\0') - printf "\n" - set $line = 1 + loop_break else if ($c < ' ' || $c >= 127 || $c == '\\') printf "\\x%02x", $c @@ -228,33 +253,67 @@ define dump_log_idx printf "%c", $c end end - set $idx = $idx + 1 + set var $idx = $idx + 1 + end + printf "\n" + end + + set var $dict = &$info->dev_info.device[0] + set var $dict_len = sizeof($info->dev_info.device) + if ($dict[0] != '\0') + printf " DEVICE=" + set var $idx = 0 + while ($idx < $dict_len) + set var $c = $dict[$idx] + if ($c == '\0') + loop_break + else + if ($c < ' ' || $c >= 127 || $c == '\\') + printf "\\x%02x", $c + else + printf "%c", $c + end + end + set var $idx = $idx + 1 end printf "\n" end end -document dump_log_idx - Dump a single log given its index in the log buffer. The first - parameter is the index into log_buf, the second is optional and - specified the previous log buffer's flags, used for properly - formatting continued lines. +document dump_record + Dump a single record. The first parameter is the descriptor, + the second parameter is the info, the third parameter is + optional and specifies the previous record's flags, used for + properly formatting continued lines. end define dmesg - set $i = log_first_idx - set $end_idx = log_first_idx - set $prev_flags = 0 + # definitions from kernel/printk/printk_ringbuffer.h + set var $desc_committed = 1 + set var $desc_finalized = 2 + set var $desc_sv_bits = sizeof(long) * 8 + set var $desc_flags_shift = $desc_sv_bits - 2 + set var $desc_flags_mask = 3 << $desc_flags_shift + set var $id_mask = ~$desc_flags_mask + + set var $desc_count = 1U << prb->desc_ring.count_bits + set var $prev_flags = 0 + + set var $id = prb->desc_ring.tail_id.counter + set var $end_id = prb->desc_ring.head_id.counter while (1) - set $msg = ((struct printk_log *) (log_buf + $i)) - if ($msg->len == 0) - set $i = 0 - else - dump_log_idx $i $prev_flags - set $i = $i + $msg->len - set $prev_flags = $msg->flags + set var $desc = &prb->desc_ring.descs[$id % $desc_count] + set var $info = &prb->desc_ring.infos[$id % $desc_count] + + # skip non-committed record + set var $state = 3 & ($desc->state_var.counter >> $desc_flags_shift) + if ($state == $desc_committed || $state == $desc_finalized) + dump_record $desc $info $prev_flags + set var $prev_flags = $info->flags end - if ($i == $end_idx) + + set var $id = ($id + 1) & $id_mask + if ($id == $end_id) loop_break end end diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst index 2baad0bfb09d..e44a6c01f336 100644 --- a/Documentation/admin-guide/kdump/vmcoreinfo.rst +++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst @@ -189,50 +189,123 @@ from this. Free areas descriptor. User-space tools use this value to iterate the free_area ranges. MAX_ORDER is used by the zone buddy allocator. -log_first_idx -------------- +prb +--- -Index of the first record stored in the buffer log_buf. Used by -user-space tools to read the strings in the log_buf. +A pointer to the printk ringbuffer (struct printk_ringbuffer). This +may be pointing to the static boot ringbuffer or the dynamically +allocated ringbuffer, depending on when the the core dump occurred. +Used by user-space tools to read the active kernel log buffer. -log_buf -------- +printk_rb_static +---------------- -Console output is written to the ring buffer log_buf at index -log_first_idx. Used to get the kernel log. +A pointer to the static boot printk ringbuffer. If @prb has a +different value, this is useful for viewing the initial boot messages, +which may have been overwritten in the dynamically allocated +ringbuffer. -log_buf_len ------------ - -log_buf's length. - -clear_idx +clear_seq --------- -The index that the next printk() record to read after the last clear -command. It indicates the first record after the last SYSLOG_ACTION -_CLEAR, like issued by 'dmesg -c'. Used by user-space tools to dump -the dmesg log. +The sequence number of the printk() record after the last clear +command. It indicates the first record after the last +SYSLOG_ACTION_CLEAR, like issued by 'dmesg -c'. Used by user-space +tools to dump a subset of the dmesg log. -log_next_idx ------------- +printk_ringbuffer +----------------- -The index of the next record to store in the buffer log_buf. Used to -compute the index of the current buffer position. +The size of a printk_ringbuffer structure. This structure contains all +information required for accessing the various components of the +kernel log buffer. -printk_log ----------- +(printk_ringbuffer, desc_ring|text_data_ring|dict_data_ring|fail) +----------------------------------------------------------------- -The size of a structure printk_log. Used to compute the size of -messages, and extract dmesg log. It encapsulates header information for -log_buf, such as timestamp, syslog level, etc. +Offsets for the various components of the printk ringbuffer. Used by +user-space tools to view the kernel log buffer without requiring the +declaration of the structure. -(printk_log, ts_nsec|len|text_len|dict_len) -------------------------------------------- +prb_desc_ring +------------- -It represents field offsets in struct printk_log. User space tools -parse it and check whether the values of printk_log's members have been -changed. +The size of the prb_desc_ring structure. This structure contains +information about the set of record descriptors. + +(prb_desc_ring, count_bits|descs|head_id|tail_id) +------------------------------------------------- + +Offsets for the fields describing the set of record descriptors. Used +by user-space tools to be able to traverse the descriptors without +requiring the declaration of the structure. + +prb_desc +-------- + +The size of the prb_desc structure. This structure contains +information about a single record descriptor. + +(prb_desc, info|state_var|text_blk_lpos|dict_blk_lpos) +------------------------------------------------------ + +Offsets for the fields describing a record descriptors. Used by +user-space tools to be able to read descriptors without requiring +the declaration of the structure. + +prb_data_blk_lpos +----------------- + +The size of the prb_data_blk_lpos structure. This structure contains +information about where the text or dictionary data (data block) is +located within the respective data ring. + +(prb_data_blk_lpos, begin|next) +------------------------------- + +Offsets for the fields describing the location of a data block. Used +by user-space tools to be able to locate data blocks without +requiring the declaration of the structure. + +printk_info +----------- + +The size of the printk_info structure. This structure contains all +the meta-data for a record. + +(printk_info, seq|ts_nsec|text_len|dict_len|caller_id) +------------------------------------------------------ + +Offsets for the fields providing the meta-data for a record. Used by +user-space tools to be able to read the information without requiring +the declaration of the structure. + +prb_data_ring +------------- + +The size of the prb_data_ring structure. This structure contains +information about a set of data blocks. + +(prb_data_ring, size_bits|data|head_lpos|tail_lpos) +--------------------------------------------------- + +Offsets for the fields describing a set of data blocks. Used by +user-space tools to be able to access the data blocks without +requiring the declaration of the structure. + +atomic_long_t +------------- + +The size of the atomic_long_t structure. Used by user-space tools to +be able to copy the full structure, regardless of its +architecture-specific implementation. + +(atomic_long_t, counter) +------------------------ + +Offset for the long value of an atomic_long_t variable. Used by +user-space tools to access the long value without requiring the +architecture-specific declaration. (free_area.free_list, MIGRATE_TYPES) ------------------------------------ diff --git a/Documentation/admin-guide/mm/hugetlbpage.rst b/Documentation/admin-guide/mm/hugetlbpage.rst index 015a5f7d7854..f7b1c7462991 100644 --- a/Documentation/admin-guide/mm/hugetlbpage.rst +++ b/Documentation/admin-guide/mm/hugetlbpage.rst @@ -131,7 +131,7 @@ hugepages parameter is preceded by an invalid hugepagesz parameter, it will be ignored. default_hugepagesz - pecify the default huge page size. This parameter can + Specify the default huge page size. This parameter can only be specified once on the command line. default_hugepagesz can optionally be followed by the hugepages parameter to preallocate a specific number of huge pages of default size. The number of default diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst index 38fd5681fade..c09c9ca2ff1c 100644 --- a/Documentation/dev-tools/kasan.rst +++ b/Documentation/dev-tools/kasan.rst @@ -13,10 +13,10 @@ KASAN uses compile-time instrumentation to insert validity checks before every memory access, and therefore requires a compiler version that supports that. Generic KASAN is supported in both GCC and Clang. With GCC it requires version -8.3.0 or later. With Clang it requires version 7.0.0 or later, but detection of +8.3.0 or later. Any supported Clang version is compatible, but detection of out-of-bounds accesses for global variables is only supported since Clang 11. -Tag-based KASAN is only supported in Clang and requires version 7.0.0 or later. +Tag-based KASAN is only supported in Clang. Currently generic KASAN is supported for the x86_64, arm64, xtensa, s390 and riscv architectures, and tag-based KASAN is supported only for arm64. @@ -281,3 +281,73 @@ unmapped. This will require changes in arch-specific code. This allows ``VMAP_STACK`` support on x86, and can simplify support of architectures that do not have a fixed module region. + +CONFIG_KASAN_KUNIT_TEST & CONFIG_TEST_KASAN_MODULE +-------------------------------------------------- + +``CONFIG_KASAN_KUNIT_TEST`` utilizes the KUnit Test Framework for testing. +This means each test focuses on a small unit of functionality and +there are a few ways these tests can be run. + +Each test will print the KASAN report if an error is detected and then +print the number of the test and the status of the test: + +pass:: + + ok 28 - kmalloc_double_kzfree +or, if kmalloc failed:: + + # kmalloc_large_oob_right: ASSERTION FAILED at lib/test_kasan.c:163 + Expected ptr is not null, but is + not ok 4 - kmalloc_large_oob_right +or, if a KASAN report was expected, but not found:: + + # kmalloc_double_kzfree: EXPECTATION FAILED at lib/test_kasan.c:629 + Expected kasan_data->report_expected == kasan_data->report_found, but + kasan_data->report_expected == 1 + kasan_data->report_found == 0 + not ok 28 - kmalloc_double_kzfree + +All test statuses are tracked as they run and an overall status will +be printed at the end:: + + ok 1 - kasan + +or:: + + not ok 1 - kasan + +(1) Loadable Module +~~~~~~~~~~~~~~~~~~~~ + +With ``CONFIG_KUNIT`` enabled, ``CONFIG_KASAN_KUNIT_TEST`` can be built as +a loadable module and run on any architecture that supports KASAN +using something like insmod or modprobe. The module is called ``test_kasan``. + +(2) Built-In +~~~~~~~~~~~~~ + +With ``CONFIG_KUNIT`` built-in, ``CONFIG_KASAN_KUNIT_TEST`` can be built-in +on any architecure that supports KASAN. These and any other KUnit +tests enabled will run and print the results at boot as a late-init +call. + +(3) Using kunit_tool +~~~~~~~~~~~~~~~~~~~~~ + +With ``CONFIG_KUNIT`` and ``CONFIG_KASAN_KUNIT_TEST`` built-in, we can also +use kunit_tool to see the results of these along with other KUnit +tests in a more readable way. This will not print the KASAN reports +of tests that passed. Use `KUnit documentation `_ for more up-to-date +information on kunit_tool. + +.. _KUnit: https://www.kernel.org/doc/html/latest/dev-tools/kunit/index.html + +``CONFIG_TEST_KASAN_MODULE`` is a set of KASAN tests that could not be +converted to KUnit. These tests can be run only as a module with +``CONFIG_TEST_KASAN_MODULE`` built as a loadable module and +``CONFIG_KASAN`` built-in. The type of error expected and the +function being run is printed before the expression expected to give +an error. Then the error is printed, if found, and that test +should be interpretted to pass only if the error was the one expected +by the test. diff --git a/Documentation/dev-tools/kmemleak.rst b/Documentation/dev-tools/kmemleak.rst index a41a2d238af2..1c935f41cd3a 100644 --- a/Documentation/dev-tools/kmemleak.rst +++ b/Documentation/dev-tools/kmemleak.rst @@ -229,7 +229,7 @@ Testing with kmemleak-test To check if you have all set up to use kmemleak, you can use the kmemleak-test module, a module that deliberately leaks memory. Set CONFIG_DEBUG_KMEMLEAK_TEST -as module (it can't be used as bult-in) and boot the kernel with kmemleak +as module (it can't be used as built-in) and boot the kernel with kmemleak enabled. Load the module and perform a scan with:: # modprobe kmemleak-test diff --git a/Documentation/kbuild/makefiles.rst b/Documentation/kbuild/makefiles.rst index 58d513a0fa95..0d5dd5413af0 100644 --- a/Documentation/kbuild/makefiles.rst +++ b/Documentation/kbuild/makefiles.rst @@ -21,6 +21,7 @@ This document describes the Linux kernel Makefiles. --- 3.10 Special Rules --- 3.11 $(CC) support functions --- 3.12 $(LD) support functions + --- 3.13 Script Invocation === 4 Host Program support --- 4.1 Simple Host Program @@ -605,6 +606,25 @@ more details, with real examples. #Makefile LDFLAGS_vmlinux += $(call ld-option, -X) +3.13 Script invocation +---------------------- + + Make rules may invoke scripts to build the kernel. The rules shall + always provide the appropriate interpreter to execute the script. They + shall not rely on the execute bits being set, and shall not invoke the + script directly. For the convenience of manual script invocation, such + as invoking ./scripts/checkpatch.pl, it is recommended to set execute + bits on the scripts nonetheless. + + Kbuild provides variables $(CONFIG_SHELL), $(AWK), $(PERL), + $(PYTHON) and $(PYTHON3) to refer to interpreters for the respective + scripts. + + Example:: + + #Makefile + cmd_depmod = $(CONFIG_SHELL) $(srctree)/scripts/depmod.sh $(DEPMOD) \ + $(KERNELRELEASE) 4 Host Program support ====================== diff --git a/Documentation/vm/active_mm.rst b/Documentation/vm/active_mm.rst index c84471b180f8..6f8269c284ed 100644 --- a/Documentation/vm/active_mm.rst +++ b/Documentation/vm/active_mm.rst @@ -64,7 +64,7 @@ Active MM actually get cases where you have a address space that is _only_ used by lazy users. That is often a short-lived state, because once that thread gets scheduled away in favour of a real thread, the "zombie" mm gets - released because "mm_users" becomes zero. + released because "mm_count" becomes zero. Also, a new rule is that _nobody_ ever has "init_mm" as a real MM any more. "init_mm" should be considered just a "lazy context when no other diff --git a/Documentation/x86/x86_64/boot-options.rst b/Documentation/x86/x86_64/boot-options.rst index 2b98efb5ba7f..324cefff92e7 100644 --- a/Documentation/x86/x86_64/boot-options.rst +++ b/Documentation/x86/x86_64/boot-options.rst @@ -173,6 +173,10 @@ NUMA numa=noacpi Don't parse the SRAT table for NUMA setup + numa=nohmat + Don't parse the HMAT table for NUMA setup, or soft-reserved memory + partitioning. + numa=fake=[MG] If given as a memory unit, fills all system RAM with nodes of size interleaved over physical nodes. diff --git a/MAINTAINERS b/MAINTAINERS index d817046a31ea..57c6346606cf 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9734,8 +9734,8 @@ M: Catalin Marinas S: Maintained F: Documentation/dev-tools/kmemleak.rst F: include/linux/kmemleak.h -F: mm/kmemleak-test.c F: mm/kmemleak.c +F: samples/kmemleak/kmemleak-test.c KMOD KERNEL MODULE LOADER - USERMODE HELPER M: Luis Chamberlain @@ -13983,6 +13983,7 @@ PRINTK M: Petr Mladek M: Sergey Senozhatsky R: Steven Rostedt +R: John Ogness S: Maintained F: include/linux/printk.h F: kernel/printk/ @@ -15633,6 +15634,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/selinux.git F: Documentation/ABI/obsolete/sysfs-selinux-checkreqprot F: Documentation/ABI/obsolete/sysfs-selinux-disable F: Documentation/admin-guide/LSM/SELinux.rst +F: include/trace/events/avc.h F: include/uapi/linux/selinux_netlink.h F: scripts/selinux/ F: security/selinux/ diff --git a/Makefile b/Makefile index e2ffa835c999..925e44ca2451 100644 --- a/Makefile +++ b/Makefile @@ -926,15 +926,6 @@ KBUILD_CFLAGS += $(call cc-disable-warning, maybe-uninitialized) # disable invalid "can't wrap" optimizations for signed / pointers KBUILD_CFLAGS += $(call cc-option,-fno-strict-overflow) -# clang sets -fmerge-all-constants by default as optimization, but this -# is non-conforming behavior for C and in fact breaks the kernel, so we -# need to disable it here generally. -KBUILD_CFLAGS += $(call cc-option,-fno-merge-all-constants) - -# for gcc -fno-merge-all-constants disables everything, but it is fine -# to have actual conforming behavior enabled. -KBUILD_CFLAGS += $(call cc-option,-fmerge-constants) - # Make sure -fstack-check isn't enabled (like gentoo apparently did) KBUILD_CFLAGS += $(call cc-option,-fno-stack-check,) diff --git a/arch/Kconfig b/arch/Kconfig index 76ec3395b843..8519d9f42e33 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -450,10 +450,23 @@ config ARCH_WANT_OLD_COMPAT_IPC select ARCH_WANT_COMPAT_IPC_PARSE_VERSION bool -config HAVE_ARCH_SECCOMP_FILTER +config HAVE_ARCH_SECCOMP bool + help + An arch should select this symbol to support seccomp mode 1 (the fixed + syscall policy), and must provide an overrides for __NR_seccomp_sigreturn, + and compat syscalls if the asm-generic/seccomp.h defaults need adjustment: + - __NR_seccomp_read_32 + - __NR_seccomp_write_32 + - __NR_seccomp_exit_32 + - __NR_seccomp_sigreturn_32 + +config HAVE_ARCH_SECCOMP_FILTER + bool + select HAVE_ARCH_SECCOMP help An arch should select this symbol if it provides all of these things: + - all the requirements for HAVE_ARCH_SECCOMP - syscall_get_arch() - syscall_get_arguments() - syscall_rollback() @@ -464,6 +477,23 @@ config HAVE_ARCH_SECCOMP_FILTER results in the system call being skipped immediately. - seccomp syscall wired up +config SECCOMP + prompt "Enable seccomp to safely execute untrusted bytecode" + def_bool y + depends on HAVE_ARCH_SECCOMP + help + This kernel feature is useful for number crunching applications + that may need to handle untrusted bytecode during their + execution. By using pipes or other transports made available + to the process as file descriptors supporting the read/write + syscalls, it's possible to isolate those applications in their + own address space using seccomp. Once seccomp is enabled via + prctl(PR_SET_SECCOMP) or the seccomp() syscall, it cannot be + disabled and the task is only allowed to execute a few safe + syscalls defined by each seccomp mode. + + If unsure, say Y. + config SECCOMP_FILTER def_bool y depends on HAVE_ARCH_SECCOMP_FILTER && SECCOMP && NET diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index e67ef15c800f..3996b6572c3a 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -68,6 +68,7 @@ config ARM select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU select HAVE_ARCH_MMAP_RND_BITS if MMU + select HAVE_ARCH_SECCOMP select HAVE_ARCH_SECCOMP_FILTER if AEABI && !OABI_COMPAT select HAVE_ARCH_THREAD_STRUCT_WHITELIST select HAVE_ARCH_TRACEHOOK @@ -84,7 +85,7 @@ config ARM select HAVE_FAST_GUP if ARM_LPAE select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL select HAVE_FUNCTION_GRAPH_TRACER if !THUMB2_KERNEL && !CC_IS_CLANG - select HAVE_FUNCTION_TRACER if !XIP_KERNEL && (CC_IS_GCC || CLANG_VERSION >= 100000) + select HAVE_FUNCTION_TRACER if !XIP_KERNEL select HAVE_GCC_PLUGINS select HAVE_HW_BREAKPOINT if PERF_EVENTS && (CPU_V6 || CPU_V6K || CPU_V7) select HAVE_IDE if PCI || ISA || PCMCIA @@ -1618,20 +1619,6 @@ config UACCESS_WITH_MEMCPY However, if the CPU data cache is using a write-allocate mode, this option is unlikely to provide any performance gain. -config SECCOMP - bool - prompt "Enable seccomp to safely compute untrusted bytecode" - help - This kernel feature is useful for number crunching applications - that may need to compute untrusted bytecode during their - execution. By using pipes or other transports made available to - the process as file descriptors supporting the read/write - syscalls, it's possible to isolate those applications in - their own address space using seccomp. Once seccomp is - enabled via prctl(PR_SET_SECCOMP), it cannot be disabled - and the task is only allowed to execute a few safe syscalls - defined by each seccomp mode. - config PARAVIRT bool "Enable paravirtualization code" help diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h index 9415222b49ad..b8cbe03ad260 100644 --- a/arch/arm/include/asm/tlb.h +++ b/arch/arm/include/asm/tlb.h @@ -59,6 +59,7 @@ __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp, unsigned long addr) #ifdef CONFIG_ARM_LPAE struct page *page = virt_to_page(pmdp); + pgtable_pmd_page_dtor(page); tlb_remove_table(tlb, page); #endif } diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index 9a6f6757cb7d..50b1803dfdbf 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -843,19 +843,25 @@ early_param("mem", early_mem); static void __init request_standard_resources(const struct machine_desc *mdesc) { - struct memblock_region *region; + phys_addr_t start, end, res_end; struct resource *res; + u64 i; kernel_code.start = virt_to_phys(_text); kernel_code.end = virt_to_phys(__init_begin - 1); kernel_data.start = virt_to_phys(_sdata); kernel_data.end = virt_to_phys(_end - 1); - for_each_memblock(memory, region) { - phys_addr_t start = __pfn_to_phys(memblock_region_memory_base_pfn(region)); - phys_addr_t end = __pfn_to_phys(memblock_region_memory_end_pfn(region)) - 1; + for_each_mem_range(i, &start, &end) { unsigned long boot_alias_start; + /* + * In memblock, end points to the first byte after the + * range while in resourses, end points to the last byte in + * the range. + */ + res_end = end - 1; + /* * Some systems have a special memory alias which is only * used for booting. We need to advertise this region to @@ -869,7 +875,7 @@ static void __init request_standard_resources(const struct machine_desc *mdesc) __func__, sizeof(*res)); res->name = "System RAM (boot alias)"; res->start = boot_alias_start; - res->end = phys_to_idmap(end); + res->end = phys_to_idmap(res_end); res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; request_resource(&iomem_resource, res); } @@ -880,7 +886,7 @@ static void __init request_standard_resources(const struct machine_desc *mdesc) sizeof(*res)); res->name = "System RAM"; res->start = start; - res->end = end; + res->end = res_end; res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; request_resource(&iomem_resource, res); diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index 000c1b48e973..45f9d5ec2360 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -299,16 +299,14 @@ free_memmap(unsigned long start_pfn, unsigned long end_pfn) */ static void __init free_unused_memmap(void) { - unsigned long start, prev_end = 0; - struct memblock_region *reg; + unsigned long start, end, prev_end = 0; + int i; /* * This relies on each bank being in address order. * The banks are sorted previously in bootmem_init(). */ - for_each_memblock(memory, reg) { - start = memblock_region_memory_base_pfn(reg); - + for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, NULL) { #ifdef CONFIG_SPARSEMEM /* * Take care not to free memmap entries that don't exist @@ -336,8 +334,7 @@ static void __init free_unused_memmap(void) * memmap entries are valid from the bank end aligned to * MAX_ORDER_NR_PAGES. */ - prev_end = ALIGN(memblock_region_memory_end_pfn(reg), - MAX_ORDER_NR_PAGES); + prev_end = ALIGN(end, MAX_ORDER_NR_PAGES); } #ifdef CONFIG_SPARSEMEM @@ -347,61 +344,29 @@ static void __init free_unused_memmap(void) #endif } -#ifdef CONFIG_HIGHMEM -static inline void free_area_high(unsigned long pfn, unsigned long end) -{ - for (; pfn < end; pfn++) - free_highmem_page(pfn_to_page(pfn)); -} -#endif - static void __init free_highpages(void) { #ifdef CONFIG_HIGHMEM unsigned long max_low = max_low_pfn; - struct memblock_region *mem, *res; + phys_addr_t range_start, range_end; + u64 i; /* set highmem page free */ - for_each_memblock(memory, mem) { - unsigned long start = memblock_region_memory_base_pfn(mem); - unsigned long end = memblock_region_memory_end_pfn(mem); + for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, + &range_start, &range_end, NULL) { + unsigned long start = PHYS_PFN(range_start); + unsigned long end = PHYS_PFN(range_end); /* Ignore complete lowmem entries */ if (end <= max_low) continue; - if (memblock_is_nomap(mem)) - continue; - /* Truncate partial highmem entries */ if (start < max_low) start = max_low; - /* Find and exclude any reserved regions */ - for_each_memblock(reserved, res) { - unsigned long res_start, res_end; - - res_start = memblock_region_reserved_base_pfn(res); - res_end = memblock_region_reserved_end_pfn(res); - - if (res_end < start) - continue; - if (res_start < start) - res_start = start; - if (res_start > end) - res_start = end; - if (res_end > end) - res_end = end; - if (res_start != start) - free_area_high(start, res_start); - start = res_end; - if (start == end) - break; - } - - /* And now free anything which remains */ - if (start < end) - free_area_high(start, end); + for (; start < end; start++) + free_highmem_page(pfn_to_page(start)); } #endif } diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index c36f977b2ccb..698cc740c6b8 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -1154,9 +1154,8 @@ phys_addr_t arm_lowmem_limit __initdata = 0; void __init adjust_lowmem_bounds(void) { - phys_addr_t memblock_limit = 0; - u64 vmalloc_limit; - struct memblock_region *reg; + phys_addr_t block_start, block_end, memblock_limit = 0; + u64 vmalloc_limit, i; phys_addr_t lowmem_limit = 0; /* @@ -1172,26 +1171,18 @@ void __init adjust_lowmem_bounds(void) * The first usable region must be PMD aligned. Mark its start * as MEMBLOCK_NOMAP if it isn't */ - for_each_memblock(memory, reg) { - if (!memblock_is_nomap(reg)) { - if (!IS_ALIGNED(reg->base, PMD_SIZE)) { - phys_addr_t len; + for_each_mem_range(i, &block_start, &block_end) { + if (!IS_ALIGNED(block_start, PMD_SIZE)) { + phys_addr_t len; - len = round_up(reg->base, PMD_SIZE) - reg->base; - memblock_mark_nomap(reg->base, len); - } - break; + len = round_up(block_start, PMD_SIZE) - block_start; + memblock_mark_nomap(block_start, len); } + break; } - for_each_memblock(memory, reg) { - phys_addr_t block_start = reg->base; - phys_addr_t block_end = reg->base + reg->size; - - if (memblock_is_nomap(reg)) - continue; - - if (reg->base < vmalloc_limit) { + for_each_mem_range(i, &block_start, &block_end) { + if (block_start < vmalloc_limit) { if (block_end > lowmem_limit) /* * Compare as u64 to ensure vmalloc_limit does @@ -1440,19 +1431,15 @@ static void __init kmap_init(void) static void __init map_lowmem(void) { - struct memblock_region *reg; phys_addr_t kernel_x_start = round_down(__pa(KERNEL_START), SECTION_SIZE); phys_addr_t kernel_x_end = round_up(__pa(__init_end), SECTION_SIZE); + phys_addr_t start, end; + u64 i; /* Map all the lowmem memory banks. */ - for_each_memblock(memory, reg) { - phys_addr_t start = reg->base; - phys_addr_t end = start + reg->size; + for_each_mem_range(i, &start, &end) { struct map_desc map; - if (memblock_is_nomap(reg)) - continue; - if (end > arm_lowmem_limit) end = arm_lowmem_limit; if (start >= end) diff --git a/arch/arm/mm/pmsa-v7.c b/arch/arm/mm/pmsa-v7.c index 699fa2e88725..88950e41a3a9 100644 --- a/arch/arm/mm/pmsa-v7.c +++ b/arch/arm/mm/pmsa-v7.c @@ -231,12 +231,12 @@ static int __init allocate_region(phys_addr_t base, phys_addr_t size, void __init pmsav7_adjust_lowmem_bounds(void) { phys_addr_t specified_mem_size = 0, total_mem_size = 0; - struct memblock_region *reg; - bool first = true; phys_addr_t mem_start; phys_addr_t mem_end; + phys_addr_t reg_start, reg_end; unsigned int mem_max_regions; - int num, i; + int num; + u64 i; /* Free-up PMSAv7_PROBE_REGION */ mpu_min_region_order = __mpu_min_region_order(); @@ -262,20 +262,19 @@ void __init pmsav7_adjust_lowmem_bounds(void) mem_max_regions -= num; #endif - for_each_memblock(memory, reg) { - if (first) { + for_each_mem_range(i, ®_start, ®_end) { + if (i == 0) { phys_addr_t phys_offset = PHYS_OFFSET; /* * Initially only use memory continuous from * PHYS_OFFSET */ - if (reg->base != phys_offset) + if (reg_start != phys_offset) panic("First memory bank must be contiguous from PHYS_OFFSET"); - mem_start = reg->base; - mem_end = reg->base + reg->size; - specified_mem_size = reg->size; - first = false; + mem_start = reg_start; + mem_end = reg_end; + specified_mem_size = mem_end - mem_start; } else { /* * memblock auto merges contiguous blocks, remove @@ -283,8 +282,8 @@ void __init pmsav7_adjust_lowmem_bounds(void) * blocks separately while iterating) */ pr_notice("Ignoring RAM after %pa, memory at %pa ignored\n", - &mem_end, ®->base); - memblock_remove(reg->base, 0 - reg->base); + &mem_end, ®_start); + memblock_remove(reg_start, 0 - reg_start); break; } } diff --git a/arch/arm/mm/pmsa-v8.c b/arch/arm/mm/pmsa-v8.c index 0d7d5fb59247..2de019f7503e 100644 --- a/arch/arm/mm/pmsa-v8.c +++ b/arch/arm/mm/pmsa-v8.c @@ -94,20 +94,19 @@ static __init bool is_region_fixed(int number) void __init pmsav8_adjust_lowmem_bounds(void) { phys_addr_t mem_end; - struct memblock_region *reg; - bool first = true; + phys_addr_t reg_start, reg_end; + u64 i; - for_each_memblock(memory, reg) { - if (first) { + for_each_mem_range(i, ®_start, ®_end) { + if (i == 0) { phys_addr_t phys_offset = PHYS_OFFSET; /* * Initially only use memory continuous from * PHYS_OFFSET */ - if (reg->base != phys_offset) + if (reg_start != phys_offset) panic("First memory bank must be contiguous from PHYS_OFFSET"); - mem_end = reg->base + reg->size; - first = false; + mem_end = reg_end; } else { /* * memblock auto merges contiguous blocks, remove @@ -115,8 +114,8 @@ void __init pmsav8_adjust_lowmem_bounds(void) * blocks separately while iterating) */ pr_notice("Ignoring RAM after %pa, memory at %pa ignored\n", - &mem_end, ®->base); - memblock_remove(reg->base, 0 - reg->base); + &mem_end, ®_start); + memblock_remove(reg_start, 0 - reg_start); break; } } diff --git a/arch/arm/xen/mm.c b/arch/arm/xen/mm.c index 396797ffe2b1..d3ef975a0965 100644 --- a/arch/arm/xen/mm.c +++ b/arch/arm/xen/mm.c @@ -25,11 +25,12 @@ unsigned long xen_get_swiotlb_free_pages(unsigned int order) { - struct memblock_region *reg; + phys_addr_t base; gfp_t flags = __GFP_NOWARN|__GFP_KSWAPD_RECLAIM; + u64 i; - for_each_memblock(memory, reg) { - if (reg->base < (phys_addr_t)0xffffffff) { + for_each_mem_range(i, &base, NULL) { + if (base < (phys_addr_t)0xffffffff) { if (IS_ENABLED(CONFIG_ZONE_DMA32)) flags |= __GFP_DMA32; else diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 339615417085..e9c2dbb109fb 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1041,19 +1041,6 @@ config ARCH_ENABLE_SPLIT_PMD_PTLOCK config CC_HAVE_SHADOW_CALL_STACK def_bool $(cc-option, -fsanitize=shadow-call-stack -ffixed-x18) -config SECCOMP - bool "Enable seccomp to safely compute untrusted bytecode" - help - This kernel feature is useful for number crunching applications - that may need to compute untrusted bytecode during their - execution. By using pipes or other transports made available to - the process as file descriptors supporting the read/write - syscalls, it's possible to isolate those applications in - their own address space using seccomp. Once seccomp is - enabled via prctl(PR_SET_SECCOMP), it cannot be disabled - and the task is only allowed to execute a few safe syscalls - defined by each seccomp mode. - config PARAVIRT bool "Enable paravirtualization code" help @@ -1612,8 +1599,6 @@ config ARM64_BTI_KERNEL depends on CC_HAS_BRANCH_PROT_PAC_RET_BTI # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94697 depends on !CC_IS_GCC || GCC_VERSION >= 100100 - # https://reviews.llvm.org/rGb8ae3fdfa579dbf366b1bb1cbfdbf8c51db7fa55 - depends on !CC_IS_CLANG || CLANG_VERSION >= 100001 depends on !(CC_IS_CLANG && GCOV_KERNEL) depends on (!FUNCTION_GRAPH_TRACER || DYNAMIC_FTRACE_WITH_REGS) help diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c index 361a1143e09e..5b0e67b93cdc 100644 --- a/arch/arm64/kernel/machine_kexec_file.c +++ b/arch/arm64/kernel/machine_kexec_file.c @@ -215,8 +215,7 @@ static int prepare_elf_headers(void **addr, unsigned long *sz) phys_addr_t start, end; nr_ranges = 1; /* for exclusion of crashkernel region */ - for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE, - MEMBLOCK_NONE, &start, &end, NULL) + for_each_mem_range(i, &start, &end) nr_ranges++; cmem = kmalloc(struct_size(cmem, ranges, nr_ranges), GFP_KERNEL); @@ -225,8 +224,7 @@ static int prepare_elf_headers(void **addr, unsigned long *sz) cmem->max_nr_ranges = nr_ranges; cmem->nr_ranges = 0; - for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE, - MEMBLOCK_NONE, &start, &end, NULL) { + for_each_mem_range(i, &start, &end) { cmem->ranges[cmem->nr_ranges].start = start; cmem->ranges[cmem->nr_ranges].end = end - 1; cmem->nr_ranges++; diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 53acbeca4f57..133257ffd859 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -217,7 +217,7 @@ static void __init request_standard_resources(void) if (!standard_resources) panic("%s: Failed to allocate %zu bytes\n", __func__, res_size); - for_each_memblock(memory, region) { + for_each_mem_region(region) { res = &standard_resources[i++]; if (memblock_is_nomap(region)) { res->name = "reserved"; @@ -257,7 +257,7 @@ static int __init reserve_memblock_reserved_regions(void) if (!memblock_is_region_reserved(mem->start, mem_size)) continue; - for_each_reserved_mem_region(j, &r_start, &r_end) { + for_each_reserved_mem_range(j, &r_start, &r_end) { resource_size_t start, end; start = max(PFN_PHYS(PFN_DOWN(r_start)), mem->start); diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile index 45d5cfe46429..04021a93171c 100644 --- a/arch/arm64/kernel/vdso/Makefile +++ b/arch/arm64/kernel/vdso/Makefile @@ -43,13 +43,6 @@ ifneq ($(c-gettimeofday-y),) CFLAGS_vgettimeofday.o += -include $(c-gettimeofday-y) endif -# Clang versions less than 8 do not support -mcmodel=tiny -ifeq ($(CONFIG_CC_IS_CLANG), y) - ifeq ($(shell test $(CONFIG_CLANG_VERSION) -lt 80000; echo $$?),0) - CFLAGS_REMOVE_vgettimeofday.o += -mcmodel=tiny - endif -endif - # Disable gcov profiling for VDSO code GCOV_PROFILE := n diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 481d22c32a2e..f0bf86d81622 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -471,12 +471,10 @@ static inline void free_memmap(unsigned long start_pfn, unsigned long end_pfn) */ static void __init free_unused_memmap(void) { - unsigned long start, prev_end = 0; - struct memblock_region *reg; - - for_each_memblock(memory, reg) { - start = __phys_to_pfn(reg->base); + unsigned long start, end, prev_end = 0; + int i; + for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, NULL) { #ifdef CONFIG_SPARSEMEM /* * Take care not to free memmap entries that don't exist due @@ -496,8 +494,7 @@ static void __init free_unused_memmap(void) * memmap entries are valid from the bank end aligned to * MAX_ORDER_NR_PAGES. */ - prev_end = ALIGN(__phys_to_pfn(reg->base + reg->size), - MAX_ORDER_NR_PAGES); + prev_end = ALIGN(end, MAX_ORDER_NR_PAGES); } #ifdef CONFIG_SPARSEMEM diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index 7291b26ce788..b24e43d20667 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -212,8 +212,8 @@ void __init kasan_init(void) { u64 kimg_shadow_start, kimg_shadow_end; u64 mod_shadow_start, mod_shadow_end; - struct memblock_region *reg; - int i; + phys_addr_t pa_start, pa_end; + u64 i; kimg_shadow_start = (u64)kasan_mem_to_shadow(_text) & PAGE_MASK; kimg_shadow_end = PAGE_ALIGN((u64)kasan_mem_to_shadow(_end)); @@ -246,9 +246,9 @@ void __init kasan_init(void) kasan_populate_early_shadow((void *)mod_shadow_end, (void *)kimg_shadow_start); - for_each_memblock(memory, reg) { - void *start = (void *)__phys_to_virt(reg->base); - void *end = (void *)__phys_to_virt(reg->base + reg->size); + for_each_mem_range(i, &pa_start, &pa_end) { + void *start = (void *)__phys_to_virt(pa_start); + void *end = (void *)__phys_to_virt(pa_end); if (start >= end) break; diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 087a844b4d26..beff3ad8c7f8 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -473,8 +473,9 @@ static void __init map_mem(pgd_t *pgdp) { phys_addr_t kernel_start = __pa_symbol(_text); phys_addr_t kernel_end = __pa_symbol(__init_begin); - struct memblock_region *reg; + phys_addr_t start, end; int flags = 0; + u64 i; if (rodata_full || debug_pagealloc_enabled()) flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; @@ -493,15 +494,9 @@ static void __init map_mem(pgd_t *pgdp) #endif /* map all the memory banks */ - for_each_memblock(memory, reg) { - phys_addr_t start = reg->base; - phys_addr_t end = start + reg->size; - + for_each_mem_range(i, &start, &end) { if (start >= end) break; - if (memblock_is_nomap(reg)) - continue; - /* * The linear map must allow allocation tags reading/writing * if MTE is present. Otherwise, it has the same attributes as diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c index 676deb220b99..a8303bc6b62a 100644 --- a/arch/arm64/mm/numa.c +++ b/arch/arm64/mm/numa.c @@ -354,7 +354,7 @@ static int __init numa_register_nodes(void) struct memblock_region *mblk; /* Check that valid nid is set to memblks */ - for_each_memblock(memory, mblk) { + for_each_mem_region(mblk) { int mblk_nid = memblock_get_region_node(mblk); if (mblk_nid == NUMA_NO_NODE || mblk_nid >= MAX_NUMNODES) { @@ -427,19 +427,16 @@ out_free_distance: */ static int __init dummy_numa_init(void) { + phys_addr_t start = memblock_start_of_DRAM(); + phys_addr_t end = memblock_end_of_DRAM(); int ret; - struct memblock_region *mblk; if (numa_off) pr_info("NUMA disabled\n"); /* Forced off on command line. */ - pr_info("Faking a node at [mem %#018Lx-%#018Lx]\n", - memblock_start_of_DRAM(), memblock_end_of_DRAM() - 1); - - for_each_memblock(memory, mblk) { - ret = numa_add_memblk(0, mblk->base, mblk->base + mblk->size); - if (!ret) - continue; + pr_info("Faking a node at [mem %#018Lx-%#018Lx]\n", start, end - 1); + ret = numa_add_memblk(0, start, end); + if (ret) { pr_err("NUMA init failed\n"); return ret; } diff --git a/arch/c6x/kernel/setup.c b/arch/c6x/kernel/setup.c index 8ef35131f999..9254c3b794a5 100644 --- a/arch/c6x/kernel/setup.c +++ b/arch/c6x/kernel/setup.c @@ -287,7 +287,8 @@ notrace void __init machine_init(unsigned long dt_ptr) void __init setup_arch(char **cmdline_p) { - struct memblock_region *reg; + phys_addr_t start, end; + u64 i; printk(KERN_INFO "Initializing kernel\n"); @@ -351,9 +352,9 @@ void __init setup_arch(char **cmdline_p) disable_caching(ram_start, ram_end - 1); /* Set caching of external RAM used by Linux */ - for_each_memblock(memory, reg) - enable_caching(CACHE_REGION_START(reg->base), - CACHE_REGION_START(reg->base + reg->size - 1)); + for_each_mem_range(i, &start, &end) + enable_caching(CACHE_REGION_START(start), + CACHE_REGION_START(end - 1)); #ifdef CONFIG_BLK_DEV_INITRD /* diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig index 3d5afb5f5685..7f424c85772c 100644 --- a/arch/csky/Kconfig +++ b/arch/csky/Kconfig @@ -309,16 +309,3 @@ endmenu source "arch/csky/Kconfig.platforms" source "kernel/Kconfig.hz" - -config SECCOMP - bool "Enable seccomp to safely compute untrusted bytecode" - help - This kernel feature is useful for number crunching applications - that may need to compute untrusted bytecode during their - execution. By using pipes or other transports made available to - the process as file descriptors supporting the read/write - syscalls, it's possible to isolate those applications in - their own address space using seccomp. Once seccomp is - enabled via prctl(PR_SET_SECCOMP), it cannot be disabled - and the task is only allowed to execute a few safe syscalls - defined by each seccomp mode. diff --git a/arch/h8300/kernel/setup.c b/arch/h8300/kernel/setup.c index 28ac88358a89..0281f92eea3d 100644 --- a/arch/h8300/kernel/setup.c +++ b/arch/h8300/kernel/setup.c @@ -74,17 +74,15 @@ static void __init bootmem_init(void) memory_end = memory_start = 0; /* Find main memory where is the kernel */ - for_each_memblock(memory, region) { - memory_start = region->base; - memory_end = region->base + region->size; - } + memory_start = memblock_start_of_DRAM(); + memory_end = memblock_end_of_DRAM(); if (!memory_end) panic("No memory!"); /* setup bootmem globals (we use no_bootmem, but mm still depends on this) */ min_low_pfn = PFN_UP(memory_start); - max_low_pfn = PFN_DOWN(memblock_end_of_DRAM()); + max_low_pfn = PFN_DOWN(memory_end); max_pfn = max_low_pfn; memblock_reserve(__pa(_stext), _end - _stext); diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index d262ac0c8714..37bd6a5f38fb 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -26,6 +26,7 @@ config MICROBLAZE select GENERIC_SCHED_CLOCK select HAVE_ARCH_HASH select HAVE_ARCH_KGDB + select HAVE_ARCH_SECCOMP select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_CONTIGUOUS select HAVE_DYNAMIC_FTRACE @@ -120,23 +121,6 @@ config CMDLINE_FORCE Set this to have arguments from the default kernel command string override those passed by the boot loader. -config SECCOMP - bool "Enable seccomp to safely compute untrusted bytecode" - depends on PROC_FS - default y - help - This kernel feature is useful for number crunching applications - that may need to compute untrusted bytecode during their - execution. By using pipes or other transports made available to - the process as file descriptors supporting the read/write - syscalls, it's possible to isolate those applications in - their own address space using seccomp. Once seccomp is - enabled via /proc//seccomp, it cannot be disabled - and the task is only allowed to execute a few safe syscalls - defined by each seccomp mode. - - If unsure, say Y. Only embedded should say N here. - endmenu menu "Kernel features" diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index 3344d4a1fe89..0902c459c385 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c @@ -108,15 +108,15 @@ static void __init paging_init(void) void __init setup_memory(void) { - struct memblock_region *reg; - #ifndef CONFIG_MMU u32 kernel_align_start, kernel_align_size; + phys_addr_t start, end; + u64 i; /* Find main memory where is the kernel */ - for_each_memblock(memory, reg) { - memory_start = (u32)reg->base; - lowmem_size = reg->size; + for_each_mem_range(i, &start, &end) { + memory_start = start; + lowmem_size = end - start; if ((memory_start <= (u32)_text) && ((u32)_text <= (memory_start + lowmem_size - 1))) { memory_size = lowmem_size; @@ -164,17 +164,6 @@ void __init setup_memory(void) pr_info("%s: max_low_pfn: %#lx\n", __func__, max_low_pfn); pr_info("%s: max_pfn: %#lx\n", __func__, max_pfn); - /* Add active regions with valid PFNs */ - for_each_memblock(memory, reg) { - unsigned long start_pfn, end_pfn; - - start_pfn = memblock_region_memory_base_pfn(reg); - end_pfn = memblock_region_memory_end_pfn(reg); - memblock_set_node(start_pfn << PAGE_SHIFT, - (end_pfn - start_pfn) << PAGE_SHIFT, - &memblock.memory, 0); - } - paging_init(); } diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index cff19225da3d..440614dc9de2 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -3006,23 +3006,6 @@ config PHYSICAL_START specified in the "crashkernel=YM@XM" command line boot parameter passed to the panic-ed kernel). -config SECCOMP - bool "Enable seccomp to safely compute untrusted bytecode" - depends on PROC_FS - default y - help - This kernel feature is useful for number crunching applications - that may need to compute untrusted bytecode during their - execution. By using pipes or other transports made available to - the process as file descriptors supporting the read/write - syscalls, it's possible to isolate those applications in - their own address space using seccomp. Once seccomp is - enabled via /proc//seccomp, it cannot be disabled - and the task is only allowed to execute a few safe syscalls - defined by each seccomp mode. - - If unsure, say Y. Only embedded should say N here. - config MIPS_O32_FP64_SUPPORT bool "Support for O32 binaries using 64-bit FP" if !CPU_MIPSR6 depends on 32BIT || MIPS32_O32 diff --git a/arch/mips/cavium-octeon/dma-octeon.c b/arch/mips/cavium-octeon/dma-octeon.c index 14ea680d180e..ad1aecc4b401 100644 --- a/arch/mips/cavium-octeon/dma-octeon.c +++ b/arch/mips/cavium-octeon/dma-octeon.c @@ -190,25 +190,25 @@ char *octeon_swiotlb; void __init plat_swiotlb_setup(void) { - struct memblock_region *mem; + phys_addr_t start, end; phys_addr_t max_addr; phys_addr_t addr_size; size_t swiotlbsize; unsigned long swiotlb_nslabs; + u64 i; max_addr = 0; addr_size = 0; - for_each_memblock(memory, mem) { + for_each_mem_range(i, &start, &end) { /* These addresses map low for PCI. */ - if (mem->base > 0x410000000ull && !OCTEON_IS_OCTEON2()) + if (start > 0x410000000ull && !OCTEON_IS_OCTEON2()) continue; - addr_size += mem->size; - - if (max_addr < mem->base + mem->size) - max_addr = mem->base + mem->size; + addr_size += (end - start); + if (max_addr < end) + max_addr = end; } swiotlbsize = PAGE_SIZE; diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c index bf5f5acab0a8..335bd188b8b4 100644 --- a/arch/mips/kernel/setup.c +++ b/arch/mips/kernel/setup.c @@ -300,8 +300,9 @@ static void __init bootmem_init(void) static void __init bootmem_init(void) { - struct memblock_region *mem; phys_addr_t ramstart, ramend; + phys_addr_t start, end; + u64 i; ramstart = memblock_start_of_DRAM(); ramend = memblock_end_of_DRAM(); @@ -338,18 +339,13 @@ static void __init bootmem_init(void) min_low_pfn = ARCH_PFN_OFFSET; max_pfn = PFN_DOWN(ramend); - for_each_memblock(memory, mem) { - unsigned long start = memblock_region_memory_base_pfn(mem); - unsigned long end = memblock_region_memory_end_pfn(mem); - + for_each_mem_range(i, &start, &end) { /* * Skip highmem here so we get an accurate max_low_pfn if low * memory stops short of high memory. * If the region overlaps HIGHMEM_START, end is clipped so * max_pfn excludes the highmem portion. */ - if (memblock_is_nomap(mem)) - continue; if (start >= PFN_DOWN(HIGHMEM_START)) continue; if (end > PFN_DOWN(HIGHMEM_START)) @@ -450,13 +446,12 @@ early_param("memmap", early_parse_memmap); unsigned long setup_elfcorehdr, setup_elfcorehdr_size; static int __init early_parse_elfcorehdr(char *p) { - struct memblock_region *mem; + phys_addr_t start, end; + u64 i; setup_elfcorehdr = memparse(p, &p); - for_each_memblock(memory, mem) { - unsigned long start = mem->base; - unsigned long end = start + mem->size; + for_each_mem_range(i, &start, &end) { if (setup_elfcorehdr >= start && setup_elfcorehdr < end) { /* * Reserve from the elf core header to the end of @@ -720,7 +715,8 @@ static void __init arch_mem_init(char **cmdline_p) static void __init resource_init(void) { - struct memblock_region *region; + phys_addr_t start, end; + u64 i; if (UNCAC_BASE != IO_BASE) return; @@ -732,9 +728,7 @@ static void __init resource_init(void) bss_resource.start = __pa_symbol(&__bss_start); bss_resource.end = __pa_symbol(&__bss_stop) - 1; - for_each_memblock(memory, region) { - phys_addr_t start = PFN_PHYS(memblock_region_memory_base_pfn(region)); - phys_addr_t end = PFN_PHYS(memblock_region_memory_end_pfn(region)) - 1; + for_each_mem_range(i, &start, &end) { struct resource *res; res = memblock_alloc(sizeof(struct resource), SMP_CACHE_BYTES); @@ -743,7 +737,12 @@ static void __init resource_init(void) sizeof(struct resource)); res->start = start; - res->end = end; + /* + * In memblock, end points to the first byte after the + * range while in resourses, end points to the last byte in + * the range. + */ + res->end = end - 1; res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; res->name = "System RAM"; diff --git a/arch/mips/netlogic/xlp/setup.c b/arch/mips/netlogic/xlp/setup.c index 1a0fc5b62ba4..6e3102bcd2f1 100644 --- a/arch/mips/netlogic/xlp/setup.c +++ b/arch/mips/netlogic/xlp/setup.c @@ -70,7 +70,7 @@ static void nlm_fixup_mem(void) const int pref_backup = 512; struct memblock_region *mem; - for_each_memblock(memory, mem) { + for_each_mem_region(mem) { memblock_remove(mem->base + mem->size - pref_backup, pref_backup); } diff --git a/arch/nds32/kernel/setup.c b/arch/nds32/kernel/setup.c index a066efbe53c0..c356e484dcab 100644 --- a/arch/nds32/kernel/setup.c +++ b/arch/nds32/kernel/setup.c @@ -249,12 +249,8 @@ static void __init setup_memory(void) memory_end = memory_start = 0; /* Find main memory where is the kernel */ - for_each_memblock(memory, region) { - memory_start = region->base; - memory_end = region->base + region->size; - pr_info("%s: Memory: 0x%x-0x%x\n", __func__, - memory_start, memory_end); - } + memory_start = memblock_start_of_DRAM(); + memory_end = memblock_end_of_DRAM(); if (!memory_end) { panic("No memory!"); diff --git a/arch/openrisc/kernel/setup.c b/arch/openrisc/kernel/setup.c index 13c87f1f872b..2416a9f91533 100644 --- a/arch/openrisc/kernel/setup.c +++ b/arch/openrisc/kernel/setup.c @@ -48,17 +48,12 @@ static void __init setup_memory(void) unsigned long ram_start_pfn; unsigned long ram_end_pfn; phys_addr_t memory_start, memory_end; - struct memblock_region *region; memory_end = memory_start = 0; /* Find main memory where is the kernel, we assume its the only one */ - for_each_memblock(memory, region) { - memory_start = region->base; - memory_end = region->base + region->size; - printk(KERN_INFO "%s: Memory: 0x%x-0x%x\n", __func__, - memory_start, memory_end); - } + memory_start = memblock_start_of_DRAM(); + memory_end = memblock_end_of_DRAM(); if (!memory_end) { panic("No memory!"); diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c index 3d7c79c7745d..8348feaaf46e 100644 --- a/arch/openrisc/mm/init.c +++ b/arch/openrisc/mm/init.c @@ -64,6 +64,7 @@ extern const char _s_kernel_ro[], _e_kernel_ro[]; */ static void __init map_ram(void) { + phys_addr_t start, end; unsigned long v, p, e; pgprot_t prot; pgd_t *pge; @@ -71,6 +72,7 @@ static void __init map_ram(void) pud_t *pue; pmd_t *pme; pte_t *pte; + u64 i; /* These mark extents of read-only kernel pages... * ...from vmlinux.lds.S */ @@ -78,9 +80,9 @@ static void __init map_ram(void) v = PAGE_OFFSET; - for_each_memblock(memory, region) { - p = (u32) region->base & PAGE_MASK; - e = p + (u32) region->size; + for_each_mem_range(i, &start, &end) { + p = (u32) start & PAGE_MASK; + e = (u32) end; v = (u32) __va(p); pge = pgd_offset_k(v); diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 3b0f53dd70bc..cd4afe1e7a6c 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -378,19 +378,3 @@ endmenu source "drivers/parisc/Kconfig" - -config SECCOMP - def_bool y - prompt "Enable seccomp to safely compute untrusted bytecode" - help - This kernel feature is useful for number crunching applications - that may need to compute untrusted bytecode during their - execution. By using pipes or other transports made available to - the process as file descriptors supporting the read/write - syscalls, it's possible to isolate those applications in - their own address space using seccomp. Once seccomp is - enabled via prctl(PR_SET_SECCOMP), it cannot be disabled - and the task is only allowed to execute a few safe syscalls - defined by each seccomp mode. - - If unsure, say Y. Only embedded should say N here. diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 592036103493..1f0bd7e223f5 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -946,23 +946,6 @@ config ARCH_WANTS_FREEZER_CONTROL source "kernel/power/Kconfig" -config SECCOMP - bool "Enable seccomp to safely compute untrusted bytecode" - depends on PROC_FS - default y - help - This kernel feature is useful for number crunching applications - that may need to compute untrusted bytecode during their - execution. By using pipes or other transports made available to - the process as file descriptors supporting the read/write - syscalls, it's possible to isolate those applications in - their own address space using seccomp. Once seccomp is - enabled via /proc//seccomp, it cannot be disabled - and the task is only allowed to execute a few safe syscalls - defined by each seccomp mode. - - If unsure, say Y. Only embedded should say N here. - config PPC_MEM_KEYS prompt "PowerPC Memory Protection Keys" def_bool y diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 10ebb4bf71ad..5cdf4168a61a 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -191,13 +191,13 @@ int is_fadump_active(void) */ static bool is_fadump_mem_area_contiguous(u64 d_start, u64 d_end) { - struct memblock_region *reg; + phys_addr_t reg_start, reg_end; bool ret = false; - u64 start, end; + u64 i, start, end; - for_each_memblock(memory, reg) { - start = max_t(u64, d_start, reg->base); - end = min_t(u64, d_end, (reg->base + reg->size)); + for_each_mem_range(i, ®_start, ®_end) { + start = max_t(u64, d_start, reg_start); + end = min_t(u64, d_end, reg_end); if (d_start < end) { /* Memory hole from d_start to start */ if (start > d_start) @@ -422,34 +422,34 @@ static int __init add_boot_mem_regions(unsigned long mstart, static int __init fadump_get_boot_mem_regions(void) { - unsigned long base, size, cur_size, hole_size, last_end; + unsigned long size, cur_size, hole_size, last_end; unsigned long mem_size = fw_dump.boot_memory_size; - struct memblock_region *reg; + phys_addr_t reg_start, reg_end; int ret = 1; + u64 i; fw_dump.boot_mem_regs_cnt = 0; last_end = 0; hole_size = 0; cur_size = 0; - for_each_memblock(memory, reg) { - base = reg->base; - size = reg->size; - hole_size += (base - last_end); + for_each_mem_range(i, ®_start, ®_end) { + size = reg_end - reg_start; + hole_size += (reg_start - last_end); if ((cur_size + size) >= mem_size) { size = (mem_size - cur_size); - ret = add_boot_mem_regions(base, size); + ret = add_boot_mem_regions(reg_start, size); break; } mem_size -= size; cur_size += size; - ret = add_boot_mem_regions(base, size); + ret = add_boot_mem_regions(reg_start, size); if (!ret) break; - last_end = base + size; + last_end = reg_end; } fw_dump.boot_mem_top = PAGE_ALIGN(fw_dump.boot_memory_size + hole_size); @@ -985,9 +985,8 @@ static int fadump_init_elfcore_header(char *bufp) */ static int fadump_setup_crash_memory_ranges(void) { - struct memblock_region *reg; - u64 start, end; - int i, ret; + u64 i, start, end; + int ret; pr_debug("Setup crash memory ranges.\n"); crash_mrange_info.mem_range_cnt = 0; @@ -1005,10 +1004,7 @@ static int fadump_setup_crash_memory_ranges(void) return ret; } - for_each_memblock(memory, reg) { - start = (u64)reg->base; - end = start + (u64)reg->size; - + for_each_mem_range(i, &start, &end) { /* * skip the memory chunk that is already added * (0 through boot_memory_top). @@ -1242,14 +1238,17 @@ static void fadump_free_reserved_memory(unsigned long start_pfn, */ static void fadump_release_reserved_area(u64 start, u64 end) { + unsigned long reg_spfn, reg_epfn; u64 tstart, tend, spfn, epfn; - struct memblock_region *reg; + int i; spfn = PHYS_PFN(start); epfn = PHYS_PFN(end); - for_each_memblock(memory, reg) { - tstart = max_t(u64, spfn, memblock_region_memory_base_pfn(reg)); - tend = min_t(u64, epfn, memblock_region_memory_end_pfn(reg)); + + for_each_mem_pfn_range(i, MAX_NUMNODES, ®_spfn, ®_epfn, NULL) { + tstart = max_t(u64, spfn, reg_spfn); + tend = min_t(u64, epfn, reg_epfn); + if (tstart < tend) { fadump_free_reserved_memory(tstart, tend); @@ -1684,12 +1683,10 @@ int __init fadump_reserve_mem(void) /* Preserve everything above the base address */ static void __init fadump_reserve_crash_area(u64 base) { - struct memblock_region *reg; - u64 mstart, msize; + u64 i, mstart, mend, msize; - for_each_memblock(memory, reg) { - mstart = reg->base; - msize = reg->size; + for_each_mem_range(i, &mstart, &mend) { + msize = mend - mstart; if ((mstart + msize) < base) continue; diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c index 53bb71e3a2e1..c69bcf9b547a 100644 --- a/arch/powerpc/kexec/file_load_64.c +++ b/arch/powerpc/kexec/file_load_64.c @@ -138,15 +138,13 @@ out: */ static int get_crash_memory_ranges(struct crash_mem **mem_ranges) { - struct memblock_region *reg; + phys_addr_t base, end; struct crash_mem *tmem; + u64 i; int ret; - for_each_memblock(memory, reg) { - u64 base, size; - - base = (u64)reg->base; - size = (u64)reg->size; + for_each_mem_range(i, &base, &end) { + u64 size = end - base; /* Skip backup memory region, which needs a separate entry */ if (base == BACKUP_SRC_START) { @@ -250,8 +248,7 @@ static int __locate_mem_hole_top_down(struct kexec_buf *kbuf, phys_addr_t start, end; u64 i; - for_each_mem_range_rev(i, &memblock.memory, NULL, NUMA_NO_NODE, - MEMBLOCK_NONE, &start, &end, NULL) { + for_each_mem_range_rev(i, &start, &end) { /* * memblock uses [start, end) convention while it is * [start, end] here. Fix the off-by-one to have the @@ -350,8 +347,7 @@ static int __locate_mem_hole_bottom_up(struct kexec_buf *kbuf, phys_addr_t start, end; u64 i; - for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE, - MEMBLOCK_NONE, &start, &end, NULL) { + for_each_mem_range(i, &start, &end) { /* * memblock uses [start, end) convention while it is * [start, end] here. Fix the off-by-one to have the diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index 073617ce83e0..8f58dd20b362 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -95,23 +95,15 @@ EXPORT_SYMBOL_GPL(kvm_free_hpt_cma); void __init kvm_cma_reserve(void) { unsigned long align_size; - struct memblock_region *reg; - phys_addr_t selected_size = 0; + phys_addr_t selected_size; /* * We need CMA reservation only when we are in HV mode */ if (!cpu_has_feature(CPU_FTR_HVMODE)) return; - /* - * We cannot use memblock_phys_mem_size() here, because - * memblock_analyze() has not been called yet. - */ - for_each_memblock(memory, reg) - selected_size += memblock_region_memory_end_pfn(reg) - - memblock_region_memory_base_pfn(reg); - selected_size = (selected_size * kvm_cma_resv_ratio / 100) << PAGE_SHIFT; + selected_size = PAGE_ALIGN(memblock_phys_mem_size() * kvm_cma_resv_ratio / 100); if (selected_size) { pr_info("%s: reserving %ld MiB for global area\n", __func__, (unsigned long)selected_size / SZ_1M); diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index 7705d5557239..84e5a2dc8be5 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -687,9 +687,9 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm) struct kvmppc_uvmem_page_pvt *pvt; unsigned long pfn_last, pfn_first; - pfn_first = kvmppc_uvmem_pgmap.res.start >> PAGE_SHIFT; + pfn_first = kvmppc_uvmem_pgmap.range.start >> PAGE_SHIFT; pfn_last = pfn_first + - (resource_size(&kvmppc_uvmem_pgmap.res) >> PAGE_SHIFT); + (range_len(&kvmppc_uvmem_pgmap.range) >> PAGE_SHIFT); spin_lock(&kvmppc_uvmem_bitmap_lock); bit = find_first_zero_bit(kvmppc_uvmem_bitmap, @@ -1007,7 +1007,7 @@ static vm_fault_t kvmppc_uvmem_migrate_to_ram(struct vm_fault *vmf) static void kvmppc_uvmem_page_free(struct page *page) { unsigned long pfn = page_to_pfn(page) - - (kvmppc_uvmem_pgmap.res.start >> PAGE_SHIFT); + (kvmppc_uvmem_pgmap.range.start >> PAGE_SHIFT); struct kvmppc_uvmem_page_pvt *pvt; spin_lock(&kvmppc_uvmem_bitmap_lock); @@ -1170,7 +1170,9 @@ int kvmppc_uvmem_init(void) } kvmppc_uvmem_pgmap.type = MEMORY_DEVICE_PRIVATE; - kvmppc_uvmem_pgmap.res = *res; + kvmppc_uvmem_pgmap.range.start = res->start; + kvmppc_uvmem_pgmap.range.end = res->end; + kvmppc_uvmem_pgmap.nr_range = 1; kvmppc_uvmem_pgmap.ops = &kvmppc_uvmem_ops; /* just one global instance: */ kvmppc_uvmem_pgmap.owner = &kvmppc_uvmem_pgmap; @@ -1205,7 +1207,7 @@ void kvmppc_uvmem_free(void) return; memunmap_pages(&kvmppc_uvmem_pgmap); - release_mem_region(kvmppc_uvmem_pgmap.res.start, - resource_size(&kvmppc_uvmem_pgmap.res)); + release_mem_region(kvmppc_uvmem_pgmap.range.start, + range_len(&kvmppc_uvmem_pgmap.range)); kfree(kvmppc_uvmem_bitmap); } diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index c663e7ba801f..b830adee51f5 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -7,7 +7,7 @@ * * SMP scalability work: * Copyright (C) 2001 Anton Blanchard , IBM - * + * * Module name: htab.c * * Description: @@ -867,8 +867,8 @@ static void __init htab_initialize(void) unsigned long table; unsigned long pteg_count; unsigned long prot; - unsigned long base = 0, size = 0; - struct memblock_region *reg; + phys_addr_t base = 0, size = 0, end; + u64 i; DBG(" -> htab_initialize()\n"); @@ -884,7 +884,7 @@ static void __init htab_initialize(void) /* * Calculate the required size of the htab. We want the number of * PTEGs to equal one half the number of real pages. - */ + */ htab_size_bytes = htab_get_table_size(); pteg_count = htab_size_bytes >> 7; @@ -894,7 +894,7 @@ static void __init htab_initialize(void) firmware_has_feature(FW_FEATURE_PS3_LV1)) { /* Using a hypervisor which owns the htab */ htab_address = NULL; - _SDR1 = 0; + _SDR1 = 0; #ifdef CONFIG_FA_DUMP /* * If firmware assisted dump is active firmware preserves @@ -960,9 +960,9 @@ static void __init htab_initialize(void) #endif /* CONFIG_DEBUG_PAGEALLOC */ /* create bolted the linear mapping in the hash table */ - for_each_memblock(memory, reg) { - base = (unsigned long)__va(reg->base); - size = reg->size; + for_each_mem_range(i, &base, &end) { + size = end - base; + base = (unsigned long)__va(base); DBG("creating mapping for region: %lx..%lx (prot: %lx)\n", base, size, prot); diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index d5f0c10d752a..cc72666e891a 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -329,7 +329,8 @@ static int __meminit create_physical_mapping(unsigned long start, static void __init radix_init_pgtable(void) { unsigned long rts_field; - struct memblock_region *reg; + phys_addr_t start, end; + u64 i; /* We don't support slb for radix */ mmu_slb_size = 0; @@ -337,20 +338,19 @@ static void __init radix_init_pgtable(void) /* * Create the linear mapping */ - for_each_memblock(memory, reg) { + for_each_mem_range(i, &start, &end) { /* * The memblock allocator is up at this point, so the * page tables will be allocated within the range. No * need or a node (which we don't have yet). */ - if ((reg->base + reg->size) >= RADIX_VMALLOC_START) { + if (end >= RADIX_VMALLOC_START) { pr_warn("Outside the supported range\n"); continue; } - WARN_ON(create_physical_mapping(reg->base, - reg->base + reg->size, + WARN_ON(create_physical_mapping(start, end, radix_mem_block_size, -1, PAGE_KERNEL)); } diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c index fb294046e00e..26fda3203320 100644 --- a/arch/powerpc/mm/kasan/kasan_init_32.c +++ b/arch/powerpc/mm/kasan/kasan_init_32.c @@ -138,11 +138,11 @@ void __init kasan_mmu_init(void) void __init kasan_init(void) { - struct memblock_region *reg; + phys_addr_t base, end; + u64 i; - for_each_memblock(memory, reg) { - phys_addr_t base = reg->base; - phys_addr_t top = min(base + reg->size, total_lowmem); + for_each_mem_range(i, &base, &end) { + phys_addr_t top = min(end, total_lowmem); int ret; if (base >= top) diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 42e25874f5a8..5e2e7c0a8f1a 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -184,15 +184,16 @@ void __init initmem_init(void) /* mark pages that don't exist as nosave */ static int __init mark_nonram_nosave(void) { - struct memblock_region *reg, *prev = NULL; + unsigned long spfn, epfn, prev = 0; + int i; - for_each_memblock(memory, reg) { - if (prev && - memblock_region_memory_end_pfn(prev) < memblock_region_memory_base_pfn(reg)) - register_nosave_region(memblock_region_memory_end_pfn(prev), - memblock_region_memory_base_pfn(reg)); - prev = reg; + for_each_mem_pfn_range(i, MAX_NUMNODES, &spfn, &epfn, NULL) { + if (prev && prev < spfn) + register_nosave_region(prev, spfn); + + prev = epfn; } + return 0; } #else /* CONFIG_NEED_MULTIPLE_NODES */ @@ -584,20 +585,24 @@ void flush_icache_user_page(struct vm_area_struct *vma, struct page *page, */ static int __init add_system_ram_resources(void) { - struct memblock_region *reg; + phys_addr_t start, end; + u64 i; - for_each_memblock(memory, reg) { + for_each_mem_range(i, &start, &end) { struct resource *res; - unsigned long base = reg->base; - unsigned long size = reg->size; res = kzalloc(sizeof(struct resource), GFP_KERNEL); WARN_ON(!res); if (res) { res->name = "System RAM"; - res->start = base; - res->end = base + size - 1; + res->start = start; + /* + * In memblock, end points to the first byte after + * the range while in resourses, end points to the + * last byte in the range. + */ + res->end = end - 1; res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; WARN_ON(request_resource(&iomem_resource, res) < 0); } diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 1f61fa2148b5..f4e20d8e6c02 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -804,17 +804,14 @@ static void __init setup_nonnuma(void) unsigned long total_ram = memblock_phys_mem_size(); unsigned long start_pfn, end_pfn; unsigned int nid = 0; - struct memblock_region *reg; + int i; printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n", top_of_ram, total_ram); printk(KERN_DEBUG "Memory hole size: %ldMB\n", (top_of_ram - total_ram) >> 20); - for_each_memblock(memory, reg) { - start_pfn = memblock_region_memory_base_pfn(reg); - end_pfn = memblock_region_memory_end_pfn(reg); - + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { fake_numa_create_new_node(end_pfn, &nid); memblock_set_node(PFN_PHYS(start_pfn), PFN_PHYS(end_pfn - start_pfn), diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index 6eb4eab79385..079159e97bca 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -123,11 +123,11 @@ static void __init __mapin_ram_chunk(unsigned long offset, unsigned long top) void __init mapin_ram(void) { - struct memblock_region *reg; + phys_addr_t base, end; + u64 i; - for_each_memblock(memory, reg) { - phys_addr_t base = reg->base; - phys_addr_t top = min(base + reg->size, total_lowmem); + for_each_mem_range(i, &base, &end) { + phys_addr_t top = min(end, total_lowmem); if (base >= top) continue; diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 7766e1289468..b7821ac36d28 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -334,19 +334,6 @@ menu "Kernel features" source "kernel/Kconfig.hz" -config SECCOMP - bool "Enable seccomp to safely compute untrusted bytecode" - help - This kernel feature is useful for number crunching applications - that may need to compute untrusted bytecode during their - execution. By using pipes or other transports made available to - the process as file descriptors supporting the read/write - syscalls, it's possible to isolate those applications in - their own address space using seccomp. Once seccomp is - enabled via prctl(PR_SET_SECCOMP), it cannot be disabled - and the task is only allowed to execute a few safe syscalls - defined by each seccomp mode. - config RISCV_SBI_V01 bool "SBI v0.1 support" default y diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index f750e012dbe5..1dc89303b679 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -145,21 +145,21 @@ static phys_addr_t dtb_early_pa __initdata; void __init setup_bootmem(void) { - struct memblock_region *reg; phys_addr_t mem_size = 0; phys_addr_t total_mem = 0; - phys_addr_t mem_start, end = 0; + phys_addr_t mem_start, start, end = 0; phys_addr_t vmlinux_end = __pa_symbol(&_end); phys_addr_t vmlinux_start = __pa_symbol(&_start); + u64 i; /* Find the memory region containing the kernel */ - for_each_memblock(memory, reg) { - end = reg->base + reg->size; + for_each_mem_range(i, &start, &end) { + phys_addr_t size = end - start; if (!total_mem) - mem_start = reg->base; - if (reg->base <= vmlinux_start && vmlinux_end <= end) - BUG_ON(reg->size == 0); - total_mem = total_mem + reg->size; + mem_start = start; + if (start <= vmlinux_start && vmlinux_end <= end) + BUG_ON(size == 0); + total_mem = total_mem + size; } /* @@ -191,15 +191,6 @@ void __init setup_bootmem(void) early_init_fdt_scan_reserved_mem(); memblock_allow_resize(); memblock_dump_all(); - - for_each_memblock(memory, reg) { - unsigned long start_pfn = memblock_region_memory_base_pfn(reg); - unsigned long end_pfn = memblock_region_memory_end_pfn(reg); - - memblock_set_node(PFN_PHYS(start_pfn), - PFN_PHYS(end_pfn - start_pfn), - &memblock.memory, 0); - } } #ifdef CONFIG_MMU @@ -464,7 +455,7 @@ static void __init setup_vm_final(void) { uintptr_t va, map_size; phys_addr_t pa, start, end; - struct memblock_region *reg; + u64 i; /* Set mmu_enabled flag */ mmu_enabled = true; @@ -475,14 +466,9 @@ static void __init setup_vm_final(void) PGDIR_SIZE, PAGE_TABLE); /* Map all memory banks */ - for_each_memblock(memory, reg) { - start = reg->base; - end = start + reg->size; - + for_each_mem_range(i, &start, &end) { if (start >= end) break; - if (memblock_is_nomap(reg)) - continue; if (start <= __pa(PAGE_OFFSET) && __pa(PAGE_OFFSET) < end) start = __pa(PAGE_OFFSET); @@ -545,7 +531,7 @@ static void __init resource_init(void) { struct memblock_region *region; - for_each_memblock(memory, region) { + for_each_mem_region(region) { struct resource *res; res = memblock_alloc(sizeof(struct resource), SMP_CACHE_BYTES); diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c index 87b4ab3d3c77..12ddd1f6bf70 100644 --- a/arch/riscv/mm/kasan_init.c +++ b/arch/riscv/mm/kasan_init.c @@ -85,16 +85,16 @@ static void __init populate(void *start, void *end) void __init kasan_init(void) { - struct memblock_region *reg; - unsigned long i; + phys_addr_t _start, _end; + u64 i; kasan_populate_early_shadow((void *)KASAN_SHADOW_START, (void *)kasan_mem_to_shadow((void *) VMALLOC_END)); - for_each_memblock(memory, reg) { - void *start = (void *)__va(reg->base); - void *end = (void *)__va(reg->base + reg->size); + for_each_mem_range(i, &_start, &_end) { + void *start = (void *)_start; + void *end = (void *)_end; if (start >= end) break; diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 0a3899386a51..d509bf23ef78 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -792,23 +792,6 @@ config CRASH_DUMP endmenu -config SECCOMP - def_bool y - prompt "Enable seccomp to safely compute untrusted bytecode" - depends on PROC_FS - help - This kernel feature is useful for number crunching applications - that may need to compute untrusted bytecode during their - execution. By using pipes or other transports made available to - the process as file descriptors supporting the read/write - syscalls, it's possible to isolate those applications in - their own address space using seccomp. Once seccomp is - enabled via /proc//seccomp, it cannot be disabled - and the task is only allowed to execute a few safe syscalls - defined by each seccomp mode. - - If unsure, say Y. - config CCW def_bool y diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index c2c1b4e723ea..d44e522c569b 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -484,8 +484,9 @@ static struct resource __initdata *standard_resources[] = { static void __init setup_resources(void) { struct resource *res, *std_res, *sub_res; - struct memblock_region *reg; + phys_addr_t start, end; int j; + u64 i; code_resource.start = (unsigned long) _text; code_resource.end = (unsigned long) _etext - 1; @@ -494,7 +495,7 @@ static void __init setup_resources(void) bss_resource.start = (unsigned long) __bss_start; bss_resource.end = (unsigned long) __bss_stop - 1; - for_each_memblock(memory, reg) { + for_each_mem_range(i, &start, &end) { res = memblock_alloc(sizeof(*res), 8); if (!res) panic("%s: Failed to allocate %zu bytes align=0x%x\n", @@ -502,8 +503,13 @@ static void __init setup_resources(void) res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM; res->name = "System RAM"; - res->start = reg->base; - res->end = reg->base + reg->size - 1; + res->start = start; + /* + * In memblock, end points to the first byte after the + * range while in resourses, end points to the last byte in + * the range. + */ + res->end = end - 1; request_resource(&iomem_resource, res); for (j = 0; j < ARRAY_SIZE(standard_resources); j++) { @@ -776,8 +782,8 @@ static void __init memblock_add_mem_detect_info(void) unsigned long start, end; int i; - memblock_dbg("physmem info source: %s (%hhd)\n", - get_mem_info_source(), mem_detect.info_source); + pr_debug("physmem info source: %s (%hhd)\n", + get_mem_info_source(), mem_detect.info_source); /* keep memblock lists close to the kernel */ memblock_set_bottom_up(true); for_each_mem_detect_block(i, &start, &end) { @@ -819,14 +825,15 @@ static void __init reserve_kernel(void) static void __init setup_memory(void) { - struct memblock_region *reg; + phys_addr_t start, end; + u64 i; /* * Init storage key for present memory */ - for_each_memblock(memory, reg) { - storage_key_init_range(reg->base, reg->base + reg->size); - } + for_each_mem_range(i, &start, &end) + storage_key_init_range(start, end); + psw_set_key(PAGE_DEFAULT_KEY); /* Only cosmetics */ diff --git a/arch/s390/mm/page-states.c b/arch/s390/mm/page-states.c index fc141893d028..567c69f3069e 100644 --- a/arch/s390/mm/page-states.c +++ b/arch/s390/mm/page-states.c @@ -183,9 +183,9 @@ static void mark_kernel_pgd(void) void __init cmma_init_nodat(void) { - struct memblock_region *reg; struct page *page; unsigned long start, end, ix; + int i; if (cmma_flag < 2) return; @@ -193,9 +193,7 @@ void __init cmma_init_nodat(void) mark_kernel_pgd(); /* Set all kernel pages not used for page tables to stable/no-dat */ - for_each_memblock(memory, reg) { - start = memblock_region_memory_base_pfn(reg); - end = memblock_region_memory_end_pfn(reg); + for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, NULL) { page = pfn_to_page(start); for (ix = start; ix < end; ix++, page++) { if (__test_and_clear_bit(PG_arch_1, &page->flags)) diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index eddf71c22875..b239f2ba93b0 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -555,10 +555,11 @@ int vmem_add_mapping(unsigned long start, unsigned long size) */ void __init vmem_map_init(void) { - struct memblock_region *reg; + phys_addr_t base, end; + u64 i; - for_each_memblock(memory, reg) - vmem_add_range(reg->base, reg->size); + for_each_mem_range(i, &base, &end) + vmem_add_range(base, end - base); __set_memory((unsigned long)_stext, (unsigned long)(_etext - _stext) >> PAGE_SHIFT, SET_MEMORY_RO | SET_MEMORY_X); diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index d20927128fce..18278152c91c 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -600,22 +600,6 @@ config PHYSICAL_START where the fail safe kernel needs to run at a different address than the panic-ed kernel. -config SECCOMP - bool "Enable seccomp to safely compute untrusted bytecode" - depends on PROC_FS - help - This kernel feature is useful for number crunching applications - that may need to compute untrusted bytecode during their - execution. By using pipes or other transports made available to - the process as file descriptors supporting the read/write - syscalls, it's possible to isolate those applications in - their own address space using seccomp. Once seccomp is - enabled via prctl, it cannot be disabled and the task is only - allowed to execute a few safe syscalls defined by each seccomp - mode. - - If unsure, say N. - config SMP bool "Symmetric multi-processing support" depends on SYS_SUPPORTS_SMP diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 4735176ab811..3348e0c4d769 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -226,15 +226,12 @@ void __init allocate_pgdat(unsigned int nid) static void __init do_init_bootmem(void) { - struct memblock_region *reg; + unsigned long start_pfn, end_pfn; + int i; /* Add active regions with valid PFNs. */ - for_each_memblock(memory, reg) { - unsigned long start_pfn, end_pfn; - start_pfn = memblock_region_memory_base_pfn(reg); - end_pfn = memblock_region_memory_end_pfn(reg); + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) __add_active_range(0, start_pfn, end_pfn); - } /* All of system RAM sits in node 0 for the non-NUMA case */ allocate_pgdat(0); diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 91ed1104b7f4..096530eac8e1 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -23,6 +23,7 @@ config SPARC select HAVE_OPROFILE select HAVE_ARCH_KGDB if !SMP || SPARC64 select HAVE_ARCH_TRACEHOOK + select HAVE_ARCH_SECCOMP if SPARC64 select HAVE_EXIT_THREAD select HAVE_PCI select SYSCTL_EXCEPTION_TRACE @@ -227,23 +228,6 @@ config EARLYFB help Say Y here to enable a faster early framebuffer boot console. -config SECCOMP - bool "Enable seccomp to safely compute untrusted bytecode" - depends on SPARC64 && PROC_FS - default y - help - This kernel feature is useful for number crunching applications - that may need to compute untrusted bytecode during their - execution. By using pipes or other transports made available to - the process as file descriptors supporting the read/write - syscalls, it's possible to isolate those applications in - their own address space using seccomp. Once seccomp is - enabled via /proc//seccomp, it cannot be disabled - and the task is only allowed to execute a few safe syscalls - defined by each seccomp mode. - - If unsure, say Y. Only embedded should say N here. - config HOTPLUG_CPU bool "Support for hot-pluggable CPUs" depends on SPARC64 && SMP diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index fad6d3129904..96edf64d4fb3 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -1192,18 +1192,14 @@ int of_node_to_nid(struct device_node *dp) static void __init add_node_ranges(void) { - struct memblock_region *reg; + phys_addr_t start, end; unsigned long prev_max; + u64 i; memblock_resized: prev_max = memblock.memory.max; - for_each_memblock(memory, reg) { - unsigned long size = reg->size; - unsigned long start, end; - - start = reg->base; - end = start + size; + for_each_mem_range(i, &start, &end) { while (start < end) { unsigned long this_end; int nid; @@ -1211,7 +1207,7 @@ memblock_resized: this_end = memblock_nid_range(start, end, &nid); numadbg("Setting memblock NUMA node nid[%d] " - "start[%lx] end[%lx]\n", + "start[%llx] end[%lx]\n", nid, start, this_end); memblock_set_node(start, this_end - start, diff --git a/arch/um/Kconfig b/arch/um/Kconfig index eb51fec75948..d49f471b02e3 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -173,22 +173,6 @@ config PGTABLE_LEVELS default 3 if 3_LEVEL_PGTABLES default 2 -config SECCOMP - def_bool y - prompt "Enable seccomp to safely compute untrusted bytecode" - help - This kernel feature is useful for number crunching applications - that may need to compute untrusted bytecode during their - execution. By using pipes or other transports made available to - the process as file descriptors supporting the read/write - syscalls, it's possible to isolate those applications in - their own address space using seccomp. Once seccomp is - enabled via prctl(PR_SET_SECCOMP), it cannot be disabled - and the task is only allowed to execute a few safe syscalls - defined by each seccomp mode. - - If unsure, say Y. - config UML_TIME_TRAVEL_SUPPORT bool prompt "Support time-travel mode (e.g. for test execution)" diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 835d93006bd6..255084c65138 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1970,22 +1970,6 @@ config EFI_MIXED If unsure, say N. -config SECCOMP - def_bool y - prompt "Enable seccomp to safely compute untrusted bytecode" - help - This kernel feature is useful for number crunching applications - that may need to compute untrusted bytecode during their - execution. By using pipes or other transports made available to - the process as file descriptors supporting the read/write - syscalls, it's possible to isolate those applications in - their own address space using seccomp. Once seccomp is - enabled via prctl(PR_SET_SECCOMP), it cannot be disabled - and the task is only allowed to execute a few safe syscalls - defined by each seccomp mode. - - If unsure, say Y. Only embedded should say N here. - source "kernel/Kconfig.hz" config KEXEC diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c index c8862696a47b..7d0394f4ebf9 100644 --- a/arch/x86/boot/compressed/pgtable_64.c +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -5,15 +5,6 @@ #include "pgtable.h" #include "../string.h" -/* - * __force_order is used by special_insns.h asm code to force instruction - * serialization. - * - * It is not referenced from the code, but GCC < 5 with -fPIE would fail - * due to an undefined symbol. Define it to make these ancient GCCs work. - */ -unsigned long __force_order; - #define BIOS_START_MIN 0x20000U /* 128K, less than this is insane */ #define BIOS_START_MAX 0x9f000U /* 640K, absolute maximum */ diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index bbfde3d2662f..0aecc0b629e0 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -3,6 +3,7 @@ #define _ASM_X86_NUMA_H #include +#include #include #include @@ -77,7 +78,12 @@ void debug_cpumask_set_cpu(int cpu, int node, bool enable); #ifdef CONFIG_NUMA_EMU #define FAKE_NODE_MIN_SIZE ((u64)32 << 20) #define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) -void numa_emu_cmdline(char *); +int numa_emu_cmdline(char *str); +#else /* CONFIG_NUMA_EMU */ +static inline int numa_emu_cmdline(char *str) +{ + return -EINVAL; +} #endif /* CONFIG_NUMA_EMU */ #endif /* _ASM_X86_NUMA_H */ diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 94624fb06fac..cc177b4431ae 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -11,45 +11,47 @@ #include /* - * Volatile isn't enough to prevent the compiler from reordering the - * read/write functions for the control registers and messing everything up. - * A memory clobber would solve the problem, but would prevent reordering of - * all loads stores around it, which can hurt performance. Solution is to - * use a variable and mimic reads and writes to it to enforce serialization + * The compiler should not reorder volatile asm statements with respect to each + * other: they should execute in program order. However GCC 4.9.x and 5.x have + * a bug (which was fixed in 8.1, 7.3 and 6.5) where they might reorder + * volatile asm. The write functions are not affected since they have memory + * clobbers preventing reordering. To prevent reads from being reordered with + * respect to writes, use a dummy memory operand. */ -extern unsigned long __force_order; + +#define __FORCE_ORDER "m"(*(unsigned int *)0x1000UL) void native_write_cr0(unsigned long val); static inline unsigned long native_read_cr0(void) { unsigned long val; - asm volatile("mov %%cr0,%0\n\t" : "=r" (val), "=m" (__force_order)); + asm volatile("mov %%cr0,%0\n\t" : "=r" (val) : __FORCE_ORDER); return val; } static __always_inline unsigned long native_read_cr2(void) { unsigned long val; - asm volatile("mov %%cr2,%0\n\t" : "=r" (val), "=m" (__force_order)); + asm volatile("mov %%cr2,%0\n\t" : "=r" (val) : __FORCE_ORDER); return val; } static __always_inline void native_write_cr2(unsigned long val) { - asm volatile("mov %0,%%cr2": : "r" (val), "m" (__force_order)); + asm volatile("mov %0,%%cr2": : "r" (val) : "memory"); } static inline unsigned long __native_read_cr3(void) { unsigned long val; - asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order)); + asm volatile("mov %%cr3,%0\n\t" : "=r" (val) : __FORCE_ORDER); return val; } static inline void native_write_cr3(unsigned long val) { - asm volatile("mov %0,%%cr3": : "r" (val), "m" (__force_order)); + asm volatile("mov %0,%%cr3": : "r" (val) : "memory"); } static inline unsigned long native_read_cr4(void) @@ -64,10 +66,10 @@ static inline unsigned long native_read_cr4(void) asm volatile("1: mov %%cr4, %0\n" "2:\n" _ASM_EXTABLE(1b, 2b) - : "=r" (val), "=m" (__force_order) : "0" (0)); + : "=r" (val) : "0" (0), __FORCE_ORDER); #else /* CR4 always exists on x86_64. */ - asm volatile("mov %%cr4,%0\n\t" : "=r" (val), "=m" (__force_order)); + asm volatile("mov %%cr4,%0\n\t" : "=r" (val) : __FORCE_ORDER); #endif return val; } diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index aa60c239931b..477c503f2753 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -418,7 +418,7 @@ do { \ "2:\n" \ ".section .fixup,\"ax\"\n" \ "3: mov %[efault],%[errout]\n" \ - " xor"itype" %[output],%[output]\n" \ + " xorl %k[output],%k[output]\n" \ " jmp 2b\n" \ ".previous\n" \ _ASM_EXTABLE_UA(1b, 3b) \ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 7824fc62c7cd..c51158914ea2 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -360,7 +360,7 @@ void native_write_cr0(unsigned long val) unsigned long bits_missing = 0; set_register: - asm volatile("mov %0,%%cr0": "+r" (val), "+m" (__force_order)); + asm volatile("mov %0,%%cr0": "+r" (val) : : "memory"); if (static_branch_likely(&cr_pinning)) { if (unlikely((val & X86_CR0_WP) != X86_CR0_WP)) { @@ -379,7 +379,7 @@ void native_write_cr4(unsigned long val) unsigned long bits_changed = 0; set_register: - asm volatile("mov %0,%%cr4": "+r" (val), "+m" (cr4_pinned_bits)); + asm volatile("mov %0,%%cr4": "+r" (val) : : "memory"); if (static_branch_likely(&cr_pinning)) { if (unlikely((val & cr4_pinned_mask) != cr4_pinned_bits)) { diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 983cd53ed4c9..22aad412f965 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -305,6 +305,20 @@ static int __init cpcompare(const void *a, const void *b) return (ap->addr != ap->entry->addr) - (bp->addr != bp->entry->addr); } +static bool e820_nomerge(enum e820_type type) +{ + /* + * These types may indicate distinct platform ranges aligned to + * numa node, protection domain, performance domain, or other + * boundaries. Do not merge them. + */ + if (type == E820_TYPE_PRAM) + return true; + if (type == E820_TYPE_SOFT_RESERVED) + return true; + return false; +} + int __init e820__update_table(struct e820_table *table) { struct e820_entry *entries = table->entries; @@ -380,7 +394,7 @@ int __init e820__update_table(struct e820_table *table) } /* Continue building up new map based on this information: */ - if (current_type != last_type || current_type == E820_TYPE_PRAM) { + if (current_type != last_type || e820_nomerge(current_type)) { if (last_type != 0) { new_entries[new_nr_entries].size = change_point[chg_idx]->addr - last_addr; /* Move forward only if the new size was non-zero: */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index fa16b906ea3f..210e878c4c0d 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -264,16 +264,12 @@ static void __init relocate_initrd(void) u64 area_size = PAGE_ALIGN(ramdisk_size); /* We need to move the initrd down into directly mapped mem */ - relocated_ramdisk = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), - area_size, PAGE_SIZE); - + relocated_ramdisk = memblock_phys_alloc_range(area_size, PAGE_SIZE, 0, + PFN_PHYS(max_pfn_mapped)); if (!relocated_ramdisk) panic("Cannot find place for new RAMDISK of size %lld\n", ramdisk_size); - /* Note: this includes all the mem currently occupied by - the initrd, we rely on that fact to keep the data intact. */ - memblock_reserve(relocated_ramdisk, area_size); initrd_start = relocated_ramdisk + PAGE_OFFSET; initrd_end = initrd_start + ramdisk_size; printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n", @@ -300,13 +296,13 @@ static void __init early_reserve_initrd(void) memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image); } + static void __init reserve_initrd(void) { /* Assume only end is not page aligned */ u64 ramdisk_image = get_ramdisk_image(); u64 ramdisk_size = get_ramdisk_size(); u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); - u64 mapped_size; if (!boot_params.hdr.type_of_loader || !ramdisk_image || !ramdisk_size) @@ -314,12 +310,6 @@ static void __init reserve_initrd(void) initrd_start = 0; - mapped_size = memblock_mem_size(max_pfn_mapped); - if (ramdisk_size >= (mapped_size>>1)) - panic("initrd too large to handle, " - "disabling initrd (%lld needed, %lld available)\n", - ramdisk_size, mapped_size>>1); - printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image, ramdisk_end - 1); @@ -431,13 +421,13 @@ static int __init reserve_crashkernel_low(void) { #ifdef CONFIG_X86_64 unsigned long long base, low_base = 0, low_size = 0; - unsigned long total_low_mem; + unsigned long low_mem_limit; int ret; - total_low_mem = memblock_mem_size(1UL << (32 - PAGE_SHIFT)); + low_mem_limit = min(memblock_phys_mem_size(), CRASH_ADDR_LOW_MAX); /* crashkernel=Y,low */ - ret = parse_crashkernel_low(boot_command_line, total_low_mem, &low_size, &base); + ret = parse_crashkernel_low(boot_command_line, low_mem_limit, &low_size, &base); if (ret) { /* * two parts from kernel/dma/swiotlb.c: @@ -455,23 +445,17 @@ static int __init reserve_crashkernel_low(void) return 0; } - low_base = memblock_find_in_range(0, 1ULL << 32, low_size, CRASH_ALIGN); + low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, 0, CRASH_ADDR_LOW_MAX); if (!low_base) { pr_err("Cannot reserve %ldMB crashkernel low memory, please try smaller size.\n", (unsigned long)(low_size >> 20)); return -ENOMEM; } - ret = memblock_reserve(low_base, low_size); - if (ret) { - pr_err("%s: Error reserving crashkernel low memblock.\n", __func__); - return ret; - } - - pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (System low RAM: %ldMB)\n", + pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (low RAM limit: %ldMB)\n", (unsigned long)(low_size >> 20), (unsigned long)(low_base >> 20), - (unsigned long)(total_low_mem >> 20)); + (unsigned long)(low_mem_limit >> 20)); crashk_low_res.start = low_base; crashk_low_res.end = low_base + low_size - 1; @@ -515,13 +499,13 @@ static void __init reserve_crashkernel(void) * unless "crashkernel=size[KMG],high" is specified. */ if (!high) - crash_base = memblock_find_in_range(CRASH_ALIGN, - CRASH_ADDR_LOW_MAX, - crash_size, CRASH_ALIGN); + crash_base = memblock_phys_alloc_range(crash_size, + CRASH_ALIGN, CRASH_ALIGN, + CRASH_ADDR_LOW_MAX); if (!crash_base) - crash_base = memblock_find_in_range(CRASH_ALIGN, - CRASH_ADDR_HIGH_MAX, - crash_size, CRASH_ALIGN); + crash_base = memblock_phys_alloc_range(crash_size, + CRASH_ALIGN, CRASH_ALIGN, + CRASH_ADDR_HIGH_MAX); if (!crash_base) { pr_info("crashkernel reservation failed - No suitable area found.\n"); return; @@ -529,19 +513,13 @@ static void __init reserve_crashkernel(void) } else { unsigned long long start; - start = memblock_find_in_range(crash_base, - crash_base + crash_size, - crash_size, 1 << 20); + start = memblock_phys_alloc_range(crash_size, SZ_1M, crash_base, + crash_base + crash_size); if (start != crash_base) { pr_info("crashkernel reservation failed - memory is in use.\n"); return; } } - ret = memblock_reserve(crash_base, crash_size); - if (ret) { - pr_err("%s: Error reserving crashkernel memblock.\n", __func__); - return; - } if (crash_base >= (1ULL << 32) && reserve_crashkernel_low()) { memblock_free(crash_base, crash_size); diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index aa76ec2d359b..9df94e0aaee1 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -37,14 +37,12 @@ static __init int numa_setup(char *opt) return -EINVAL; if (!strncmp(opt, "off", 3)) numa_off = 1; -#ifdef CONFIG_NUMA_EMU if (!strncmp(opt, "fake=", 5)) - numa_emu_cmdline(opt + 5); -#endif -#ifdef CONFIG_ACPI_NUMA + return numa_emu_cmdline(opt + 5); if (!strncmp(opt, "noacpi", 6)) - acpi_numa = -1; -#endif + disable_srat(); + if (!strncmp(opt, "nohmat", 6)) + disable_hmat(); return 0; } early_param("numa", numa_setup); @@ -516,7 +514,7 @@ static void __init numa_clear_kernel_node_hotplug(void) * memory ranges, because quirks such as trim_snb_memory() * reserve specific pages for Sandy Bridge graphics. ] */ - for_each_memblock(reserved, mb_region) { + for_each_reserved_mem_region(mb_region) { int nid = memblock_get_region_node(mb_region); if (nid != MAX_NUMNODES) @@ -919,7 +917,6 @@ int phys_to_target_node(phys_addr_t start) return meminfo_to_nid(&numa_reserved_meminfo, start); } -EXPORT_SYMBOL_GPL(phys_to_target_node); int memory_add_physaddr_to_nid(u64 start) { diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index 683cd12f4793..87d77cc52f86 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -13,9 +13,10 @@ static int emu_nid_to_phys[MAX_NUMNODES]; static char *emu_cmdline __initdata; -void __init numa_emu_cmdline(char *str) +int __init numa_emu_cmdline(char *str) { emu_cmdline = str; + return 0; } static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi) diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 41485a8a6dcf..b1418a6c0e90 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -1300,7 +1300,7 @@ asmlinkage __visible void __init xen_start_kernel(void) * any NUMA information the kernel tries to get from ACPI will * be meaningless. Prevent it from trying. */ - acpi_numa = -1; + disable_srat(); #endif WARN_ON(xen_cpuhp_setup(xen_cpu_up_prepare_pv, xen_cpu_dead_pv)); diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index e997e0119c02..d8a29dc5a284 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -217,20 +217,6 @@ config HOTPLUG_CPU Say N if you want to disable CPU hotplug. -config SECCOMP - bool - prompt "Enable seccomp to safely compute untrusted bytecode" - help - This kernel feature is useful for number crunching applications - that may need to compute untrusted bytecode during their - execution. By using pipes or other transports made available to - the process as file descriptors supporting the read/write - syscalls, it's possible to isolate those applications in - their own address space using seccomp. Once seccomp is - enabled via prctl(PR_SET_SECCOMP), it cannot be disabled - and the task is only allowed to execute a few safe syscalls - defined by each seccomp mode. - config FAST_SYSCALL_XTENSA bool "Enable fast atomic syscalls" default n diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c index a05b306cf371..ad9d59d93f39 100644 --- a/arch/xtensa/mm/init.c +++ b/arch/xtensa/mm/init.c @@ -79,67 +79,32 @@ void __init zones_init(void) free_area_init(max_zone_pfn); } -#ifdef CONFIG_HIGHMEM -static void __init free_area_high(unsigned long pfn, unsigned long end) -{ - for (; pfn < end; pfn++) - free_highmem_page(pfn_to_page(pfn)); -} - static void __init free_highpages(void) { +#ifdef CONFIG_HIGHMEM unsigned long max_low = max_low_pfn; - struct memblock_region *mem, *res; + phys_addr_t range_start, range_end; + u64 i; - reset_all_zones_managed_pages(); /* set highmem page free */ - for_each_memblock(memory, mem) { - unsigned long start = memblock_region_memory_base_pfn(mem); - unsigned long end = memblock_region_memory_end_pfn(mem); + for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, + &range_start, &range_end, NULL) { + unsigned long start = PHYS_PFN(range_start); + unsigned long end = PHYS_PFN(range_end); /* Ignore complete lowmem entries */ if (end <= max_low) continue; - if (memblock_is_nomap(mem)) - continue; - /* Truncate partial highmem entries */ if (start < max_low) start = max_low; - /* Find and exclude any reserved regions */ - for_each_memblock(reserved, res) { - unsigned long res_start, res_end; - - res_start = memblock_region_reserved_base_pfn(res); - res_end = memblock_region_reserved_end_pfn(res); - - if (res_end < start) - continue; - if (res_start < start) - res_start = start; - if (res_start > end) - res_start = end; - if (res_end > end) - res_end = end; - if (res_start != start) - free_area_high(start, res_start); - start = res_end; - if (start == end) - break; - } - - /* And now free anything which remains */ - if (start < end) - free_area_high(start, end); + for (; start < end; start++) + free_highmem_page(pfn_to_page(start)); } -} -#else -static void __init free_highpages(void) -{ -} #endif +} /* * Initialize memory pages. diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c index 2c32cfb72370..134bcb40b2af 100644 --- a/drivers/acpi/numa/hmat.c +++ b/drivers/acpi/numa/hmat.c @@ -24,8 +24,15 @@ #include #include #include +#include static u8 hmat_revision; +static int hmat_disable __initdata; + +void __init disable_hmat(void) +{ + hmat_disable = 1; +} static LIST_HEAD(targets); static LIST_HEAD(initiators); @@ -634,66 +641,6 @@ static void hmat_register_target_perf(struct memory_target *target) node_set_perf_attrs(mem_nid, &target->hmem_attrs, 0); } -static void hmat_register_target_device(struct memory_target *target, - struct resource *r) -{ - /* define a clean / non-busy resource for the platform device */ - struct resource res = { - .start = r->start, - .end = r->end, - .flags = IORESOURCE_MEM, - }; - struct platform_device *pdev; - struct memregion_info info; - int rc, id; - - rc = region_intersects(res.start, resource_size(&res), IORESOURCE_MEM, - IORES_DESC_SOFT_RESERVED); - if (rc != REGION_INTERSECTS) - return; - - id = memregion_alloc(GFP_KERNEL); - if (id < 0) { - pr_err("memregion allocation failure for %pr\n", &res); - return; - } - - pdev = platform_device_alloc("hmem", id); - if (!pdev) { - pr_err("hmem device allocation failure for %pr\n", &res); - goto out_pdev; - } - - pdev->dev.numa_node = acpi_map_pxm_to_online_node(target->memory_pxm); - info = (struct memregion_info) { - .target_node = acpi_map_pxm_to_node(target->memory_pxm), - }; - rc = platform_device_add_data(pdev, &info, sizeof(info)); - if (rc < 0) { - pr_err("hmem memregion_info allocation failure for %pr\n", &res); - goto out_pdev; - } - - rc = platform_device_add_resources(pdev, &res, 1); - if (rc < 0) { - pr_err("hmem resource allocation failure for %pr\n", &res); - goto out_resource; - } - - rc = platform_device_add(pdev); - if (rc < 0) { - dev_err(&pdev->dev, "device add failed for %pr\n", &res); - goto out_resource; - } - - return; - -out_resource: - put_device(&pdev->dev); -out_pdev: - memregion_free(id); -} - static void hmat_register_target_devices(struct memory_target *target) { struct resource *res; @@ -705,8 +652,11 @@ static void hmat_register_target_devices(struct memory_target *target) if (!IS_ENABLED(CONFIG_DEV_DAX_HMEM)) return; - for (res = target->memregions.child; res; res = res->sibling) - hmat_register_target_device(target, res); + for (res = target->memregions.child; res; res = res->sibling) { + int target_nid = acpi_map_pxm_to_node(target->memory_pxm); + + hmem_register_device(target_nid, res); + } } static void hmat_register_target(struct memory_target *target) @@ -814,7 +764,7 @@ static __init int hmat_init(void) enum acpi_hmat_type i; acpi_status status; - if (srat_disabled()) + if (srat_disabled() || hmat_disable) return 0; status = acpi_get_table(ACPI_SIG_SRAT, 0, &tbl); diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c index 15bbaab8500b..1b0ae0a1959b 100644 --- a/drivers/acpi/numa/srat.c +++ b/drivers/acpi/numa/srat.c @@ -27,7 +27,12 @@ static int node_to_pxm_map[MAX_NUMNODES] = { [0 ... MAX_NUMNODES - 1] = PXM_INVAL }; unsigned char acpi_srat_revision __initdata; -int acpi_numa __initdata; +static int acpi_numa __initdata; + +void __init disable_srat(void) +{ + acpi_numa = -1; +} int pxm_to_node(int pxm) { @@ -163,7 +168,7 @@ static int __init slit_valid(struct acpi_table_slit *slit) void __init bad_srat(void) { pr_err("SRAT: SRAT not used.\n"); - acpi_numa = -1; + disable_srat(); } int __init srat_disabled(void) diff --git a/drivers/base/core.c b/drivers/base/core.c index 6c77182a139d..281aa509ab5e 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -3324,7 +3324,7 @@ struct device *device_find_child_by_name(struct device *parent, klist_iter_init(&parent->p->klist_children, &i); while ((child = next_device(&i))) - if (!strcmp(dev_name(child), name) && get_device(child)) + if (sysfs_streq(dev_name(child), name) && get_device(child)) break; klist_iter_exit(&i); return child; @@ -4061,22 +4061,21 @@ void device_shutdown(void) */ #ifdef CONFIG_PRINTK -static int -create_syslog_header(const struct device *dev, char *hdr, size_t hdrlen) +static void +set_dev_info(const struct device *dev, struct dev_printk_info *dev_info) { const char *subsys; - size_t pos = 0; + + memset(dev_info, 0, sizeof(*dev_info)); if (dev->class) subsys = dev->class->name; else if (dev->bus) subsys = dev->bus->name; else - return 0; + return; - pos += snprintf(hdr + pos, hdrlen - pos, "SUBSYSTEM=%s", subsys); - if (pos >= hdrlen) - goto overflow; + strscpy(dev_info->subsystem, subsys, sizeof(dev_info->subsystem)); /* * Add device identifier DEVICE=: @@ -4092,41 +4091,28 @@ create_syslog_header(const struct device *dev, char *hdr, size_t hdrlen) c = 'b'; else c = 'c'; - pos++; - pos += snprintf(hdr + pos, hdrlen - pos, - "DEVICE=%c%u:%u", - c, MAJOR(dev->devt), MINOR(dev->devt)); + + snprintf(dev_info->device, sizeof(dev_info->device), + "%c%u:%u", c, MAJOR(dev->devt), MINOR(dev->devt)); } else if (strcmp(subsys, "net") == 0) { struct net_device *net = to_net_dev(dev); - pos++; - pos += snprintf(hdr + pos, hdrlen - pos, - "DEVICE=n%u", net->ifindex); + snprintf(dev_info->device, sizeof(dev_info->device), + "n%u", net->ifindex); } else { - pos++; - pos += snprintf(hdr + pos, hdrlen - pos, - "DEVICE=+%s:%s", subsys, dev_name(dev)); + snprintf(dev_info->device, sizeof(dev_info->device), + "+%s:%s", subsys, dev_name(dev)); } - - if (pos >= hdrlen) - goto overflow; - - return pos; - -overflow: - dev_WARN(dev, "device/subsystem name too long"); - return 0; } int dev_vprintk_emit(int level, const struct device *dev, const char *fmt, va_list args) { - char hdr[128]; - size_t hdrlen; + struct dev_printk_info dev_info; - hdrlen = create_syslog_header(dev, hdr, sizeof(hdr)); + set_dev_info(dev, &dev_info); - return vprintk_emit(0, level, hdrlen ? hdr : NULL, hdrlen, fmt, args); + return vprintk_emit(0, level, &dev_info, fmt, args); } EXPORT_SYMBOL(dev_vprintk_emit); diff --git a/drivers/bus/mvebu-mbus.c b/drivers/bus/mvebu-mbus.c index 5b2a11a88951..2519ceede64b 100644 --- a/drivers/bus/mvebu-mbus.c +++ b/drivers/bus/mvebu-mbus.c @@ -610,23 +610,23 @@ static unsigned int armada_xp_mbus_win_remap_offset(int win) static void __init mvebu_mbus_find_bridge_hole(uint64_t *start, uint64_t *end) { - struct memblock_region *r; - uint64_t s = 0; + phys_addr_t reg_start, reg_end; + uint64_t i, s = 0; - for_each_memblock(memory, r) { + for_each_mem_range(i, ®_start, ®_end) { /* * This part of the memory is above 4 GB, so we don't * care for the MBus bridge hole. */ - if (r->base >= 0x100000000ULL) + if (reg_start >= 0x100000000ULL) continue; /* * The MBus bridge hole is at the end of the RAM under * the 4 GB limit. */ - if (r->base + r->size > s) - s = r->base + r->size; + if (reg_end > s) + s = reg_end; } *start = s; diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig index 3b6c06f07326..567428e10b7b 100644 --- a/drivers/dax/Kconfig +++ b/drivers/dax/Kconfig @@ -35,6 +35,7 @@ config DEV_DAX_PMEM config DEV_DAX_HMEM tristate "HMEM DAX: direct access to 'specific purpose' memory" depends on EFI_SOFT_RESERVE + select NUMA_KEEP_MEMINFO if (NUMA && X86) default DEV_DAX help EFI 2.8 platforms, and others, may advertise 'specific purpose' @@ -48,6 +49,11 @@ config DEV_DAX_HMEM Say M if unsure. +config DEV_DAX_HMEM_DEVICES + depends on NUMA_KEEP_MEMINFO # for phys_to_target_node() + depends on DEV_DAX_HMEM && DAX=y + def_bool y + config DEV_DAX_KMEM tristate "KMEM DAX: volatile-use of persistent memory" default DEV_DAX diff --git a/drivers/dax/Makefile b/drivers/dax/Makefile index 80065b38b3c4..9d4ba672d305 100644 --- a/drivers/dax/Makefile +++ b/drivers/dax/Makefile @@ -2,11 +2,10 @@ obj-$(CONFIG_DAX) += dax.o obj-$(CONFIG_DEV_DAX) += device_dax.o obj-$(CONFIG_DEV_DAX_KMEM) += kmem.o -obj-$(CONFIG_DEV_DAX_HMEM) += dax_hmem.o dax-y := super.o dax-y += bus.o device_dax-y := device.o -dax_hmem-y := hmem.o obj-y += pmem/ +obj-y += hmem/ diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c index df238c8b6ef2..27513d311242 100644 --- a/drivers/dax/bus.c +++ b/drivers/dax/bus.c @@ -6,6 +6,7 @@ #include #include #include +#include #include "dax-private.h" #include "bus.h" @@ -130,10 +131,63 @@ ATTRIBUTE_GROUPS(dax_drv); static int dax_bus_match(struct device *dev, struct device_driver *drv); +static bool is_static(struct dax_region *dax_region) +{ + return (dax_region->res.flags & IORESOURCE_DAX_STATIC) != 0; +} + +static u64 dev_dax_size(struct dev_dax *dev_dax) +{ + u64 size = 0; + int i; + + device_lock_assert(&dev_dax->dev); + + for (i = 0; i < dev_dax->nr_range; i++) + size += range_len(&dev_dax->ranges[i].range); + + return size; +} + +static int dax_bus_probe(struct device *dev) +{ + struct dax_device_driver *dax_drv = to_dax_drv(dev->driver); + struct dev_dax *dev_dax = to_dev_dax(dev); + struct dax_region *dax_region = dev_dax->region; + int rc; + + if (dev_dax_size(dev_dax) == 0 || dev_dax->id < 0) + return -ENXIO; + + rc = dax_drv->probe(dev_dax); + + if (rc || is_static(dax_region)) + return rc; + + /* + * Track new seed creation only after successful probe of the + * previous seed. + */ + if (dax_region->seed == dev) + dax_region->seed = NULL; + + return 0; +} + +static int dax_bus_remove(struct device *dev) +{ + struct dax_device_driver *dax_drv = to_dax_drv(dev->driver); + struct dev_dax *dev_dax = to_dev_dax(dev); + + return dax_drv->remove(dev_dax); +} + static struct bus_type dax_bus_type = { .name = "dax", .uevent = dax_bus_uevent, .match = dax_bus_match, + .probe = dax_bus_probe, + .remove = dax_bus_remove, .drv_groups = dax_drv_groups, }; @@ -176,18 +230,269 @@ static ssize_t region_size_show(struct device *dev, static struct device_attribute dev_attr_region_size = __ATTR(size, 0444, region_size_show, NULL); -static ssize_t align_show(struct device *dev, +static ssize_t region_align_show(struct device *dev, struct device_attribute *attr, char *buf) { struct dax_region *dax_region = dev_get_drvdata(dev); return sprintf(buf, "%u\n", dax_region->align); } -static DEVICE_ATTR_RO(align); +static struct device_attribute dev_attr_region_align = + __ATTR(align, 0400, region_align_show, NULL); + +#define for_each_dax_region_resource(dax_region, res) \ + for (res = (dax_region)->res.child; res; res = res->sibling) + +static unsigned long long dax_region_avail_size(struct dax_region *dax_region) +{ + resource_size_t size = resource_size(&dax_region->res); + struct resource *res; + + device_lock_assert(dax_region->dev); + + for_each_dax_region_resource(dax_region, res) + size -= resource_size(res); + return size; +} + +static ssize_t available_size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dax_region *dax_region = dev_get_drvdata(dev); + unsigned long long size; + + device_lock(dev); + size = dax_region_avail_size(dax_region); + device_unlock(dev); + + return sprintf(buf, "%llu\n", size); +} +static DEVICE_ATTR_RO(available_size); + +static ssize_t seed_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dax_region *dax_region = dev_get_drvdata(dev); + struct device *seed; + ssize_t rc; + + if (is_static(dax_region)) + return -EINVAL; + + device_lock(dev); + seed = dax_region->seed; + rc = sprintf(buf, "%s\n", seed ? dev_name(seed) : ""); + device_unlock(dev); + + return rc; +} +static DEVICE_ATTR_RO(seed); + +static ssize_t create_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dax_region *dax_region = dev_get_drvdata(dev); + struct device *youngest; + ssize_t rc; + + if (is_static(dax_region)) + return -EINVAL; + + device_lock(dev); + youngest = dax_region->youngest; + rc = sprintf(buf, "%s\n", youngest ? dev_name(youngest) : ""); + device_unlock(dev); + + return rc; +} + +static ssize_t create_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct dax_region *dax_region = dev_get_drvdata(dev); + unsigned long long avail; + ssize_t rc; + int val; + + if (is_static(dax_region)) + return -EINVAL; + + rc = kstrtoint(buf, 0, &val); + if (rc) + return rc; + if (val != 1) + return -EINVAL; + + device_lock(dev); + avail = dax_region_avail_size(dax_region); + if (avail == 0) + rc = -ENOSPC; + else { + struct dev_dax_data data = { + .dax_region = dax_region, + .size = 0, + .id = -1, + }; + struct dev_dax *dev_dax = devm_create_dev_dax(&data); + + if (IS_ERR(dev_dax)) + rc = PTR_ERR(dev_dax); + else { + /* + * In support of crafting multiple new devices + * simultaneously multiple seeds can be created, + * but only the first one that has not been + * successfully bound is tracked as the region + * seed. + */ + if (!dax_region->seed) + dax_region->seed = &dev_dax->dev; + dax_region->youngest = &dev_dax->dev; + rc = len; + } + } + device_unlock(dev); + + return rc; +} +static DEVICE_ATTR_RW(create); + +void kill_dev_dax(struct dev_dax *dev_dax) +{ + struct dax_device *dax_dev = dev_dax->dax_dev; + struct inode *inode = dax_inode(dax_dev); + + kill_dax(dax_dev); + unmap_mapping_range(inode->i_mapping, 0, 0, 1); +} +EXPORT_SYMBOL_GPL(kill_dev_dax); + +static void free_dev_dax_ranges(struct dev_dax *dev_dax) +{ + struct dax_region *dax_region = dev_dax->region; + int i; + + device_lock_assert(dax_region->dev); + for (i = 0; i < dev_dax->nr_range; i++) { + struct range *range = &dev_dax->ranges[i].range; + + __release_region(&dax_region->res, range->start, + range_len(range)); + } + dev_dax->nr_range = 0; +} + +static void unregister_dev_dax(void *dev) +{ + struct dev_dax *dev_dax = to_dev_dax(dev); + + dev_dbg(dev, "%s\n", __func__); + + kill_dev_dax(dev_dax); + free_dev_dax_ranges(dev_dax); + device_del(dev); + put_device(dev); +} + +/* a return value >= 0 indicates this invocation invalidated the id */ +static int __free_dev_dax_id(struct dev_dax *dev_dax) +{ + struct dax_region *dax_region = dev_dax->region; + struct device *dev = &dev_dax->dev; + int rc = dev_dax->id; + + device_lock_assert(dev); + + if (is_static(dax_region) || dev_dax->id < 0) + return -1; + ida_free(&dax_region->ida, dev_dax->id); + dev_dax->id = -1; + return rc; +} + +static int free_dev_dax_id(struct dev_dax *dev_dax) +{ + struct device *dev = &dev_dax->dev; + int rc; + + device_lock(dev); + rc = __free_dev_dax_id(dev_dax); + device_unlock(dev); + return rc; +} + +static ssize_t delete_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct dax_region *dax_region = dev_get_drvdata(dev); + struct dev_dax *dev_dax; + struct device *victim; + bool do_del = false; + int rc; + + if (is_static(dax_region)) + return -EINVAL; + + victim = device_find_child_by_name(dax_region->dev, buf); + if (!victim) + return -ENXIO; + + device_lock(dev); + device_lock(victim); + dev_dax = to_dev_dax(victim); + if (victim->driver || dev_dax_size(dev_dax)) + rc = -EBUSY; + else { + /* + * Invalidate the device so it does not become active + * again, but always preserve device-id-0 so that + * /sys/bus/dax/ is guaranteed to be populated while any + * dax_region is registered. + */ + if (dev_dax->id > 0) { + do_del = __free_dev_dax_id(dev_dax) >= 0; + rc = len; + if (dax_region->seed == victim) + dax_region->seed = NULL; + if (dax_region->youngest == victim) + dax_region->youngest = NULL; + } else + rc = -EBUSY; + } + device_unlock(victim); + + /* won the race to invalidate the device, clean it up */ + if (do_del) + devm_release_action(dev, unregister_dev_dax, victim); + device_unlock(dev); + put_device(victim); + + return rc; +} +static DEVICE_ATTR_WO(delete); + +static umode_t dax_region_visible(struct kobject *kobj, struct attribute *a, + int n) +{ + struct device *dev = container_of(kobj, struct device, kobj); + struct dax_region *dax_region = dev_get_drvdata(dev); + + if (is_static(dax_region)) + if (a == &dev_attr_available_size.attr + || a == &dev_attr_create.attr + || a == &dev_attr_seed.attr + || a == &dev_attr_delete.attr) + return 0; + return a->mode; +} static struct attribute *dax_region_attributes[] = { + &dev_attr_available_size.attr, &dev_attr_region_size.attr, - &dev_attr_align.attr, + &dev_attr_region_align.attr, + &dev_attr_create.attr, + &dev_attr_seed.attr, + &dev_attr_delete.attr, &dev_attr_id.attr, NULL, }; @@ -195,6 +500,7 @@ static struct attribute *dax_region_attributes[] = { static const struct attribute_group dax_region_attribute_group = { .name = "dax_region", .attrs = dax_region_attributes, + .is_visible = dax_region_visible, }; static const struct attribute_group *dax_region_attribute_groups[] = { @@ -226,8 +532,8 @@ static void dax_region_unregister(void *region) } struct dax_region *alloc_dax_region(struct device *parent, int region_id, - struct resource *res, int target_node, unsigned int align, - unsigned long long pfn_flags) + struct range *range, int target_node, unsigned int align, + unsigned long flags) { struct dax_region *dax_region; @@ -241,8 +547,8 @@ struct dax_region *alloc_dax_region(struct device *parent, int region_id, return NULL; } - if (!IS_ALIGNED(res->start, align) - || !IS_ALIGNED(resource_size(res), align)) + if (!IS_ALIGNED(range->start, align) + || !IS_ALIGNED(range_len(range), align)) return NULL; dax_region = kzalloc(sizeof(*dax_region), GFP_KERNEL); @@ -250,13 +556,18 @@ struct dax_region *alloc_dax_region(struct device *parent, int region_id, return NULL; dev_set_drvdata(parent, dax_region); - memcpy(&dax_region->res, res, sizeof(*res)); - dax_region->pfn_flags = pfn_flags; kref_init(&dax_region->kref); dax_region->id = region_id; dax_region->align = align; dax_region->dev = parent; dax_region->target_node = target_node; + ida_init(&dax_region->ida); + dax_region->res = (struct resource) { + .start = range->start, + .end = range->end, + .flags = IORESOURCE_MEM | flags, + }; + if (sysfs_create_groups(&parent->kobj, dax_region_attribute_groups)) { kfree(dax_region); return NULL; @@ -269,15 +580,601 @@ struct dax_region *alloc_dax_region(struct device *parent, int region_id, } EXPORT_SYMBOL_GPL(alloc_dax_region); +static void dax_mapping_release(struct device *dev) +{ + struct dax_mapping *mapping = to_dax_mapping(dev); + struct dev_dax *dev_dax = to_dev_dax(dev->parent); + + ida_free(&dev_dax->ida, mapping->id); + kfree(mapping); +} + +static void unregister_dax_mapping(void *data) +{ + struct device *dev = data; + struct dax_mapping *mapping = to_dax_mapping(dev); + struct dev_dax *dev_dax = to_dev_dax(dev->parent); + struct dax_region *dax_region = dev_dax->region; + + dev_dbg(dev, "%s\n", __func__); + + device_lock_assert(dax_region->dev); + + dev_dax->ranges[mapping->range_id].mapping = NULL; + mapping->range_id = -1; + + device_del(dev); + put_device(dev); +} + +static struct dev_dax_range *get_dax_range(struct device *dev) +{ + struct dax_mapping *mapping = to_dax_mapping(dev); + struct dev_dax *dev_dax = to_dev_dax(dev->parent); + struct dax_region *dax_region = dev_dax->region; + + device_lock(dax_region->dev); + if (mapping->range_id < 0) { + device_unlock(dax_region->dev); + return NULL; + } + + return &dev_dax->ranges[mapping->range_id]; +} + +static void put_dax_range(struct dev_dax_range *dax_range) +{ + struct dax_mapping *mapping = dax_range->mapping; + struct dev_dax *dev_dax = to_dev_dax(mapping->dev.parent); + struct dax_region *dax_region = dev_dax->region; + + device_unlock(dax_region->dev); +} + +static ssize_t start_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dev_dax_range *dax_range; + ssize_t rc; + + dax_range = get_dax_range(dev); + if (!dax_range) + return -ENXIO; + rc = sprintf(buf, "%#llx\n", dax_range->range.start); + put_dax_range(dax_range); + + return rc; +} +static DEVICE_ATTR(start, 0400, start_show, NULL); + +static ssize_t end_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dev_dax_range *dax_range; + ssize_t rc; + + dax_range = get_dax_range(dev); + if (!dax_range) + return -ENXIO; + rc = sprintf(buf, "%#llx\n", dax_range->range.end); + put_dax_range(dax_range); + + return rc; +} +static DEVICE_ATTR(end, 0400, end_show, NULL); + +static ssize_t pgoff_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dev_dax_range *dax_range; + ssize_t rc; + + dax_range = get_dax_range(dev); + if (!dax_range) + return -ENXIO; + rc = sprintf(buf, "%#lx\n", dax_range->pgoff); + put_dax_range(dax_range); + + return rc; +} +static DEVICE_ATTR(page_offset, 0400, pgoff_show, NULL); + +static struct attribute *dax_mapping_attributes[] = { + &dev_attr_start.attr, + &dev_attr_end.attr, + &dev_attr_page_offset.attr, + NULL, +}; + +static const struct attribute_group dax_mapping_attribute_group = { + .attrs = dax_mapping_attributes, +}; + +static const struct attribute_group *dax_mapping_attribute_groups[] = { + &dax_mapping_attribute_group, + NULL, +}; + +static struct device_type dax_mapping_type = { + .release = dax_mapping_release, + .groups = dax_mapping_attribute_groups, +}; + +static int devm_register_dax_mapping(struct dev_dax *dev_dax, int range_id) +{ + struct dax_region *dax_region = dev_dax->region; + struct dax_mapping *mapping; + struct device *dev; + int rc; + + device_lock_assert(dax_region->dev); + + if (dev_WARN_ONCE(&dev_dax->dev, !dax_region->dev->driver, + "region disabled\n")) + return -ENXIO; + + mapping = kzalloc(sizeof(*mapping), GFP_KERNEL); + if (!mapping) + return -ENOMEM; + mapping->range_id = range_id; + mapping->id = ida_alloc(&dev_dax->ida, GFP_KERNEL); + if (mapping->id < 0) { + kfree(mapping); + return -ENOMEM; + } + dev_dax->ranges[range_id].mapping = mapping; + dev = &mapping->dev; + device_initialize(dev); + dev->parent = &dev_dax->dev; + dev->type = &dax_mapping_type; + dev_set_name(dev, "mapping%d", mapping->id); + rc = device_add(dev); + if (rc) { + put_device(dev); + return rc; + } + + rc = devm_add_action_or_reset(dax_region->dev, unregister_dax_mapping, + dev); + if (rc) + return rc; + return 0; +} + +static int alloc_dev_dax_range(struct dev_dax *dev_dax, u64 start, + resource_size_t size) +{ + struct dax_region *dax_region = dev_dax->region; + struct resource *res = &dax_region->res; + struct device *dev = &dev_dax->dev; + struct dev_dax_range *ranges; + unsigned long pgoff = 0; + struct resource *alloc; + int i, rc; + + device_lock_assert(dax_region->dev); + + /* handle the seed alloc special case */ + if (!size) { + if (dev_WARN_ONCE(dev, dev_dax->nr_range, + "0-size allocation must be first\n")) + return -EBUSY; + /* nr_range == 0 is elsewhere special cased as 0-size device */ + return 0; + } + + ranges = krealloc(dev_dax->ranges, sizeof(*ranges) + * (dev_dax->nr_range + 1), GFP_KERNEL); + if (!ranges) + return -ENOMEM; + + alloc = __request_region(res, start, size, dev_name(dev), 0); + if (!alloc) { + /* + * If this was an empty set of ranges nothing else + * will release @ranges, so do it now. + */ + if (!dev_dax->nr_range) { + kfree(ranges); + ranges = NULL; + } + dev_dax->ranges = ranges; + return -ENOMEM; + } + + for (i = 0; i < dev_dax->nr_range; i++) + pgoff += PHYS_PFN(range_len(&ranges[i].range)); + dev_dax->ranges = ranges; + ranges[dev_dax->nr_range++] = (struct dev_dax_range) { + .pgoff = pgoff, + .range = { + .start = alloc->start, + .end = alloc->end, + }, + }; + + dev_dbg(dev, "alloc range[%d]: %pa:%pa\n", dev_dax->nr_range - 1, + &alloc->start, &alloc->end); + /* + * A dev_dax instance must be registered before mapping device + * children can be added. Defer to devm_create_dev_dax() to add + * the initial mapping device. + */ + if (!device_is_registered(&dev_dax->dev)) + return 0; + + rc = devm_register_dax_mapping(dev_dax, dev_dax->nr_range - 1); + if (rc) { + dev_dbg(dev, "delete range[%d]: %pa:%pa\n", dev_dax->nr_range - 1, + &alloc->start, &alloc->end); + dev_dax->nr_range--; + __release_region(res, alloc->start, resource_size(alloc)); + return rc; + } + + return 0; +} + +static int adjust_dev_dax_range(struct dev_dax *dev_dax, struct resource *res, resource_size_t size) +{ + int last_range = dev_dax->nr_range - 1; + struct dev_dax_range *dax_range = &dev_dax->ranges[last_range]; + struct dax_region *dax_region = dev_dax->region; + bool is_shrink = resource_size(res) > size; + struct range *range = &dax_range->range; + struct device *dev = &dev_dax->dev; + int rc; + + device_lock_assert(dax_region->dev); + + if (dev_WARN_ONCE(dev, !size, "deletion is handled by dev_dax_shrink\n")) + return -EINVAL; + + rc = adjust_resource(res, range->start, size); + if (rc) + return rc; + + *range = (struct range) { + .start = range->start, + .end = range->start + size - 1, + }; + + dev_dbg(dev, "%s range[%d]: %#llx:%#llx\n", is_shrink ? "shrink" : "extend", + last_range, (unsigned long long) range->start, + (unsigned long long) range->end); + + return 0; +} + static ssize_t size_show(struct device *dev, struct device_attribute *attr, char *buf) { struct dev_dax *dev_dax = to_dev_dax(dev); - unsigned long long size = resource_size(&dev_dax->region->res); + unsigned long long size; + + device_lock(dev); + size = dev_dax_size(dev_dax); + device_unlock(dev); return sprintf(buf, "%llu\n", size); } -static DEVICE_ATTR_RO(size); + +static bool alloc_is_aligned(struct dev_dax *dev_dax, resource_size_t size) +{ + /* + * The minimum mapping granularity for a device instance is a + * single subsection, unless the arch says otherwise. + */ + return IS_ALIGNED(size, max_t(unsigned long, dev_dax->align, memremap_compat_align())); +} + +static int dev_dax_shrink(struct dev_dax *dev_dax, resource_size_t size) +{ + resource_size_t to_shrink = dev_dax_size(dev_dax) - size; + struct dax_region *dax_region = dev_dax->region; + struct device *dev = &dev_dax->dev; + int i; + + for (i = dev_dax->nr_range - 1; i >= 0; i--) { + struct range *range = &dev_dax->ranges[i].range; + struct dax_mapping *mapping = dev_dax->ranges[i].mapping; + struct resource *adjust = NULL, *res; + resource_size_t shrink; + + shrink = min_t(u64, to_shrink, range_len(range)); + if (shrink >= range_len(range)) { + devm_release_action(dax_region->dev, + unregister_dax_mapping, &mapping->dev); + __release_region(&dax_region->res, range->start, + range_len(range)); + dev_dax->nr_range--; + dev_dbg(dev, "delete range[%d]: %#llx:%#llx\n", i, + (unsigned long long) range->start, + (unsigned long long) range->end); + to_shrink -= shrink; + if (!to_shrink) + break; + continue; + } + + for_each_dax_region_resource(dax_region, res) + if (strcmp(res->name, dev_name(dev)) == 0 + && res->start == range->start) { + adjust = res; + break; + } + + if (dev_WARN_ONCE(dev, !adjust || i != dev_dax->nr_range - 1, + "failed to find matching resource\n")) + return -ENXIO; + return adjust_dev_dax_range(dev_dax, adjust, range_len(range) + - shrink); + } + return 0; +} + +/* + * Only allow adjustments that preserve the relative pgoff of existing + * allocations. I.e. the dev_dax->ranges array is ordered by increasing pgoff. + */ +static bool adjust_ok(struct dev_dax *dev_dax, struct resource *res) +{ + struct dev_dax_range *last; + int i; + + if (dev_dax->nr_range == 0) + return false; + if (strcmp(res->name, dev_name(&dev_dax->dev)) != 0) + return false; + last = &dev_dax->ranges[dev_dax->nr_range - 1]; + if (last->range.start != res->start || last->range.end != res->end) + return false; + for (i = 0; i < dev_dax->nr_range - 1; i++) { + struct dev_dax_range *dax_range = &dev_dax->ranges[i]; + + if (dax_range->pgoff > last->pgoff) + return false; + } + + return true; +} + +static ssize_t dev_dax_resize(struct dax_region *dax_region, + struct dev_dax *dev_dax, resource_size_t size) +{ + resource_size_t avail = dax_region_avail_size(dax_region), to_alloc; + resource_size_t dev_size = dev_dax_size(dev_dax); + struct resource *region_res = &dax_region->res; + struct device *dev = &dev_dax->dev; + struct resource *res, *first; + resource_size_t alloc = 0; + int rc; + + if (dev->driver) + return -EBUSY; + if (size == dev_size) + return 0; + if (size > dev_size && size - dev_size > avail) + return -ENOSPC; + if (size < dev_size) + return dev_dax_shrink(dev_dax, size); + + to_alloc = size - dev_size; + if (dev_WARN_ONCE(dev, !alloc_is_aligned(dev_dax, to_alloc), + "resize of %pa misaligned\n", &to_alloc)) + return -ENXIO; + + /* + * Expand the device into the unused portion of the region. This + * may involve adjusting the end of an existing resource, or + * allocating a new resource. + */ +retry: + first = region_res->child; + if (!first) + return alloc_dev_dax_range(dev_dax, dax_region->res.start, to_alloc); + + rc = -ENOSPC; + for (res = first; res; res = res->sibling) { + struct resource *next = res->sibling; + + /* space at the beginning of the region */ + if (res == first && res->start > dax_region->res.start) { + alloc = min(res->start - dax_region->res.start, to_alloc); + rc = alloc_dev_dax_range(dev_dax, dax_region->res.start, alloc); + break; + } + + alloc = 0; + /* space between allocations */ + if (next && next->start > res->end + 1) + alloc = min(next->start - (res->end + 1), to_alloc); + + /* space at the end of the region */ + if (!alloc && !next && res->end < region_res->end) + alloc = min(region_res->end - res->end, to_alloc); + + if (!alloc) + continue; + + if (adjust_ok(dev_dax, res)) { + rc = adjust_dev_dax_range(dev_dax, res, resource_size(res) + alloc); + break; + } + rc = alloc_dev_dax_range(dev_dax, res->end + 1, alloc); + break; + } + if (rc) + return rc; + to_alloc -= alloc; + if (to_alloc) + goto retry; + return 0; +} + +static ssize_t size_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + ssize_t rc; + unsigned long long val; + struct dev_dax *dev_dax = to_dev_dax(dev); + struct dax_region *dax_region = dev_dax->region; + + rc = kstrtoull(buf, 0, &val); + if (rc) + return rc; + + if (!alloc_is_aligned(dev_dax, val)) { + dev_dbg(dev, "%s: size: %lld misaligned\n", __func__, val); + return -EINVAL; + } + + device_lock(dax_region->dev); + if (!dax_region->dev->driver) { + device_unlock(dax_region->dev); + return -ENXIO; + } + device_lock(dev); + rc = dev_dax_resize(dax_region, dev_dax, val); + device_unlock(dev); + device_unlock(dax_region->dev); + + return rc == 0 ? len : rc; +} +static DEVICE_ATTR_RW(size); + +static ssize_t range_parse(const char *opt, size_t len, struct range *range) +{ + unsigned long long addr = 0; + char *start, *end, *str; + ssize_t rc = EINVAL; + + str = kstrdup(opt, GFP_KERNEL); + if (!str) + return rc; + + end = str; + start = strsep(&end, "-"); + if (!start || !end) + goto err; + + rc = kstrtoull(start, 16, &addr); + if (rc) + goto err; + range->start = addr; + + rc = kstrtoull(end, 16, &addr); + if (rc) + goto err; + range->end = addr; + +err: + kfree(str); + return rc; +} + +static ssize_t mapping_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct dev_dax *dev_dax = to_dev_dax(dev); + struct dax_region *dax_region = dev_dax->region; + size_t to_alloc; + struct range r; + ssize_t rc; + + rc = range_parse(buf, len, &r); + if (rc) + return rc; + + rc = -ENXIO; + device_lock(dax_region->dev); + if (!dax_region->dev->driver) { + device_unlock(dax_region->dev); + return rc; + } + device_lock(dev); + + to_alloc = range_len(&r); + if (alloc_is_aligned(dev_dax, to_alloc)) + rc = alloc_dev_dax_range(dev_dax, r.start, to_alloc); + device_unlock(dev); + device_unlock(dax_region->dev); + + return rc == 0 ? len : rc; +} +static DEVICE_ATTR_WO(mapping); + +static ssize_t align_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dev_dax *dev_dax = to_dev_dax(dev); + + return sprintf(buf, "%d\n", dev_dax->align); +} + +static ssize_t dev_dax_validate_align(struct dev_dax *dev_dax) +{ + resource_size_t dev_size = dev_dax_size(dev_dax); + struct device *dev = &dev_dax->dev; + int i; + + if (dev_size > 0 && !alloc_is_aligned(dev_dax, dev_size)) { + dev_dbg(dev, "%s: align %u invalid for size %pa\n", + __func__, dev_dax->align, &dev_size); + return -EINVAL; + } + + for (i = 0; i < dev_dax->nr_range; i++) { + size_t len = range_len(&dev_dax->ranges[i].range); + + if (!alloc_is_aligned(dev_dax, len)) { + dev_dbg(dev, "%s: align %u invalid for range %d\n", + __func__, dev_dax->align, i); + return -EINVAL; + } + } + + return 0; +} + +static ssize_t align_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct dev_dax *dev_dax = to_dev_dax(dev); + struct dax_region *dax_region = dev_dax->region; + unsigned long val, align_save; + ssize_t rc; + + rc = kstrtoul(buf, 0, &val); + if (rc) + return -ENXIO; + + if (!dax_align_valid(val)) + return -EINVAL; + + device_lock(dax_region->dev); + if (!dax_region->dev->driver) { + device_unlock(dax_region->dev); + return -ENXIO; + } + + device_lock(dev); + if (dev->driver) { + rc = -EBUSY; + goto out_unlock; + } + + align_save = dev_dax->align; + dev_dax->align = val; + rc = dev_dax_validate_align(dev_dax); + if (rc) + dev_dax->align = align_save; +out_unlock: + device_unlock(dev); + device_unlock(dax_region->dev); + return rc == 0 ? len : rc; +} +static DEVICE_ATTR_RW(align); static int dev_dax_target_node(struct dev_dax *dev_dax) { @@ -295,19 +1192,19 @@ static ssize_t target_node_show(struct device *dev, } static DEVICE_ATTR_RO(target_node); -static unsigned long long dev_dax_resource(struct dev_dax *dev_dax) -{ - struct dax_region *dax_region = dev_dax->region; - - return dax_region->res.start; -} - static ssize_t resource_show(struct device *dev, struct device_attribute *attr, char *buf) { struct dev_dax *dev_dax = to_dev_dax(dev); + struct dax_region *dax_region = dev_dax->region; + unsigned long long start; - return sprintf(buf, "%#llx\n", dev_dax_resource(dev_dax)); + if (dev_dax->nr_range < 1) + start = dax_region->res.start; + else + start = dev_dax->ranges[0].range.start; + + return sprintf(buf, "%#llx\n", start); } static DEVICE_ATTR(resource, 0400, resource_show, NULL); @@ -333,18 +1230,26 @@ static umode_t dev_dax_visible(struct kobject *kobj, struct attribute *a, int n) { struct device *dev = container_of(kobj, struct device, kobj); struct dev_dax *dev_dax = to_dev_dax(dev); + struct dax_region *dax_region = dev_dax->region; if (a == &dev_attr_target_node.attr && dev_dax_target_node(dev_dax) < 0) return 0; if (a == &dev_attr_numa_node.attr && !IS_ENABLED(CONFIG_NUMA)) return 0; + if (a == &dev_attr_mapping.attr && is_static(dax_region)) + return 0; + if ((a == &dev_attr_align.attr || + a == &dev_attr_size.attr) && is_static(dax_region)) + return 0444; return a->mode; } static struct attribute *dev_dax_attributes[] = { &dev_attr_modalias.attr, &dev_attr_size.attr, + &dev_attr_mapping.attr, &dev_attr_target_node.attr, + &dev_attr_align.attr, &dev_attr_resource.attr, &dev_attr_numa_node.attr, NULL, @@ -360,24 +1265,17 @@ static const struct attribute_group *dax_attribute_groups[] = { NULL, }; -void kill_dev_dax(struct dev_dax *dev_dax) -{ - struct dax_device *dax_dev = dev_dax->dax_dev; - struct inode *inode = dax_inode(dax_dev); - - kill_dax(dax_dev); - unmap_mapping_range(inode->i_mapping, 0, 0, 1); -} -EXPORT_SYMBOL_GPL(kill_dev_dax); - static void dev_dax_release(struct device *dev) { struct dev_dax *dev_dax = to_dev_dax(dev); struct dax_region *dax_region = dev_dax->region; struct dax_device *dax_dev = dev_dax->dax_dev; - dax_region_put(dax_region); put_dax(dax_dev); + free_dev_dax_id(dev_dax); + dax_region_put(dax_region); + kfree(dev_dax->ranges); + kfree(dev_dax->pgmap); kfree(dev_dax); } @@ -386,35 +1284,61 @@ static const struct device_type dev_dax_type = { .groups = dax_attribute_groups, }; -static void unregister_dev_dax(void *dev) -{ - struct dev_dax *dev_dax = to_dev_dax(dev); - - dev_dbg(dev, "%s\n", __func__); - - kill_dev_dax(dev_dax); - device_del(dev); - put_device(dev); -} - -struct dev_dax *__devm_create_dev_dax(struct dax_region *dax_region, int id, - struct dev_pagemap *pgmap, enum dev_dax_subsys subsys) +struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data) { + struct dax_region *dax_region = data->dax_region; struct device *parent = dax_region->dev; struct dax_device *dax_dev; struct dev_dax *dev_dax; struct inode *inode; struct device *dev; - int rc = -ENOMEM; - - if (id < 0) - return ERR_PTR(-EINVAL); + int rc; dev_dax = kzalloc(sizeof(*dev_dax), GFP_KERNEL); if (!dev_dax) return ERR_PTR(-ENOMEM); - memcpy(&dev_dax->pgmap, pgmap, sizeof(*pgmap)); + if (is_static(dax_region)) { + if (dev_WARN_ONCE(parent, data->id < 0, + "dynamic id specified to static region\n")) { + rc = -EINVAL; + goto err_id; + } + + dev_dax->id = data->id; + } else { + if (dev_WARN_ONCE(parent, data->id >= 0, + "static id specified to dynamic region\n")) { + rc = -EINVAL; + goto err_id; + } + + rc = ida_alloc(&dax_region->ida, GFP_KERNEL); + if (rc < 0) + goto err_id; + dev_dax->id = rc; + } + + dev_dax->region = dax_region; + dev = &dev_dax->dev; + device_initialize(dev); + dev_set_name(dev, "dax%d.%d", dax_region->id, dev_dax->id); + + rc = alloc_dev_dax_range(dev_dax, dax_region->res.start, data->size); + if (rc) + goto err_range; + + if (data->pgmap) { + dev_WARN_ONCE(parent, !is_static(dax_region), + "custom dev_pagemap requires a static dax_region\n"); + + dev_dax->pgmap = kmemdup(data->pgmap, + sizeof(struct dev_pagemap), GFP_KERNEL); + if (!dev_dax->pgmap) { + rc = -ENOMEM; + goto err_pgmap; + } + } /* * No 'host' or dax_operations since there is no access to this @@ -423,30 +1347,26 @@ struct dev_dax *__devm_create_dev_dax(struct dax_region *dax_region, int id, dax_dev = alloc_dax(dev_dax, NULL, NULL, DAXDEV_F_SYNC); if (IS_ERR(dax_dev)) { rc = PTR_ERR(dax_dev); - goto err; + goto err_alloc_dax; } /* a device_dax instance is dead while the driver is not attached */ kill_dax(dax_dev); - /* from here on we're committed to teardown via dax_dev_release() */ - dev = &dev_dax->dev; - device_initialize(dev); - dev_dax->dax_dev = dax_dev; - dev_dax->region = dax_region; dev_dax->target_node = dax_region->target_node; + dev_dax->align = dax_region->align; + ida_init(&dev_dax->ida); kref_get(&dax_region->kref); inode = dax_inode(dax_dev); dev->devt = inode->i_rdev; - if (subsys == DEV_DAX_BUS) + if (data->subsys == DEV_DAX_BUS) dev->bus = &dax_bus_type; else dev->class = dax_class; dev->parent = parent; dev->type = &dev_dax_type; - dev_set_name(dev, "dax%d.%d", dax_region->id, id); rc = device_add(dev); if (rc) { @@ -459,14 +1379,27 @@ struct dev_dax *__devm_create_dev_dax(struct dax_region *dax_region, int id, if (rc) return ERR_PTR(rc); + /* register mapping device for the initial allocation range */ + if (dev_dax->nr_range && range_len(&dev_dax->ranges[0].range)) { + rc = devm_register_dax_mapping(dev_dax, 0); + if (rc) + return ERR_PTR(rc); + } + return dev_dax; - err: +err_alloc_dax: + kfree(dev_dax->pgmap); +err_pgmap: + free_dev_dax_ranges(dev_dax); +err_range: + free_dev_dax_id(dev_dax); +err_id: kfree(dev_dax); return ERR_PTR(rc); } -EXPORT_SYMBOL_GPL(__devm_create_dev_dax); +EXPORT_SYMBOL_GPL(devm_create_dev_dax); static int match_always_count; diff --git a/drivers/dax/bus.h b/drivers/dax/bus.h index 9e4eba67e8b9..72b92f95509f 100644 --- a/drivers/dax/bus.h +++ b/drivers/dax/bus.h @@ -3,29 +3,33 @@ #ifndef __DAX_BUS_H__ #define __DAX_BUS_H__ #include +#include struct dev_dax; struct resource; struct dax_device; struct dax_region; void dax_region_put(struct dax_region *dax_region); + +#define IORESOURCE_DAX_STATIC (1UL << 0) struct dax_region *alloc_dax_region(struct device *parent, int region_id, - struct resource *res, int target_node, unsigned int align, - unsigned long long flags); + struct range *range, int target_node, unsigned int align, + unsigned long flags); enum dev_dax_subsys { - DEV_DAX_BUS, + DEV_DAX_BUS = 0, /* zeroed dev_dax_data picks this by default */ DEV_DAX_CLASS, }; -struct dev_dax *__devm_create_dev_dax(struct dax_region *dax_region, int id, - struct dev_pagemap *pgmap, enum dev_dax_subsys subsys); +struct dev_dax_data { + struct dax_region *dax_region; + struct dev_pagemap *pgmap; + enum dev_dax_subsys subsys; + resource_size_t size; + int id; +}; -static inline struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region, - int id, struct dev_pagemap *pgmap) -{ - return __devm_create_dev_dax(dax_region, id, pgmap, DEV_DAX_BUS); -} +struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data); /* to be deleted when DEV_DAX_CLASS is removed */ struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys subsys); @@ -34,6 +38,8 @@ struct dax_device_driver { struct device_driver drv; struct list_head ids; int match_always; + int (*probe)(struct dev_dax *dev); + int (*remove)(struct dev_dax *dev); }; int __dax_driver_register(struct dax_device_driver *dax_drv, @@ -44,7 +50,7 @@ void dax_driver_unregister(struct dax_device_driver *dax_drv); void kill_dev_dax(struct dev_dax *dev_dax); #if IS_ENABLED(CONFIG_DEV_DAX_PMEM_COMPAT) -int dev_dax_probe(struct device *dev); +int dev_dax_probe(struct dev_dax *dev_dax); #endif /* diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h index 16850d5388ab..1c974b7caae6 100644 --- a/drivers/dax/dax-private.h +++ b/drivers/dax/dax-private.h @@ -7,6 +7,7 @@ #include #include +#include /* private routines between core files */ struct dax_device; @@ -22,8 +23,10 @@ void dax_bus_exit(void); * @kref: to pin while other agents have a need to do lookups * @dev: parent device backing this region * @align: allocation and mapping alignment for child dax devices - * @res: physical address range of the region - * @pfn_flags: identify whether the pfns are paged back or not + * @ida: instance id allocator + * @res: resource tree to track instance allocations + * @seed: allow userspace to find the first unbound seed device + * @youngest: allow userspace to find the most recently created device */ struct dax_region { int id; @@ -31,8 +34,16 @@ struct dax_region { struct kref kref; struct device *dev; unsigned int align; + struct ida ida; struct resource res; - unsigned long long pfn_flags; + struct device *seed; + struct device *youngest; +}; + +struct dax_mapping { + struct device dev; + int range_id; + int id; }; /** @@ -41,22 +52,57 @@ struct dax_region { * @region - parent region * @dax_dev - core dax functionality * @target_node: effective numa node if dev_dax memory range is onlined + * @id: ida allocated id + * @ida: mapping id allocator * @dev - device core * @pgmap - pgmap for memmap setup / lifetime (driver owned) - * @dax_mem_res: physical address range of hotadded DAX memory - * @dax_mem_name: name for hotadded DAX memory via add_memory_driver_managed() + * @nr_range: size of @ranges + * @ranges: resource-span + pgoff tuples for the instance */ struct dev_dax { struct dax_region *region; struct dax_device *dax_dev; + unsigned int align; int target_node; + int id; + struct ida ida; struct device dev; - struct dev_pagemap pgmap; - struct resource *dax_kmem_res; + struct dev_pagemap *pgmap; + int nr_range; + struct dev_dax_range { + unsigned long pgoff; + struct range range; + struct dax_mapping *mapping; + } *ranges; }; static inline struct dev_dax *to_dev_dax(struct device *dev) { return container_of(dev, struct dev_dax, dev); } + +static inline struct dax_mapping *to_dax_mapping(struct device *dev) +{ + return container_of(dev, struct dax_mapping, dev); +} + +phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff, unsigned long size); + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline bool dax_align_valid(unsigned long align) +{ + if (align == PUD_SIZE && IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)) + return true; + if (align == PMD_SIZE && has_transparent_hugepage()) + return true; + if (align == PAGE_SIZE) + return true; + return false; +} +#else +static inline bool dax_align_valid(unsigned long align) +{ + return align == PAGE_SIZE; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 1e89513f3c59..25e0b84a4296 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -17,7 +17,6 @@ static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma, const char *func) { - struct dax_region *dax_region = dev_dax->region; struct device *dev = &dev_dax->dev; unsigned long mask; @@ -32,7 +31,7 @@ static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma, return -EINVAL; } - mask = dax_region->align - 1; + mask = dev_dax->align - 1; if (vma->vm_start & mask || vma->vm_end & mask) { dev_info_ratelimited(dev, "%s: %s: fail, unaligned vma (%#lx - %#lx, %#lx)\n", @@ -41,14 +40,6 @@ static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma, return -EINVAL; } - if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) == PFN_DEV - && (vma->vm_flags & VM_DONTCOPY) == 0) { - dev_info_ratelimited(dev, - "%s: %s: fail, dax range requires MADV_DONTFORK\n", - current->comm, func); - return -EINVAL; - } - if (!vma_is_dax(vma)) { dev_info_ratelimited(dev, "%s: %s: fail, vma is not DAX capable\n", @@ -63,15 +54,22 @@ static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma, __weak phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff, unsigned long size) { - struct resource *res = &dev_dax->region->res; - phys_addr_t phys; + int i; - phys = pgoff * PAGE_SIZE + res->start; - if (phys >= res->start && phys <= res->end) { - if (phys + size - 1 <= res->end) + for (i = 0; i < dev_dax->nr_range; i++) { + struct dev_dax_range *dax_range = &dev_dax->ranges[i]; + struct range *range = &dax_range->range; + unsigned long long pgoff_end; + phys_addr_t phys; + + pgoff_end = dax_range->pgoff + PHYS_PFN(range_len(range)) - 1; + if (pgoff < dax_range->pgoff || pgoff > pgoff_end) + continue; + phys = PFN_PHYS(pgoff - dax_range->pgoff) + range->start; + if (phys + size - 1 <= range->end) return phys; + break; } - return -1; } @@ -79,21 +77,19 @@ static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax, struct vm_fault *vmf, pfn_t *pfn) { struct device *dev = &dev_dax->dev; - struct dax_region *dax_region; phys_addr_t phys; unsigned int fault_size = PAGE_SIZE; if (check_vma(dev_dax, vmf->vma, __func__)) return VM_FAULT_SIGBUS; - dax_region = dev_dax->region; - if (dax_region->align > PAGE_SIZE) { + if (dev_dax->align > PAGE_SIZE) { dev_dbg(dev, "alignment (%#x) > fault size (%#x)\n", - dax_region->align, fault_size); + dev_dax->align, fault_size); return VM_FAULT_SIGBUS; } - if (fault_size != dax_region->align) + if (fault_size != dev_dax->align) return VM_FAULT_SIGBUS; phys = dax_pgoff_to_phys(dev_dax, vmf->pgoff, PAGE_SIZE); @@ -102,7 +98,7 @@ static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax, return VM_FAULT_SIGBUS; } - *pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); + *pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP); return vmf_insert_mixed(vmf->vma, vmf->address, *pfn); } @@ -112,7 +108,6 @@ static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax, { unsigned long pmd_addr = vmf->address & PMD_MASK; struct device *dev = &dev_dax->dev; - struct dax_region *dax_region; phys_addr_t phys; pgoff_t pgoff; unsigned int fault_size = PMD_SIZE; @@ -120,22 +115,15 @@ static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax, if (check_vma(dev_dax, vmf->vma, __func__)) return VM_FAULT_SIGBUS; - dax_region = dev_dax->region; - if (dax_region->align > PMD_SIZE) { + if (dev_dax->align > PMD_SIZE) { dev_dbg(dev, "alignment (%#x) > fault size (%#x)\n", - dax_region->align, fault_size); + dev_dax->align, fault_size); return VM_FAULT_SIGBUS; } - /* dax pmd mappings require pfn_t_devmap() */ - if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) { - dev_dbg(dev, "region lacks devmap flags\n"); + if (fault_size < dev_dax->align) return VM_FAULT_SIGBUS; - } - - if (fault_size < dax_region->align) - return VM_FAULT_SIGBUS; - else if (fault_size > dax_region->align) + else if (fault_size > dev_dax->align) return VM_FAULT_FALLBACK; /* if we are outside of the VMA */ @@ -150,7 +138,7 @@ static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax, return VM_FAULT_SIGBUS; } - *pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); + *pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP); return vmf_insert_pfn_pmd(vmf, *pfn, vmf->flags & FAULT_FLAG_WRITE); } @@ -161,7 +149,6 @@ static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, { unsigned long pud_addr = vmf->address & PUD_MASK; struct device *dev = &dev_dax->dev; - struct dax_region *dax_region; phys_addr_t phys; pgoff_t pgoff; unsigned int fault_size = PUD_SIZE; @@ -170,22 +157,15 @@ static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, if (check_vma(dev_dax, vmf->vma, __func__)) return VM_FAULT_SIGBUS; - dax_region = dev_dax->region; - if (dax_region->align > PUD_SIZE) { + if (dev_dax->align > PUD_SIZE) { dev_dbg(dev, "alignment (%#x) > fault size (%#x)\n", - dax_region->align, fault_size); + dev_dax->align, fault_size); return VM_FAULT_SIGBUS; } - /* dax pud mappings require pfn_t_devmap() */ - if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) { - dev_dbg(dev, "region lacks devmap flags\n"); + if (fault_size < dev_dax->align) return VM_FAULT_SIGBUS; - } - - if (fault_size < dax_region->align) - return VM_FAULT_SIGBUS; - else if (fault_size > dax_region->align) + else if (fault_size > dev_dax->align) return VM_FAULT_FALLBACK; /* if we are outside of the VMA */ @@ -200,7 +180,7 @@ static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, return VM_FAULT_SIGBUS; } - *pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); + *pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP); return vmf_insert_pfn_pud(vmf, *pfn, vmf->flags & FAULT_FLAG_WRITE); } @@ -280,9 +260,8 @@ static int dev_dax_split(struct vm_area_struct *vma, unsigned long addr) { struct file *filp = vma->vm_file; struct dev_dax *dev_dax = filp->private_data; - struct dax_region *dax_region = dev_dax->region; - if (!IS_ALIGNED(addr, dax_region->align)) + if (!IS_ALIGNED(addr, dev_dax->align)) return -EINVAL; return 0; } @@ -291,9 +270,8 @@ static unsigned long dev_dax_pagesize(struct vm_area_struct *vma) { struct file *filp = vma->vm_file; struct dev_dax *dev_dax = filp->private_data; - struct dax_region *dax_region = dev_dax->region; - return dax_region->align; + return dev_dax->align; } static const struct vm_operations_struct dax_vm_ops = { @@ -332,13 +310,11 @@ static unsigned long dax_get_unmapped_area(struct file *filp, { unsigned long off, off_end, off_align, len_align, addr_align, align; struct dev_dax *dev_dax = filp ? filp->private_data : NULL; - struct dax_region *dax_region; if (!dev_dax || addr) goto out; - dax_region = dev_dax->region; - align = dax_region->align; + align = dev_dax->align; off = pgoff << PAGE_SHIFT; off_end = off + len; off_align = round_up(off, align); @@ -412,25 +388,45 @@ static void dev_dax_kill(void *dev_dax) kill_dev_dax(dev_dax); } -int dev_dax_probe(struct device *dev) +int dev_dax_probe(struct dev_dax *dev_dax) { - struct dev_dax *dev_dax = to_dev_dax(dev); struct dax_device *dax_dev = dev_dax->dax_dev; - struct resource *res = &dev_dax->region->res; + struct device *dev = &dev_dax->dev; + struct dev_pagemap *pgmap; struct inode *inode; struct cdev *cdev; void *addr; - int rc; + int rc, i; - /* 1:1 map region resource range to device-dax instance range */ - if (!devm_request_mem_region(dev, res->start, resource_size(res), - dev_name(dev))) { - dev_warn(dev, "could not reserve region %pR\n", res); - return -EBUSY; + pgmap = dev_dax->pgmap; + if (dev_WARN_ONCE(dev, pgmap && dev_dax->nr_range > 1, + "static pgmap / multi-range device conflict\n")) + return -EINVAL; + + if (!pgmap) { + pgmap = devm_kzalloc(dev, sizeof(*pgmap) + sizeof(struct range) + * (dev_dax->nr_range - 1), GFP_KERNEL); + if (!pgmap) + return -ENOMEM; + pgmap->nr_range = dev_dax->nr_range; } - dev_dax->pgmap.type = MEMORY_DEVICE_GENERIC; - addr = devm_memremap_pages(dev, &dev_dax->pgmap); + for (i = 0; i < dev_dax->nr_range; i++) { + struct range *range = &dev_dax->ranges[i].range; + + if (!devm_request_mem_region(dev, range->start, + range_len(range), dev_name(dev))) { + dev_warn(dev, "mapping%d: %#llx-%#llx could not reserve range\n", + i, range->start, range->end); + return -EBUSY; + } + /* don't update the range for static pgmap */ + if (!dev_dax->pgmap) + pgmap->ranges[i] = *range; + } + + pgmap->type = MEMORY_DEVICE_GENERIC; + addr = devm_memremap_pages(dev, pgmap); if (IS_ERR(addr)) return PTR_ERR(addr); @@ -456,17 +452,15 @@ int dev_dax_probe(struct device *dev) } EXPORT_SYMBOL_GPL(dev_dax_probe); -static int dev_dax_remove(struct device *dev) +static int dev_dax_remove(struct dev_dax *dev_dax) { /* all probe actions are unwound by devm */ return 0; } static struct dax_device_driver device_dax_driver = { - .drv = { - .probe = dev_dax_probe, - .remove = dev_dax_remove, - }, + .probe = dev_dax_probe, + .remove = dev_dax_remove, .match_always = 1, }; diff --git a/drivers/dax/hmem/Makefile b/drivers/dax/hmem/Makefile new file mode 100644 index 000000000000..57377b4c3d47 --- /dev/null +++ b/drivers/dax/hmem/Makefile @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_DEV_DAX_HMEM) += dax_hmem.o +obj-$(CONFIG_DEV_DAX_HMEM_DEVICES) += device_hmem.o + +device_hmem-y := device.o +dax_hmem-y := hmem.o diff --git a/drivers/dax/hmem/device.c b/drivers/dax/hmem/device.c new file mode 100644 index 000000000000..cb6401c9e9a4 --- /dev/null +++ b/drivers/dax/hmem/device.c @@ -0,0 +1,100 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include + +static bool nohmem; +module_param_named(disable, nohmem, bool, 0444); + +void hmem_register_device(int target_nid, struct resource *r) +{ + /* define a clean / non-busy resource for the platform device */ + struct resource res = { + .start = r->start, + .end = r->end, + .flags = IORESOURCE_MEM, + }; + struct platform_device *pdev; + struct memregion_info info; + int rc, id; + + if (nohmem) + return; + + rc = region_intersects(res.start, resource_size(&res), IORESOURCE_MEM, + IORES_DESC_SOFT_RESERVED); + if (rc != REGION_INTERSECTS) + return; + + id = memregion_alloc(GFP_KERNEL); + if (id < 0) { + pr_err("memregion allocation failure for %pr\n", &res); + return; + } + + pdev = platform_device_alloc("hmem", id); + if (!pdev) { + pr_err("hmem device allocation failure for %pr\n", &res); + goto out_pdev; + } + + pdev->dev.numa_node = numa_map_to_online_node(target_nid); + info = (struct memregion_info) { + .target_node = target_nid, + }; + rc = platform_device_add_data(pdev, &info, sizeof(info)); + if (rc < 0) { + pr_err("hmem memregion_info allocation failure for %pr\n", &res); + goto out_pdev; + } + + rc = platform_device_add_resources(pdev, &res, 1); + if (rc < 0) { + pr_err("hmem resource allocation failure for %pr\n", &res); + goto out_resource; + } + + rc = platform_device_add(pdev); + if (rc < 0) { + dev_err(&pdev->dev, "device add failed for %pr\n", &res); + goto out_resource; + } + + return; + +out_resource: + put_device(&pdev->dev); +out_pdev: + memregion_free(id); +} + +static __init int hmem_register_one(struct resource *res, void *data) +{ + /* + * If the resource is not a top-level resource it was already + * assigned to a device by the HMAT parsing. + */ + if (res->parent != &iomem_resource) { + pr_info("HMEM: skip %pr, already claimed\n", res); + return 0; + } + + hmem_register_device(phys_to_target_node(res->start), res); + + return 0; +} + +static __init int hmem_init(void) +{ + walk_iomem_res_desc(IORES_DESC_SOFT_RESERVED, + IORESOURCE_MEM, 0, -1, NULL, hmem_register_one); + return 0; +} + +/* + * As this is a fallback for address ranges unclaimed by the ACPI HMAT + * parsing it must be at an initcall level greater than hmat_init(). + */ +late_initcall(hmem_init); diff --git a/drivers/dax/hmem.c b/drivers/dax/hmem/hmem.c similarity index 69% rename from drivers/dax/hmem.c rename to drivers/dax/hmem/hmem.c index fe7214daf62e..1bf040dbc834 100644 --- a/drivers/dax/hmem.c +++ b/drivers/dax/hmem/hmem.c @@ -3,30 +3,39 @@ #include #include #include -#include "bus.h" +#include "../bus.h" + +static bool region_idle; +module_param_named(region_idle, region_idle, bool, 0644); static int dax_hmem_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; - struct dev_pagemap pgmap = { }; struct dax_region *dax_region; struct memregion_info *mri; + struct dev_dax_data data; struct dev_dax *dev_dax; struct resource *res; + struct range range; res = platform_get_resource(pdev, IORESOURCE_MEM, 0); if (!res) return -ENOMEM; mri = dev->platform_data; - memcpy(&pgmap.res, res, sizeof(*res)); - - dax_region = alloc_dax_region(dev, pdev->id, res, mri->target_node, - PMD_SIZE, PFN_DEV|PFN_MAP); + range.start = res->start; + range.end = res->end; + dax_region = alloc_dax_region(dev, pdev->id, &range, mri->target_node, + PMD_SIZE, 0); if (!dax_region) return -ENOMEM; - dev_dax = devm_create_dev_dax(dax_region, 0, &pgmap); + data = (struct dev_dax_data) { + .dax_region = dax_region, + .id = -1, + .size = region_idle ? 0 : resource_size(res), + }; + dev_dax = devm_create_dev_dax(&data); if (IS_ERR(dev_dax)) return PTR_ERR(dev_dax); diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c index 275aa5f87399..6c933f2b604e 100644 --- a/drivers/dax/kmem.c +++ b/drivers/dax/kmem.c @@ -19,17 +19,28 @@ static const char *kmem_name; /* Set if any memory will remain added when the driver will be unloaded. */ static bool any_hotremove_failed; -int dev_dax_kmem_probe(struct device *dev) +static int dax_kmem_range(struct dev_dax *dev_dax, int i, struct range *r) { - struct dev_dax *dev_dax = to_dev_dax(dev); - struct resource *res = &dev_dax->region->res; - resource_size_t kmem_start; - resource_size_t kmem_size; - resource_size_t kmem_end; - struct resource *new_res; - const char *new_res_name; + struct dev_dax_range *dax_range = &dev_dax->ranges[i]; + struct range *range = &dax_range->range; + + /* memory-block align the hotplug range */ + r->start = ALIGN(range->start, memory_block_size_bytes()); + r->end = ALIGN_DOWN(range->end + 1, memory_block_size_bytes()) - 1; + if (r->start >= r->end) { + r->start = range->start; + r->end = range->end; + return -ENOSPC; + } + return 0; +} + +static int dev_dax_kmem_probe(struct dev_dax *dev_dax) +{ + struct device *dev = &dev_dax->dev; + int i, mapped = 0; + char *res_name; int numa_node; - int rc; /* * Ensure good NUMA information for the persistent memory. @@ -39,68 +50,80 @@ int dev_dax_kmem_probe(struct device *dev) */ numa_node = dev_dax->target_node; if (numa_node < 0) { - dev_warn(dev, "rejecting DAX region %pR with invalid node: %d\n", - res, numa_node); + dev_warn(dev, "rejecting DAX region with invalid node: %d\n", + numa_node); return -EINVAL; } - /* Hotplug starting at the beginning of the next block: */ - kmem_start = ALIGN(res->start, memory_block_size_bytes()); - - kmem_size = resource_size(res); - /* Adjust the size down to compensate for moving up kmem_start: */ - kmem_size -= kmem_start - res->start; - /* Align the size down to cover only complete blocks: */ - kmem_size &= ~(memory_block_size_bytes() - 1); - kmem_end = kmem_start + kmem_size; - - new_res_name = kstrdup(dev_name(dev), GFP_KERNEL); - if (!new_res_name) + res_name = kstrdup(dev_name(dev), GFP_KERNEL); + if (!res_name) return -ENOMEM; - /* Region is permanently reserved if hotremove fails. */ - new_res = request_mem_region(kmem_start, kmem_size, new_res_name); - if (!new_res) { - dev_warn(dev, "could not reserve region [%pa-%pa]\n", - &kmem_start, &kmem_end); - kfree(new_res_name); - return -EBUSY; + for (i = 0; i < dev_dax->nr_range; i++) { + struct resource *res; + struct range range; + int rc; + + rc = dax_kmem_range(dev_dax, i, &range); + if (rc) { + dev_info(dev, "mapping%d: %#llx-%#llx too small after alignment\n", + i, range.start, range.end); + continue; + } + + /* Region is permanently reserved if hotremove fails. */ + res = request_mem_region(range.start, range_len(&range), res_name); + if (!res) { + dev_warn(dev, "mapping%d: %#llx-%#llx could not reserve region\n", + i, range.start, range.end); + /* + * Once some memory has been onlined we can't + * assume that it can be un-onlined safely. + */ + if (mapped) + continue; + kfree(res_name); + return -EBUSY; + } + + /* + * Set flags appropriate for System RAM. Leave ..._BUSY clear + * so that add_memory() can add a child resource. Do not + * inherit flags from the parent since it may set new flags + * unknown to us that will break add_memory() below. + */ + res->flags = IORESOURCE_SYSTEM_RAM; + + /* + * Ensure that future kexec'd kernels will not treat + * this as RAM automatically. + */ + rc = add_memory_driver_managed(numa_node, range.start, + range_len(&range), kmem_name); + + if (rc) { + dev_warn(dev, "mapping%d: %#llx-%#llx memory add failed\n", + i, range.start, range.end); + release_mem_region(range.start, range_len(&range)); + if (mapped) + continue; + kfree(res_name); + return rc; + } + mapped++; } - /* - * Set flags appropriate for System RAM. Leave ..._BUSY clear - * so that add_memory() can add a child resource. Do not - * inherit flags from the parent since it may set new flags - * unknown to us that will break add_memory() below. - */ - new_res->flags = IORESOURCE_SYSTEM_RAM; - - /* - * Ensure that future kexec'd kernels will not treat this as RAM - * automatically. - */ - rc = add_memory_driver_managed(numa_node, new_res->start, - resource_size(new_res), kmem_name); - if (rc) { - release_resource(new_res); - kfree(new_res); - kfree(new_res_name); - return rc; - } - dev_dax->dax_kmem_res = new_res; + dev_set_drvdata(dev, res_name); return 0; } #ifdef CONFIG_MEMORY_HOTREMOVE -static int dev_dax_kmem_remove(struct device *dev) +static int dev_dax_kmem_remove(struct dev_dax *dev_dax) { - struct dev_dax *dev_dax = to_dev_dax(dev); - struct resource *res = dev_dax->dax_kmem_res; - resource_size_t kmem_start = res->start; - resource_size_t kmem_size = resource_size(res); - const char *res_name = res->name; - int rc; + int i, success = 0; + struct device *dev = &dev_dax->dev; + const char *res_name = dev_get_drvdata(dev); /* * We have one shot for removing memory, if some memory blocks were not @@ -108,25 +131,36 @@ static int dev_dax_kmem_remove(struct device *dev) * there is no way to hotremove this memory until reboot because device * unbind will succeed even if we return failure. */ - rc = remove_memory(dev_dax->target_node, kmem_start, kmem_size); - if (rc) { + for (i = 0; i < dev_dax->nr_range; i++) { + struct range range; + int rc; + + rc = dax_kmem_range(dev_dax, i, &range); + if (rc) + continue; + + rc = remove_memory(dev_dax->target_node, range.start, + range_len(&range)); + if (rc == 0) { + release_mem_region(range.start, range_len(&range)); + success++; + continue; + } any_hotremove_failed = true; dev_err(dev, - "DAX region %pR cannot be hotremoved until the next reboot\n", - res); - return rc; + "mapping%d: %#llx-%#llx cannot be hotremoved until the next reboot\n", + i, range.start, range.end); } - /* Release and free dax resources */ - release_resource(res); - kfree(res); - kfree(res_name); - dev_dax->dax_kmem_res = NULL; + if (success >= dev_dax->nr_range) { + kfree(res_name); + dev_set_drvdata(dev, NULL); + } return 0; } #else -static int dev_dax_kmem_remove(struct device *dev) +static int dev_dax_kmem_remove(struct dev_dax *dev_dax) { /* * Without hotremove purposely leak the request_mem_region() for the @@ -141,10 +175,8 @@ static int dev_dax_kmem_remove(struct device *dev) #endif /* CONFIG_MEMORY_HOTREMOVE */ static struct dax_device_driver device_dax_kmem_driver = { - .drv = { - .probe = dev_dax_kmem_probe, - .remove = dev_dax_kmem_remove, - }, + .probe = dev_dax_kmem_probe, + .remove = dev_dax_kmem_remove, }; static int __init dax_kmem_init(void) diff --git a/drivers/dax/pmem/compat.c b/drivers/dax/pmem/compat.c index d7b15e6f30c5..863c114fd88c 100644 --- a/drivers/dax/pmem/compat.c +++ b/drivers/dax/pmem/compat.c @@ -22,7 +22,7 @@ static int dax_pmem_compat_probe(struct device *dev) return -ENOMEM; device_lock(&dev_dax->dev); - rc = dev_dax_probe(&dev_dax->dev); + rc = dev_dax_probe(dev_dax); device_unlock(&dev_dax->dev); devres_close_group(&dev_dax->dev, dev_dax); diff --git a/drivers/dax/pmem/core.c b/drivers/dax/pmem/core.c index 2bedf8414fff..62b26bfceab1 100644 --- a/drivers/dax/pmem/core.c +++ b/drivers/dax/pmem/core.c @@ -9,11 +9,12 @@ struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys subsys) { - struct resource res; + struct range range; int rc, id, region_id; resource_size_t offset; struct nd_pfn_sb *pfn_sb; struct dev_dax *dev_dax; + struct dev_dax_data data; struct nd_namespace_io *nsio; struct dax_region *dax_region; struct dev_pagemap pgmap = { }; @@ -49,16 +50,23 @@ struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys subsys) if (rc != 2) return ERR_PTR(-EINVAL); - /* adjust the dax_region resource to the start of data */ - memcpy(&res, &pgmap.res, sizeof(res)); - res.start += offset; - dax_region = alloc_dax_region(dev, region_id, &res, + /* adjust the dax_region range to the start of data */ + range = pgmap.range; + range.start += offset, + dax_region = alloc_dax_region(dev, region_id, &range, nd_region->target_node, le32_to_cpu(pfn_sb->align), - PFN_DEV|PFN_MAP); + IORESOURCE_DAX_STATIC); if (!dax_region) return ERR_PTR(-ENOMEM); - dev_dax = __devm_create_dev_dax(dax_region, id, &pgmap, subsys); + data = (struct dev_dax_data) { + .dax_region = dax_region, + .id = id, + .pgmap = &pgmap, + .subsys = subsys, + .size = range_len(&range), + }; + dev_dax = devm_create_dev_dax(&data); /* child dev_dax instances now own the lifetime of the dax_region */ dax_region_put(dax_region); diff --git a/drivers/firmware/efi/x86_fake_mem.c b/drivers/firmware/efi/x86_fake_mem.c index e5d6d5a1b240..0bafcc1bb0f6 100644 --- a/drivers/firmware/efi/x86_fake_mem.c +++ b/drivers/firmware/efi/x86_fake_mem.c @@ -38,7 +38,7 @@ void __init efi_fake_memmap_early(void) m_start = mem->range.start; m_end = mem->range.end; for_each_efi_memory_desc(md) { - u64 start, end; + u64 start, end, size; if (md->type != EFI_CONVENTIONAL_MEMORY) continue; @@ -58,11 +58,17 @@ void __init efi_fake_memmap_early(void) */ start = max(start, m_start); end = min(end, m_end); + size = end - start + 1; if (end <= start) continue; - e820__range_update(start, end - start + 1, E820_TYPE_RAM, - E820_TYPE_SOFT_RESERVED); + + /* + * Ensure each efi_fake_mem instance results in + * a unique e820 resource + */ + e820__range_remove(start, size, E820_TYPE_RAM, 1); + e820__range_add(start, size, E820_TYPE_SOFT_RESERVED); e820__update_table(e820_table); } } diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c index 38113d3c0138..75e8b71c18b9 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c @@ -258,8 +258,8 @@ shmem_writeback(struct drm_i915_gem_object *obj) for (i = 0; i < obj->base.size >> PAGE_SHIFT; i++) { struct page *page; - page = find_lock_entry(mapping, i); - if (!page || xa_is_value(page)) + page = find_lock_page(mapping, i); + if (!page) continue; if (!page_mapped(page) && clear_page_dirty_for_io(page)) { diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index 4e8112fde3e6..a13c6215bba8 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -101,7 +101,7 @@ unsigned long nouveau_dmem_page_addr(struct page *page) { struct nouveau_dmem_chunk *chunk = nouveau_page_to_chunk(page); unsigned long off = (page_to_pfn(page) << PAGE_SHIFT) - - chunk->pagemap.res.start; + chunk->pagemap.range.start; return chunk->bo->offset + off; } @@ -249,7 +249,9 @@ nouveau_dmem_chunk_alloc(struct nouveau_drm *drm, struct page **ppage) chunk->drm = drm; chunk->pagemap.type = MEMORY_DEVICE_PRIVATE; - chunk->pagemap.res = *res; + chunk->pagemap.range.start = res->start; + chunk->pagemap.range.end = res->end; + chunk->pagemap.nr_range = 1; chunk->pagemap.ops = &nouveau_dmem_pagemap_ops; chunk->pagemap.owner = drm->dev; @@ -273,7 +275,7 @@ nouveau_dmem_chunk_alloc(struct nouveau_drm *drm, struct page **ppage) list_add(&chunk->list, &drm->dmem->chunks); mutex_unlock(&drm->dmem->mutex); - pfn_first = chunk->pagemap.res.start >> PAGE_SHIFT; + pfn_first = chunk->pagemap.range.start >> PAGE_SHIFT; page = pfn_to_page(pfn_first); spin_lock(&drm->dmem->lock); for (i = 0; i < DMEM_CHUNK_NPAGES - 1; ++i, ++page) { @@ -294,8 +296,7 @@ out_bo_unpin: out_bo_free: nouveau_bo_ref(NULL, &chunk->bo); out_release: - release_mem_region(chunk->pagemap.res.start, - resource_size(&chunk->pagemap.res)); + release_mem_region(chunk->pagemap.range.start, range_len(&chunk->pagemap.range)); out_free: kfree(chunk); out: @@ -382,8 +383,8 @@ nouveau_dmem_fini(struct nouveau_drm *drm) nouveau_bo_ref(NULL, &chunk->bo); list_del(&chunk->list); memunmap_pages(&chunk->pagemap); - release_mem_region(chunk->pagemap.res.start, - resource_size(&chunk->pagemap.res)); + release_mem_region(chunk->pagemap.range.start, + range_len(&chunk->pagemap.range)); kfree(chunk); } diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 0418071a3724..46d885575601 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -2198,7 +2198,7 @@ static bool gic_check_reserved_range(phys_addr_t addr, unsigned long size) addr_end = addr + size - 1; - for_each_reserved_mem_region(i, &start, &end) { + for_each_reserved_mem_range(i, &start, &end) { if (addr >= start && addr_end <= end) return true; } diff --git a/drivers/nvdimm/badrange.c b/drivers/nvdimm/badrange.c index b9eeefa27e3a..aaf6e215a8c6 100644 --- a/drivers/nvdimm/badrange.c +++ b/drivers/nvdimm/badrange.c @@ -211,7 +211,7 @@ static void __add_badblock_range(struct badblocks *bb, u64 ns_offset, u64 len) } static void badblocks_populate(struct badrange *badrange, - struct badblocks *bb, const struct resource *res) + struct badblocks *bb, const struct range *range) { struct badrange_entry *bre; @@ -222,34 +222,34 @@ static void badblocks_populate(struct badrange *badrange, u64 bre_end = bre->start + bre->length - 1; /* Discard intervals with no intersection */ - if (bre_end < res->start) + if (bre_end < range->start) continue; - if (bre->start > res->end) + if (bre->start > range->end) continue; /* Deal with any overlap after start of the namespace */ - if (bre->start >= res->start) { + if (bre->start >= range->start) { u64 start = bre->start; u64 len; - if (bre_end <= res->end) + if (bre_end <= range->end) len = bre->length; else - len = res->start + resource_size(res) + len = range->start + range_len(range) - bre->start; - __add_badblock_range(bb, start - res->start, len); + __add_badblock_range(bb, start - range->start, len); continue; } /* * Deal with overlap for badrange starting before * the namespace. */ - if (bre->start < res->start) { + if (bre->start < range->start) { u64 len; - if (bre_end < res->end) - len = bre->start + bre->length - res->start; + if (bre_end < range->end) + len = bre->start + bre->length - range->start; else - len = resource_size(res); + len = range_len(range); __add_badblock_range(bb, 0, len); } } @@ -267,7 +267,7 @@ static void badblocks_populate(struct badrange *badrange, * and add badblocks entries for all matching sub-ranges */ void nvdimm_badblocks_populate(struct nd_region *nd_region, - struct badblocks *bb, const struct resource *res) + struct badblocks *bb, const struct range *range) { struct nvdimm_bus *nvdimm_bus; @@ -279,7 +279,7 @@ void nvdimm_badblocks_populate(struct nd_region *nd_region, nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev); nvdimm_bus_lock(&nvdimm_bus->dev); - badblocks_populate(&nvdimm_bus->badrange, bb, res); + badblocks_populate(&nvdimm_bus->badrange, bb, range); nvdimm_bus_unlock(&nvdimm_bus->dev); } EXPORT_SYMBOL_GPL(nvdimm_badblocks_populate); diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c index 22d865ba6353..5a7c80053c62 100644 --- a/drivers/nvdimm/claim.c +++ b/drivers/nvdimm/claim.c @@ -303,13 +303,16 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns, int devm_nsio_enable(struct device *dev, struct nd_namespace_io *nsio, resource_size_t size) { - struct resource *res = &nsio->res; struct nd_namespace_common *ndns = &nsio->common; + struct range range = { + .start = nsio->res.start, + .end = nsio->res.end, + }; nsio->size = size; - if (!devm_request_mem_region(dev, res->start, size, + if (!devm_request_mem_region(dev, range.start, size, dev_name(&ndns->dev))) { - dev_warn(dev, "could not reserve region %pR\n", res); + dev_warn(dev, "could not reserve region %pR\n", &nsio->res); return -EBUSY; } @@ -317,9 +320,9 @@ int devm_nsio_enable(struct device *dev, struct nd_namespace_io *nsio, if (devm_init_badblocks(dev, &nsio->bb)) return -ENOMEM; nvdimm_badblocks_populate(to_nd_region(ndns->dev.parent), &nsio->bb, - &nsio->res); + &range); - nsio->addr = devm_memremap(dev, res->start, size, ARCH_MEMREMAP_PMEM); + nsio->addr = devm_memremap(dev, range.start, size, ARCH_MEMREMAP_PMEM); return PTR_ERR_OR_ZERO(nsio->addr); } diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index 72740108ba42..696b55556d4d 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -377,8 +377,9 @@ int nvdimm_namespace_detach_btt(struct nd_btt *nd_btt); const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns, char *name); unsigned int pmem_sector_size(struct nd_namespace_common *ndns); +struct range; void nvdimm_badblocks_populate(struct nd_region *nd_region, - struct badblocks *bb, const struct resource *res); + struct badblocks *bb, const struct range *range); int devm_namespace_enable(struct device *dev, struct nd_namespace_common *ndns, resource_size_t size); void devm_namespace_disable(struct device *dev, diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index 3e11ef8d3f5b..b499df630d4d 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c @@ -672,7 +672,7 @@ static unsigned long init_altmap_reserve(resource_size_t base) static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap) { - struct resource *res = &pgmap->res; + struct range *range = &pgmap->range; struct vmem_altmap *altmap = &pgmap->altmap; struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb; u64 offset = le64_to_cpu(pfn_sb->dataoff); @@ -689,16 +689,17 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap) .end_pfn = PHYS_PFN(end), }; - memcpy(res, &nsio->res, sizeof(*res)); - res->start += start_pad; - res->end -= end_trunc; - + *range = (struct range) { + .start = nsio->res.start + start_pad, + .end = nsio->res.end - end_trunc, + }; + pgmap->nr_range = 1; if (nd_pfn->mode == PFN_MODE_RAM) { if (offset < reserve) return -EINVAL; nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns); } else if (nd_pfn->mode == PFN_MODE_PMEM) { - nd_pfn->npfns = PHYS_PFN((resource_size(res) - offset)); + nd_pfn->npfns = PHYS_PFN((range_len(range) - offset)); if (le64_to_cpu(nd_pfn->pfn_sb->npfns) > nd_pfn->npfns) dev_info(&nd_pfn->dev, "number of pfns truncated from %lld to %ld\n", diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index c86a0ceaece6..875076b0ea6c 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -375,7 +375,7 @@ static int pmem_attach_disk(struct device *dev, struct nd_region *nd_region = to_nd_region(dev->parent); int nid = dev_to_node(dev), fua; struct resource *res = &nsio->res; - struct resource bb_res; + struct range bb_range; struct nd_pfn *nd_pfn = NULL; struct dax_device *dax_dev; struct nd_pfn_sb *pfn_sb; @@ -434,24 +434,27 @@ static int pmem_attach_disk(struct device *dev, pfn_sb = nd_pfn->pfn_sb; pmem->data_offset = le64_to_cpu(pfn_sb->dataoff); pmem->pfn_pad = resource_size(res) - - resource_size(&pmem->pgmap.res); + range_len(&pmem->pgmap.range); pmem->pfn_flags |= PFN_MAP; - memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res)); - bb_res.start += pmem->data_offset; + bb_range = pmem->pgmap.range; + bb_range.start += pmem->data_offset; } else if (pmem_should_map_pages(dev)) { - memcpy(&pmem->pgmap.res, &nsio->res, sizeof(pmem->pgmap.res)); + pmem->pgmap.range.start = res->start; + pmem->pgmap.range.end = res->end; + pmem->pgmap.nr_range = 1; pmem->pgmap.type = MEMORY_DEVICE_FS_DAX; pmem->pgmap.ops = &fsdax_pagemap_ops; addr = devm_memremap_pages(dev, &pmem->pgmap); pmem->pfn_flags |= PFN_MAP; - memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res)); + bb_range = pmem->pgmap.range; } else { if (devm_add_action_or_reset(dev, pmem_release_queue, &pmem->pgmap)) return -ENOMEM; addr = devm_memremap(dev, pmem->phys_addr, pmem->size, ARCH_MEMREMAP_PMEM); - memcpy(&bb_res, &nsio->res, sizeof(bb_res)); + bb_range.start = res->start; + bb_range.end = res->end; } if (IS_ERR(addr)) @@ -480,7 +483,7 @@ static int pmem_attach_disk(struct device *dev, / 512); if (devm_init_badblocks(dev, &pmem->bb)) return -ENOMEM; - nvdimm_badblocks_populate(nd_region, &pmem->bb, &bb_res); + nvdimm_badblocks_populate(nd_region, &pmem->bb, &bb_range); disk->bb = &pmem->bb; if (is_nvdimm_sync(nd_region)) @@ -591,8 +594,8 @@ static void nd_pmem_notify(struct device *dev, enum nvdimm_event event) resource_size_t offset = 0, end_trunc = 0; struct nd_namespace_common *ndns; struct nd_namespace_io *nsio; - struct resource res; struct badblocks *bb; + struct range range; struct kernfs_node *bb_state; if (event != NVDIMM_REVALIDATE_POISON) @@ -628,9 +631,9 @@ static void nd_pmem_notify(struct device *dev, enum nvdimm_event event) nsio = to_nd_namespace_io(&ndns->dev); } - res.start = nsio->res.start + offset; - res.end = nsio->res.end - end_trunc; - nvdimm_badblocks_populate(nd_region, bb, &res); + range.start = nsio->res.start + offset; + range.end = nsio->res.end - end_trunc; + nvdimm_badblocks_populate(nd_region, bb, &range); if (bb_state) sysfs_notify_dirent(bb_state); } diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c index 0f6978e72e7c..bfce87ed72ab 100644 --- a/drivers/nvdimm/region.c +++ b/drivers/nvdimm/region.c @@ -35,7 +35,10 @@ static int nd_region_probe(struct device *dev) return rc; if (is_memory(&nd_region->dev)) { - struct resource ndr_res; + struct range range = { + .start = nd_region->ndr_start, + .end = nd_region->ndr_start + nd_region->ndr_size - 1, + }; if (devm_init_badblocks(dev, &nd_region->bb)) return -ENODEV; @@ -44,9 +47,7 @@ static int nd_region_probe(struct device *dev) if (!nd_region->bb_state) dev_warn(&nd_region->dev, "'badblocks' notification disabled\n"); - ndr_res.start = nd_region->ndr_start; - ndr_res.end = nd_region->ndr_start + nd_region->ndr_size - 1; - nvdimm_badblocks_populate(nd_region, &nd_region->bb, &ndr_res); + nvdimm_badblocks_populate(nd_region, &nd_region->bb, &range); } rc = nd_region_register_namespaces(nd_region, &err); @@ -121,14 +122,16 @@ static void nd_region_notify(struct device *dev, enum nvdimm_event event) { if (event == NVDIMM_REVALIDATE_POISON) { struct nd_region *nd_region = to_nd_region(dev); - struct resource res; if (is_memory(&nd_region->dev)) { - res.start = nd_region->ndr_start; - res.end = nd_region->ndr_start + - nd_region->ndr_size - 1; + struct range range = { + .start = nd_region->ndr_start, + .end = nd_region->ndr_start + + nd_region->ndr_size - 1, + }; + nvdimm_badblocks_populate(nd_region, - &nd_region->bb, &res); + &nd_region->bb, &range); if (nd_region->bb_state) sysfs_notify_dirent(nd_region->bb_state); } diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index f357f9a32b3a..9d53c16b7329 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -185,9 +185,9 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, return -ENOMEM; pgmap = &p2p_pgmap->pgmap; - pgmap->res.start = pci_resource_start(pdev, bar) + offset; - pgmap->res.end = pgmap->res.start + size - 1; - pgmap->res.flags = pci_resource_flags(pdev, bar); + pgmap->range.start = pci_resource_start(pdev, bar) + offset; + pgmap->range.end = pgmap->range.start + size - 1; + pgmap->nr_range = 1; pgmap->type = MEMORY_DEVICE_PCI_P2PDMA; p2p_pgmap->provider = pdev; @@ -202,13 +202,13 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, error = gen_pool_add_owner(pdev->p2pdma->pool, (unsigned long)addr, pci_bus_address(pdev, bar) + offset, - resource_size(&pgmap->res), dev_to_node(&pdev->dev), + range_len(&pgmap->range), dev_to_node(&pdev->dev), pgmap->ref); if (error) goto pages_free; - pci_info(pdev, "added peer-to-peer DMA memory %pR\n", - &pgmap->res); + pci_info(pdev, "added peer-to-peer DMA memory %#llx-%#llx\n", + pgmap->range.start, pgmap->range.end); return 0; diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index c08512fcea90..834b7c13ef3d 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -36,18 +36,10 @@ enum virtio_mem_mb_state { VIRTIO_MEM_MB_STATE_OFFLINE, /* Partially plugged, fully added to Linux, offline. */ VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL, - /* Fully plugged, fully added to Linux, online (!ZONE_MOVABLE). */ + /* Fully plugged, fully added to Linux, online. */ VIRTIO_MEM_MB_STATE_ONLINE, - /* Partially plugged, fully added to Linux, online (!ZONE_MOVABLE). */ + /* Partially plugged, fully added to Linux, online. */ VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL, - /* - * Fully plugged, fully added to Linux, online (ZONE_MOVABLE). - * We are not allowed to allocate (unplug) parts of this block that - * are not movable (similar to gigantic pages). We will never allow - * to online OFFLINE_PARTIAL to ZONE_MOVABLE (as they would contain - * unmovable parts). - */ - VIRTIO_MEM_MB_STATE_ONLINE_MOVABLE, VIRTIO_MEM_MB_STATE_COUNT }; @@ -526,21 +518,10 @@ static bool virtio_mem_owned_mb(struct virtio_mem *vm, unsigned long mb_id) } static int virtio_mem_notify_going_online(struct virtio_mem *vm, - unsigned long mb_id, - enum zone_type zone) + unsigned long mb_id) { switch (virtio_mem_mb_get_state(vm, mb_id)) { case VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL: - /* - * We won't allow to online a partially plugged memory block - * to the MOVABLE zone - it would contain unmovable parts. - */ - if (zone == ZONE_MOVABLE) { - dev_warn_ratelimited(&vm->vdev->dev, - "memory block has holes, MOVABLE not supported\n"); - return NOTIFY_BAD; - } - return NOTIFY_OK; case VIRTIO_MEM_MB_STATE_OFFLINE: return NOTIFY_OK; default: @@ -560,7 +541,6 @@ static void virtio_mem_notify_offline(struct virtio_mem *vm, VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL); break; case VIRTIO_MEM_MB_STATE_ONLINE: - case VIRTIO_MEM_MB_STATE_ONLINE_MOVABLE: virtio_mem_mb_set_state(vm, mb_id, VIRTIO_MEM_MB_STATE_OFFLINE); break; @@ -579,24 +559,17 @@ static void virtio_mem_notify_offline(struct virtio_mem *vm, virtio_mem_retry(vm); } -static void virtio_mem_notify_online(struct virtio_mem *vm, unsigned long mb_id, - enum zone_type zone) +static void virtio_mem_notify_online(struct virtio_mem *vm, unsigned long mb_id) { unsigned long nb_offline; switch (virtio_mem_mb_get_state(vm, mb_id)) { case VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL: - BUG_ON(zone == ZONE_MOVABLE); virtio_mem_mb_set_state(vm, mb_id, VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL); break; case VIRTIO_MEM_MB_STATE_OFFLINE: - if (zone == ZONE_MOVABLE) - virtio_mem_mb_set_state(vm, mb_id, - VIRTIO_MEM_MB_STATE_ONLINE_MOVABLE); - else - virtio_mem_mb_set_state(vm, mb_id, - VIRTIO_MEM_MB_STATE_ONLINE); + virtio_mem_mb_set_state(vm, mb_id, VIRTIO_MEM_MB_STATE_ONLINE); break; default: BUG(); @@ -675,7 +648,6 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb, const unsigned long start = PFN_PHYS(mhp->start_pfn); const unsigned long size = PFN_PHYS(mhp->nr_pages); const unsigned long mb_id = virtio_mem_phys_to_mb_id(start); - enum zone_type zone; int rc = NOTIFY_OK; if (!virtio_mem_overlaps_range(vm, start, size)) @@ -717,8 +689,7 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb, break; } vm->hotplug_active = true; - zone = page_zonenum(pfn_to_page(mhp->start_pfn)); - rc = virtio_mem_notify_going_online(vm, mb_id, zone); + rc = virtio_mem_notify_going_online(vm, mb_id); break; case MEM_OFFLINE: virtio_mem_notify_offline(vm, mb_id); @@ -726,8 +697,7 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb, mutex_unlock(&vm->hotplug_mutex); break; case MEM_ONLINE: - zone = page_zonenum(pfn_to_page(mhp->start_pfn)); - virtio_mem_notify_online(vm, mb_id, zone); + virtio_mem_notify_online(vm, mb_id); vm->hotplug_active = false; mutex_unlock(&vm->hotplug_mutex); break; @@ -1906,8 +1876,7 @@ static void virtio_mem_remove(struct virtio_device *vdev) if (vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE] || vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL] || vm->nb_mb_state[VIRTIO_MEM_MB_STATE_ONLINE] || - vm->nb_mb_state[VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL] || - vm->nb_mb_state[VIRTIO_MEM_MB_STATE_ONLINE_MOVABLE]) { + vm->nb_mb_state[VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL]) { dev_warn(&vdev->dev, "device still has system memory added\n"); } else { virtio_mem_delete_resource(vm); diff --git a/drivers/xen/unpopulated-alloc.c b/drivers/xen/unpopulated-alloc.c index 3b98dc921426..8c512ea550bb 100644 --- a/drivers/xen/unpopulated-alloc.c +++ b/drivers/xen/unpopulated-alloc.c @@ -18,27 +18,38 @@ static unsigned int list_count; static int fill_list(unsigned int nr_pages) { struct dev_pagemap *pgmap; + struct resource *res; void *vaddr; unsigned int i, alloc_pages = round_up(nr_pages, PAGES_PER_SECTION); - int ret; + int ret = -ENOMEM; + + res = kzalloc(sizeof(*res), GFP_KERNEL); + if (!res) + return -ENOMEM; pgmap = kzalloc(sizeof(*pgmap), GFP_KERNEL); if (!pgmap) - return -ENOMEM; + goto err_pgmap; pgmap->type = MEMORY_DEVICE_GENERIC; - pgmap->res.name = "Xen scratch"; - pgmap->res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; + res->name = "Xen scratch"; + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; - ret = allocate_resource(&iomem_resource, &pgmap->res, + ret = allocate_resource(&iomem_resource, res, alloc_pages * PAGE_SIZE, 0, -1, PAGES_PER_SECTION * PAGE_SIZE, NULL, NULL); if (ret < 0) { pr_err("Cannot allocate new IOMEM resource\n"); - kfree(pgmap); - return ret; + goto err_resource; } + pgmap->range = (struct range) { + .start = res->start, + .end = res->end, + }; + pgmap->nr_range = 1; + pgmap->owner = res; + #ifdef CONFIG_XEN_HAVE_PVMMU /* * memremap will build page tables for the new memory so @@ -50,14 +61,13 @@ static int fill_list(unsigned int nr_pages) * conflict with any devices. */ if (!xen_feature(XENFEAT_auto_translated_physmap)) { - xen_pfn_t pfn = PFN_DOWN(pgmap->res.start); + xen_pfn_t pfn = PFN_DOWN(res->start); for (i = 0; i < alloc_pages; i++) { if (!set_phys_to_machine(pfn + i, INVALID_P2M_ENTRY)) { pr_warn("set_phys_to_machine() failed, no memory added\n"); - release_resource(&pgmap->res); - kfree(pgmap); - return -ENOMEM; + ret = -ENOMEM; + goto err_memremap; } } } @@ -66,9 +76,8 @@ static int fill_list(unsigned int nr_pages) vaddr = memremap_pages(pgmap, NUMA_NO_NODE); if (IS_ERR(vaddr)) { pr_err("Cannot remap memory range\n"); - release_resource(&pgmap->res); - kfree(pgmap); - return PTR_ERR(vaddr); + ret = PTR_ERR(vaddr); + goto err_memremap; } for (i = 0; i < alloc_pages; i++) { @@ -80,6 +89,14 @@ static int fill_list(unsigned int nr_pages) } return 0; + +err_memremap: + release_resource(res); +err_resource: + kfree(pgmap); +err_pgmap: + kfree(res); + return ret; } /** diff --git a/fs/fs_parser.c b/fs/fs_parser.c index ab53e42a874a..68b0148f4bb8 100644 --- a/fs/fs_parser.c +++ b/fs/fs_parser.c @@ -189,7 +189,7 @@ out: } EXPORT_SYMBOL(fs_lookup_param); -int fs_param_bad_value(struct p_log *log, struct fs_parameter *param) +static int fs_param_bad_value(struct p_log *log, struct fs_parameter *param) { return inval_plog(log, "Bad value for '%s'", param->key); } diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 9bb9f0952b18..caf563981532 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -1810,6 +1810,12 @@ int ntfs_read_inode_mount(struct inode *vi) brelse(bh); } + if (le32_to_cpu(m->bytes_allocated) != vol->mft_record_size) { + ntfs_error(sb, "Incorrect mft record size %u in superblock, should be %u.", + le32_to_cpu(m->bytes_allocated), vol->mft_record_size); + goto err_out; + } + /* Apply the mst fixups. */ if (post_read_mst_fixup((NTFS_RECORD*)m, vol->mft_record_size)) { /* FIXME: Try to use the $MFTMirr now. */ diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 4c1b90442d6f..78710788c237 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -6013,7 +6013,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) goto out; } - /* Appending truncate log(TA) and and flushing truncate log(TF) are + /* Appending truncate log(TA) and flushing truncate log(TF) are * two separated transactions. They can be both committed but not * checkpointed. If crash occurs then, both two transaction will be * replayed with several already released to global bitmap clusters. @@ -7654,8 +7654,10 @@ out_mutex: * main_bm related locks for avoiding the current IO starve, then go to * trim the next group */ - if (ret >= 0 && group <= last_group) + if (ret >= 0 && group <= last_group) { + cond_resched(); goto next_group; + } out: range->len = trimmed * sb->s_blocksize; return ret; diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 720e9f94957e..fc8252a28cb1 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -677,7 +677,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, /* * Under certain conditions, the window slide code * might have reduced the number of bits available or - * disabled the the local alloc entirely. Re-check + * disabled the local alloc entirely. Re-check * here and return -ENOSPC if necessary. */ status = -ENOSPC; diff --git a/fs/proc/base.c b/fs/proc/base.c index 3d8f449c726d..991730608907 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1056,7 +1056,6 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count, static int __set_oom_adj(struct file *file, int oom_adj, bool legacy) { - static DEFINE_MUTEX(oom_adj_mutex); struct mm_struct *mm = NULL; struct task_struct *task; int err = 0; @@ -1096,7 +1095,7 @@ static int __set_oom_adj(struct file *file, int oom_adj, bool legacy) struct task_struct *p = find_lock_task_mm(task); if (p) { - if (atomic_read(&p->mm->mm_users) > 1) { + if (test_bit(MMF_MULTIPROCESS, &p->mm->flags)) { mm = p->mm; mmgrab(mm); } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 12e91f367110..7f15809b65c3 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -577,16 +577,10 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, page = device_private_entry_to_page(swpent); } else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap && pte_none(*pte))) { - page = find_get_entry(vma->vm_file->f_mapping, + page = xa_load(&vma->vm_file->f_mapping->i_pages, linear_page_index(vma, addr)); - if (!page) - return; - if (xa_is_value(page)) mss->swap += PAGE_SIZE; - else - put_page(page); - return; } @@ -784,9 +778,21 @@ static const struct mm_walk_ops smaps_shmem_walk_ops = { .pte_hole = smaps_pte_hole, }; +/* + * Gather mem stats from @vma with the indicated beginning + * address @start, and keep them in @mss. + * + * Use vm_start of @vma as the beginning address if @start is 0. + */ static void smap_gather_stats(struct vm_area_struct *vma, - struct mem_size_stats *mss) + struct mem_size_stats *mss, unsigned long start) { + const struct mm_walk_ops *ops = &smaps_walk_ops; + + /* Invalid start */ + if (start >= vma->vm_end) + return; + #ifdef CONFIG_SHMEM /* In case of smaps_rollup, reset the value from previous vma */ mss->check_shmem_swap = false; @@ -803,18 +809,20 @@ static void smap_gather_stats(struct vm_area_struct *vma, */ unsigned long shmem_swapped = shmem_swap_usage(vma); - if (!shmem_swapped || (vma->vm_flags & VM_SHARED) || - !(vma->vm_flags & VM_WRITE)) { + if (!start && (!shmem_swapped || (vma->vm_flags & VM_SHARED) || + !(vma->vm_flags & VM_WRITE))) { mss->swap += shmem_swapped; } else { mss->check_shmem_swap = true; - walk_page_vma(vma, &smaps_shmem_walk_ops, mss); - return; + ops = &smaps_shmem_walk_ops; } } #endif /* mmap_lock is held in m_start */ - walk_page_vma(vma, &smaps_walk_ops, mss); + if (!start) + walk_page_vma(vma, ops, mss); + else + walk_page_range(vma->vm_mm, start, vma->vm_end, ops, mss); } #define SEQ_PUT_DEC(str, val) \ @@ -866,7 +874,7 @@ static int show_smap(struct seq_file *m, void *v) memset(&mss, 0, sizeof(mss)); - smap_gather_stats(vma, &mss); + smap_gather_stats(vma, &mss, 0); show_map_vma(m, vma); if (vma_get_anon_name(vma)) { @@ -919,9 +927,73 @@ static int show_smaps_rollup(struct seq_file *m, void *v) hold_task_mempolicy(priv); - for (vma = priv->mm->mmap; vma; vma = vma->vm_next) { - smap_gather_stats(vma, &mss); + for (vma = priv->mm->mmap; vma;) { + smap_gather_stats(vma, &mss, 0); last_vma_end = vma->vm_end; + + /* + * Release mmap_lock temporarily if someone wants to + * access it for write request. + */ + if (mmap_lock_is_contended(mm)) { + mmap_read_unlock(mm); + ret = mmap_read_lock_killable(mm); + if (ret) { + release_task_mempolicy(priv); + goto out_put_mm; + } + + /* + * After dropping the lock, there are four cases to + * consider. See the following example for explanation. + * + * +------+------+-----------+ + * | VMA1 | VMA2 | VMA3 | + * +------+------+-----------+ + * | | | | + * 4k 8k 16k 400k + * + * Suppose we drop the lock after reading VMA2 due to + * contention, then we get: + * + * last_vma_end = 16k + * + * 1) VMA2 is freed, but VMA3 exists: + * + * find_vma(mm, 16k - 1) will return VMA3. + * In this case, just continue from VMA3. + * + * 2) VMA2 still exists: + * + * find_vma(mm, 16k - 1) will return VMA2. + * Iterate the loop like the original one. + * + * 3) No more VMAs can be found: + * + * find_vma(mm, 16k - 1) will return NULL. + * No more things to do, just break. + * + * 4) (last_vma_end - 1) is the middle of a vma (VMA'): + * + * find_vma(mm, 16k - 1) will return VMA' whose range + * contains last_vma_end. + * Iterate VMA' from last_vma_end. + */ + vma = find_vma(mm, last_vma_end - 1); + /* Case 3 above */ + if (!vma) + break; + + /* Case 1 above */ + if (vma->vm_start >= last_vma_end) + continue; + + /* Case 4 above */ + if (vma->vm_end > last_vma_end) + smap_gather_stats(vma, &mss, last_vma_end); + } + /* Case 2 above */ + vma = vma->vm_next; } show_vma_header_prefix(m, priv->mm->mmap->vm_start, diff --git a/fs/xattr.c b/fs/xattr.c index 535af3d6c868..d6bf5a7e2420 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -232,15 +232,15 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name, } /** - * __vfs_setxattr_locked: set an extended attribute while holding the inode + * __vfs_setxattr_locked - set an extended attribute while holding the inode * lock * - * @dentry - object to perform setxattr on - * @name - xattr name to set - * @value - value to set @name to - * @size - size of @value - * @flags - flags to pass into filesystem operations - * @delegated_inode - on return, will contain an inode pointer that + * @dentry: object to perform setxattr on + * @name: xattr name to set + * @value: value to set @name to + * @size: size of @value + * @flags: flags to pass into filesystem operations + * @delegated_inode: on return, will contain an inode pointer that * a delegation was broken on, NULL if none. */ int @@ -443,12 +443,12 @@ __vfs_removexattr(struct dentry *dentry, const char *name) EXPORT_SYMBOL(__vfs_removexattr); /** - * __vfs_removexattr_locked: set an extended attribute while holding the inode + * __vfs_removexattr_locked - set an extended attribute while holding the inode * lock * - * @dentry - object to perform setxattr on - * @name - name of xattr to remove - * @delegated_inode - on return, will contain an inode pointer that + * @dentry: object to perform setxattr on + * @name: name of xattr to remove + * @delegated_inode: on return, will contain an inode pointer that * a delegation was broken on, NULL if none. */ int diff --git a/include/acpi/acpi_numa.h b/include/acpi/acpi_numa.h index fdebcfc6c8df..0e9302285f14 100644 --- a/include/acpi/acpi_numa.h +++ b/include/acpi/acpi_numa.h @@ -17,10 +17,22 @@ extern int pxm_to_node(int); extern int node_to_pxm(int); extern int acpi_map_pxm_to_node(int); extern unsigned char acpi_srat_revision; -extern int acpi_numa __initdata; +extern void disable_srat(void); extern void bad_srat(void); extern int srat_disabled(void); +#else /* CONFIG_ACPI_NUMA */ +static inline void disable_srat(void) +{ +} #endif /* CONFIG_ACPI_NUMA */ + +#ifdef CONFIG_ACPI_HMAT +extern void disable_hmat(void); +#else /* CONFIG_ACPI_HMAT */ +static inline void disable_hmat(void) +{ +} +#endif /* CONFIG_ACPI_HMAT */ #endif /* __ACP_NUMA_H */ diff --git a/include/kunit/test.h b/include/kunit/test.h index 59f3144f009a..3391f38389f8 100644 --- a/include/kunit/test.h +++ b/include/kunit/test.h @@ -224,6 +224,11 @@ struct kunit { struct list_head resources; /* Protected by lock. */ }; +static inline void kunit_set_failure(struct kunit *test) +{ + WRITE_ONCE(test->success, false); +} + void kunit_init_test(struct kunit *test, const char *name, char *log); int kunit_run_tests(struct kunit_suite *suite); diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 64ae25c59d55..cfa8c0015863 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -709,6 +709,8 @@ static inline u64 acpi_arch_get_root_pointer(void) #define ACPI_HANDLE_FWNODE(fwnode) (NULL) #define ACPI_DEVICE_CLASS(_cls, _msk) .cls = (0), .cls_msk = (0), +#include + struct fwnode_handle; static inline bool acpi_dev_found(const char *hid) diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 25a521d299c1..1de5a1151ee7 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -29,9 +29,6 @@ enum compact_result { /* compaction didn't start as it was deferred due to past failures */ COMPACT_DEFERRED, - /* compaction not active last round */ - COMPACT_INACTIVE = COMPACT_DEFERRED, - /* For more detailed tracepoint output - internal to compaction */ COMPACT_NO_SUITABLE_PAGE, /* compaction should continue to another pageblock */ diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index cee0c728d39a..230604e7f057 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -3,6 +3,14 @@ #error "Please don't include directly, include instead." #endif +#define CLANG_VERSION (__clang_major__ * 10000 \ + + __clang_minor__ * 100 \ + + __clang_patchlevel__) + +#if CLANG_VERSION < 100001 +# error Sorry, your version of Clang is too old - please use 10.0.1 or newer. +#endif + /* Compiler specific definitions for Clang compiler */ /* same as gcc, this was present in clang-2.6 so we can assume it works diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 7a3769040d7d..d1e3c6896b71 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -12,7 +12,7 @@ /* https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58145 */ #if GCC_VERSION < 40900 -# error Sorry, your compiler is too old - please upgrade it. +# error Sorry, your version of GCC is too old - please use 4.9 or newer. #endif /* Optimization barrier */ diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 92ef163a7479..ac45f6d40d39 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -155,7 +155,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, extern typeof(sym) sym; \ static const unsigned long __kentry_##sym \ __used \ - __section("___kentry" "+" #sym ) \ + __attribute__((__section__("___kentry+" #sym))) \ = (unsigned long)&sym; #endif diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index 6594dbc34a37..206bde8308b2 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -55,6 +55,9 @@ phys_addr_t paddr_vmcoreinfo_note(void); #define VMCOREINFO_OFFSET(name, field) \ vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \ (unsigned long)offsetof(struct name, field)) +#define VMCOREINFO_TYPE_OFFSET(name, field) \ + vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \ + (unsigned long)offsetof(name, field)) #define VMCOREINFO_LENGTH(name, value) \ vmcoreinfo_append_str("LENGTH(%s)=%lu\n", #name, (unsigned long)value) #define VMCOREINFO_NUMBER(name) \ diff --git a/include/linux/dax.h b/include/linux/dax.h index 43b39ab9de1a..4ec0bbf86205 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -238,4 +238,12 @@ static inline bool dax_mapping(struct address_space *mapping) return mapping->host && IS_DAX(mapping->host); } +#ifdef CONFIG_DEV_DAX_HMEM_DEVICES +void hmem_register_device(int target_nid, struct resource *r); +#else +static inline void hmem_register_device(int target_nid, struct resource *r) +{ +} +#endif + #endif diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h index e7e45f0cc7da..2915f56ad421 100644 --- a/include/linux/debug_locks.h +++ b/include/linux/debug_locks.h @@ -2,9 +2,9 @@ #ifndef __LINUX_DEBUG_LOCKING_H #define __LINUX_DEBUG_LOCKING_H -#include #include #include +#include struct task_struct; diff --git a/include/linux/dev_printk.h b/include/linux/dev_printk.h index 3028b644b4fb..6f009559ee54 100644 --- a/include/linux/dev_printk.h +++ b/include/linux/dev_printk.h @@ -21,6 +21,14 @@ struct device; +#define PRINTK_INFO_SUBSYSTEM_LEN 16 +#define PRINTK_INFO_DEVICE_LEN 48 + +struct dev_printk_info { + char subsystem[PRINTK_INFO_SUBSYSTEM_LEN]; + char device[PRINTK_INFO_DEVICE_LEN]; +}; + #ifdef CONFIG_PRINTK __printf(3, 0) __cold diff --git a/include/linux/export.h b/include/linux/export.h index b962df3c5dd3..fa8043dedb7b 100644 --- a/include/linux/export.h +++ b/include/linux/export.h @@ -130,7 +130,7 @@ struct kernel_symbol { * discarded in the final link stage. */ #define __ksym_marker(sym) \ - static int __ksym_marker_##sym[0] __section(".discard.ksym") __used + static int __ksym_marker_##sym[0] __section(.discard.ksym) __used #define __EXPORT_SYMBOL(sym, sec, ns) \ __ksym_marker(sym); \ diff --git a/include/linux/fs.h b/include/linux/fs.h index 2d5c3cd86d31..32e6f72de7eb 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2604,6 +2604,10 @@ extern bool is_bad_inode(struct inode *); unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end); +void invalidate_mapping_pagevec(struct address_space *mapping, + pgoff_t start, pgoff_t end, + unsigned long *nr_pagevec); + static inline void invalidate_remote_inode(struct inode *inode) { if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 67a0774e080b..07e481993ef5 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -238,7 +238,9 @@ struct vm_area_struct; * %__GFP_FOO flags as necessary. * * %GFP_ATOMIC users can not sleep and need the allocation to succeed. A lower - * watermark is applied to allow access to "atomic reserves" + * watermark is applied to allow access to "atomic reserves". + * The current implementation doesn't support NMI and few other strict + * non-preemptive contexts (e.g. raw_spin_lock). The same applies to %GFP_NOWAIT. * * %GFP_KERNEL is typical for kernel-internal allocations. The caller requires * %ZONE_NORMAL or a lower zone for direct access but can direct reclaim. @@ -560,8 +562,6 @@ extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) #define alloc_page_vma(gfp_mask, vma, addr) \ alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false) -#define alloc_page_vma_node(gfp_mask, vma, addr, node) \ - alloc_pages_vma(gfp_mask, 0, vma, addr, node, false) extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); extern unsigned long get_zeroed_page(gfp_t gfp_mask); diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 8a8bc46a2432..0365aa97f8e7 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -38,9 +38,6 @@ extern int zap_huge_pmd(struct mmu_gather *tlb, extern int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, unsigned long addr); -extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, unsigned long end, - unsigned char *vec); extern bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd); diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 087fba34b209..30d343b4a40a 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -14,6 +14,12 @@ struct task_struct; #include #include +/* kasan_data struct is used in KUnit tests for KASAN expected failures */ +struct kunit_kasan_expectation { + bool report_expected; + bool report_found; +}; + extern unsigned char kasan_early_shadow_page[PAGE_SIZE]; extern pte_t kasan_early_shadow_pte[PTRS_PER_PTE]; extern pmd_t kasan_early_shadow_pmd[PTRS_PER_PMD]; diff --git a/include/linux/kernel.h b/include/linux/kernel.h index c25b8e41c0ea..e4aa29b1ad62 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -526,7 +526,6 @@ extern unsigned int sysctl_oops_all_cpu_backtrace; #endif /* CONFIG_SMP */ extern void bust_spinlocks(int yes); -extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in progress */ extern int panic_timeout; extern unsigned long panic_print; extern int panic_on_oops; diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 9d925db0d355..ef131255cedc 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -86,7 +86,6 @@ struct memblock { }; extern struct memblock memblock; -extern int memblock_debug; #ifndef CONFIG_ARCH_KEEP_MEMBLOCK #define __init_memblock __meminit @@ -98,9 +97,6 @@ void memblock_discard(void); static inline void memblock_discard(void) {} #endif -#define memblock_dbg(fmt, ...) \ - if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) - phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end, phys_addr_t size, phys_addr_t align); void memblock_allow_resize(void); @@ -136,9 +132,6 @@ void __next_mem_range_rev(u64 *idx, int nid, enum memblock_flags flags, struct memblock_type *type_b, phys_addr_t *out_start, phys_addr_t *out_end, int *out_nid); -void __next_reserved_mem_region(u64 *idx, phys_addr_t *out_start, - phys_addr_t *out_end); - void __memblock_free_late(phys_addr_t base, phys_addr_t size); #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP @@ -166,7 +159,7 @@ static inline void __next_physmem_range(u64 *idx, struct memblock_type *type, #endif /* CONFIG_HAVE_MEMBLOCK_PHYS_MAP */ /** - * for_each_mem_range - iterate through memblock areas from type_a and not + * __for_each_mem_range - iterate through memblock areas from type_a and not * included in type_b. Or just type_a if type_b is NULL. * @i: u64 used as loop variable * @type_a: ptr to memblock_type to iterate @@ -177,7 +170,7 @@ static inline void __next_physmem_range(u64 *idx, struct memblock_type *type, * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL * @p_nid: ptr to int for nid of the range, can be %NULL */ -#define for_each_mem_range(i, type_a, type_b, nid, flags, \ +#define __for_each_mem_range(i, type_a, type_b, nid, flags, \ p_start, p_end, p_nid) \ for (i = 0, __next_mem_range(&i, nid, flags, type_a, type_b, \ p_start, p_end, p_nid); \ @@ -186,7 +179,7 @@ static inline void __next_physmem_range(u64 *idx, struct memblock_type *type, p_start, p_end, p_nid)) /** - * for_each_mem_range_rev - reverse iterate through memblock areas from + * __for_each_mem_range_rev - reverse iterate through memblock areas from * type_a and not included in type_b. Or just type_a if type_b is NULL. * @i: u64 used as loop variable * @type_a: ptr to memblock_type to iterate @@ -197,17 +190,38 @@ static inline void __next_physmem_range(u64 *idx, struct memblock_type *type, * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL * @p_nid: ptr to int for nid of the range, can be %NULL */ -#define for_each_mem_range_rev(i, type_a, type_b, nid, flags, \ - p_start, p_end, p_nid) \ +#define __for_each_mem_range_rev(i, type_a, type_b, nid, flags, \ + p_start, p_end, p_nid) \ for (i = (u64)ULLONG_MAX, \ - __next_mem_range_rev(&i, nid, flags, type_a, type_b,\ + __next_mem_range_rev(&i, nid, flags, type_a, type_b, \ p_start, p_end, p_nid); \ i != (u64)ULLONG_MAX; \ __next_mem_range_rev(&i, nid, flags, type_a, type_b, \ p_start, p_end, p_nid)) /** - * for_each_reserved_mem_region - iterate over all reserved memblock areas + * for_each_mem_range - iterate through memory areas. + * @i: u64 used as loop variable + * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL + * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL + */ +#define for_each_mem_range(i, p_start, p_end) \ + __for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE, \ + MEMBLOCK_NONE, p_start, p_end, NULL) + +/** + * for_each_mem_range_rev - reverse iterate through memblock areas from + * type_a and not included in type_b. Or just type_a if type_b is NULL. + * @i: u64 used as loop variable + * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL + * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL + */ +#define for_each_mem_range_rev(i, p_start, p_end) \ + __for_each_mem_range_rev(i, &memblock.memory, NULL, NUMA_NO_NODE, \ + MEMBLOCK_NONE, p_start, p_end, NULL) + +/** + * for_each_reserved_mem_range - iterate over all reserved memblock areas * @i: u64 used as loop variable * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL @@ -215,10 +229,9 @@ static inline void __next_physmem_range(u64 *idx, struct memblock_type *type, * Walks over reserved areas of memblock. Available as soon as memblock * is initialized. */ -#define for_each_reserved_mem_region(i, p_start, p_end) \ - for (i = 0UL, __next_reserved_mem_region(&i, p_start, p_end); \ - i != (u64)ULLONG_MAX; \ - __next_reserved_mem_region(&i, p_start, p_end)) +#define for_each_reserved_mem_range(i, p_start, p_end) \ + __for_each_mem_range(i, &memblock.reserved, NULL, NUMA_NO_NODE, \ + MEMBLOCK_NONE, p_start, p_end, NULL) static inline bool memblock_is_hotpluggable(struct memblock_region *m) { @@ -311,8 +324,8 @@ int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask); * soon as memblock is initialized. */ #define for_each_free_mem_range(i, nid, flags, p_start, p_end, p_nid) \ - for_each_mem_range(i, &memblock.memory, &memblock.reserved, \ - nid, flags, p_start, p_end, p_nid) + __for_each_mem_range(i, &memblock.memory, &memblock.reserved, \ + nid, flags, p_start, p_end, p_nid) /** * for_each_free_mem_range_reverse - rev-iterate through free memblock areas @@ -328,8 +341,8 @@ int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask); */ #define for_each_free_mem_range_reverse(i, nid, flags, p_start, p_end, \ p_nid) \ - for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved, \ - nid, flags, p_start, p_end, p_nid) + __for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved, \ + nid, flags, p_start, p_end, p_nid) int memblock_set_node(phys_addr_t base, phys_addr_t size, struct memblock_type *type, int nid); @@ -464,7 +477,6 @@ static inline bool memblock_bottom_up(void) phys_addr_t memblock_phys_mem_size(void); phys_addr_t memblock_reserved_size(void); -phys_addr_t memblock_mem_size(unsigned long limit_pfn); phys_addr_t memblock_start_of_DRAM(void); phys_addr_t memblock_end_of_DRAM(void); void memblock_enforce_memory_limit(phys_addr_t memory_limit); @@ -476,13 +488,7 @@ bool memblock_is_region_memory(phys_addr_t base, phys_addr_t size); bool memblock_is_reserved(phys_addr_t addr); bool memblock_is_region_reserved(phys_addr_t base, phys_addr_t size); -extern void __memblock_dump_all(void); - -static inline void memblock_dump_all(void) -{ - if (memblock_debug) - __memblock_dump_all(); -} +void memblock_dump_all(void); /** * memblock_set_current_limit - Set the current allocation limit to allow @@ -547,15 +553,23 @@ static inline unsigned long memblock_region_reserved_end_pfn(const struct memblo return PFN_UP(reg->base + reg->size); } -#define for_each_memblock(memblock_type, region) \ - for (region = memblock.memblock_type.regions; \ - region < (memblock.memblock_type.regions + memblock.memblock_type.cnt); \ +/** + * for_each_mem_region - itereate over memory regions + * @region: loop variable + */ +#define for_each_mem_region(region) \ + for (region = memblock.memory.regions; \ + region < (memblock.memory.regions + memblock.memory.cnt); \ region++) -#define for_each_memblock_type(i, memblock_type, rgn) \ - for (i = 0, rgn = &memblock_type->regions[0]; \ - i < memblock_type->cnt; \ - i++, rgn = &memblock_type->regions[i]) +/** + * for_each_reserved_mem_region - itereate over reserved memory regions + * @region: loop variable + */ +#define for_each_reserved_mem_region(region) \ + for (region = memblock.reserved.regions; \ + region < (memblock.reserved.regions + memblock.reserved.cnt); \ + region++) extern void *alloc_large_system_hash(const char *tablename, unsigned long bucketsize, diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d0b036123c6a..6ef4a552e09d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -215,13 +215,16 @@ struct mem_cgroup { struct mem_cgroup_id id; /* Accounted resources */ - struct page_counter memory; - struct page_counter swap; + struct page_counter memory; /* Both v1 & v2 */ + + union { + struct page_counter swap; /* v2 only */ + struct page_counter memsw; /* v1 only */ + }; /* Legacy consumer-oriented counters */ - struct page_counter memsw; - struct page_counter kmem; - struct page_counter tcpmem; + struct page_counter kmem; /* v1 only */ + struct page_counter tcpmem; /* v1 only */ /* Range enforcement for interrupt charges */ struct work_struct high_work; diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 375515803cd8..c0faa7a30c46 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -149,15 +149,6 @@ int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, struct mhp_params *params); #endif /* ARCH_HAS_ADD_PAGES */ -#ifdef CONFIG_NUMA -extern int memory_add_physaddr_to_nid(u64 start); -#else -static inline int memory_add_physaddr_to_nid(u64 start) -{ - return 0; -} -#endif - #ifdef CONFIG_HAVE_ARCH_NODEDATA_EXTENSION /* * For supporting node-hotadd, we have to allocate a new pgdat. @@ -284,6 +275,20 @@ static inline bool movable_node_is_enabled(void) } #endif /* ! CONFIG_MEMORY_HOTPLUG */ +#ifdef CONFIG_NUMA +extern int memory_add_physaddr_to_nid(u64 start); +extern int phys_to_target_node(u64 start); +#else +static inline int memory_add_physaddr_to_nid(u64 start) +{ + return 0; +} +static inline int phys_to_target_node(u64 start) +{ + return 0; +} +#endif + #if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT) /* * pgdat resizing functions diff --git a/include/linux/memremap.h b/include/linux/memremap.h index e5862746751b..79c49e7f5c30 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MEMREMAP_H_ #define _LINUX_MEMREMAP_H_ +#include #include #include @@ -93,7 +94,6 @@ struct dev_pagemap_ops { /** * struct dev_pagemap - metadata for ZONE_DEVICE mappings * @altmap: pre-allocated/reserved memory for vmemmap allocations - * @res: physical address range covered by @ref * @ref: reference count that pins the devm_memremap_pages() mapping * @internal_ref: internal reference if @ref is not provided by the caller * @done: completion for @internal_ref @@ -103,10 +103,12 @@ struct dev_pagemap_ops { * @owner: an opaque pointer identifying the entity that manages this * instance. Used by various helpers to make sure that no * foreign ZONE_DEVICE memory is accessed. + * @nr_range: number of ranges to be mapped + * @range: range to be mapped when nr_range == 1 + * @ranges: array of ranges to be mapped when nr_range > 1 */ struct dev_pagemap { struct vmem_altmap altmap; - struct resource res; struct percpu_ref *ref; struct percpu_ref internal_ref; struct completion done; @@ -114,6 +116,11 @@ struct dev_pagemap { unsigned int flags; const struct dev_pagemap_ops *ops; void *owner; + int nr_range; + union { + struct range range; + struct range ranges[0]; + }; }; static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap) diff --git a/include/linux/mm.h b/include/linux/mm.h index d10eb89befbf..51a7a10e520c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -791,7 +791,7 @@ static inline void *kvcalloc(size_t n, size_t size, gfp_t flags) extern void kvfree(const void *addr); extern void kvfree_sensitive(const void *addr, size_t len); -static inline int head_mapcount(struct page *head) +static inline int head_compound_mapcount(struct page *head) { return atomic_read(compound_mapcount_ptr(head)) + 1; } @@ -805,7 +805,7 @@ static inline int compound_mapcount(struct page *page) { VM_BUG_ON_PAGE(!PageCompound(page), page); page = compound_head(page); - return head_mapcount(page); + return head_compound_mapcount(page); } /* @@ -918,7 +918,7 @@ static inline bool hpage_pincount_available(struct page *page) return PageCompound(page) && compound_order(page) > 1; } -static inline int head_pincount(struct page *head) +static inline int head_compound_pincount(struct page *head) { return atomic_read(compound_pincount_ptr(head)); } @@ -927,7 +927,7 @@ static inline int compound_pincount(struct page *page) { VM_BUG_ON_PAGE(!hpage_pincount_available(page), page); page = compound_head(page); - return head_pincount(page); + return head_compound_pincount(page); } static inline void set_compound_order(struct page *page, unsigned int order) @@ -1653,8 +1653,8 @@ struct mmu_notifier_range; void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); -int copy_page_range(struct mm_struct *dst, struct mm_struct *src, - struct vm_area_struct *vma, struct vm_area_struct *new); +int +copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); int follow_pte_pmd(struct mm_struct *mm, unsigned long address, struct mmu_notifier_range *range, pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp); @@ -2255,7 +2255,7 @@ static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) return ptlock_ptr(pmd_to_page(pmd)); } -static inline bool pgtable_pmd_page_ctor(struct page *page) +static inline bool pmd_ptlock_init(struct page *page) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE page->pmd_huge_pte = NULL; @@ -2263,7 +2263,7 @@ static inline bool pgtable_pmd_page_ctor(struct page *page) return ptlock_init(page); } -static inline void pgtable_pmd_page_dtor(struct page *page) +static inline void pmd_ptlock_free(struct page *page) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE VM_BUG_ON_PAGE(page->pmd_huge_pte, page); @@ -2280,8 +2280,8 @@ static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) return &mm->page_table_lock; } -static inline bool pgtable_pmd_page_ctor(struct page *page) { return true; } -static inline void pgtable_pmd_page_dtor(struct page *page) {} +static inline bool pmd_ptlock_init(struct page *page) { return true; } +static inline void pmd_ptlock_free(struct page *page) {} #define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte) @@ -2294,6 +2294,22 @@ static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd) return ptl; } +static inline bool pgtable_pmd_page_ctor(struct page *page) +{ + if (!pmd_ptlock_init(page)) + return false; + __SetPageTable(page); + inc_zone_page_state(page, NR_PAGETABLE); + return true; +} + +static inline void pgtable_pmd_page_dtor(struct page *page) +{ + pmd_ptlock_free(page); + __ClearPageTable(page); + dec_zone_page_state(page, NR_PAGETABLE); +} + /* * No scalability reason to split PUD locks yet, but follow the same pattern * as the PMD locks to make it easier if we decide to. The VM should not be diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 0707671851a8..18e7eae9b5ba 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -87,4 +87,9 @@ static inline void mmap_assert_write_locked(struct mm_struct *mm) VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm); } +static inline int mmap_lock_is_contended(struct mm_struct *mm) +{ + return rwsem_is_contended(&mm->mmap_lock); +} + #endif /* _LINUX_MMAP_LOCK_H */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 8624f2d2099f..366fcc0474dd 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -394,6 +394,41 @@ enum zone_type { */ ZONE_HIGHMEM, #endif + /* + * ZONE_MOVABLE is similar to ZONE_NORMAL, except that it contains + * movable pages with few exceptional cases described below. Main use + * cases for ZONE_MOVABLE are to make memory offlining/unplug more + * likely to succeed, and to locally limit unmovable allocations - e.g., + * to increase the number of THP/huge pages. Notable special cases are: + * + * 1. Pinned pages: (long-term) pinning of movable pages might + * essentially turn such pages unmovable. Memory offlining might + * retry a long time. + * 2. memblock allocations: kernelcore/movablecore setups might create + * situations where ZONE_MOVABLE contains unmovable allocations + * after boot. Memory offlining and allocations fail early. + * 3. Memory holes: kernelcore/movablecore setups might create very rare + * situations where ZONE_MOVABLE contains memory holes after boot, + * for example, if we have sections that are only partially + * populated. Memory offlining and allocations fail early. + * 4. PG_hwpoison pages: while poisoned pages can be skipped during + * memory offlining, such pages cannot be allocated. + * 5. Unmovable PG_offline pages: in paravirtualized environments, + * hotplugged memory blocks might only partially be managed by the + * buddy (e.g., via XEN-balloon, Hyper-V balloon, virtio-mem). The + * parts not manged by the buddy are unmovable PG_offline pages. In + * some cases (virtio-mem), such pages can be skipped during + * memory offlining, however, cannot be moved/allocated. These + * techniques might use alloc_contig_range() to hide previously + * exposed pages from the buddy again (e.g., to implement some sort + * of memory unplug in virtio-mem). + * + * In general, no unmovable allocations that degrade memory offlining + * should end up in ZONE_MOVABLE. Allocators (like alloc_contig_range()) + * have to expect that migrating pages in ZONE_MOVABLE can fail (even + * if has_unmovable_pages() states that there are no unmovable pages, + * there can be false negatives). + */ ZONE_MOVABLE, #ifdef CONFIG_ZONE_DEVICE ZONE_DEVICE, @@ -1079,7 +1114,7 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, z = next_zones_zonelist(++z, highidx, nodemask), \ zone = zonelist_zone(z)) -#define for_next_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \ +#define for_next_zone_zonelist_nodemask(zone, z, highidx, nodemask) \ for (zone = z->zone; \ zone; \ z = next_zones_zonelist(++z, highidx, nodemask), \ diff --git a/include/linux/numa.h b/include/linux/numa.h index a42df804679e..8cb33ccfb671 100644 --- a/include/linux/numa.h +++ b/include/linux/numa.h @@ -23,22 +23,11 @@ #ifdef CONFIG_NUMA /* Generic implementation available */ int numa_map_to_online_node(int node); - -/* - * Optional architecture specific implementation, users need a "depends - * on $ARCH" - */ -int phys_to_target_node(phys_addr_t addr); #else static inline int numa_map_to_online_node(int node) { return NUMA_NO_NODE; } - -static inline int phys_to_target_node(phys_addr_t addr) -{ - return NUMA_NO_NODE; -} #endif #endif /* _LINUX_NUMA_H */ diff --git a/include/linux/oom.h b/include/linux/oom.h index f022f581ac29..2db9a1432511 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -55,6 +55,7 @@ struct oom_control { }; extern struct mutex oom_lock; +extern struct mutex oom_adj_mutex; static inline void set_current_oom_origin(void) { diff --git a/include/linux/overflow.h b/include/linux/overflow.h index 93fcef105061..f1c4e7b56bd9 100644 --- a/include/linux/overflow.h +++ b/include/linux/overflow.h @@ -43,6 +43,16 @@ #define is_non_negative(a) ((a) > 0 || (a) == 0) #define is_negative(a) (!(is_non_negative(a))) +/* + * Allows for effectively applying __must_check to a macro so we can have + * both the type-agnostic benefits of the macros while also being able to + * enforce that the return value is, in fact, checked. + */ +static inline bool __must_check __must_check_overflow(bool overflow) +{ + return unlikely(overflow); +} + #ifdef COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW /* * For simplicity and code hygiene, the fallback code below insists on @@ -52,32 +62,32 @@ * alias for __builtin_add_overflow, but add type checks similar to * below. */ -#define check_add_overflow(a, b, d) ({ \ +#define check_add_overflow(a, b, d) __must_check_overflow(({ \ typeof(a) __a = (a); \ typeof(b) __b = (b); \ typeof(d) __d = (d); \ (void) (&__a == &__b); \ (void) (&__a == __d); \ __builtin_add_overflow(__a, __b, __d); \ -}) +})) -#define check_sub_overflow(a, b, d) ({ \ +#define check_sub_overflow(a, b, d) __must_check_overflow(({ \ typeof(a) __a = (a); \ typeof(b) __b = (b); \ typeof(d) __d = (d); \ (void) (&__a == &__b); \ (void) (&__a == __d); \ __builtin_sub_overflow(__a, __b, __d); \ -}) +})) -#define check_mul_overflow(a, b, d) ({ \ +#define check_mul_overflow(a, b, d) __must_check_overflow(({ \ typeof(a) __a = (a); \ typeof(b) __b = (b); \ typeof(d) __d = (d); \ (void) (&__a == &__b); \ (void) (&__a == __d); \ __builtin_mul_overflow(__a, __b, __d); \ -}) +})) #else @@ -190,21 +200,20 @@ }) -#define check_add_overflow(a, b, d) \ +#define check_add_overflow(a, b, d) __must_check_overflow( \ __builtin_choose_expr(is_signed_type(typeof(a)), \ __signed_add_overflow(a, b, d), \ - __unsigned_add_overflow(a, b, d)) + __unsigned_add_overflow(a, b, d))) -#define check_sub_overflow(a, b, d) \ +#define check_sub_overflow(a, b, d) __must_check_overflow( \ __builtin_choose_expr(is_signed_type(typeof(a)), \ __signed_sub_overflow(a, b, d), \ - __unsigned_sub_overflow(a, b, d)) + __unsigned_sub_overflow(a, b, d))) -#define check_mul_overflow(a, b, d) \ +#define check_mul_overflow(a, b, d) __must_check_overflow( \ __builtin_choose_expr(is_signed_type(typeof(a)), \ __signed_mul_overflow(a, b, d), \ - __unsigned_mul_overflow(a, b, d)) - + __unsigned_mul_overflow(a, b, d))) #endif /* COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW */ @@ -227,7 +236,7 @@ * '*d' will hold the results of the attempted shift, but is not * considered "safe for use" if false is returned. */ -#define check_shl_overflow(a, s, d) ({ \ +#define check_shl_overflow(a, s, d) __must_check_overflow(({ \ typeof(a) _a = a; \ typeof(s) _s = s; \ typeof(d) _d = d; \ @@ -237,7 +246,7 @@ *_d = (_a_full << _to_shift); \ (_to_shift != _s || is_negative(*_d) || is_negative(_a) || \ (*_d >> _to_shift) != _a); \ -}) +})) /** * array_size() - Calculate size of 2-dimensional array. diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 276140c94f4a..38ded408bd4c 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -167,7 +167,7 @@ enum pageflags { PG_slob_free = PG_private, /* Compound pages. Stored in first tail page's flags */ - PG_double_map = PG_private_2, + PG_double_map = PG_workingset, /* non-lru isolated movable page */ PG_isolated = PG_reclaim, @@ -235,6 +235,9 @@ static inline void page_init_poison(struct page *page, size_t size) * * PF_NO_COMPOUND: * the page flag is not relevant for compound pages. + * + * PF_SECOND: + * the page flag is stored in the first tail page. */ #define PF_POISONED_CHECK(page) ({ \ VM_BUG_ON_PGFLAGS(PagePoisoned(page), page); \ @@ -250,6 +253,9 @@ static inline void page_init_poison(struct page *page, size_t size) #define PF_NO_COMPOUND(page, enforce) ({ \ VM_BUG_ON_PGFLAGS(enforce && PageCompound(page), page); \ PF_POISONED_CHECK(page); }) +#define PF_SECOND(page, enforce) ({ \ + VM_BUG_ON_PGFLAGS(!PageHead(page), page); \ + PF_POISONED_CHECK(&page[1]); }) /* * Macros to create function definitions for page flags @@ -688,42 +694,15 @@ static inline int PageTransTail(struct page *page) * * See also __split_huge_pmd_locked() and page_remove_anon_compound_rmap(). */ -static inline int PageDoubleMap(struct page *page) -{ - return PageHead(page) && test_bit(PG_double_map, &page[1].flags); -} - -static inline void SetPageDoubleMap(struct page *page) -{ - VM_BUG_ON_PAGE(!PageHead(page), page); - set_bit(PG_double_map, &page[1].flags); -} - -static inline void ClearPageDoubleMap(struct page *page) -{ - VM_BUG_ON_PAGE(!PageHead(page), page); - clear_bit(PG_double_map, &page[1].flags); -} -static inline int TestSetPageDoubleMap(struct page *page) -{ - VM_BUG_ON_PAGE(!PageHead(page), page); - return test_and_set_bit(PG_double_map, &page[1].flags); -} - -static inline int TestClearPageDoubleMap(struct page *page) -{ - VM_BUG_ON_PAGE(!PageHead(page), page); - return test_and_clear_bit(PG_double_map, &page[1].flags); -} - +PAGEFLAG(DoubleMap, double_map, PF_SECOND) + TESTSCFLAG(DoubleMap, double_map, PF_SECOND) #else TESTPAGEFLAG_FALSE(TransHuge) TESTPAGEFLAG_FALSE(TransCompound) TESTPAGEFLAG_FALSE(TransCompoundMap) TESTPAGEFLAG_FALSE(TransTail) PAGEFLAG_FALSE(DoubleMap) - TESTSETFLAG_FALSE(DoubleMap) - TESTCLEARFLAG_FALSE(DoubleMap) + TESTSCFLAG_FALSE(DoubleMap) #endif /* @@ -888,6 +867,7 @@ static inline int page_has_private(struct page *page) #undef PF_ONLY_HEAD #undef PF_NO_TAIL #undef PF_NO_COMPOUND +#undef PF_SECOND #endif /* !__GENERATING_BOUNDS_H */ #endif /* PAGE_FLAGS_H */ diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 434c9c34aeb6..1a3554f5d992 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -279,6 +279,7 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping, #define FGP_NOFS 0x00000010 #define FGP_NOWAIT 0x00000020 #define FGP_FOR_MMAP 0x00000040 +#define FGP_HEAD 0x00000080 struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, int fgp_flags, gfp_t cache_gfp_mask); @@ -310,18 +311,37 @@ static inline struct page *find_get_page_flags(struct address_space *mapping, * @mapping: the address_space to search * @offset: the page index * - * Looks up the page cache slot at @mapping & @offset. If there is a + * Looks up the page cache entry at @mapping & @offset. If there is a * page cache page, it is returned locked and with an increased * refcount. * - * Otherwise, %NULL is returned. - * - * find_lock_page() may sleep. + * Context: May sleep. + * Return: A struct page or %NULL if there is no page in the cache for this + * index. */ static inline struct page *find_lock_page(struct address_space *mapping, - pgoff_t offset) + pgoff_t index) { - return pagecache_get_page(mapping, offset, FGP_LOCK, 0); + return pagecache_get_page(mapping, index, FGP_LOCK, 0); +} + +/** + * find_lock_head - Locate, pin and lock a pagecache page. + * @mapping: The address_space to search. + * @offset: The page index. + * + * Looks up the page cache entry at @mapping & @offset. If there is a + * page cache page, its head page is returned locked and with an increased + * refcount. + * + * Context: May sleep. + * Return: A struct page which is !PageTail, or %NULL if there is no page + * in the cache for this index. + */ +static inline struct page *find_lock_head(struct address_space *mapping, + pgoff_t index) +{ + return pagecache_get_page(mapping, index, FGP_LOCK | FGP_HEAD, 0); } /** @@ -372,6 +392,15 @@ static inline struct page *grab_cache_page_nowait(struct address_space *mapping, mapping_gfp_mask(mapping)); } +/* Does this page contain this index? */ +static inline bool thp_contains(struct page *head, pgoff_t index) +{ + /* HugeTLBfs indexes the page cache in units of hpage_size */ + if (PageHuge(head)) + return head->index == index; + return page_index(head) == (index & ~(thp_nr_pages(head) - 1UL)); +} + /* * Given the page we found in the page cache, return the page corresponding * to this index in the file @@ -385,8 +414,6 @@ static inline struct page *find_subpage(struct page *head, pgoff_t index) return head + (index & (thp_nr_pages(head) - 1)); } -struct page *find_get_entry(struct address_space *mapping, pgoff_t offset); -struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset); unsigned find_get_entries(struct address_space *mapping, pgoff_t start, unsigned int nr_entries, struct page **entries, pgoff_t *indices); diff --git a/include/linux/printk.h b/include/linux/printk.h index 34c1a7be3e01..78479633ccfc 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -12,6 +12,8 @@ extern const char linux_banner[]; extern const char linux_proc_banner[]; +extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in progress */ + #define PRINTK_MAX_SINGLE_HEADER_LEN 2 static inline int printk_get_level(const char *buffer) @@ -159,10 +161,12 @@ static inline void printk_nmi_direct_enter(void) { } static inline void printk_nmi_direct_exit(void) { } #endif /* PRINTK_NMI */ +struct dev_printk_info; + #ifdef CONFIG_PRINTK -asmlinkage __printf(5, 0) +asmlinkage __printf(4, 0) int vprintk_emit(int facility, int level, - const char *dict, size_t dictlen, + const struct dev_printk_info *dev_info, const char *fmt, va_list args); asmlinkage __printf(1, 0) diff --git a/include/linux/range.h b/include/linux/range.h index d1fbeb664012..274681cc3154 100644 --- a/include/linux/range.h +++ b/include/linux/range.h @@ -1,12 +1,18 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_RANGE_H #define _LINUX_RANGE_H +#include struct range { u64 start; u64 end; }; +static inline u64 range_len(const struct range *range) +{ + return range->end - range->start + 1; +} + int add_range(struct range *range, int az, int nr_range, u64 start, u64 end); diff --git a/include/linux/sched.h b/include/linux/sched.h index e2772c61ff63..9e3b3dec20a3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1213,6 +1213,10 @@ struct task_struct { #endif #endif +#if IS_ENABLED(CONFIG_KUNIT) + struct kunit *kunit_test; +#endif + #ifdef CONFIG_FUNCTION_GRAPH_TRACER /* Index of current stored address in ret_stack: */ int curr_ret_stack; diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index ecdc6542070f..dfd82eab2902 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -72,6 +72,7 @@ static inline int get_dumpable(struct mm_struct *mm) #define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ #define MMF_OOM_VICTIM 25 /* mm is the oom victim */ #define MMF_OOM_REAP_QUEUED 26 /* mm was queued for oom_reaper */ +#define MMF_MULTIPROCESS 27 /* mm is shared between processes */ #define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ diff --git a/include/linux/slab.h b/include/linux/slab.h index 24df2393ec03..9e155cc83b8a 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -279,7 +279,7 @@ static inline void __check_heap_object(const void *ptr, unsigned long n, #define KMALLOC_MAX_SIZE (1UL << KMALLOC_SHIFT_MAX) /* Maximum size for which we actually use a slab cache */ #define KMALLOC_MAX_CACHE_SIZE (1UL << KMALLOC_SHIFT_HIGH) -/* Maximum order allocatable via the slab allocagtor */ +/* Maximum order allocatable via the slab allocator */ #define KMALLOC_MAX_ORDER (KMALLOC_SHIFT_MAX - PAGE_SHIFT) /* diff --git a/include/linux/swap.h b/include/linux/swap.h index 4340a7b6e7a1..667935c0dbd4 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -170,7 +170,7 @@ enum { SWP_CONTINUED = (1 << 5), /* swap_map has count continuation */ SWP_BLKDEV = (1 << 6), /* its a block device */ SWP_ACTIVATED = (1 << 7), /* set after swap_activate success */ - SWP_FS = (1 << 8), /* swap file goes through fs */ + SWP_FS_OPS = (1 << 8), /* swapfile operations go through fs */ SWP_AREA_DISCARD = (1 << 9), /* single-time swap area discards */ SWP_PAGE_DISCARD = (1 << 10), /* freed swap page-cluster discards */ SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */ @@ -340,7 +340,6 @@ extern void lru_note_cost_page(struct page *); extern void lru_cache_add(struct page *); extern void lru_add_page_tail(struct page *page, struct page *page_tail, struct lruvec *lruvec, struct list_head *head); -extern void activate_page(struct page *); extern void mark_page_accessed(struct page *); extern void lru_add_drain(void); extern void lru_add_drain_cpu(int cpu); @@ -427,6 +426,7 @@ extern void free_pages_and_swap_cache(struct page **, int); extern struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, unsigned long addr); +struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index); extern struct page *read_swap_cache_async(swp_entry_t, gfp_t, struct vm_area_struct *vma, unsigned long addr, bool do_poll); @@ -570,6 +570,12 @@ static inline struct page *lookup_swap_cache(swp_entry_t swp, return NULL; } +static inline +struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index) +{ + return find_get_page(mapping, index); +} + static inline int add_to_swap(struct page *page) { return 0; diff --git a/include/linux/swap_slots.h b/include/linux/swap_slots.h index e36b200c2a77..347f1a304190 100644 --- a/include/linux/swap_slots.h +++ b/include/linux/swap_slots.h @@ -23,7 +23,7 @@ struct swap_slots_cache { void disable_swap_slots_cache_lock(void); void reenable_swap_slots_cache_unlock(void); -int enable_swap_slots_cache(void); +void enable_swap_slots_cache(void); int free_swap_slot(swp_entry_t entry); extern bool swap_slot_cache_enabled; diff --git a/include/trace/events/avc.h b/include/trace/events/avc.h new file mode 100644 index 000000000000..b55fda2e0773 --- /dev/null +++ b/include/trace/events/avc.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Authors: Thiébaud Weksteen + * Peter Enderborg + */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM avc + +#if !defined(_TRACE_SELINUX_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_SELINUX_H + +#include + +TRACE_EVENT(selinux_audited, + + TP_PROTO(struct selinux_audit_data *sad, + char *scontext, + char *tcontext, + const char *tclass + ), + + TP_ARGS(sad, scontext, tcontext, tclass), + + TP_STRUCT__entry( + __field(u32, requested) + __field(u32, denied) + __field(u32, audited) + __field(int, result) + __string(scontext, scontext) + __string(tcontext, tcontext) + __string(tclass, tclass) + ), + + TP_fast_assign( + __entry->requested = sad->requested; + __entry->denied = sad->denied; + __entry->audited = sad->audited; + __entry->result = sad->result; + __assign_str(tcontext, tcontext); + __assign_str(scontext, scontext); + __assign_str(tclass, tclass); + ), + + TP_printk("requested=0x%x denied=0x%x audited=0x%x result=%d scontext=%s tcontext=%s tclass=%s", + __entry->requested, __entry->denied, __entry->audited, __entry->result, + __get_str(scontext), __get_str(tcontext), __get_str(tclass) + ) +); + +#endif + +/* This part must be outside protection */ +#include diff --git a/init/Kconfig b/init/Kconfig index 71aefec2e2aa..fbe876b137a1 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -682,7 +682,8 @@ config IKHEADERS config LOG_BUF_SHIFT int "Kernel log buffer size (16 => 64KB, 17 => 128KB)" - range 12 25 + range 12 25 if !H8300 + range 12 19 if H8300 default 17 depends on PRINTK help diff --git a/kernel/audit.c b/kernel/audit.c index 7efaece534a9..68cee3bc8cfe 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -123,9 +123,9 @@ static u32 audit_backlog_limit = 64; static u32 audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME; /* The identity of the user shutting down the audit system. */ -kuid_t audit_sig_uid = INVALID_UID; -pid_t audit_sig_pid = -1; -u32 audit_sig_sid = 0; +static kuid_t audit_sig_uid = INVALID_UID; +static pid_t audit_sig_pid = -1; +static u32 audit_sig_sid; /* Records can be lost in several ways: 0) [suppressed in audit_alloc] @@ -934,8 +934,7 @@ static void audit_free_reply(struct audit_reply *reply) if (!reply) return; - if (reply->skb) - kfree_skb(reply->skb); + kfree_skb(reply->skb); if (reply->net) put_net(reply->net); kfree(reply); diff --git a/kernel/audit.h b/kernel/audit.h index ddc22878433d..3b9c0945225a 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -327,10 +327,6 @@ static inline int audit_signal_info_syscall(struct task_struct *t) extern char *audit_unpack_string(void **bufp, size_t *remain, size_t len); -extern pid_t audit_sig_pid; -extern kuid_t audit_sig_uid; -extern u32 audit_sig_sid; - extern int audit_filter(int msgtype, unsigned int listtype); extern void audit_ctl_lock(void); diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index cff7e60968b9..0369fd5fda8f 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -73,16 +73,7 @@ early_param("cma", early_cma); static phys_addr_t __init __maybe_unused cma_early_percent_memory(void) { - struct memblock_region *reg; - unsigned long total_pages = 0; - - /* - * We cannot use memblock_phys_mem_size() here, because - * memblock_analyze() has not been called yet. - */ - for_each_memblock(memory, reg) - total_pages += memblock_region_memory_end_pfn(reg) - - memblock_region_memory_base_pfn(reg); + unsigned long total_pages = PHYS_PFN(memblock_phys_mem_size()); return (total_pages * CONFIG_CMA_SIZE_PERCENTAGE / 100) << PAGE_SHIFT; } diff --git a/kernel/fork.c b/kernel/fork.c index d5957161d91d..ead9672b69c1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -561,7 +561,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, atomic_dec(&inode->i_writecount); i_mmap_lock_write(mapping); if (tmp->vm_flags & VM_SHARED) - atomic_inc(&mapping->i_mmap_writable); + mapping_allow_writable(mapping); flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ vma_interval_tree_insert_after(tmp, mpnt, @@ -592,7 +592,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, mm->map_count++; if (!(tmp->vm_flags & VM_WIPEONFORK)) - retval = copy_page_range(mm, oldmm, mpnt, tmp); + retval = copy_page_range(tmp, mpnt); if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); @@ -1814,6 +1814,25 @@ static __always_inline void delayed_free_task(struct task_struct *tsk) free_task(tsk); } +static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk) +{ + /* Skip if kernel thread */ + if (!tsk->mm) + return; + + /* Skip if spawning a thread or using vfork */ + if ((clone_flags & (CLONE_VM | CLONE_THREAD | CLONE_VFORK)) != CLONE_VM) + return; + + /* We need to synchronize with __set_oom_adj */ + mutex_lock(&oom_adj_mutex); + set_bit(MMF_MULTIPROCESS, &tsk->mm->flags); + /* Update the values in case they were changed after copy_signal */ + tsk->signal->oom_score_adj = current->signal->oom_score_adj; + tsk->signal->oom_score_adj_min = current->signal->oom_score_adj_min; + mutex_unlock(&oom_adj_mutex); +} + /* * This creates a new process as a copy of the old one, * but does not actually start it yet. @@ -2292,6 +2311,8 @@ static __latent_entropy struct task_struct *copy_process( trace_task_newtask(p, clone_flags); uprobe_copy_process(p, clone_flags); + copy_oom_score_adj(clone_flags, p); + return p; bad_fork_cancel_cgroup: diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile index 4d052fc6bcde..eee3dc9b60a9 100644 --- a/kernel/printk/Makefile +++ b/kernel/printk/Makefile @@ -2,3 +2,4 @@ obj-y = printk.o obj-$(CONFIG_PRINTK) += printk_safe.o obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o +obj-$(CONFIG_PRINTK) += printk_ringbuffer.o diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 660f9a6bf73a..3a8fd491758c 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -14,9 +14,9 @@ extern raw_spinlock_t logbuf_lock; -__printf(5, 0) +__printf(4, 0) int vprintk_store(int facility, int level, - const char *dict, size_t dictlen, + const struct dev_printk_info *dev_info, const char *fmt, va_list args); __printf(1, 0) int vprintk_default(const char *fmt, va_list args); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 9b75f6bfc333..fe64a49344bf 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -55,6 +55,7 @@ #define CREATE_TRACE_POINTS #include +#include "printk_ringbuffer.h" #include "console_cmdline.h" #include "braille.h" #include "internal.h" @@ -294,30 +295,22 @@ enum con_msg_format_flags { static int console_msg_format = MSG_FORMAT_DEFAULT; /* - * The printk log buffer consists of a chain of concatenated variable - * length records. Every record starts with a record header, containing - * the overall length of the record. + * The printk log buffer consists of a sequenced collection of records, each + * containing variable length message text. Every record also contains its + * own meta-data (@info). * - * The heads to the first and last entry in the buffer, as well as the - * sequence numbers of these entries are maintained when messages are - * stored. + * Every record meta-data carries the timestamp in microseconds, as well as + * the standard userspace syslog level and syslog facility. The usual kernel + * messages use LOG_KERN; userspace-injected messages always carry a matching + * syslog facility, by default LOG_USER. The origin of every message can be + * reliably determined that way. * - * If the heads indicate available messages, the length in the header - * tells the start next message. A length == 0 for the next message - * indicates a wrap-around to the beginning of the buffer. + * The human readable log message of a record is available in @text, the + * length of the message text in @text_len. The stored message is not + * terminated. * - * Every record carries the monotonic timestamp in microseconds, as well as - * the standard userspace syslog level and syslog facility. The usual - * kernel messages use LOG_KERN; userspace-injected messages always carry - * a matching syslog facility, by default LOG_USER. The origin of every - * message can be reliably determined that way. - * - * The human readable log message directly follows the message header. The - * length of the message text is stored in the header, the stored message - * is not terminated. - * - * Optionally, a message can carry a dictionary of properties (key/value pairs), - * to provide userspace with a machine-readable message context. + * Optionally, a record can carry a dictionary of properties (key/value + * pairs), to provide userspace with a machine-readable message context. * * Examples for well-defined, commonly used property names are: * DEVICE=b12:8 device identifier @@ -327,25 +320,22 @@ static int console_msg_format = MSG_FORMAT_DEFAULT; * +sound:card0 subsystem:devname * SUBSYSTEM=pci driver-core subsystem name * - * Valid characters in property names are [a-zA-Z0-9.-_]. The plain text value - * follows directly after a '=' character. Every property is terminated by - * a '\0' character. The last property is not terminated. + * Valid characters in property names are [a-zA-Z0-9.-_]. Property names + * and values are terminated by a '\0' character. * - * Example of a message structure: - * 0000 ff 8f 00 00 00 00 00 00 monotonic time in nsec - * 0008 34 00 record is 52 bytes long - * 000a 0b 00 text is 11 bytes long - * 000c 1f 00 dictionary is 23 bytes long - * 000e 03 00 LOG_KERN (facility) LOG_ERR (level) - * 0010 69 74 27 73 20 61 20 6c "it's a l" - * 69 6e 65 "ine" - * 001b 44 45 56 49 43 "DEVIC" - * 45 3d 62 38 3a 32 00 44 "E=b8:2\0D" - * 52 49 56 45 52 3d 62 75 "RIVER=bu" - * 67 "g" - * 0032 00 00 00 padding to next message header + * Example of record values: + * record.text_buf = "it's a line" (unterminated) + * record.info.seq = 56 + * record.info.ts_nsec = 36863 + * record.info.text_len = 11 + * record.info.facility = 0 (LOG_KERN) + * record.info.flags = 0 + * record.info.level = 3 (LOG_ERR) + * record.info.caller_id = 299 (task 299) + * record.info.dev_info.subsystem = "pci" (terminated) + * record.info.dev_info.device = "+pci:0000:00:01.0" (terminated) * - * The 'struct printk_log' buffer header must never be directly exported to + * The 'struct printk_info' buffer must never be directly exported to * userspace, it is a kernel-private implementation detail that might * need to be changed in the future, when the requirements change. * @@ -365,23 +355,6 @@ enum log_flags { LOG_CONT = 8, /* text is a fragment of a continuation line */ }; -struct printk_log { - u64 ts_nsec; /* timestamp in nanoseconds */ - u16 len; /* length of entire record */ - u16 text_len; /* length of text buffer */ - u16 dict_len; /* length of dictionary buffer */ - u8 facility; /* syslog facility */ - u8 flags:5; /* internal record flags */ - u8 level:3; /* syslog level */ -#ifdef CONFIG_PRINTK_CALLER - u32 caller_id; /* thread id or processor id */ -#endif -} -#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS -__packed __aligned(4) -#endif -; - /* * The logbuf_lock protects kmsg buffer, indices, counters. This can be taken * within the scheduler's rq lock. It must be released before calling @@ -421,26 +394,16 @@ DEFINE_RAW_SPINLOCK(logbuf_lock); DECLARE_WAIT_QUEUE_HEAD(log_wait); /* the next printk record to read by syslog(READ) or /proc/kmsg */ static u64 syslog_seq; -static u32 syslog_idx; static size_t syslog_partial; static bool syslog_time; -/* index and sequence number of the first record stored in the buffer */ -static u64 log_first_seq; -static u32 log_first_idx; - -/* index and sequence number of the next record to store in the buffer */ -static u64 log_next_seq; -static u32 log_next_idx; - /* the next printk record to write to the console */ static u64 console_seq; -static u32 console_idx; static u64 exclusive_console_stop_seq; +static unsigned long console_dropped; /* the next printk record to read after the last 'clear' command */ static u64 clear_seq; -static u32 clear_idx; #ifdef CONFIG_PRINTK_CALLER #define PREFIX_MAX 48 @@ -453,13 +416,30 @@ static u32 clear_idx; #define LOG_FACILITY(v) ((v) >> 3 & 0xff) /* record buffer */ -#define LOG_ALIGN __alignof__(struct printk_log) +#define LOG_ALIGN __alignof__(unsigned long) #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) #define LOG_BUF_LEN_MAX (u32)(1 << 31) static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); static char *log_buf = __log_buf; static u32 log_buf_len = __LOG_BUF_LEN; +/* + * Define the average message size. This only affects the number of + * descriptors that will be available. Underestimating is better than + * overestimating (too many available descriptors is better than not enough). + */ +#define PRB_AVGBITS 5 /* 32 character average length */ + +#if CONFIG_LOG_BUF_SHIFT <= PRB_AVGBITS +#error CONFIG_LOG_BUF_SHIFT value too small. +#endif +_DEFINE_PRINTKRB(printk_rb_static, CONFIG_LOG_BUF_SHIFT - PRB_AVGBITS, + PRB_AVGBITS, &__log_buf[0]); + +static struct printk_ringbuffer printk_rb_dynamic; + +static struct printk_ringbuffer *prb = &printk_rb_static; + /* * We cannot access per-CPU data (e.g. per-CPU flush irq_work) before * per_cpu_areas are initialised. This variable is set to true when @@ -484,108 +464,6 @@ u32 log_buf_len_get(void) return log_buf_len; } -/* human readable text of the record */ -static char *log_text(const struct printk_log *msg) -{ - return (char *)msg + sizeof(struct printk_log); -} - -/* optional key/value pair dictionary attached to the record */ -static char *log_dict(const struct printk_log *msg) -{ - return (char *)msg + sizeof(struct printk_log) + msg->text_len; -} - -/* get record by index; idx must point to valid msg */ -static struct printk_log *log_from_idx(u32 idx) -{ - struct printk_log *msg = (struct printk_log *)(log_buf + idx); - - /* - * A length == 0 record is the end of buffer marker. Wrap around and - * read the message at the start of the buffer. - */ - if (!msg->len) - return (struct printk_log *)log_buf; - return msg; -} - -/* get next record; idx must point to valid msg */ -static u32 log_next(u32 idx) -{ - struct printk_log *msg = (struct printk_log *)(log_buf + idx); - - /* length == 0 indicates the end of the buffer; wrap */ - /* - * A length == 0 record is the end of buffer marker. Wrap around and - * read the message at the start of the buffer as *this* one, and - * return the one after that. - */ - if (!msg->len) { - msg = (struct printk_log *)log_buf; - return msg->len; - } - return idx + msg->len; -} - -/* - * Check whether there is enough free space for the given message. - * - * The same values of first_idx and next_idx mean that the buffer - * is either empty or full. - * - * If the buffer is empty, we must respect the position of the indexes. - * They cannot be reset to the beginning of the buffer. - */ -static int logbuf_has_space(u32 msg_size, bool empty) -{ - u32 free; - - if (log_next_idx > log_first_idx || empty) - free = max(log_buf_len - log_next_idx, log_first_idx); - else - free = log_first_idx - log_next_idx; - - /* - * We need space also for an empty header that signalizes wrapping - * of the buffer. - */ - return free >= msg_size + sizeof(struct printk_log); -} - -static int log_make_free_space(u32 msg_size) -{ - while (log_first_seq < log_next_seq && - !logbuf_has_space(msg_size, false)) { - /* drop old messages until we have enough contiguous space */ - log_first_idx = log_next(log_first_idx); - log_first_seq++; - } - - if (clear_seq < log_first_seq) { - clear_seq = log_first_seq; - clear_idx = log_first_idx; - } - - /* sequence numbers are equal, so the log buffer is empty */ - if (logbuf_has_space(msg_size, log_first_seq == log_next_seq)) - return 0; - - return -ENOMEM; -} - -/* compute the message size including the padding bytes */ -static u32 msg_used_size(u16 text_len, u16 dict_len, u32 *pad_len) -{ - u32 size; - - size = sizeof(struct printk_log) + text_len + dict_len; - *pad_len = (-size) & (LOG_ALIGN - 1); - size += *pad_len; - - return size; -} - /* * Define how much of the log buffer we could take at maximum. The value * must be greater than two. Note that only half of the buffer is available @@ -594,84 +472,69 @@ static u32 msg_used_size(u16 text_len, u16 dict_len, u32 *pad_len) #define MAX_LOG_TAKE_PART 4 static const char trunc_msg[] = ""; -static u32 truncate_msg(u16 *text_len, u16 *trunc_msg_len, - u16 *dict_len, u32 *pad_len) +static void truncate_msg(u16 *text_len, u16 *trunc_msg_len) { /* * The message should not take the whole buffer. Otherwise, it might * get removed too soon. */ u32 max_text_len = log_buf_len / MAX_LOG_TAKE_PART; + if (*text_len > max_text_len) *text_len = max_text_len; - /* enable the warning message */ + + /* enable the warning message (if there is room) */ *trunc_msg_len = strlen(trunc_msg); - /* disable the "dict" completely */ - *dict_len = 0; - /* compute the size again, count also the warning message */ - return msg_used_size(*text_len + *trunc_msg_len, 0, pad_len); + if (*text_len >= *trunc_msg_len) + *text_len -= *trunc_msg_len; + else + *trunc_msg_len = 0; } /* insert record into the buffer, discard old ones, update heads */ static int log_store(u32 caller_id, int facility, int level, enum log_flags flags, u64 ts_nsec, - const char *dict, u16 dict_len, + const struct dev_printk_info *dev_info, const char *text, u16 text_len) { - struct printk_log *msg; - u32 size, pad_len; + struct prb_reserved_entry e; + struct printk_record r; u16 trunc_msg_len = 0; - /* number of '\0' padding bytes to next message */ - size = msg_used_size(text_len, dict_len, &pad_len); + prb_rec_init_wr(&r, text_len); - if (log_make_free_space(size)) { + if (!prb_reserve(&e, prb, &r)) { /* truncate the message if it is too long for empty buffer */ - size = truncate_msg(&text_len, &trunc_msg_len, - &dict_len, &pad_len); + truncate_msg(&text_len, &trunc_msg_len); + prb_rec_init_wr(&r, text_len + trunc_msg_len); /* survive when the log buffer is too small for trunc_msg */ - if (log_make_free_space(size)) + if (!prb_reserve(&e, prb, &r)) return 0; } - if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) { - /* - * This message + an additional empty header does not fit - * at the end of the buffer. Add an empty header with len == 0 - * to signify a wrap around. - */ - memset(log_buf + log_next_idx, 0, sizeof(struct printk_log)); - log_next_idx = 0; - } - /* fill message */ - msg = (struct printk_log *)(log_buf + log_next_idx); - memcpy(log_text(msg), text, text_len); - msg->text_len = text_len; - if (trunc_msg_len) { - memcpy(log_text(msg) + text_len, trunc_msg, trunc_msg_len); - msg->text_len += trunc_msg_len; - } - memcpy(log_dict(msg), dict, dict_len); - msg->dict_len = dict_len; - msg->facility = facility; - msg->level = level & 7; - msg->flags = flags & 0x1f; + memcpy(&r.text_buf[0], text, text_len); + if (trunc_msg_len) + memcpy(&r.text_buf[text_len], trunc_msg, trunc_msg_len); + r.info->text_len = text_len + trunc_msg_len; + r.info->facility = facility; + r.info->level = level & 7; + r.info->flags = flags & 0x1f; if (ts_nsec > 0) - msg->ts_nsec = ts_nsec; + r.info->ts_nsec = ts_nsec; else - msg->ts_nsec = local_clock(); -#ifdef CONFIG_PRINTK_CALLER - msg->caller_id = caller_id; -#endif - memset(log_dict(msg) + dict_len, 0, pad_len); - msg->len = size; + r.info->ts_nsec = local_clock(); + r.info->caller_id = caller_id; + if (dev_info) + memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info)); /* insert message */ - log_next_idx += msg->len; - log_next_seq++; + if ((flags & LOG_CONT) || !(flags & LOG_NEWLINE)) + prb_commit(&e); + else + prb_final_commit(&e); - return msg->text_len; + return (text_len + trunc_msg_len); } int dmesg_restrict = IS_ENABLED(CONFIG_SECURITY_DMESG_RESTRICT); @@ -723,13 +586,13 @@ static void append_char(char **pp, char *e, char c) *(*pp)++ = c; } -static ssize_t msg_print_ext_header(char *buf, size_t size, - struct printk_log *msg, u64 seq) +static ssize_t info_print_ext_header(char *buf, size_t size, + struct printk_info *info) { - u64 ts_usec = msg->ts_nsec; + u64 ts_usec = info->ts_nsec; char caller[20]; #ifdef CONFIG_PRINTK_CALLER - u32 id = msg->caller_id; + u32 id = info->caller_id; snprintf(caller, sizeof(caller), ",caller=%c%u", id & 0x80000000 ? 'C' : 'T', id & ~0x80000000); @@ -740,13 +603,13 @@ static ssize_t msg_print_ext_header(char *buf, size_t size, do_div(ts_usec, 1000); return scnprintf(buf, size, "%u,%llu,%llu,%c%s;", - (msg->facility << 3) | msg->level, seq, ts_usec, - msg->flags & LOG_CONT ? 'c' : '-', caller); + (info->facility << 3) | info->level, info->seq, + ts_usec, info->flags & LOG_CONT ? 'c' : '-', caller); } -static ssize_t msg_print_ext_body(char *buf, size_t size, - char *dict, size_t dict_len, - char *text, size_t text_len) +static ssize_t msg_add_ext_text(char *buf, size_t size, + const char *text, size_t text_len, + unsigned char endc) { char *p = buf, *e = buf + size; size_t i; @@ -760,45 +623,56 @@ static ssize_t msg_print_ext_body(char *buf, size_t size, else append_char(&p, e, c); } - append_char(&p, e, '\n'); - - if (dict_len) { - bool line = true; - - for (i = 0; i < dict_len; i++) { - unsigned char c = dict[i]; - - if (line) { - append_char(&p, e, ' '); - line = false; - } - - if (c == '\0') { - append_char(&p, e, '\n'); - line = true; - continue; - } - - if (c < ' ' || c >= 127 || c == '\\') { - p += scnprintf(p, e - p, "\\x%02x", c); - continue; - } - - append_char(&p, e, c); - } - append_char(&p, e, '\n'); - } + append_char(&p, e, endc); return p - buf; } +static ssize_t msg_add_dict_text(char *buf, size_t size, + const char *key, const char *val) +{ + size_t val_len = strlen(val); + ssize_t len; + + if (!val_len) + return 0; + + len = msg_add_ext_text(buf, size, "", 0, ' '); /* dict prefix */ + len += msg_add_ext_text(buf + len, size - len, key, strlen(key), '='); + len += msg_add_ext_text(buf + len, size - len, val, val_len, '\n'); + + return len; +} + +static ssize_t msg_print_ext_body(char *buf, size_t size, + char *text, size_t text_len, + struct dev_printk_info *dev_info) +{ + ssize_t len; + + len = msg_add_ext_text(buf, size, text, text_len, '\n'); + + if (!dev_info) + goto out; + + len += msg_add_dict_text(buf + len, size - len, "SUBSYSTEM", + dev_info->subsystem); + len += msg_add_dict_text(buf + len, size - len, "DEVICE", + dev_info->device); +out: + return len; +} + /* /dev/kmsg - userspace message inject/listen interface */ struct devkmsg_user { u64 seq; - u32 idx; struct ratelimit_state rs; struct mutex lock; char buf[CONSOLE_EXT_LOG_MAX]; + + struct printk_info info; + char text_buf[CONSOLE_EXT_LOG_MAX]; + struct printk_record record; }; static __printf(3, 4) __cold @@ -808,7 +682,7 @@ int devkmsg_emit(int facility, int level, const char *fmt, ...) int r; va_start(args, fmt); - r = vprintk_emit(facility, level, NULL, 0, fmt, args); + r = vprintk_emit(facility, level, NULL, fmt, args); va_end(args); return r; @@ -881,7 +755,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct devkmsg_user *user = file->private_data; - struct printk_log *msg; + struct printk_record *r = &user->record; size_t len; ssize_t ret; @@ -893,7 +767,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, return ret; logbuf_lock_irq(); - while (user->seq == log_next_seq) { + if (!prb_read_valid(prb, user->seq, r)) { if (file->f_flags & O_NONBLOCK) { ret = -EAGAIN; logbuf_unlock_irq(); @@ -902,30 +776,26 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, logbuf_unlock_irq(); ret = wait_event_interruptible(log_wait, - user->seq != log_next_seq); + prb_read_valid(prb, user->seq, r)); if (ret) goto out; logbuf_lock_irq(); } - if (user->seq < log_first_seq) { + if (user->seq < prb_first_valid_seq(prb)) { /* our last seen message is gone, return error and reset */ - user->idx = log_first_idx; - user->seq = log_first_seq; + user->seq = prb_first_valid_seq(prb); ret = -EPIPE; logbuf_unlock_irq(); goto out; } - msg = log_from_idx(user->idx); - len = msg_print_ext_header(user->buf, sizeof(user->buf), - msg, user->seq); + len = info_print_ext_header(user->buf, sizeof(user->buf), r->info); len += msg_print_ext_body(user->buf + len, sizeof(user->buf) - len, - log_dict(msg), msg->dict_len, - log_text(msg), msg->text_len); + &r->text_buf[0], r->info->text_len, + &r->info->dev_info); - user->idx = log_next(user->idx); - user->seq++; + user->seq = r->info->seq + 1; logbuf_unlock_irq(); if (len > count) { @@ -965,8 +835,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) switch (whence) { case SEEK_SET: /* the first record */ - user->idx = log_first_idx; - user->seq = log_first_seq; + user->seq = prb_first_valid_seq(prb); break; case SEEK_DATA: /* @@ -974,13 +843,11 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) * like issued by 'dmesg -c'. Reading /dev/kmsg itself * changes no global state, and does not clear anything. */ - user->idx = clear_idx; user->seq = clear_seq; break; case SEEK_END: /* after the last record */ - user->idx = log_next_idx; - user->seq = log_next_seq; + user->seq = prb_next_seq(prb); break; default: ret = -EINVAL; @@ -1000,9 +867,9 @@ static __poll_t devkmsg_poll(struct file *file, poll_table *wait) poll_wait(file, &log_wait, wait); logbuf_lock_irq(); - if (user->seq < log_next_seq) { + if (prb_read_valid(prb, user->seq, NULL)) { /* return error when data has vanished underneath us */ - if (user->seq < log_first_seq) + if (user->seq < prb_first_valid_seq(prb)) ret = EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI; else ret = EPOLLIN|EPOLLRDNORM; @@ -1037,9 +904,11 @@ static int devkmsg_open(struct inode *inode, struct file *file) mutex_init(&user->lock); + prb_rec_init_rd(&user->record, &user->info, + &user->text_buf[0], sizeof(user->text_buf)); + logbuf_lock_irq(); - user->idx = log_first_idx; - user->seq = log_first_seq; + user->seq = prb_first_valid_seq(prb); logbuf_unlock_irq(); file->private_data = user; @@ -1080,23 +949,58 @@ const struct file_operations kmsg_fops = { */ void log_buf_vmcoreinfo_setup(void) { - VMCOREINFO_SYMBOL(log_buf); - VMCOREINFO_SYMBOL(log_buf_len); - VMCOREINFO_SYMBOL(log_first_idx); - VMCOREINFO_SYMBOL(clear_idx); - VMCOREINFO_SYMBOL(log_next_idx); + struct dev_printk_info *dev_info = NULL; + + VMCOREINFO_SYMBOL(prb); + VMCOREINFO_SYMBOL(printk_rb_static); + VMCOREINFO_SYMBOL(clear_seq); + /* - * Export struct printk_log size and field offsets. User space tools can + * Export struct size and field offsets. User space tools can * parse it and detect any changes to structure down the line. */ - VMCOREINFO_STRUCT_SIZE(printk_log); - VMCOREINFO_OFFSET(printk_log, ts_nsec); - VMCOREINFO_OFFSET(printk_log, len); - VMCOREINFO_OFFSET(printk_log, text_len); - VMCOREINFO_OFFSET(printk_log, dict_len); -#ifdef CONFIG_PRINTK_CALLER - VMCOREINFO_OFFSET(printk_log, caller_id); -#endif + + VMCOREINFO_STRUCT_SIZE(printk_ringbuffer); + VMCOREINFO_OFFSET(printk_ringbuffer, desc_ring); + VMCOREINFO_OFFSET(printk_ringbuffer, text_data_ring); + VMCOREINFO_OFFSET(printk_ringbuffer, fail); + + VMCOREINFO_STRUCT_SIZE(prb_desc_ring); + VMCOREINFO_OFFSET(prb_desc_ring, count_bits); + VMCOREINFO_OFFSET(prb_desc_ring, descs); + VMCOREINFO_OFFSET(prb_desc_ring, infos); + VMCOREINFO_OFFSET(prb_desc_ring, head_id); + VMCOREINFO_OFFSET(prb_desc_ring, tail_id); + + VMCOREINFO_STRUCT_SIZE(prb_desc); + VMCOREINFO_OFFSET(prb_desc, state_var); + VMCOREINFO_OFFSET(prb_desc, text_blk_lpos); + + VMCOREINFO_STRUCT_SIZE(prb_data_blk_lpos); + VMCOREINFO_OFFSET(prb_data_blk_lpos, begin); + VMCOREINFO_OFFSET(prb_data_blk_lpos, next); + + VMCOREINFO_STRUCT_SIZE(printk_info); + VMCOREINFO_OFFSET(printk_info, seq); + VMCOREINFO_OFFSET(printk_info, ts_nsec); + VMCOREINFO_OFFSET(printk_info, text_len); + VMCOREINFO_OFFSET(printk_info, caller_id); + VMCOREINFO_OFFSET(printk_info, dev_info); + + VMCOREINFO_STRUCT_SIZE(dev_printk_info); + VMCOREINFO_OFFSET(dev_printk_info, subsystem); + VMCOREINFO_LENGTH(printk_info_subsystem, sizeof(dev_info->subsystem)); + VMCOREINFO_OFFSET(dev_printk_info, device); + VMCOREINFO_LENGTH(printk_info_device, sizeof(dev_info->device)); + + VMCOREINFO_STRUCT_SIZE(prb_data_ring); + VMCOREINFO_OFFSET(prb_data_ring, size_bits); + VMCOREINFO_OFFSET(prb_data_ring, data); + VMCOREINFO_OFFSET(prb_data_ring, head_lpos); + VMCOREINFO_OFFSET(prb_data_ring, tail_lpos); + + VMCOREINFO_SIZE(atomic_long_t); + VMCOREINFO_TYPE_OFFSET(atomic_long_t, counter); } #endif @@ -1174,11 +1078,46 @@ static void __init set_percpu_data_ready(void) __printk_percpu_data_ready = true; } +static unsigned int __init add_to_rb(struct printk_ringbuffer *rb, + struct printk_record *r) +{ + struct prb_reserved_entry e; + struct printk_record dest_r; + + prb_rec_init_wr(&dest_r, r->info->text_len); + + if (!prb_reserve(&e, rb, &dest_r)) + return 0; + + memcpy(&dest_r.text_buf[0], &r->text_buf[0], r->info->text_len); + dest_r.info->text_len = r->info->text_len; + dest_r.info->facility = r->info->facility; + dest_r.info->level = r->info->level; + dest_r.info->flags = r->info->flags; + dest_r.info->ts_nsec = r->info->ts_nsec; + dest_r.info->caller_id = r->info->caller_id; + memcpy(&dest_r.info->dev_info, &r->info->dev_info, sizeof(dest_r.info->dev_info)); + + prb_final_commit(&e); + + return prb_record_text_space(&e); +} + +static char setup_text_buf[LOG_LINE_MAX] __initdata; + void __init setup_log_buf(int early) { + struct printk_info *new_infos; + unsigned int new_descs_count; + struct prb_desc *new_descs; + struct printk_info info; + struct printk_record r; + size_t new_descs_size; + size_t new_infos_size; unsigned long flags; char *new_log_buf; unsigned int free; + u64 seq; /* * Some archs call setup_log_buf() multiple times - first is very @@ -1197,24 +1136,75 @@ void __init setup_log_buf(int early) if (!new_log_buf_len) return; - new_log_buf = memblock_alloc(new_log_buf_len, LOG_ALIGN); - if (unlikely(!new_log_buf)) { - pr_err("log_buf_len: %lu bytes not available\n", - new_log_buf_len); + new_descs_count = new_log_buf_len >> PRB_AVGBITS; + if (new_descs_count == 0) { + pr_err("new_log_buf_len: %lu too small\n", new_log_buf_len); return; } + new_log_buf = memblock_alloc(new_log_buf_len, LOG_ALIGN); + if (unlikely(!new_log_buf)) { + pr_err("log_buf_len: %lu text bytes not available\n", + new_log_buf_len); + return; + } + + new_descs_size = new_descs_count * sizeof(struct prb_desc); + new_descs = memblock_alloc(new_descs_size, LOG_ALIGN); + if (unlikely(!new_descs)) { + pr_err("log_buf_len: %zu desc bytes not available\n", + new_descs_size); + goto err_free_log_buf; + } + + new_infos_size = new_descs_count * sizeof(struct printk_info); + new_infos = memblock_alloc(new_infos_size, LOG_ALIGN); + if (unlikely(!new_infos)) { + pr_err("log_buf_len: %zu info bytes not available\n", + new_infos_size); + goto err_free_descs; + } + + prb_rec_init_rd(&r, &info, &setup_text_buf[0], sizeof(setup_text_buf)); + + prb_init(&printk_rb_dynamic, + new_log_buf, ilog2(new_log_buf_len), + new_descs, ilog2(new_descs_count), + new_infos); + logbuf_lock_irqsave(flags); + log_buf_len = new_log_buf_len; log_buf = new_log_buf; new_log_buf_len = 0; - free = __LOG_BUF_LEN - log_next_idx; - memcpy(log_buf, __log_buf, __LOG_BUF_LEN); + + free = __LOG_BUF_LEN; + prb_for_each_record(0, &printk_rb_static, seq, &r) + free -= add_to_rb(&printk_rb_dynamic, &r); + + /* + * This is early enough that everything is still running on the + * boot CPU and interrupts are disabled. So no new messages will + * appear during the transition to the dynamic buffer. + */ + prb = &printk_rb_dynamic; + logbuf_unlock_irqrestore(flags); + if (seq != prb_next_seq(&printk_rb_static)) { + pr_err("dropped %llu messages\n", + prb_next_seq(&printk_rb_static) - seq); + } + pr_info("log_buf_len: %u bytes\n", log_buf_len); pr_info("early log buf free: %u(%u%%)\n", free, (free * 100) / __LOG_BUF_LEN); + return; + +err_free_descs: + memblock_free(__pa(new_descs), new_descs_size); +err_free_log_buf: + memblock_free(__pa(new_log_buf), new_log_buf_len); } static bool __read_mostly ignore_loglevel; @@ -1321,18 +1311,18 @@ static size_t print_caller(u32 id, char *buf) #define print_caller(id, buf) 0 #endif -static size_t print_prefix(const struct printk_log *msg, bool syslog, - bool time, char *buf) +static size_t info_print_prefix(const struct printk_info *info, bool syslog, + bool time, char *buf) { size_t len = 0; if (syslog) - len = print_syslog((msg->facility << 3) | msg->level, buf); + len = print_syslog((info->facility << 3) | info->level, buf); if (time) - len += print_time(msg->ts_nsec, buf + len); + len += print_time(info->ts_nsec, buf + len); - len += print_caller(msg->caller_id, buf + len); + len += print_caller(info->caller_id, buf + len); if (IS_ENABLED(CONFIG_PRINTK_CALLER) || time) { buf[len++] = ' '; @@ -1342,72 +1332,150 @@ static size_t print_prefix(const struct printk_log *msg, bool syslog, return len; } -static size_t msg_print_text(const struct printk_log *msg, bool syslog, - bool time, char *buf, size_t size) +/* + * Prepare the record for printing. The text is shifted within the given + * buffer to avoid a need for another one. The following operations are + * done: + * + * - Add prefix for each line. + * - Add the trailing newline that has been removed in vprintk_store(). + * - Drop truncated lines that do not longer fit into the buffer. + * + * Return: The length of the updated/prepared text, including the added + * prefixes and the newline. The dropped line(s) are not counted. + */ +static size_t record_print_text(struct printk_record *r, bool syslog, + bool time) { - const char *text = log_text(msg); - size_t text_size = msg->text_len; - size_t len = 0; + size_t text_len = r->info->text_len; + size_t buf_size = r->text_buf_size; + char *text = r->text_buf; char prefix[PREFIX_MAX]; - const size_t prefix_len = print_prefix(msg, syslog, time, prefix); + bool truncated = false; + size_t prefix_len; + size_t line_len; + size_t len = 0; + char *next; - do { - const char *next = memchr(text, '\n', text_size); - size_t text_len; + /* + * If the message was truncated because the buffer was not large + * enough, treat the available text as if it were the full text. + */ + if (text_len > buf_size) + text_len = buf_size; + prefix_len = info_print_prefix(r->info, syslog, time, prefix); + + /* + * @text_len: bytes of unprocessed text + * @line_len: bytes of current line _without_ newline + * @text: pointer to beginning of current line + * @len: number of bytes prepared in r->text_buf + */ + for (;;) { + next = memchr(text, '\n', text_len); if (next) { - text_len = next - text; - next++; - text_size -= next - text; + line_len = next - text; } else { - text_len = text_size; + /* Drop truncated line(s). */ + if (truncated) + break; + line_len = text_len; } - if (buf) { - if (prefix_len + text_len + 1 >= size - len) + /* + * Truncate the text if there is not enough space to add the + * prefix and a trailing newline. + */ + if (len + prefix_len + text_len + 1 > buf_size) { + /* Drop even the current line if no space. */ + if (len + prefix_len + line_len + 1 > buf_size) break; - memcpy(buf + len, prefix, prefix_len); - len += prefix_len; - memcpy(buf + len, text, text_len); - len += text_len; - buf[len++] = '\n'; - } else { - /* SYSLOG_ACTION_* buffer size only calculation */ - len += prefix_len + text_len + 1; + text_len = buf_size - len - prefix_len - 1; + truncated = true; } - text = next; - } while (text); + memmove(text + prefix_len, text, text_len); + memcpy(text, prefix, prefix_len); + + len += prefix_len + line_len + 1; + + if (text_len == line_len) { + /* + * Add the trailing newline removed in + * vprintk_store(). + */ + text[prefix_len + line_len] = '\n'; + break; + } + + /* + * Advance beyond the added prefix and the related line with + * its newline. + */ + text += prefix_len + line_len + 1; + + /* + * The remaining text has only decreased by the line with its + * newline. + * + * Note that @text_len can become zero. It happens when @text + * ended with a newline (either due to truncation or the + * original string ending with "\n\n"). The loop is correctly + * repeated and (if not truncated) an empty line with a prefix + * will be prepared. + */ + text_len -= line_len + 1; + } return len; } +static size_t get_record_print_text_size(struct printk_info *info, + unsigned int line_count, + bool syslog, bool time) +{ + char prefix[PREFIX_MAX]; + size_t prefix_len; + + prefix_len = info_print_prefix(info, syslog, time, prefix); + + /* + * Each line will be preceded with a prefix. The intermediate + * newlines are already within the text, but a final trailing + * newline will be added. + */ + return ((prefix_len * line_count) + info->text_len + 1); +} + static int syslog_print(char __user *buf, int size) { + struct printk_info info; + struct printk_record r; char *text; - struct printk_log *msg; int len = 0; text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); if (!text) return -ENOMEM; + prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX); + while (size > 0) { size_t n; size_t skip; logbuf_lock_irq(); - if (syslog_seq < log_first_seq) { - /* messages are gone, move to first one */ - syslog_seq = log_first_seq; - syslog_idx = log_first_idx; - syslog_partial = 0; - } - if (syslog_seq == log_next_seq) { + if (!prb_read_valid(prb, syslog_seq, &r)) { logbuf_unlock_irq(); break; } + if (r.info->seq != syslog_seq) { + /* message is gone, move to next valid one */ + syslog_seq = r.info->seq; + syslog_partial = 0; + } /* * To keep reading/counting partial line consistent, @@ -1417,13 +1485,10 @@ static int syslog_print(char __user *buf, int size) syslog_time = printk_time; skip = syslog_partial; - msg = log_from_idx(syslog_idx); - n = msg_print_text(msg, true, syslog_time, text, - LOG_LINE_MAX + PREFIX_MAX); + n = record_print_text(&r, true, syslog_time); if (n - syslog_partial <= size) { /* message fits into buffer, move forward */ - syslog_idx = log_next(syslog_idx); - syslog_seq++; + syslog_seq = r.info->seq + 1; n -= syslog_partial; syslog_partial = 0; } else if (!len){ @@ -1454,11 +1519,12 @@ static int syslog_print(char __user *buf, int size) static int syslog_print_all(char __user *buf, int size, bool clear) { + struct printk_info info; + unsigned int line_count; + struct printk_record r; char *text; int len = 0; - u64 next_seq; u64 seq; - u32 idx; bool time; text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); @@ -1471,38 +1537,28 @@ static int syslog_print_all(char __user *buf, int size, bool clear) * Find first record that fits, including all following records, * into the user-provided buffer for this dump. */ - seq = clear_seq; - idx = clear_idx; - while (seq < log_next_seq) { - struct printk_log *msg = log_from_idx(idx); - - len += msg_print_text(msg, true, time, NULL, 0); - idx = log_next(idx); - seq++; - } + prb_for_each_info(clear_seq, prb, seq, &info, &line_count) + len += get_record_print_text_size(&info, line_count, true, time); /* move first record forward until length fits into the buffer */ - seq = clear_seq; - idx = clear_idx; - while (len > size && seq < log_next_seq) { - struct printk_log *msg = log_from_idx(idx); - - len -= msg_print_text(msg, true, time, NULL, 0); - idx = log_next(idx); - seq++; + prb_for_each_info(clear_seq, prb, seq, &info, &line_count) { + if (len <= size) + break; + len -= get_record_print_text_size(&info, line_count, true, time); } - /* last message fitting into this dump */ - next_seq = log_next_seq; + prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX); len = 0; - while (len >= 0 && seq < next_seq) { - struct printk_log *msg = log_from_idx(idx); - int textlen = msg_print_text(msg, true, time, text, - LOG_LINE_MAX + PREFIX_MAX); + prb_for_each_record(seq, prb, seq, &r) { + int textlen; - idx = log_next(idx); - seq++; + textlen = record_print_text(&r, true, time); + + if (len + textlen > size) { + seq--; + break; + } logbuf_unlock_irq(); if (copy_to_user(buf + len, text, textlen)) @@ -1511,17 +1567,12 @@ static int syslog_print_all(char __user *buf, int size, bool clear) len += textlen; logbuf_lock_irq(); - if (seq < log_first_seq) { - /* messages are gone, move to next one */ - seq = log_first_seq; - idx = log_first_idx; - } + if (len < 0) + break; } - if (clear) { - clear_seq = log_next_seq; - clear_idx = log_next_idx; - } + if (clear) + clear_seq = seq; logbuf_unlock_irq(); kfree(text); @@ -1531,8 +1582,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) static void syslog_clear(void) { logbuf_lock_irq(); - clear_seq = log_next_seq; - clear_idx = log_next_idx; + clear_seq = prb_next_seq(prb); logbuf_unlock_irq(); } @@ -1559,7 +1609,7 @@ int do_syslog(int type, char __user *buf, int len, int source) if (!access_ok(buf, len)) return -EFAULT; error = wait_event_interruptible(log_wait, - syslog_seq != log_next_seq); + prb_read_valid(prb, syslog_seq, NULL)); if (error) return error; error = syslog_print(buf, len); @@ -1567,7 +1617,7 @@ int do_syslog(int type, char __user *buf, int len, int source) /* Read/clear last kernel messages */ case SYSLOG_ACTION_READ_CLEAR: clear = true; - /* FALL THRU */ + fallthrough; /* Read last kernel messages */ case SYSLOG_ACTION_READ_ALL: if (!buf || len < 0) @@ -1608,10 +1658,9 @@ int do_syslog(int type, char __user *buf, int len, int source) /* Number of chars in the log buffer */ case SYSLOG_ACTION_SIZE_UNREAD: logbuf_lock_irq(); - if (syslog_seq < log_first_seq) { + if (syslog_seq < prb_first_valid_seq(prb)) { /* messages are gone, move to first one */ - syslog_seq = log_first_seq; - syslog_idx = log_first_idx; + syslog_seq = prb_first_valid_seq(prb); syslog_partial = 0; } if (source == SYSLOG_FROM_PROC) { @@ -1620,20 +1669,18 @@ int do_syslog(int type, char __user *buf, int len, int source) * for pending data, not the size; return the count of * records, not the length. */ - error = log_next_seq - syslog_seq; + error = prb_next_seq(prb) - syslog_seq; } else { - u64 seq = syslog_seq; - u32 idx = syslog_idx; bool time = syslog_partial ? syslog_time : printk_time; + struct printk_info info; + unsigned int line_count; + u64 seq; - while (seq < log_next_seq) { - struct printk_log *msg = log_from_idx(idx); - - error += msg_print_text(msg, true, time, NULL, - 0); + prb_for_each_info(syslog_seq, prb, seq, &info, + &line_count) { + error += get_record_print_text_size(&info, line_count, + true, time); time = printk_time; - idx = log_next(idx); - seq++; } error -= syslog_partial; } @@ -1804,10 +1851,22 @@ static int console_trylock_spinning(void) static void call_console_drivers(const char *ext_text, size_t ext_len, const char *text, size_t len) { + static char dropped_text[64]; + size_t dropped_len = 0; struct console *con; trace_console_rcuidle(text, len); + if (!console_drivers) + return; + + if (console_dropped) { + dropped_len = snprintf(dropped_text, sizeof(dropped_text), + "** %lu printk messages dropped **\n", + console_dropped); + console_dropped = 0; + } + for_each_console(con) { if (exclusive_console && con != exclusive_console) continue; @@ -1820,8 +1879,11 @@ static void call_console_drivers(const char *ext_text, size_t ext_len, continue; if (con->flags & CON_EXTENDED) con->write(con, ext_text, ext_len); - else + else { + if (dropped_len) + con->write(con, dropped_text, dropped_len); con->write(con, text, len); + } } } @@ -1845,97 +1907,38 @@ static inline u32 printk_caller_id(void) 0x80000000 + raw_smp_processor_id(); } -/* - * Continuation lines are buffered, and not committed to the record buffer - * until the line is complete, or a race forces it. The line fragments - * though, are printed immediately to the consoles to ensure everything has - * reached the console in case of a kernel crash. - */ -static struct cont { - char buf[LOG_LINE_MAX]; - size_t len; /* length == 0 means unused buffer */ - u32 caller_id; /* printk_caller_id() of first print */ - u64 ts_nsec; /* time of first print */ - u8 level; /* log level of first message */ - u8 facility; /* log facility of first message */ - enum log_flags flags; /* prefix, newline flags */ -} cont; - -static void cont_flush(void) -{ - if (cont.len == 0) - return; - - log_store(cont.caller_id, cont.facility, cont.level, cont.flags, - cont.ts_nsec, NULL, 0, cont.buf, cont.len); - cont.len = 0; -} - -static bool cont_add(u32 caller_id, int facility, int level, - enum log_flags flags, const char *text, size_t len) -{ - /* If the line gets too long, split it up in separate records. */ - if (cont.len + len > sizeof(cont.buf)) { - cont_flush(); - return false; - } - - if (!cont.len) { - cont.facility = facility; - cont.level = level; - cont.caller_id = caller_id; - cont.ts_nsec = local_clock(); - cont.flags = flags; - } - - memcpy(cont.buf + cont.len, text, len); - cont.len += len; - - // The original flags come from the first line, - // but later continuations can add a newline. - if (flags & LOG_NEWLINE) { - cont.flags |= LOG_NEWLINE; - cont_flush(); - } - - return true; -} - -static size_t log_output(int facility, int level, enum log_flags lflags, const char *dict, size_t dictlen, char *text, size_t text_len) +static size_t log_output(int facility, int level, enum log_flags lflags, + const struct dev_printk_info *dev_info, + char *text, size_t text_len) { const u32 caller_id = printk_caller_id(); - /* - * If an earlier line was buffered, and we're a continuation - * write from the same context, try to add it to the buffer. - */ - if (cont.len) { - if (cont.caller_id == caller_id && (lflags & LOG_CONT)) { - if (cont_add(caller_id, facility, level, lflags, text, text_len)) - return text_len; - } - /* Otherwise, make sure it's flushed */ - cont_flush(); - } + if (lflags & LOG_CONT) { + struct prb_reserved_entry e; + struct printk_record r; - /* Skip empty continuation lines that couldn't be added - they just flush */ - if (!text_len && (lflags & LOG_CONT)) - return 0; - - /* If it doesn't end in a newline, try to buffer the current line */ - if (!(lflags & LOG_NEWLINE)) { - if (cont_add(caller_id, facility, level, lflags, text, text_len)) + prb_rec_init_wr(&r, text_len); + if (prb_reserve_in_last(&e, prb, &r, caller_id, LOG_LINE_MAX)) { + memcpy(&r.text_buf[r.info->text_len], text, text_len); + r.info->text_len += text_len; + if (lflags & LOG_NEWLINE) { + r.info->flags |= LOG_NEWLINE; + prb_final_commit(&e); + } else { + prb_commit(&e); + } return text_len; + } } /* Store it in the record log */ return log_store(caller_id, facility, level, lflags, 0, - dict, dictlen, text, text_len); + dev_info, text, text_len); } /* Must be called under logbuf_lock. */ int vprintk_store(int facility, int level, - const char *dict, size_t dictlen, + const struct dev_printk_info *dev_info, const char *fmt, va_list args) { static char textbuf[LOG_LINE_MAX]; @@ -1977,21 +1980,19 @@ int vprintk_store(int facility, int level, if (level == LOGLEVEL_DEFAULT) level = default_message_loglevel; - if (dict) + if (dev_info) lflags |= LOG_NEWLINE; - return log_output(facility, level, lflags, - dict, dictlen, text, text_len); + return log_output(facility, level, lflags, dev_info, text, text_len); } asmlinkage int vprintk_emit(int facility, int level, - const char *dict, size_t dictlen, + const struct dev_printk_info *dev_info, const char *fmt, va_list args) { int printed_len; - bool in_sched = false, pending_output; + bool in_sched = false; unsigned long flags; - u64 curr_log_seq; /* Suppress unimportant messages after panic happens */ if (unlikely(suppress_printk)) @@ -2007,13 +2008,11 @@ asmlinkage int vprintk_emit(int facility, int level, /* This stops the holder of console_sem just where we want him */ logbuf_lock_irqsave(flags); - curr_log_seq = log_next_seq; - printed_len = vprintk_store(facility, level, dict, dictlen, fmt, args); - pending_output = (curr_log_seq != log_next_seq); + printed_len = vprintk_store(facility, level, dev_info, fmt, args); logbuf_unlock_irqrestore(flags); /* If called from the scheduler, we can not call up(). */ - if (!in_sched && pending_output) { + if (!in_sched) { /* * Disable preemption to avoid being preempted while holding * console_sem which would prevent anyone from printing to @@ -2030,8 +2029,7 @@ asmlinkage int vprintk_emit(int facility, int level, preempt_enable(); } - if (pending_output) - wake_up_klogd(); + wake_up_klogd(); return printed_len; } EXPORT_SYMBOL(vprintk_emit); @@ -2044,7 +2042,7 @@ EXPORT_SYMBOL(vprintk); int vprintk_default(const char *fmt, va_list args) { - return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args); + return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, fmt, args); } EXPORT_SYMBOL_GPL(vprintk_default); @@ -2088,30 +2086,31 @@ EXPORT_SYMBOL(printk); #define PREFIX_MAX 0 #define printk_time false +#define prb_read_valid(rb, seq, r) false +#define prb_first_valid_seq(rb) 0 + static u64 syslog_seq; -static u32 syslog_idx; static u64 console_seq; -static u32 console_idx; static u64 exclusive_console_stop_seq; -static u64 log_first_seq; -static u32 log_first_idx; -static u64 log_next_seq; -static char *log_text(const struct printk_log *msg) { return NULL; } -static char *log_dict(const struct printk_log *msg) { return NULL; } -static struct printk_log *log_from_idx(u32 idx) { return NULL; } -static u32 log_next(u32 idx) { return 0; } -static ssize_t msg_print_ext_header(char *buf, size_t size, - struct printk_log *msg, - u64 seq) { return 0; } +static unsigned long console_dropped; + +static size_t record_print_text(const struct printk_record *r, + bool syslog, bool time) +{ + return 0; +} +static ssize_t info_print_ext_header(char *buf, size_t size, + struct printk_info *info) +{ + return 0; +} static ssize_t msg_print_ext_body(char *buf, size_t size, - char *dict, size_t dict_len, - char *text, size_t text_len) { return 0; } + char *text, size_t text_len, + struct dev_printk_info *dev_info) { return 0; } static void console_lock_spinning_enable(void) { } static int console_lock_spinning_disable_and_check(void) { return 0; } static void call_console_drivers(const char *ext_text, size_t ext_len, const char *text, size_t len) {} -static size_t msg_print_text(const struct printk_log *msg, bool syslog, - bool time, char *buf, size_t size) { return 0; } static bool suppress_message_printing(int level) { return false; } #endif /* CONFIG_PRINTK */ @@ -2398,12 +2397,16 @@ void console_unlock(void) static char text[LOG_LINE_MAX + PREFIX_MAX]; unsigned long flags; bool do_cond_resched, retry; + struct printk_info info; + struct printk_record r; if (console_suspended) { up_console_sem(); return; } + prb_rec_init_rd(&r, &info, text, sizeof(text)); + /* * Console drivers are called with interrupts disabled, so * @console_may_schedule should be cleared before; however, we may @@ -2416,7 +2419,7 @@ void console_unlock(void) * * console_trylock() is not able to detect the preemptive * context reliably. Therefore the value must be stored before - * and cleared after the the "again" goto label. + * and cleared after the "again" goto label. */ do_cond_resched = console_may_schedule; again: @@ -2434,35 +2437,26 @@ again: } for (;;) { - struct printk_log *msg; size_t ext_len = 0; size_t len; printk_safe_enter_irqsave(flags); raw_spin_lock(&logbuf_lock); - if (console_seq < log_first_seq) { - len = snprintf(text, sizeof(text), - "** %llu printk messages dropped **\n", - log_first_seq - console_seq); - - /* messages are gone, move to first one */ - console_seq = log_first_seq; - console_idx = log_first_idx; - } else { - len = 0; - } skip: - if (console_seq == log_next_seq) + if (!prb_read_valid(prb, console_seq, &r)) break; - msg = log_from_idx(console_idx); - if (suppress_message_printing(msg->level)) { + if (console_seq != r.info->seq) { + console_dropped += r.info->seq - console_seq; + console_seq = r.info->seq; + } + + if (suppress_message_printing(r.info->level)) { /* * Skip record we have buffered and already printed * directly to the console when we received it, and * record that has level above the console loglevel. */ - console_idx = log_next(console_idx); console_seq++; goto skip; } @@ -2473,19 +2467,23 @@ skip: exclusive_console = NULL; } - len += msg_print_text(msg, - console_msg_format & MSG_FORMAT_SYSLOG, - printk_time, text + len, sizeof(text) - len); + /* + * Handle extended console text first because later + * record_print_text() will modify the record buffer in-place. + */ if (nr_ext_console_drivers) { - ext_len = msg_print_ext_header(ext_text, + ext_len = info_print_ext_header(ext_text, sizeof(ext_text), - msg, console_seq); + r.info); ext_len += msg_print_ext_body(ext_text + ext_len, sizeof(ext_text) - ext_len, - log_dict(msg), msg->dict_len, - log_text(msg), msg->text_len); + &r.text_buf[0], + r.info->text_len, + &r.info->dev_info); } - console_idx = log_next(console_idx); + len = record_print_text(&r, + console_msg_format & MSG_FORMAT_SYSLOG, + printk_time); console_seq++; raw_spin_unlock(&logbuf_lock); @@ -2525,7 +2523,7 @@ skip: * flush, no worries. */ raw_spin_lock(&logbuf_lock); - retry = console_seq != log_next_seq; + retry = prb_read_valid(prb, console_seq, NULL); raw_spin_unlock(&logbuf_lock); printk_safe_exit_irqrestore(flags); @@ -2594,8 +2592,7 @@ void console_flush_on_panic(enum con_flush_mode mode) unsigned long flags; logbuf_lock_irqsave(flags); - console_seq = log_first_seq; - console_idx = log_first_idx; + console_seq = prb_first_valid_seq(prb); logbuf_unlock_irqrestore(flags); } console_unlock(); @@ -2838,7 +2835,6 @@ void register_console(struct console *newcon) exclusive_console = newcon; exclusive_console_stop_seq = console_seq; console_seq = syslog_seq; - console_idx = syslog_idx; logbuf_unlock_irqrestore(flags); } console_unlock(); @@ -3062,7 +3058,7 @@ int vprintk_deferred(const char *fmt, va_list args) { int r; - r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args); + r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, fmt, args); defer_console_output(); return r; @@ -3227,9 +3223,7 @@ void kmsg_dump(enum kmsg_dump_reason reason) logbuf_lock_irqsave(flags); dumper->cur_seq = clear_seq; - dumper->cur_idx = clear_idx; - dumper->next_seq = log_next_seq; - dumper->next_idx = log_next_idx; + dumper->next_seq = prb_next_seq(prb); logbuf_unlock_irqrestore(flags); /* invoke dumper which will iterate over records */ @@ -3263,28 +3257,33 @@ void kmsg_dump(enum kmsg_dump_reason reason) bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, char *line, size_t size, size_t *len) { - struct printk_log *msg; + struct printk_info info; + unsigned int line_count; + struct printk_record r; size_t l = 0; bool ret = false; + prb_rec_init_rd(&r, &info, line, size); + if (!dumper->active) goto out; - if (dumper->cur_seq < log_first_seq) { - /* messages are gone, move to first available one */ - dumper->cur_seq = log_first_seq; - dumper->cur_idx = log_first_idx; + /* Read text or count text lines? */ + if (line) { + if (!prb_read_valid(prb, dumper->cur_seq, &r)) + goto out; + l = record_print_text(&r, syslog, printk_time); + } else { + if (!prb_read_valid_info(prb, dumper->cur_seq, + &info, &line_count)) { + goto out; + } + l = get_record_print_text_size(&info, line_count, syslog, + printk_time); + } - /* last entry */ - if (dumper->cur_seq >= log_next_seq) - goto out; - - msg = log_from_idx(dumper->cur_idx); - l = msg_print_text(msg, syslog, printk_time, line, size); - - dumper->cur_idx = log_next(dumper->cur_idx); - dumper->cur_seq++; + dumper->cur_seq = r.info->seq + 1; ret = true; out: if (len) @@ -3332,7 +3331,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_line); * @len: length of line placed into buffer * * Start at the end of the kmsg buffer and fill the provided buffer - * with as many of the the *youngest* kmsg records that fit into it. + * with as many of the *youngest* kmsg records that fit into it. * If the buffer is large enough, all available kmsg records will be * copied with a single call. * @@ -3345,23 +3344,25 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_line); bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, char *buf, size_t size, size_t *len) { + struct printk_info info; + unsigned int line_count; + struct printk_record r; unsigned long flags; u64 seq; - u32 idx; u64 next_seq; - u32 next_idx; size_t l = 0; bool ret = false; bool time = printk_time; - if (!dumper->active) + prb_rec_init_rd(&r, &info, buf, size); + + if (!dumper->active || !buf || !size) goto out; logbuf_lock_irqsave(flags); - if (dumper->cur_seq < log_first_seq) { + if (dumper->cur_seq < prb_first_valid_seq(prb)) { /* messages are gone, move to first available one */ - dumper->cur_seq = log_first_seq; - dumper->cur_idx = log_first_idx; + dumper->cur_seq = prb_first_valid_seq(prb); } /* last entry */ @@ -3372,41 +3373,41 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, /* calculate length of entire buffer */ seq = dumper->cur_seq; - idx = dumper->cur_idx; - while (seq < dumper->next_seq) { - struct printk_log *msg = log_from_idx(idx); - - l += msg_print_text(msg, true, time, NULL, 0); - idx = log_next(idx); - seq++; + while (prb_read_valid_info(prb, seq, &info, &line_count)) { + if (r.info->seq >= dumper->next_seq) + break; + l += get_record_print_text_size(&info, line_count, true, time); + seq = r.info->seq + 1; } /* move first record forward until length fits into the buffer */ seq = dumper->cur_seq; - idx = dumper->cur_idx; - while (l >= size && seq < dumper->next_seq) { - struct printk_log *msg = log_from_idx(idx); - - l -= msg_print_text(msg, true, time, NULL, 0); - idx = log_next(idx); - seq++; + while (l >= size && prb_read_valid_info(prb, seq, + &info, &line_count)) { + if (r.info->seq >= dumper->next_seq) + break; + l -= get_record_print_text_size(&info, line_count, true, time); + seq = r.info->seq + 1; } /* last message in next interation */ next_seq = seq; - next_idx = idx; + /* actually read text into the buffer now */ l = 0; - while (seq < dumper->next_seq) { - struct printk_log *msg = log_from_idx(idx); + while (prb_read_valid(prb, seq, &r)) { + if (r.info->seq >= dumper->next_seq) + break; - l += msg_print_text(msg, syslog, time, buf + l, size - l); - idx = log_next(idx); - seq++; + l += record_print_text(&r, syslog, time); + + /* adjust record to store to remaining buffer space */ + prb_rec_init_rd(&r, &info, buf + l, size - l); + + seq = r.info->seq + 1; } dumper->next_seq = next_seq; - dumper->next_idx = next_idx; ret = true; logbuf_unlock_irqrestore(flags); out: @@ -3429,9 +3430,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper) { dumper->cur_seq = clear_seq; - dumper->cur_idx = clear_idx; - dumper->next_seq = log_next_seq; - dumper->next_idx = log_next_idx; + dumper->next_seq = prb_next_seq(prb); } /** diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c new file mode 100644 index 000000000000..2493348a1631 --- /dev/null +++ b/kernel/printk/printk_ringbuffer.c @@ -0,0 +1,2083 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include "printk_ringbuffer.h" + +/** + * DOC: printk_ringbuffer overview + * + * Data Structure + * -------------- + * The printk_ringbuffer is made up of 3 internal ringbuffers: + * + * desc_ring + * A ring of descriptors and their meta data (such as sequence number, + * timestamp, loglevel, etc.) as well as internal state information about + * the record and logical positions specifying where in the other + * ringbuffer the text strings are located. + * + * text_data_ring + * A ring of data blocks. A data block consists of an unsigned long + * integer (ID) that maps to a desc_ring index followed by the text + * string of the record. + * + * The internal state information of a descriptor is the key element to allow + * readers and writers to locklessly synchronize access to the data. + * + * Implementation + * -------------- + * + * Descriptor Ring + * ~~~~~~~~~~~~~~~ + * The descriptor ring is an array of descriptors. A descriptor contains + * essential meta data to track the data of a printk record using + * blk_lpos structs pointing to associated text data blocks (see + * "Data Rings" below). Each descriptor is assigned an ID that maps + * directly to index values of the descriptor array and has a state. The ID + * and the state are bitwise combined into a single descriptor field named + * @state_var, allowing ID and state to be synchronously and atomically + * updated. + * + * Descriptors have four states: + * + * reserved + * A writer is modifying the record. + * + * committed + * The record and all its data are written. A writer can reopen the + * descriptor (transitioning it back to reserved), but in the committed + * state the data is consistent. + * + * finalized + * The record and all its data are complete and available for reading. A + * writer cannot reopen the descriptor. + * + * reusable + * The record exists, but its text and/or meta data may no longer be + * available. + * + * Querying the @state_var of a record requires providing the ID of the + * descriptor to query. This can yield a possible fifth (pseudo) state: + * + * miss + * The descriptor being queried has an unexpected ID. + * + * The descriptor ring has a @tail_id that contains the ID of the oldest + * descriptor and @head_id that contains the ID of the newest descriptor. + * + * When a new descriptor should be created (and the ring is full), the tail + * descriptor is invalidated by first transitioning to the reusable state and + * then invalidating all tail data blocks up to and including the data blocks + * associated with the tail descriptor (for the text ring). Then + * @tail_id is advanced, followed by advancing @head_id. And finally the + * @state_var of the new descriptor is initialized to the new ID and reserved + * state. + * + * The @tail_id can only be advanced if the new @tail_id would be in the + * committed or reusable queried state. This makes it possible that a valid + * sequence number of the tail is always available. + * + * Descriptor Finalization + * ~~~~~~~~~~~~~~~~~~~~~~~ + * When a writer calls the commit function prb_commit(), record data is + * fully stored and is consistent within the ringbuffer. However, a writer can + * reopen that record, claiming exclusive access (as with prb_reserve()), and + * modify that record. When finished, the writer must again commit the record. + * + * In order for a record to be made available to readers (and also become + * recyclable for writers), it must be finalized. A finalized record cannot be + * reopened and can never become "unfinalized". Record finalization can occur + * in three different scenarios: + * + * 1) A writer can simultaneously commit and finalize its record by calling + * prb_final_commit() instead of prb_commit(). + * + * 2) When a new record is reserved and the previous record has been + * committed via prb_commit(), that previous record is automatically + * finalized. + * + * 3) When a record is committed via prb_commit() and a newer record + * already exists, the record being committed is automatically finalized. + * + * Data Ring + * ~~~~~~~~~ + * The text data ring is a byte array composed of data blocks. Data blocks are + * referenced by blk_lpos structs that point to the logical position of the + * beginning of a data block and the beginning of the next adjacent data + * block. Logical positions are mapped directly to index values of the byte + * array ringbuffer. + * + * Each data block consists of an ID followed by the writer data. The ID is + * the identifier of a descriptor that is associated with the data block. A + * given data block is considered valid if all of the following conditions + * are met: + * + * 1) The descriptor associated with the data block is in the committed + * or finalized queried state. + * + * 2) The blk_lpos struct within the descriptor associated with the data + * block references back to the same data block. + * + * 3) The data block is within the head/tail logical position range. + * + * If the writer data of a data block would extend beyond the end of the + * byte array, only the ID of the data block is stored at the logical + * position and the full data block (ID and writer data) is stored at the + * beginning of the byte array. The referencing blk_lpos will point to the + * ID before the wrap and the next data block will be at the logical + * position adjacent the full data block after the wrap. + * + * Data rings have a @tail_lpos that points to the beginning of the oldest + * data block and a @head_lpos that points to the logical position of the + * next (not yet existing) data block. + * + * When a new data block should be created (and the ring is full), tail data + * blocks will first be invalidated by putting their associated descriptors + * into the reusable state and then pushing the @tail_lpos forward beyond + * them. Then the @head_lpos is pushed forward and is associated with a new + * descriptor. If a data block is not valid, the @tail_lpos cannot be + * advanced beyond it. + * + * Info Array + * ~~~~~~~~~~ + * The general meta data of printk records are stored in printk_info structs, + * stored in an array with the same number of elements as the descriptor ring. + * Each info corresponds to the descriptor of the same index in the + * descriptor ring. Info validity is confirmed by evaluating the corresponding + * descriptor before and after loading the info. + * + * Usage + * ----- + * Here are some simple examples demonstrating writers and readers. For the + * examples a global ringbuffer (test_rb) is available (which is not the + * actual ringbuffer used by printk):: + * + * DEFINE_PRINTKRB(test_rb, 15, 5); + * + * This ringbuffer allows up to 32768 records (2 ^ 15) and has a size of + * 1 MiB (2 ^ (15 + 5)) for text data. + * + * Sample writer code:: + * + * const char *textstr = "message text"; + * struct prb_reserved_entry e; + * struct printk_record r; + * + * // specify how much to allocate + * prb_rec_init_wr(&r, strlen(textstr) + 1); + * + * if (prb_reserve(&e, &test_rb, &r)) { + * snprintf(r.text_buf, r.text_buf_size, "%s", textstr); + * + * r.info->text_len = strlen(textstr); + * r.info->ts_nsec = local_clock(); + * r.info->caller_id = printk_caller_id(); + * + * // commit and finalize the record + * prb_final_commit(&e); + * } + * + * Note that additional writer functions are available to extend a record + * after it has been committed but not yet finalized. This can be done as + * long as no new records have been reserved and the caller is the same. + * + * Sample writer code (record extending):: + * + * // alternate rest of previous example + * + * r.info->text_len = strlen(textstr); + * r.info->ts_nsec = local_clock(); + * r.info->caller_id = printk_caller_id(); + * + * // commit the record (but do not finalize yet) + * prb_commit(&e); + * } + * + * ... + * + * // specify additional 5 bytes text space to extend + * prb_rec_init_wr(&r, 5); + * + * // try to extend, but only if it does not exceed 32 bytes + * if (prb_reserve_in_last(&e, &test_rb, &r, printk_caller_id()), 32) { + * snprintf(&r.text_buf[r.info->text_len], + * r.text_buf_size - r.info->text_len, "hello"); + * + * r.info->text_len += 5; + * + * // commit and finalize the record + * prb_final_commit(&e); + * } + * + * Sample reader code:: + * + * struct printk_info info; + * struct printk_record r; + * char text_buf[32]; + * u64 seq; + * + * prb_rec_init_rd(&r, &info, &text_buf[0], sizeof(text_buf)); + * + * prb_for_each_record(0, &test_rb, &seq, &r) { + * if (info.seq != seq) + * pr_warn("lost %llu records\n", info.seq - seq); + * + * if (info.text_len > r.text_buf_size) { + * pr_warn("record %llu text truncated\n", info.seq); + * text_buf[r.text_buf_size - 1] = 0; + * } + * + * pr_info("%llu: %llu: %s\n", info.seq, info.ts_nsec, + * &text_buf[0]); + * } + * + * Note that additional less convenient reader functions are available to + * allow complex record access. + * + * ABA Issues + * ~~~~~~~~~~ + * To help avoid ABA issues, descriptors are referenced by IDs (array index + * values combined with tagged bits counting array wraps) and data blocks are + * referenced by logical positions (array index values combined with tagged + * bits counting array wraps). However, on 32-bit systems the number of + * tagged bits is relatively small such that an ABA incident is (at least + * theoretically) possible. For example, if 4 million maximally sized (1KiB) + * printk messages were to occur in NMI context on a 32-bit system, the + * interrupted context would not be able to recognize that the 32-bit integer + * completely wrapped and thus represents a different data block than the one + * the interrupted context expects. + * + * To help combat this possibility, additional state checking is performed + * (such as using cmpxchg() even though set() would suffice). These extra + * checks are commented as such and will hopefully catch any ABA issue that + * a 32-bit system might experience. + * + * Memory Barriers + * ~~~~~~~~~~~~~~~ + * Multiple memory barriers are used. To simplify proving correctness and + * generating litmus tests, lines of code related to memory barriers + * (loads, stores, and the associated memory barriers) are labeled:: + * + * LMM(function:letter) + * + * Comments reference the labels using only the "function:letter" part. + * + * The memory barrier pairs and their ordering are: + * + * desc_reserve:D / desc_reserve:B + * push descriptor tail (id), then push descriptor head (id) + * + * desc_reserve:D / data_push_tail:B + * push data tail (lpos), then set new descriptor reserved (state) + * + * desc_reserve:D / desc_push_tail:C + * push descriptor tail (id), then set new descriptor reserved (state) + * + * desc_reserve:D / prb_first_seq:C + * push descriptor tail (id), then set new descriptor reserved (state) + * + * desc_reserve:F / desc_read:D + * set new descriptor id and reserved (state), then allow writer changes + * + * data_alloc:A (or data_realloc:A) / desc_read:D + * set old descriptor reusable (state), then modify new data block area + * + * data_alloc:A (or data_realloc:A) / data_push_tail:B + * push data tail (lpos), then modify new data block area + * + * _prb_commit:B / desc_read:B + * store writer changes, then set new descriptor committed (state) + * + * desc_reopen_last:A / _prb_commit:B + * set descriptor reserved (state), then read descriptor data + * + * _prb_commit:B / desc_reserve:D + * set new descriptor committed (state), then check descriptor head (id) + * + * data_push_tail:D / data_push_tail:A + * set descriptor reusable (state), then push data tail (lpos) + * + * desc_push_tail:B / desc_reserve:D + * set descriptor reusable (state), then push descriptor tail (id) + */ + +#define DATA_SIZE(data_ring) _DATA_SIZE((data_ring)->size_bits) +#define DATA_SIZE_MASK(data_ring) (DATA_SIZE(data_ring) - 1) + +#define DESCS_COUNT(desc_ring) _DESCS_COUNT((desc_ring)->count_bits) +#define DESCS_COUNT_MASK(desc_ring) (DESCS_COUNT(desc_ring) - 1) + +/* Determine the data array index from a logical position. */ +#define DATA_INDEX(data_ring, lpos) ((lpos) & DATA_SIZE_MASK(data_ring)) + +/* Determine the desc array index from an ID or sequence number. */ +#define DESC_INDEX(desc_ring, n) ((n) & DESCS_COUNT_MASK(desc_ring)) + +/* Determine how many times the data array has wrapped. */ +#define DATA_WRAPS(data_ring, lpos) ((lpos) >> (data_ring)->size_bits) + +/* Determine if a logical position refers to a data-less block. */ +#define LPOS_DATALESS(lpos) ((lpos) & 1UL) +#define BLK_DATALESS(blk) (LPOS_DATALESS((blk)->begin) && \ + LPOS_DATALESS((blk)->next)) + +/* Get the logical position at index 0 of the current wrap. */ +#define DATA_THIS_WRAP_START_LPOS(data_ring, lpos) \ +((lpos) & ~DATA_SIZE_MASK(data_ring)) + +/* Get the ID for the same index of the previous wrap as the given ID. */ +#define DESC_ID_PREV_WRAP(desc_ring, id) \ +DESC_ID((id) - DESCS_COUNT(desc_ring)) + +/* + * A data block: mapped directly to the beginning of the data block area + * specified as a logical position within the data ring. + * + * @id: the ID of the associated descriptor + * @data: the writer data + * + * Note that the size of a data block is only known by its associated + * descriptor. + */ +struct prb_data_block { + unsigned long id; + char data[0]; +}; + +/* + * Return the descriptor associated with @n. @n can be either a + * descriptor ID or a sequence number. + */ +static struct prb_desc *to_desc(struct prb_desc_ring *desc_ring, u64 n) +{ + return &desc_ring->descs[DESC_INDEX(desc_ring, n)]; +} + +/* + * Return the printk_info associated with @n. @n can be either a + * descriptor ID or a sequence number. + */ +static struct printk_info *to_info(struct prb_desc_ring *desc_ring, u64 n) +{ + return &desc_ring->infos[DESC_INDEX(desc_ring, n)]; +} + +static struct prb_data_block *to_block(struct prb_data_ring *data_ring, + unsigned long begin_lpos) +{ + return (void *)&data_ring->data[DATA_INDEX(data_ring, begin_lpos)]; +} + +/* + * Increase the data size to account for data block meta data plus any + * padding so that the adjacent data block is aligned on the ID size. + */ +static unsigned int to_blk_size(unsigned int size) +{ + struct prb_data_block *db = NULL; + + size += sizeof(*db); + size = ALIGN(size, sizeof(db->id)); + return size; +} + +/* + * Sanity checker for reserve size. The ringbuffer code assumes that a data + * block does not exceed the maximum possible size that could fit within the + * ringbuffer. This function provides that basic size check so that the + * assumption is safe. + */ +static bool data_check_size(struct prb_data_ring *data_ring, unsigned int size) +{ + struct prb_data_block *db = NULL; + + if (size == 0) + return true; + + /* + * Ensure the alignment padded size could possibly fit in the data + * array. The largest possible data block must still leave room for + * at least the ID of the next block. + */ + size = to_blk_size(size); + if (size > DATA_SIZE(data_ring) - sizeof(db->id)) + return false; + + return true; +} + +/* Query the state of a descriptor. */ +static enum desc_state get_desc_state(unsigned long id, + unsigned long state_val) +{ + if (id != DESC_ID(state_val)) + return desc_miss; + + return DESC_STATE(state_val); +} + +/* + * Get a copy of a specified descriptor and return its queried state. If the + * descriptor is in an inconsistent state (miss or reserved), the caller can + * only expect the descriptor's @state_var field to be valid. + * + * The sequence number and caller_id can be optionally retrieved. Like all + * non-state_var data, they are only valid if the descriptor is in a + * consistent state. + */ +static enum desc_state desc_read(struct prb_desc_ring *desc_ring, + unsigned long id, struct prb_desc *desc_out, + u64 *seq_out, u32 *caller_id_out) +{ + struct printk_info *info = to_info(desc_ring, id); + struct prb_desc *desc = to_desc(desc_ring, id); + atomic_long_t *state_var = &desc->state_var; + enum desc_state d_state; + unsigned long state_val; + + /* Check the descriptor state. */ + state_val = atomic_long_read(state_var); /* LMM(desc_read:A) */ + d_state = get_desc_state(id, state_val); + if (d_state == desc_miss || d_state == desc_reserved) { + /* + * The descriptor is in an inconsistent state. Set at least + * @state_var so that the caller can see the details of + * the inconsistent state. + */ + goto out; + } + + /* + * Guarantee the state is loaded before copying the descriptor + * content. This avoids copying obsolete descriptor content that might + * not apply to the descriptor state. This pairs with _prb_commit:B. + * + * Memory barrier involvement: + * + * If desc_read:A reads from _prb_commit:B, then desc_read:C reads + * from _prb_commit:A. + * + * Relies on: + * + * WMB from _prb_commit:A to _prb_commit:B + * matching + * RMB from desc_read:A to desc_read:C + */ + smp_rmb(); /* LMM(desc_read:B) */ + + /* + * Copy the descriptor data. The data is not valid until the + * state has been re-checked. A memcpy() for all of @desc + * cannot be used because of the atomic_t @state_var field. + */ + memcpy(&desc_out->text_blk_lpos, &desc->text_blk_lpos, + sizeof(desc_out->text_blk_lpos)); /* LMM(desc_read:C) */ + if (seq_out) + *seq_out = info->seq; /* also part of desc_read:C */ + if (caller_id_out) + *caller_id_out = info->caller_id; /* also part of desc_read:C */ + + /* + * 1. Guarantee the descriptor content is loaded before re-checking + * the state. This avoids reading an obsolete descriptor state + * that may not apply to the copied content. This pairs with + * desc_reserve:F. + * + * Memory barrier involvement: + * + * If desc_read:C reads from desc_reserve:G, then desc_read:E + * reads from desc_reserve:F. + * + * Relies on: + * + * WMB from desc_reserve:F to desc_reserve:G + * matching + * RMB from desc_read:C to desc_read:E + * + * 2. Guarantee the record data is loaded before re-checking the + * state. This avoids reading an obsolete descriptor state that may + * not apply to the copied data. This pairs with data_alloc:A and + * data_realloc:A. + * + * Memory barrier involvement: + * + * If copy_data:A reads from data_alloc:B, then desc_read:E + * reads from desc_make_reusable:A. + * + * Relies on: + * + * MB from desc_make_reusable:A to data_alloc:B + * matching + * RMB from desc_read:C to desc_read:E + * + * Note: desc_make_reusable:A and data_alloc:B can be different + * CPUs. However, the data_alloc:B CPU (which performs the + * full memory barrier) must have previously seen + * desc_make_reusable:A. + */ + smp_rmb(); /* LMM(desc_read:D) */ + + /* + * The data has been copied. Return the current descriptor state, + * which may have changed since the load above. + */ + state_val = atomic_long_read(state_var); /* LMM(desc_read:E) */ + d_state = get_desc_state(id, state_val); +out: + atomic_long_set(&desc_out->state_var, state_val); + return d_state; +} + +/* + * Take a specified descriptor out of the finalized state by attempting + * the transition from finalized to reusable. Either this context or some + * other context will have been successful. + */ +static void desc_make_reusable(struct prb_desc_ring *desc_ring, + unsigned long id) +{ + unsigned long val_finalized = DESC_SV(id, desc_finalized); + unsigned long val_reusable = DESC_SV(id, desc_reusable); + struct prb_desc *desc = to_desc(desc_ring, id); + atomic_long_t *state_var = &desc->state_var; + + atomic_long_cmpxchg_relaxed(state_var, val_finalized, + val_reusable); /* LMM(desc_make_reusable:A) */ +} + +/* + * Given the text data ring, put the associated descriptor of each + * data block from @lpos_begin until @lpos_end into the reusable state. + * + * If there is any problem making the associated descriptor reusable, either + * the descriptor has not yet been finalized or another writer context has + * already pushed the tail lpos past the problematic data block. Regardless, + * on error the caller can re-load the tail lpos to determine the situation. + */ +static bool data_make_reusable(struct printk_ringbuffer *rb, + struct prb_data_ring *data_ring, + unsigned long lpos_begin, + unsigned long lpos_end, + unsigned long *lpos_out) +{ + struct prb_desc_ring *desc_ring = &rb->desc_ring; + struct prb_data_block *blk; + enum desc_state d_state; + struct prb_desc desc; + struct prb_data_blk_lpos *blk_lpos = &desc.text_blk_lpos; + unsigned long id; + + /* Loop until @lpos_begin has advanced to or beyond @lpos_end. */ + while ((lpos_end - lpos_begin) - 1 < DATA_SIZE(data_ring)) { + blk = to_block(data_ring, lpos_begin); + + /* + * Load the block ID from the data block. This is a data race + * against a writer that may have newly reserved this data + * area. If the loaded value matches a valid descriptor ID, + * the blk_lpos of that descriptor will be checked to make + * sure it points back to this data block. If the check fails, + * the data area has been recycled by another writer. + */ + id = blk->id; /* LMM(data_make_reusable:A) */ + + d_state = desc_read(desc_ring, id, &desc, + NULL, NULL); /* LMM(data_make_reusable:B) */ + + switch (d_state) { + case desc_miss: + case desc_reserved: + case desc_committed: + return false; + case desc_finalized: + /* + * This data block is invalid if the descriptor + * does not point back to it. + */ + if (blk_lpos->begin != lpos_begin) + return false; + desc_make_reusable(desc_ring, id); + break; + case desc_reusable: + /* + * This data block is invalid if the descriptor + * does not point back to it. + */ + if (blk_lpos->begin != lpos_begin) + return false; + break; + } + + /* Advance @lpos_begin to the next data block. */ + lpos_begin = blk_lpos->next; + } + + *lpos_out = lpos_begin; + return true; +} + +/* + * Advance the data ring tail to at least @lpos. This function puts + * descriptors into the reusable state if the tail is pushed beyond + * their associated data block. + */ +static bool data_push_tail(struct printk_ringbuffer *rb, + struct prb_data_ring *data_ring, + unsigned long lpos) +{ + unsigned long tail_lpos_new; + unsigned long tail_lpos; + unsigned long next_lpos; + + /* If @lpos is from a data-less block, there is nothing to do. */ + if (LPOS_DATALESS(lpos)) + return true; + + /* + * Any descriptor states that have transitioned to reusable due to the + * data tail being pushed to this loaded value will be visible to this + * CPU. This pairs with data_push_tail:D. + * + * Memory barrier involvement: + * + * If data_push_tail:A reads from data_push_tail:D, then this CPU can + * see desc_make_reusable:A. + * + * Relies on: + * + * MB from desc_make_reusable:A to data_push_tail:D + * matches + * READFROM from data_push_tail:D to data_push_tail:A + * thus + * READFROM from desc_make_reusable:A to this CPU + */ + tail_lpos = atomic_long_read(&data_ring->tail_lpos); /* LMM(data_push_tail:A) */ + + /* + * Loop until the tail lpos is at or beyond @lpos. This condition + * may already be satisfied, resulting in no full memory barrier + * from data_push_tail:D being performed. However, since this CPU + * sees the new tail lpos, any descriptor states that transitioned to + * the reusable state must already be visible. + */ + while ((lpos - tail_lpos) - 1 < DATA_SIZE(data_ring)) { + /* + * Make all descriptors reusable that are associated with + * data blocks before @lpos. + */ + if (!data_make_reusable(rb, data_ring, tail_lpos, lpos, + &next_lpos)) { + /* + * 1. Guarantee the block ID loaded in + * data_make_reusable() is performed before + * reloading the tail lpos. The failed + * data_make_reusable() may be due to a newly + * recycled data area causing the tail lpos to + * have been previously pushed. This pairs with + * data_alloc:A and data_realloc:A. + * + * Memory barrier involvement: + * + * If data_make_reusable:A reads from data_alloc:B, + * then data_push_tail:C reads from + * data_push_tail:D. + * + * Relies on: + * + * MB from data_push_tail:D to data_alloc:B + * matching + * RMB from data_make_reusable:A to + * data_push_tail:C + * + * Note: data_push_tail:D and data_alloc:B can be + * different CPUs. However, the data_alloc:B + * CPU (which performs the full memory + * barrier) must have previously seen + * data_push_tail:D. + * + * 2. Guarantee the descriptor state loaded in + * data_make_reusable() is performed before + * reloading the tail lpos. The failed + * data_make_reusable() may be due to a newly + * recycled descriptor causing the tail lpos to + * have been previously pushed. This pairs with + * desc_reserve:D. + * + * Memory barrier involvement: + * + * If data_make_reusable:B reads from + * desc_reserve:F, then data_push_tail:C reads + * from data_push_tail:D. + * + * Relies on: + * + * MB from data_push_tail:D to desc_reserve:F + * matching + * RMB from data_make_reusable:B to + * data_push_tail:C + * + * Note: data_push_tail:D and desc_reserve:F can + * be different CPUs. However, the + * desc_reserve:F CPU (which performs the + * full memory barrier) must have previously + * seen data_push_tail:D. + */ + smp_rmb(); /* LMM(data_push_tail:B) */ + + tail_lpos_new = atomic_long_read(&data_ring->tail_lpos + ); /* LMM(data_push_tail:C) */ + if (tail_lpos_new == tail_lpos) + return false; + + /* Another CPU pushed the tail. Try again. */ + tail_lpos = tail_lpos_new; + continue; + } + + /* + * Guarantee any descriptor states that have transitioned to + * reusable are stored before pushing the tail lpos. A full + * memory barrier is needed since other CPUs may have made + * the descriptor states reusable. This pairs with + * data_push_tail:A. + */ + if (atomic_long_try_cmpxchg(&data_ring->tail_lpos, &tail_lpos, + next_lpos)) { /* LMM(data_push_tail:D) */ + break; + } + } + + return true; +} + +/* + * Advance the desc ring tail. This function advances the tail by one + * descriptor, thus invalidating the oldest descriptor. Before advancing + * the tail, the tail descriptor is made reusable and all data blocks up to + * and including the descriptor's data block are invalidated (i.e. the data + * ring tail is pushed past the data block of the descriptor being made + * reusable). + */ +static bool desc_push_tail(struct printk_ringbuffer *rb, + unsigned long tail_id) +{ + struct prb_desc_ring *desc_ring = &rb->desc_ring; + enum desc_state d_state; + struct prb_desc desc; + + d_state = desc_read(desc_ring, tail_id, &desc, NULL, NULL); + + switch (d_state) { + case desc_miss: + /* + * If the ID is exactly 1 wrap behind the expected, it is + * in the process of being reserved by another writer and + * must be considered reserved. + */ + if (DESC_ID(atomic_long_read(&desc.state_var)) == + DESC_ID_PREV_WRAP(desc_ring, tail_id)) { + return false; + } + + /* + * The ID has changed. Another writer must have pushed the + * tail and recycled the descriptor already. Success is + * returned because the caller is only interested in the + * specified tail being pushed, which it was. + */ + return true; + case desc_reserved: + case desc_committed: + return false; + case desc_finalized: + desc_make_reusable(desc_ring, tail_id); + break; + case desc_reusable: + break; + } + + /* + * Data blocks must be invalidated before their associated + * descriptor can be made available for recycling. Invalidating + * them later is not possible because there is no way to trust + * data blocks once their associated descriptor is gone. + */ + + if (!data_push_tail(rb, &rb->text_data_ring, desc.text_blk_lpos.next)) + return false; + + /* + * Check the next descriptor after @tail_id before pushing the tail + * to it because the tail must always be in a finalized or reusable + * state. The implementation of prb_first_seq() relies on this. + * + * A successful read implies that the next descriptor is less than or + * equal to @head_id so there is no risk of pushing the tail past the + * head. + */ + d_state = desc_read(desc_ring, DESC_ID(tail_id + 1), &desc, + NULL, NULL); /* LMM(desc_push_tail:A) */ + + if (d_state == desc_finalized || d_state == desc_reusable) { + /* + * Guarantee any descriptor states that have transitioned to + * reusable are stored before pushing the tail ID. This allows + * verifying the recycled descriptor state. A full memory + * barrier is needed since other CPUs may have made the + * descriptor states reusable. This pairs with desc_reserve:D. + */ + atomic_long_cmpxchg(&desc_ring->tail_id, tail_id, + DESC_ID(tail_id + 1)); /* LMM(desc_push_tail:B) */ + } else { + /* + * Guarantee the last state load from desc_read() is before + * reloading @tail_id in order to see a new tail ID in the + * case that the descriptor has been recycled. This pairs + * with desc_reserve:D. + * + * Memory barrier involvement: + * + * If desc_push_tail:A reads from desc_reserve:F, then + * desc_push_tail:D reads from desc_push_tail:B. + * + * Relies on: + * + * MB from desc_push_tail:B to desc_reserve:F + * matching + * RMB from desc_push_tail:A to desc_push_tail:D + * + * Note: desc_push_tail:B and desc_reserve:F can be different + * CPUs. However, the desc_reserve:F CPU (which performs + * the full memory barrier) must have previously seen + * desc_push_tail:B. + */ + smp_rmb(); /* LMM(desc_push_tail:C) */ + + /* + * Re-check the tail ID. The descriptor following @tail_id is + * not in an allowed tail state. But if the tail has since + * been moved by another CPU, then it does not matter. + */ + if (atomic_long_read(&desc_ring->tail_id) == tail_id) /* LMM(desc_push_tail:D) */ + return false; + } + + return true; +} + +/* Reserve a new descriptor, invalidating the oldest if necessary. */ +static bool desc_reserve(struct printk_ringbuffer *rb, unsigned long *id_out) +{ + struct prb_desc_ring *desc_ring = &rb->desc_ring; + unsigned long prev_state_val; + unsigned long id_prev_wrap; + struct prb_desc *desc; + unsigned long head_id; + unsigned long id; + + head_id = atomic_long_read(&desc_ring->head_id); /* LMM(desc_reserve:A) */ + + do { + desc = to_desc(desc_ring, head_id); + + id = DESC_ID(head_id + 1); + id_prev_wrap = DESC_ID_PREV_WRAP(desc_ring, id); + + /* + * Guarantee the head ID is read before reading the tail ID. + * Since the tail ID is updated before the head ID, this + * guarantees that @id_prev_wrap is never ahead of the tail + * ID. This pairs with desc_reserve:D. + * + * Memory barrier involvement: + * + * If desc_reserve:A reads from desc_reserve:D, then + * desc_reserve:C reads from desc_push_tail:B. + * + * Relies on: + * + * MB from desc_push_tail:B to desc_reserve:D + * matching + * RMB from desc_reserve:A to desc_reserve:C + * + * Note: desc_push_tail:B and desc_reserve:D can be different + * CPUs. However, the desc_reserve:D CPU (which performs + * the full memory barrier) must have previously seen + * desc_push_tail:B. + */ + smp_rmb(); /* LMM(desc_reserve:B) */ + + if (id_prev_wrap == atomic_long_read(&desc_ring->tail_id + )) { /* LMM(desc_reserve:C) */ + /* + * Make space for the new descriptor by + * advancing the tail. + */ + if (!desc_push_tail(rb, id_prev_wrap)) + return false; + } + + /* + * 1. Guarantee the tail ID is read before validating the + * recycled descriptor state. A read memory barrier is + * sufficient for this. This pairs with desc_push_tail:B. + * + * Memory barrier involvement: + * + * If desc_reserve:C reads from desc_push_tail:B, then + * desc_reserve:E reads from desc_make_reusable:A. + * + * Relies on: + * + * MB from desc_make_reusable:A to desc_push_tail:B + * matching + * RMB from desc_reserve:C to desc_reserve:E + * + * Note: desc_make_reusable:A and desc_push_tail:B can be + * different CPUs. However, the desc_push_tail:B CPU + * (which performs the full memory barrier) must have + * previously seen desc_make_reusable:A. + * + * 2. Guarantee the tail ID is stored before storing the head + * ID. This pairs with desc_reserve:B. + * + * 3. Guarantee any data ring tail changes are stored before + * recycling the descriptor. Data ring tail changes can + * happen via desc_push_tail()->data_push_tail(). A full + * memory barrier is needed since another CPU may have + * pushed the data ring tails. This pairs with + * data_push_tail:B. + * + * 4. Guarantee a new tail ID is stored before recycling the + * descriptor. A full memory barrier is needed since + * another CPU may have pushed the tail ID. This pairs + * with desc_push_tail:C and this also pairs with + * prb_first_seq:C. + * + * 5. Guarantee the head ID is stored before trying to + * finalize the previous descriptor. This pairs with + * _prb_commit:B. + */ + } while (!atomic_long_try_cmpxchg(&desc_ring->head_id, &head_id, + id)); /* LMM(desc_reserve:D) */ + + desc = to_desc(desc_ring, id); + + /* + * If the descriptor has been recycled, verify the old state val. + * See "ABA Issues" about why this verification is performed. + */ + prev_state_val = atomic_long_read(&desc->state_var); /* LMM(desc_reserve:E) */ + if (prev_state_val && + get_desc_state(id_prev_wrap, prev_state_val) != desc_reusable) { + WARN_ON_ONCE(1); + return false; + } + + /* + * Assign the descriptor a new ID and set its state to reserved. + * See "ABA Issues" about why cmpxchg() instead of set() is used. + * + * Guarantee the new descriptor ID and state is stored before making + * any other changes. A write memory barrier is sufficient for this. + * This pairs with desc_read:D. + */ + if (!atomic_long_try_cmpxchg(&desc->state_var, &prev_state_val, + DESC_SV(id, desc_reserved))) { /* LMM(desc_reserve:F) */ + WARN_ON_ONCE(1); + return false; + } + + /* Now data in @desc can be modified: LMM(desc_reserve:G) */ + + *id_out = id; + return true; +} + +/* Determine the end of a data block. */ +static unsigned long get_next_lpos(struct prb_data_ring *data_ring, + unsigned long lpos, unsigned int size) +{ + unsigned long begin_lpos; + unsigned long next_lpos; + + begin_lpos = lpos; + next_lpos = lpos + size; + + /* First check if the data block does not wrap. */ + if (DATA_WRAPS(data_ring, begin_lpos) == DATA_WRAPS(data_ring, next_lpos)) + return next_lpos; + + /* Wrapping data blocks store their data at the beginning. */ + return (DATA_THIS_WRAP_START_LPOS(data_ring, next_lpos) + size); +} + +/* + * Allocate a new data block, invalidating the oldest data block(s) + * if necessary. This function also associates the data block with + * a specified descriptor. + */ +static char *data_alloc(struct printk_ringbuffer *rb, + struct prb_data_ring *data_ring, unsigned int size, + struct prb_data_blk_lpos *blk_lpos, unsigned long id) +{ + struct prb_data_block *blk; + unsigned long begin_lpos; + unsigned long next_lpos; + + if (size == 0) { + /* Specify a data-less block. */ + blk_lpos->begin = NO_LPOS; + blk_lpos->next = NO_LPOS; + return NULL; + } + + size = to_blk_size(size); + + begin_lpos = atomic_long_read(&data_ring->head_lpos); + + do { + next_lpos = get_next_lpos(data_ring, begin_lpos, size); + + if (!data_push_tail(rb, data_ring, next_lpos - DATA_SIZE(data_ring))) { + /* Failed to allocate, specify a data-less block. */ + blk_lpos->begin = FAILED_LPOS; + blk_lpos->next = FAILED_LPOS; + return NULL; + } + + /* + * 1. Guarantee any descriptor states that have transitioned + * to reusable are stored before modifying the newly + * allocated data area. A full memory barrier is needed + * since other CPUs may have made the descriptor states + * reusable. See data_push_tail:A about why the reusable + * states are visible. This pairs with desc_read:D. + * + * 2. Guarantee any updated tail lpos is stored before + * modifying the newly allocated data area. Another CPU may + * be in data_make_reusable() and is reading a block ID + * from this area. data_make_reusable() can handle reading + * a garbage block ID value, but then it must be able to + * load a new tail lpos. A full memory barrier is needed + * since other CPUs may have updated the tail lpos. This + * pairs with data_push_tail:B. + */ + } while (!atomic_long_try_cmpxchg(&data_ring->head_lpos, &begin_lpos, + next_lpos)); /* LMM(data_alloc:A) */ + + blk = to_block(data_ring, begin_lpos); + blk->id = id; /* LMM(data_alloc:B) */ + + if (DATA_WRAPS(data_ring, begin_lpos) != DATA_WRAPS(data_ring, next_lpos)) { + /* Wrapping data blocks store their data at the beginning. */ + blk = to_block(data_ring, 0); + + /* + * Store the ID on the wrapped block for consistency. + * The printk_ringbuffer does not actually use it. + */ + blk->id = id; + } + + blk_lpos->begin = begin_lpos; + blk_lpos->next = next_lpos; + + return &blk->data[0]; +} + +/* + * Try to resize an existing data block associated with the descriptor + * specified by @id. If the resized data block should become wrapped, it + * copies the old data to the new data block. If @size yields a data block + * with the same or less size, the data block is left as is. + * + * Fail if this is not the last allocated data block or if there is not + * enough space or it is not possible make enough space. + * + * Return a pointer to the beginning of the entire data buffer or NULL on + * failure. + */ +static char *data_realloc(struct printk_ringbuffer *rb, + struct prb_data_ring *data_ring, unsigned int size, + struct prb_data_blk_lpos *blk_lpos, unsigned long id) +{ + struct prb_data_block *blk; + unsigned long head_lpos; + unsigned long next_lpos; + bool wrapped; + + /* Reallocation only works if @blk_lpos is the newest data block. */ + head_lpos = atomic_long_read(&data_ring->head_lpos); + if (head_lpos != blk_lpos->next) + return NULL; + + /* Keep track if @blk_lpos was a wrapping data block. */ + wrapped = (DATA_WRAPS(data_ring, blk_lpos->begin) != DATA_WRAPS(data_ring, blk_lpos->next)); + + size = to_blk_size(size); + + next_lpos = get_next_lpos(data_ring, blk_lpos->begin, size); + + /* If the data block does not increase, there is nothing to do. */ + if (head_lpos - next_lpos < DATA_SIZE(data_ring)) { + blk = to_block(data_ring, blk_lpos->begin); + return &blk->data[0]; + } + + if (!data_push_tail(rb, data_ring, next_lpos - DATA_SIZE(data_ring))) + return NULL; + + /* The memory barrier involvement is the same as data_alloc:A. */ + if (!atomic_long_try_cmpxchg(&data_ring->head_lpos, &head_lpos, + next_lpos)) { /* LMM(data_realloc:A) */ + return NULL; + } + + blk = to_block(data_ring, blk_lpos->begin); + + if (DATA_WRAPS(data_ring, blk_lpos->begin) != DATA_WRAPS(data_ring, next_lpos)) { + struct prb_data_block *old_blk = blk; + + /* Wrapping data blocks store their data at the beginning. */ + blk = to_block(data_ring, 0); + + /* + * Store the ID on the wrapped block for consistency. + * The printk_ringbuffer does not actually use it. + */ + blk->id = id; + + if (!wrapped) { + /* + * Since the allocated space is now in the newly + * created wrapping data block, copy the content + * from the old data block. + */ + memcpy(&blk->data[0], &old_blk->data[0], + (blk_lpos->next - blk_lpos->begin) - sizeof(blk->id)); + } + } + + blk_lpos->next = next_lpos; + + return &blk->data[0]; +} + +/* Return the number of bytes used by a data block. */ +static unsigned int space_used(struct prb_data_ring *data_ring, + struct prb_data_blk_lpos *blk_lpos) +{ + /* Data-less blocks take no space. */ + if (BLK_DATALESS(blk_lpos)) + return 0; + + if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next)) { + /* Data block does not wrap. */ + return (DATA_INDEX(data_ring, blk_lpos->next) - + DATA_INDEX(data_ring, blk_lpos->begin)); + } + + /* + * For wrapping data blocks, the trailing (wasted) space is + * also counted. + */ + return (DATA_INDEX(data_ring, blk_lpos->next) + + DATA_SIZE(data_ring) - DATA_INDEX(data_ring, blk_lpos->begin)); +} + +/* + * Given @blk_lpos, return a pointer to the writer data from the data block + * and calculate the size of the data part. A NULL pointer is returned if + * @blk_lpos specifies values that could never be legal. + * + * This function (used by readers) performs strict validation on the lpos + * values to possibly detect bugs in the writer code. A WARN_ON_ONCE() is + * triggered if an internal error is detected. + */ +static const char *get_data(struct prb_data_ring *data_ring, + struct prb_data_blk_lpos *blk_lpos, + unsigned int *data_size) +{ + struct prb_data_block *db; + + /* Data-less data block description. */ + if (BLK_DATALESS(blk_lpos)) { + if (blk_lpos->begin == NO_LPOS && blk_lpos->next == NO_LPOS) { + *data_size = 0; + return ""; + } + return NULL; + } + + /* Regular data block: @begin less than @next and in same wrap. */ + if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next) && + blk_lpos->begin < blk_lpos->next) { + db = to_block(data_ring, blk_lpos->begin); + *data_size = blk_lpos->next - blk_lpos->begin; + + /* Wrapping data block: @begin is one wrap behind @next. */ + } else if (DATA_WRAPS(data_ring, blk_lpos->begin + DATA_SIZE(data_ring)) == + DATA_WRAPS(data_ring, blk_lpos->next)) { + db = to_block(data_ring, 0); + *data_size = DATA_INDEX(data_ring, blk_lpos->next); + + /* Illegal block description. */ + } else { + WARN_ON_ONCE(1); + return NULL; + } + + /* A valid data block will always be aligned to the ID size. */ + if (WARN_ON_ONCE(blk_lpos->begin != ALIGN(blk_lpos->begin, sizeof(db->id))) || + WARN_ON_ONCE(blk_lpos->next != ALIGN(blk_lpos->next, sizeof(db->id)))) { + return NULL; + } + + /* A valid data block will always have at least an ID. */ + if (WARN_ON_ONCE(*data_size < sizeof(db->id))) + return NULL; + + /* Subtract block ID space from size to reflect data size. */ + *data_size -= sizeof(db->id); + + return &db->data[0]; +} + +/* + * Attempt to transition the newest descriptor from committed back to reserved + * so that the record can be modified by a writer again. This is only possible + * if the descriptor is not yet finalized and the provided @caller_id matches. + */ +static struct prb_desc *desc_reopen_last(struct prb_desc_ring *desc_ring, + u32 caller_id, unsigned long *id_out) +{ + unsigned long prev_state_val; + enum desc_state d_state; + struct prb_desc desc; + struct prb_desc *d; + unsigned long id; + u32 cid; + + id = atomic_long_read(&desc_ring->head_id); + + /* + * To reduce unnecessarily reopening, first check if the descriptor + * state and caller ID are correct. + */ + d_state = desc_read(desc_ring, id, &desc, NULL, &cid); + if (d_state != desc_committed || cid != caller_id) + return NULL; + + d = to_desc(desc_ring, id); + + prev_state_val = DESC_SV(id, desc_committed); + + /* + * Guarantee the reserved state is stored before reading any + * record data. A full memory barrier is needed because @state_var + * modification is followed by reading. This pairs with _prb_commit:B. + * + * Memory barrier involvement: + * + * If desc_reopen_last:A reads from _prb_commit:B, then + * prb_reserve_in_last:A reads from _prb_commit:A. + * + * Relies on: + * + * WMB from _prb_commit:A to _prb_commit:B + * matching + * MB If desc_reopen_last:A to prb_reserve_in_last:A + */ + if (!atomic_long_try_cmpxchg(&d->state_var, &prev_state_val, + DESC_SV(id, desc_reserved))) { /* LMM(desc_reopen_last:A) */ + return NULL; + } + + *id_out = id; + return d; +} + +/** + * prb_reserve_in_last() - Re-reserve and extend the space in the ringbuffer + * used by the newest record. + * + * @e: The entry structure to setup. + * @rb: The ringbuffer to re-reserve and extend data in. + * @r: The record structure to allocate buffers for. + * @caller_id: The caller ID of the caller (reserving writer). + * @max_size: Fail if the extended size would be greater than this. + * + * This is the public function available to writers to re-reserve and extend + * data. + * + * The writer specifies the text size to extend (not the new total size) by + * setting the @text_buf_size field of @r. To ensure proper initialization + * of @r, prb_rec_init_wr() should be used. + * + * This function will fail if @caller_id does not match the caller ID of the + * newest record. In that case the caller must reserve new data using + * prb_reserve(). + * + * Context: Any context. Disables local interrupts on success. + * Return: true if text data could be extended, otherwise false. + * + * On success: + * + * - @r->text_buf points to the beginning of the entire text buffer. + * + * - @r->text_buf_size is set to the new total size of the buffer. + * + * - @r->info is not touched so that @r->info->text_len could be used + * to append the text. + * + * - prb_record_text_space() can be used on @e to query the new + * actually used space. + * + * Important: All @r->info fields will already be set with the current values + * for the record. I.e. @r->info->text_len will be less than + * @text_buf_size. Writers can use @r->info->text_len to know + * where concatenation begins and writers should update + * @r->info->text_len after concatenating. + */ +bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, + struct printk_record *r, u32 caller_id, unsigned int max_size) +{ + struct prb_desc_ring *desc_ring = &rb->desc_ring; + struct printk_info *info; + unsigned int data_size; + struct prb_desc *d; + unsigned long id; + + local_irq_save(e->irqflags); + + /* Transition the newest descriptor back to the reserved state. */ + d = desc_reopen_last(desc_ring, caller_id, &id); + if (!d) { + local_irq_restore(e->irqflags); + goto fail_reopen; + } + + /* Now the writer has exclusive access: LMM(prb_reserve_in_last:A) */ + + info = to_info(desc_ring, id); + + /* + * Set the @e fields here so that prb_commit() can be used if + * anything fails from now on. + */ + e->rb = rb; + e->id = id; + + /* + * desc_reopen_last() checked the caller_id, but there was no + * exclusive access at that point. The descriptor may have + * changed since then. + */ + if (caller_id != info->caller_id) + goto fail; + + if (BLK_DATALESS(&d->text_blk_lpos)) { + if (WARN_ON_ONCE(info->text_len != 0)) { + pr_warn_once("wrong text_len value (%hu, expecting 0)\n", + info->text_len); + info->text_len = 0; + } + + if (!data_check_size(&rb->text_data_ring, r->text_buf_size)) + goto fail; + + if (r->text_buf_size > max_size) + goto fail; + + r->text_buf = data_alloc(rb, &rb->text_data_ring, r->text_buf_size, + &d->text_blk_lpos, id); + } else { + if (!get_data(&rb->text_data_ring, &d->text_blk_lpos, &data_size)) + goto fail; + + /* + * Increase the buffer size to include the original size. If + * the meta data (@text_len) is not sane, use the full data + * block size. + */ + if (WARN_ON_ONCE(info->text_len > data_size)) { + pr_warn_once("wrong text_len value (%hu, expecting <=%u)\n", + info->text_len, data_size); + info->text_len = data_size; + } + r->text_buf_size += info->text_len; + + if (!data_check_size(&rb->text_data_ring, r->text_buf_size)) + goto fail; + + if (r->text_buf_size > max_size) + goto fail; + + r->text_buf = data_realloc(rb, &rb->text_data_ring, r->text_buf_size, + &d->text_blk_lpos, id); + } + if (r->text_buf_size && !r->text_buf) + goto fail; + + r->info = info; + + e->text_space = space_used(&rb->text_data_ring, &d->text_blk_lpos); + + return true; +fail: + prb_commit(e); + /* prb_commit() re-enabled interrupts. */ +fail_reopen: + /* Make it clear to the caller that the re-reserve failed. */ + memset(r, 0, sizeof(*r)); + return false; +} + +/* + * Attempt to finalize a specified descriptor. If this fails, the descriptor + * is either already final or it will finalize itself when the writer commits. + */ +static void desc_make_final(struct prb_desc_ring *desc_ring, unsigned long id) +{ + unsigned long prev_state_val = DESC_SV(id, desc_committed); + struct prb_desc *d = to_desc(desc_ring, id); + + atomic_long_cmpxchg_relaxed(&d->state_var, prev_state_val, + DESC_SV(id, desc_finalized)); /* LMM(desc_make_final:A) */ +} + +/** + * prb_reserve() - Reserve space in the ringbuffer. + * + * @e: The entry structure to setup. + * @rb: The ringbuffer to reserve data in. + * @r: The record structure to allocate buffers for. + * + * This is the public function available to writers to reserve data. + * + * The writer specifies the text size to reserve by setting the + * @text_buf_size field of @r. To ensure proper initialization of @r, + * prb_rec_init_wr() should be used. + * + * Context: Any context. Disables local interrupts on success. + * Return: true if at least text data could be allocated, otherwise false. + * + * On success, the fields @info and @text_buf of @r will be set by this + * function and should be filled in by the writer before committing. Also + * on success, prb_record_text_space() can be used on @e to query the actual + * space used for the text data block. + * + * Important: @info->text_len needs to be set correctly by the writer in + * order for data to be readable and/or extended. Its value + * is initialized to 0. + */ +bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, + struct printk_record *r) +{ + struct prb_desc_ring *desc_ring = &rb->desc_ring; + struct printk_info *info; + struct prb_desc *d; + unsigned long id; + u64 seq; + + if (!data_check_size(&rb->text_data_ring, r->text_buf_size)) + goto fail; + + /* + * Descriptors in the reserved state act as blockers to all further + * reservations once the desc_ring has fully wrapped. Disable + * interrupts during the reserve/commit window in order to minimize + * the likelihood of this happening. + */ + local_irq_save(e->irqflags); + + if (!desc_reserve(rb, &id)) { + /* Descriptor reservation failures are tracked. */ + atomic_long_inc(&rb->fail); + local_irq_restore(e->irqflags); + goto fail; + } + + d = to_desc(desc_ring, id); + info = to_info(desc_ring, id); + + /* + * All @info fields (except @seq) are cleared and must be filled in + * by the writer. Save @seq before clearing because it is used to + * determine the new sequence number. + */ + seq = info->seq; + memset(info, 0, sizeof(*info)); + + /* + * Set the @e fields here so that prb_commit() can be used if + * text data allocation fails. + */ + e->rb = rb; + e->id = id; + + /* + * Initialize the sequence number if it has "never been set". + * Otherwise just increment it by a full wrap. + * + * @seq is considered "never been set" if it has a value of 0, + * _except_ for @infos[0], which was specially setup by the ringbuffer + * initializer and therefore is always considered as set. + * + * See the "Bootstrap" comment block in printk_ringbuffer.h for + * details about how the initializer bootstraps the descriptors. + */ + if (seq == 0 && DESC_INDEX(desc_ring, id) != 0) + info->seq = DESC_INDEX(desc_ring, id); + else + info->seq = seq + DESCS_COUNT(desc_ring); + + /* + * New data is about to be reserved. Once that happens, previous + * descriptors are no longer able to be extended. Finalize the + * previous descriptor now so that it can be made available to + * readers. (For seq==0 there is no previous descriptor.) + */ + if (info->seq > 0) + desc_make_final(desc_ring, DESC_ID(id - 1)); + + r->text_buf = data_alloc(rb, &rb->text_data_ring, r->text_buf_size, + &d->text_blk_lpos, id); + /* If text data allocation fails, a data-less record is committed. */ + if (r->text_buf_size && !r->text_buf) { + prb_commit(e); + /* prb_commit() re-enabled interrupts. */ + goto fail; + } + + r->info = info; + + /* Record full text space used by record. */ + e->text_space = space_used(&rb->text_data_ring, &d->text_blk_lpos); + + return true; +fail: + /* Make it clear to the caller that the reserve failed. */ + memset(r, 0, sizeof(*r)); + return false; +} + +/* Commit the data (possibly finalizing it) and restore interrupts. */ +static void _prb_commit(struct prb_reserved_entry *e, unsigned long state_val) +{ + struct prb_desc_ring *desc_ring = &e->rb->desc_ring; + struct prb_desc *d = to_desc(desc_ring, e->id); + unsigned long prev_state_val = DESC_SV(e->id, desc_reserved); + + /* Now the writer has finished all writing: LMM(_prb_commit:A) */ + + /* + * Set the descriptor as committed. See "ABA Issues" about why + * cmpxchg() instead of set() is used. + * + * 1 Guarantee all record data is stored before the descriptor state + * is stored as committed. A write memory barrier is sufficient + * for this. This pairs with desc_read:B and desc_reopen_last:A. + * + * 2. Guarantee the descriptor state is stored as committed before + * re-checking the head ID in order to possibly finalize this + * descriptor. This pairs with desc_reserve:D. + * + * Memory barrier involvement: + * + * If prb_commit:A reads from desc_reserve:D, then + * desc_make_final:A reads from _prb_commit:B. + * + * Relies on: + * + * MB _prb_commit:B to prb_commit:A + * matching + * MB desc_reserve:D to desc_make_final:A + */ + if (!atomic_long_try_cmpxchg(&d->state_var, &prev_state_val, + DESC_SV(e->id, state_val))) { /* LMM(_prb_commit:B) */ + WARN_ON_ONCE(1); + } + + /* Restore interrupts, the reserve/commit window is finished. */ + local_irq_restore(e->irqflags); +} + +/** + * prb_commit() - Commit (previously reserved) data to the ringbuffer. + * + * @e: The entry containing the reserved data information. + * + * This is the public function available to writers to commit data. + * + * Note that the data is not yet available to readers until it is finalized. + * Finalizing happens automatically when space for the next record is + * reserved. + * + * See prb_final_commit() for a version of this function that finalizes + * immediately. + * + * Context: Any context. Enables local interrupts. + */ +void prb_commit(struct prb_reserved_entry *e) +{ + struct prb_desc_ring *desc_ring = &e->rb->desc_ring; + unsigned long head_id; + + _prb_commit(e, desc_committed); + + /* + * If this descriptor is no longer the head (i.e. a new record has + * been allocated), extending the data for this record is no longer + * allowed and therefore it must be finalized. + */ + head_id = atomic_long_read(&desc_ring->head_id); /* LMM(prb_commit:A) */ + if (head_id != e->id) + desc_make_final(desc_ring, e->id); +} + +/** + * prb_final_commit() - Commit and finalize (previously reserved) data to + * the ringbuffer. + * + * @e: The entry containing the reserved data information. + * + * This is the public function available to writers to commit+finalize data. + * + * By finalizing, the data is made immediately available to readers. + * + * This function should only be used if there are no intentions of extending + * this data using prb_reserve_in_last(). + * + * Context: Any context. Enables local interrupts. + */ +void prb_final_commit(struct prb_reserved_entry *e) +{ + _prb_commit(e, desc_finalized); +} + +/* + * Count the number of lines in provided text. All text has at least 1 line + * (even if @text_size is 0). Each '\n' processed is counted as an additional + * line. + */ +static unsigned int count_lines(const char *text, unsigned int text_size) +{ + unsigned int next_size = text_size; + unsigned int line_count = 1; + const char *next = text; + + while (next_size) { + next = memchr(next, '\n', next_size); + if (!next) + break; + line_count++; + next++; + next_size = text_size - (next - text); + } + + return line_count; +} + +/* + * Given @blk_lpos, copy an expected @len of data into the provided buffer. + * If @line_count is provided, count the number of lines in the data. + * + * This function (used by readers) performs strict validation on the data + * size to possibly detect bugs in the writer code. A WARN_ON_ONCE() is + * triggered if an internal error is detected. + */ +static bool copy_data(struct prb_data_ring *data_ring, + struct prb_data_blk_lpos *blk_lpos, u16 len, char *buf, + unsigned int buf_size, unsigned int *line_count) +{ + unsigned int data_size; + const char *data; + + /* Caller might not want any data. */ + if ((!buf || !buf_size) && !line_count) + return true; + + data = get_data(data_ring, blk_lpos, &data_size); + if (!data) + return false; + + /* + * Actual cannot be less than expected. It can be more than expected + * because of the trailing alignment padding. + * + * Note that invalid @len values can occur because the caller loads + * the value during an allowed data race. + */ + if (data_size < (unsigned int)len) + return false; + + /* Caller interested in the line count? */ + if (line_count) + *line_count = count_lines(data, data_size); + + /* Caller interested in the data content? */ + if (!buf || !buf_size) + return true; + + data_size = min_t(u16, buf_size, len); + + memcpy(&buf[0], data, data_size); /* LMM(copy_data:A) */ + return true; +} + +/* + * This is an extended version of desc_read(). It gets a copy of a specified + * descriptor. However, it also verifies that the record is finalized and has + * the sequence number @seq. On success, 0 is returned. + * + * Error return values: + * -EINVAL: A finalized record with sequence number @seq does not exist. + * -ENOENT: A finalized record with sequence number @seq exists, but its data + * is not available. This is a valid record, so readers should + * continue with the next record. + */ +static int desc_read_finalized_seq(struct prb_desc_ring *desc_ring, + unsigned long id, u64 seq, + struct prb_desc *desc_out) +{ + struct prb_data_blk_lpos *blk_lpos = &desc_out->text_blk_lpos; + enum desc_state d_state; + u64 s; + + d_state = desc_read(desc_ring, id, desc_out, &s, NULL); + + /* + * An unexpected @id (desc_miss) or @seq mismatch means the record + * does not exist. A descriptor in the reserved or committed state + * means the record does not yet exist for the reader. + */ + if (d_state == desc_miss || + d_state == desc_reserved || + d_state == desc_committed || + s != seq) { + return -EINVAL; + } + + /* + * A descriptor in the reusable state may no longer have its data + * available; report it as existing but with lost data. Or the record + * may actually be a record with lost data. + */ + if (d_state == desc_reusable || + (blk_lpos->begin == FAILED_LPOS && blk_lpos->next == FAILED_LPOS)) { + return -ENOENT; + } + + return 0; +} + +/* + * Copy the ringbuffer data from the record with @seq to the provided + * @r buffer. On success, 0 is returned. + * + * See desc_read_finalized_seq() for error return values. + */ +static int prb_read(struct printk_ringbuffer *rb, u64 seq, + struct printk_record *r, unsigned int *line_count) +{ + struct prb_desc_ring *desc_ring = &rb->desc_ring; + struct printk_info *info = to_info(desc_ring, seq); + struct prb_desc *rdesc = to_desc(desc_ring, seq); + atomic_long_t *state_var = &rdesc->state_var; + struct prb_desc desc; + unsigned long id; + int err; + + /* Extract the ID, used to specify the descriptor to read. */ + id = DESC_ID(atomic_long_read(state_var)); + + /* Get a local copy of the correct descriptor (if available). */ + err = desc_read_finalized_seq(desc_ring, id, seq, &desc); + + /* + * If @r is NULL, the caller is only interested in the availability + * of the record. + */ + if (err || !r) + return err; + + /* If requested, copy meta data. */ + if (r->info) + memcpy(r->info, info, sizeof(*(r->info))); + + /* Copy text data. If it fails, this is a data-less record. */ + if (!copy_data(&rb->text_data_ring, &desc.text_blk_lpos, info->text_len, + r->text_buf, r->text_buf_size, line_count)) { + return -ENOENT; + } + + /* Ensure the record is still finalized and has the same @seq. */ + return desc_read_finalized_seq(desc_ring, id, seq, &desc); +} + +/* Get the sequence number of the tail descriptor. */ +static u64 prb_first_seq(struct printk_ringbuffer *rb) +{ + struct prb_desc_ring *desc_ring = &rb->desc_ring; + enum desc_state d_state; + struct prb_desc desc; + unsigned long id; + u64 seq; + + for (;;) { + id = atomic_long_read(&rb->desc_ring.tail_id); /* LMM(prb_first_seq:A) */ + + d_state = desc_read(desc_ring, id, &desc, &seq, NULL); /* LMM(prb_first_seq:B) */ + + /* + * This loop will not be infinite because the tail is + * _always_ in the finalized or reusable state. + */ + if (d_state == desc_finalized || d_state == desc_reusable) + break; + + /* + * Guarantee the last state load from desc_read() is before + * reloading @tail_id in order to see a new tail in the case + * that the descriptor has been recycled. This pairs with + * desc_reserve:D. + * + * Memory barrier involvement: + * + * If prb_first_seq:B reads from desc_reserve:F, then + * prb_first_seq:A reads from desc_push_tail:B. + * + * Relies on: + * + * MB from desc_push_tail:B to desc_reserve:F + * matching + * RMB prb_first_seq:B to prb_first_seq:A + */ + smp_rmb(); /* LMM(prb_first_seq:C) */ + } + + return seq; +} + +/* + * Non-blocking read of a record. Updates @seq to the last finalized record + * (which may have no data available). + * + * See the description of prb_read_valid() and prb_read_valid_info() + * for details. + */ +static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq, + struct printk_record *r, unsigned int *line_count) +{ + u64 tail_seq; + int err; + + while ((err = prb_read(rb, *seq, r, line_count))) { + tail_seq = prb_first_seq(rb); + + if (*seq < tail_seq) { + /* + * Behind the tail. Catch up and try again. This + * can happen for -ENOENT and -EINVAL cases. + */ + *seq = tail_seq; + + } else if (err == -ENOENT) { + /* Record exists, but no data available. Skip. */ + (*seq)++; + + } else { + /* Non-existent/non-finalized record. Must stop. */ + return false; + } + } + + return true; +} + +/** + * prb_read_valid() - Non-blocking read of a requested record or (if gone) + * the next available record. + * + * @rb: The ringbuffer to read from. + * @seq: The sequence number of the record to read. + * @r: A record data buffer to store the read record to. + * + * This is the public function available to readers to read a record. + * + * The reader provides the @info and @text_buf buffers of @r to be + * filled in. Any of the buffer pointers can be set to NULL if the reader + * is not interested in that data. To ensure proper initialization of @r, + * prb_rec_init_rd() should be used. + * + * Context: Any context. + * Return: true if a record was read, otherwise false. + * + * On success, the reader must check r->info.seq to see which record was + * actually read. This allows the reader to detect dropped records. + * + * Failure means @seq refers to a not yet written record. + */ +bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq, + struct printk_record *r) +{ + return _prb_read_valid(rb, &seq, r, NULL); +} + +/** + * prb_read_valid_info() - Non-blocking read of meta data for a requested + * record or (if gone) the next available record. + * + * @rb: The ringbuffer to read from. + * @seq: The sequence number of the record to read. + * @info: A buffer to store the read record meta data to. + * @line_count: A buffer to store the number of lines in the record text. + * + * This is the public function available to readers to read only the + * meta data of a record. + * + * The reader provides the @info, @line_count buffers to be filled in. + * Either of the buffer pointers can be set to NULL if the reader is not + * interested in that data. + * + * Context: Any context. + * Return: true if a record's meta data was read, otherwise false. + * + * On success, the reader must check info->seq to see which record meta data + * was actually read. This allows the reader to detect dropped records. + * + * Failure means @seq refers to a not yet written record. + */ +bool prb_read_valid_info(struct printk_ringbuffer *rb, u64 seq, + struct printk_info *info, unsigned int *line_count) +{ + struct printk_record r; + + prb_rec_init_rd(&r, info, NULL, 0); + + return _prb_read_valid(rb, &seq, &r, line_count); +} + +/** + * prb_first_valid_seq() - Get the sequence number of the oldest available + * record. + * + * @rb: The ringbuffer to get the sequence number from. + * + * This is the public function available to readers to see what the + * first/oldest valid sequence number is. + * + * This provides readers a starting point to begin iterating the ringbuffer. + * + * Context: Any context. + * Return: The sequence number of the first/oldest record or, if the + * ringbuffer is empty, 0 is returned. + */ +u64 prb_first_valid_seq(struct printk_ringbuffer *rb) +{ + u64 seq = 0; + + if (!_prb_read_valid(rb, &seq, NULL, NULL)) + return 0; + + return seq; +} + +/** + * prb_next_seq() - Get the sequence number after the last available record. + * + * @rb: The ringbuffer to get the sequence number from. + * + * This is the public function available to readers to see what the next + * newest sequence number available to readers will be. + * + * This provides readers a sequence number to jump to if all currently + * available records should be skipped. + * + * Context: Any context. + * Return: The sequence number of the next newest (not yet available) record + * for readers. + */ +u64 prb_next_seq(struct printk_ringbuffer *rb) +{ + u64 seq = 0; + + /* Search forward from the oldest descriptor. */ + while (_prb_read_valid(rb, &seq, NULL, NULL)) + seq++; + + return seq; +} + +/** + * prb_init() - Initialize a ringbuffer to use provided external buffers. + * + * @rb: The ringbuffer to initialize. + * @text_buf: The data buffer for text data. + * @textbits: The size of @text_buf as a power-of-2 value. + * @descs: The descriptor buffer for ringbuffer records. + * @descbits: The count of @descs items as a power-of-2 value. + * @infos: The printk_info buffer for ringbuffer records. + * + * This is the public function available to writers to setup a ringbuffer + * during runtime using provided buffers. + * + * This must match the initialization of DEFINE_PRINTKRB(). + * + * Context: Any context. + */ +void prb_init(struct printk_ringbuffer *rb, + char *text_buf, unsigned int textbits, + struct prb_desc *descs, unsigned int descbits, + struct printk_info *infos) +{ + memset(descs, 0, _DESCS_COUNT(descbits) * sizeof(descs[0])); + memset(infos, 0, _DESCS_COUNT(descbits) * sizeof(infos[0])); + + rb->desc_ring.count_bits = descbits; + rb->desc_ring.descs = descs; + rb->desc_ring.infos = infos; + atomic_long_set(&rb->desc_ring.head_id, DESC0_ID(descbits)); + atomic_long_set(&rb->desc_ring.tail_id, DESC0_ID(descbits)); + + rb->text_data_ring.size_bits = textbits; + rb->text_data_ring.data = text_buf; + atomic_long_set(&rb->text_data_ring.head_lpos, BLK0_LPOS(textbits)); + atomic_long_set(&rb->text_data_ring.tail_lpos, BLK0_LPOS(textbits)); + + atomic_long_set(&rb->fail, 0); + + atomic_long_set(&(descs[_DESCS_COUNT(descbits) - 1].state_var), DESC0_SV(descbits)); + descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.begin = FAILED_LPOS; + descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.next = FAILED_LPOS; + + infos[0].seq = -(u64)_DESCS_COUNT(descbits); + infos[_DESCS_COUNT(descbits) - 1].seq = 0; +} + +/** + * prb_record_text_space() - Query the full actual used ringbuffer space for + * the text data of a reserved entry. + * + * @e: The successfully reserved entry to query. + * + * This is the public function available to writers to see how much actual + * space is used in the ringbuffer to store the text data of the specified + * entry. + * + * This function is only valid if @e has been successfully reserved using + * prb_reserve(). + * + * Context: Any context. + * Return: The size in bytes used by the text data of the associated record. + */ +unsigned int prb_record_text_space(struct prb_reserved_entry *e) +{ + return e->text_space; +} diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h new file mode 100644 index 000000000000..5dc9d022db07 --- /dev/null +++ b/kernel/printk/printk_ringbuffer.h @@ -0,0 +1,382 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _KERNEL_PRINTK_RINGBUFFER_H +#define _KERNEL_PRINTK_RINGBUFFER_H + +#include +#include + +/* + * Meta information about each stored message. + * + * All fields are set by the printk code except for @seq, which is + * set by the ringbuffer code. + */ +struct printk_info { + u64 seq; /* sequence number */ + u64 ts_nsec; /* timestamp in nanoseconds */ + u16 text_len; /* length of text message */ + u8 facility; /* syslog facility */ + u8 flags:5; /* internal record flags */ + u8 level:3; /* syslog level */ + u32 caller_id; /* thread id or processor id */ + + struct dev_printk_info dev_info; +}; + +/* + * A structure providing the buffers, used by writers and readers. + * + * Writers: + * Using prb_rec_init_wr(), a writer sets @text_buf_size before calling + * prb_reserve(). On success, prb_reserve() sets @info and @text_buf to + * buffers reserved for that writer. + * + * Readers: + * Using prb_rec_init_rd(), a reader sets all fields before calling + * prb_read_valid(). Note that the reader provides the @info and @text_buf, + * buffers. On success, the struct pointed to by @info will be filled and + * the char array pointed to by @text_buf will be filled with text data. + */ +struct printk_record { + struct printk_info *info; + char *text_buf; + unsigned int text_buf_size; +}; + +/* Specifies the logical position and span of a data block. */ +struct prb_data_blk_lpos { + unsigned long begin; + unsigned long next; +}; + +/* + * A descriptor: the complete meta-data for a record. + * + * @state_var: A bitwise combination of descriptor ID and descriptor state. + */ +struct prb_desc { + atomic_long_t state_var; + struct prb_data_blk_lpos text_blk_lpos; +}; + +/* A ringbuffer of "ID + data" elements. */ +struct prb_data_ring { + unsigned int size_bits; + char *data; + atomic_long_t head_lpos; + atomic_long_t tail_lpos; +}; + +/* A ringbuffer of "struct prb_desc" elements. */ +struct prb_desc_ring { + unsigned int count_bits; + struct prb_desc *descs; + struct printk_info *infos; + atomic_long_t head_id; + atomic_long_t tail_id; +}; + +/* + * The high level structure representing the printk ringbuffer. + * + * @fail: Count of failed prb_reserve() calls where not even a data-less + * record was created. + */ +struct printk_ringbuffer { + struct prb_desc_ring desc_ring; + struct prb_data_ring text_data_ring; + atomic_long_t fail; +}; + +/* + * Used by writers as a reserve/commit handle. + * + * @rb: Ringbuffer where the entry is reserved. + * @irqflags: Saved irq flags to restore on entry commit. + * @id: ID of the reserved descriptor. + * @text_space: Total occupied buffer space in the text data ring, including + * ID, alignment padding, and wrapping data blocks. + * + * This structure is an opaque handle for writers. Its contents are only + * to be used by the ringbuffer implementation. + */ +struct prb_reserved_entry { + struct printk_ringbuffer *rb; + unsigned long irqflags; + unsigned long id; + unsigned int text_space; +}; + +/* The possible responses of a descriptor state-query. */ +enum desc_state { + desc_miss = -1, /* ID mismatch (pseudo state) */ + desc_reserved = 0x0, /* reserved, in use by writer */ + desc_committed = 0x1, /* committed by writer, could get reopened */ + desc_finalized = 0x2, /* committed, no further modification allowed */ + desc_reusable = 0x3, /* free, not yet used by any writer */ +}; + +#define _DATA_SIZE(sz_bits) (1UL << (sz_bits)) +#define _DESCS_COUNT(ct_bits) (1U << (ct_bits)) +#define DESC_SV_BITS (sizeof(unsigned long) * 8) +#define DESC_FLAGS_SHIFT (DESC_SV_BITS - 2) +#define DESC_FLAGS_MASK (3UL << DESC_FLAGS_SHIFT) +#define DESC_STATE(sv) (3UL & (sv >> DESC_FLAGS_SHIFT)) +#define DESC_SV(id, state) (((unsigned long)state << DESC_FLAGS_SHIFT) | id) +#define DESC_ID_MASK (~DESC_FLAGS_MASK) +#define DESC_ID(sv) ((sv) & DESC_ID_MASK) +#define FAILED_LPOS 0x1 +#define NO_LPOS 0x3 + +#define FAILED_BLK_LPOS \ +{ \ + .begin = FAILED_LPOS, \ + .next = FAILED_LPOS, \ +} + +/* + * Descriptor Bootstrap + * + * The descriptor array is minimally initialized to allow immediate usage + * by readers and writers. The requirements that the descriptor array + * initialization must satisfy: + * + * Req1 + * The tail must point to an existing (committed or reusable) descriptor. + * This is required by the implementation of prb_first_seq(). + * + * Req2 + * Readers must see that the ringbuffer is initially empty. + * + * Req3 + * The first record reserved by a writer is assigned sequence number 0. + * + * To satisfy Req1, the tail initially points to a descriptor that is + * minimally initialized (having no data block, i.e. data-less with the + * data block's lpos @begin and @next values set to FAILED_LPOS). + * + * To satisfy Req2, the initial tail descriptor is initialized to the + * reusable state. Readers recognize reusable descriptors as existing + * records, but skip over them. + * + * To satisfy Req3, the last descriptor in the array is used as the initial + * head (and tail) descriptor. This allows the first record reserved by a + * writer (head + 1) to be the first descriptor in the array. (Only the first + * descriptor in the array could have a valid sequence number of 0.) + * + * The first time a descriptor is reserved, it is assigned a sequence number + * with the value of the array index. A "first time reserved" descriptor can + * be recognized because it has a sequence number of 0 but does not have an + * index of 0. (Only the first descriptor in the array could have a valid + * sequence number of 0.) After the first reservation, all future reservations + * (recycling) simply involve incrementing the sequence number by the array + * count. + * + * Hack #1 + * Only the first descriptor in the array is allowed to have the sequence + * number 0. In this case it is not possible to recognize if it is being + * reserved the first time (set to index value) or has been reserved + * previously (increment by the array count). This is handled by _always_ + * incrementing the sequence number by the array count when reserving the + * first descriptor in the array. In order to satisfy Req3, the sequence + * number of the first descriptor in the array is initialized to minus + * the array count. Then, upon the first reservation, it is incremented + * to 0, thus satisfying Req3. + * + * Hack #2 + * prb_first_seq() can be called at any time by readers to retrieve the + * sequence number of the tail descriptor. However, due to Req2 and Req3, + * initially there are no records to report the sequence number of + * (sequence numbers are u64 and there is nothing less than 0). To handle + * this, the sequence number of the initial tail descriptor is initialized + * to 0. Technically this is incorrect, because there is no record with + * sequence number 0 (yet) and the tail descriptor is not the first + * descriptor in the array. But it allows prb_read_valid() to correctly + * report the existence of a record for _any_ given sequence number at all + * times. Bootstrapping is complete when the tail is pushed the first + * time, thus finally pointing to the first descriptor reserved by a + * writer, which has the assigned sequence number 0. + */ + +/* + * Initiating Logical Value Overflows + * + * Both logical position (lpos) and ID values can be mapped to array indexes + * but may experience overflows during the lifetime of the system. To ensure + * that printk_ringbuffer can handle the overflows for these types, initial + * values are chosen that map to the correct initial array indexes, but will + * result in overflows soon. + * + * BLK0_LPOS + * The initial @head_lpos and @tail_lpos for data rings. It is at index + * 0 and the lpos value is such that it will overflow on the first wrap. + * + * DESC0_ID + * The initial @head_id and @tail_id for the desc ring. It is at the last + * index of the descriptor array (see Req3 above) and the ID value is such + * that it will overflow on the second wrap. + */ +#define BLK0_LPOS(sz_bits) (-(_DATA_SIZE(sz_bits))) +#define DESC0_ID(ct_bits) DESC_ID(-(_DESCS_COUNT(ct_bits) + 1)) +#define DESC0_SV(ct_bits) DESC_SV(DESC0_ID(ct_bits), desc_reusable) + +/* + * Define a ringbuffer with an external text data buffer. The same as + * DEFINE_PRINTKRB() but requires specifying an external buffer for the + * text data. + * + * Note: The specified external buffer must be of the size: + * 2 ^ (descbits + avgtextbits) + */ +#define _DEFINE_PRINTKRB(name, descbits, avgtextbits, text_buf) \ +static struct prb_desc _##name##_descs[_DESCS_COUNT(descbits)] = { \ + /* the initial head and tail */ \ + [_DESCS_COUNT(descbits) - 1] = { \ + /* reusable */ \ + .state_var = ATOMIC_INIT(DESC0_SV(descbits)), \ + /* no associated data block */ \ + .text_blk_lpos = FAILED_BLK_LPOS, \ + }, \ +}; \ +static struct printk_info _##name##_infos[_DESCS_COUNT(descbits)] = { \ + /* this will be the first record reserved by a writer */ \ + [0] = { \ + /* will be incremented to 0 on the first reservation */ \ + .seq = -(u64)_DESCS_COUNT(descbits), \ + }, \ + /* the initial head and tail */ \ + [_DESCS_COUNT(descbits) - 1] = { \ + /* reports the first seq value during the bootstrap phase */ \ + .seq = 0, \ + }, \ +}; \ +static struct printk_ringbuffer name = { \ + .desc_ring = { \ + .count_bits = descbits, \ + .descs = &_##name##_descs[0], \ + .infos = &_##name##_infos[0], \ + .head_id = ATOMIC_INIT(DESC0_ID(descbits)), \ + .tail_id = ATOMIC_INIT(DESC0_ID(descbits)), \ + }, \ + .text_data_ring = { \ + .size_bits = (avgtextbits) + (descbits), \ + .data = text_buf, \ + .head_lpos = ATOMIC_LONG_INIT(BLK0_LPOS((avgtextbits) + (descbits))), \ + .tail_lpos = ATOMIC_LONG_INIT(BLK0_LPOS((avgtextbits) + (descbits))), \ + }, \ + .fail = ATOMIC_LONG_INIT(0), \ +} + +/** + * DEFINE_PRINTKRB() - Define a ringbuffer. + * + * @name: The name of the ringbuffer variable. + * @descbits: The number of descriptors as a power-of-2 value. + * @avgtextbits: The average text data size per record as a power-of-2 value. + * + * This is a macro for defining a ringbuffer and all internal structures + * such that it is ready for immediate use. See _DEFINE_PRINTKRB() for a + * variant where the text data buffer can be specified externally. + */ +#define DEFINE_PRINTKRB(name, descbits, avgtextbits) \ +static char _##name##_text[1U << ((avgtextbits) + (descbits))] \ + __aligned(__alignof__(unsigned long)); \ +_DEFINE_PRINTKRB(name, descbits, avgtextbits, &_##name##_text[0]) + +/* Writer Interface */ + +/** + * prb_rec_init_wd() - Initialize a buffer for writing records. + * + * @r: The record to initialize. + * @text_buf_size: The needed text buffer size. + */ +static inline void prb_rec_init_wr(struct printk_record *r, + unsigned int text_buf_size) +{ + r->info = NULL; + r->text_buf = NULL; + r->text_buf_size = text_buf_size; +} + +bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, + struct printk_record *r); +bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, + struct printk_record *r, u32 caller_id, unsigned int max_size); +void prb_commit(struct prb_reserved_entry *e); +void prb_final_commit(struct prb_reserved_entry *e); + +void prb_init(struct printk_ringbuffer *rb, + char *text_buf, unsigned int text_buf_size, + struct prb_desc *descs, unsigned int descs_count_bits, + struct printk_info *infos); +unsigned int prb_record_text_space(struct prb_reserved_entry *e); + +/* Reader Interface */ + +/** + * prb_rec_init_rd() - Initialize a buffer for reading records. + * + * @r: The record to initialize. + * @info: A buffer to store record meta-data. + * @text_buf: A buffer to store text data. + * @text_buf_size: The size of @text_buf. + * + * Initialize all the fields that a reader is interested in. All arguments + * (except @r) are optional. Only record data for arguments that are + * non-NULL or non-zero will be read. + */ +static inline void prb_rec_init_rd(struct printk_record *r, + struct printk_info *info, + char *text_buf, unsigned int text_buf_size) +{ + r->info = info; + r->text_buf = text_buf; + r->text_buf_size = text_buf_size; +} + +/** + * prb_for_each_record() - Iterate over the records of a ringbuffer. + * + * @from: The sequence number to begin with. + * @rb: The ringbuffer to iterate over. + * @s: A u64 to store the sequence number on each iteration. + * @r: A printk_record to store the record on each iteration. + * + * This is a macro for conveniently iterating over a ringbuffer. + * Note that @s may not be the sequence number of the record on each + * iteration. For the sequence number, @r->info->seq should be checked. + * + * Context: Any context. + */ +#define prb_for_each_record(from, rb, s, r) \ +for ((s) = from; prb_read_valid(rb, s, r); (s) = (r)->info->seq + 1) + +/** + * prb_for_each_info() - Iterate over the meta data of a ringbuffer. + * + * @from: The sequence number to begin with. + * @rb: The ringbuffer to iterate over. + * @s: A u64 to store the sequence number on each iteration. + * @i: A printk_info to store the record meta data on each iteration. + * @lc: An unsigned int to store the text line count of each record. + * + * This is a macro for conveniently iterating over a ringbuffer. + * Note that @s may not be the sequence number of the record on each + * iteration. For the sequence number, @r->info->seq should be checked. + * + * Context: Any context. + */ +#define prb_for_each_info(from, rb, s, i, lc) \ +for ((s) = from; prb_read_valid_info(rb, s, i, lc); (s) = (i)->seq + 1) + +bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq, + struct printk_record *r); +bool prb_read_valid_info(struct printk_ringbuffer *rb, u64 seq, + struct printk_info *info, unsigned int *line_count); + +u64 prb_first_valid_seq(struct printk_ringbuffer *rb); +u64 prb_next_seq(struct printk_ringbuffer *rb); + +#endif /* _KERNEL_PRINTK_RINGBUFFER_H */ diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index 50aeae770434..5dbc40160990 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -375,7 +375,7 @@ __printf(1, 0) int vprintk_func(const char *fmt, va_list args) raw_spin_trylock(&logbuf_lock)) { int len; - len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args); + len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, fmt, args); raw_spin_unlock(&logbuf_lock); defer_console_output(); return len; diff --git a/kernel/resource.c b/kernel/resource.c index 841737bbda9e..f1175ce93a1d 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -382,10 +382,13 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, if (p) { /* copy data */ - res->start = max(start, p->start); - res->end = min(end, p->end); - res->flags = p->flags; - res->desc = p->desc; + *res = (struct resource) { + .start = max(start, p->start), + .end = min(end, p->end), + .flags = p->flags, + .desc = p->desc, + .parent = p->parent, + }; } read_unlock(&resource_lock); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 676d4af62103..8ad7a293255a 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -196,6 +196,10 @@ struct seccomp_filter { */ static void populate_seccomp_data(struct seccomp_data *sd) { + /* + * Instead of using current_pt_reg(), we're already doing the work + * to safely fetch "current", so just use "task" everywhere below. + */ struct task_struct *task = current; struct pt_regs *regs = task_pt_regs(task); unsigned long args[6]; @@ -910,7 +914,7 @@ out: if (flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE) return 0; - syscall_set_return_value(current, task_pt_regs(current), + syscall_set_return_value(current, current_pt_regs(), err, ret); return -1; } @@ -943,13 +947,13 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, /* Set low-order bits as an errno, capped at MAX_ERRNO. */ if (data > MAX_ERRNO) data = MAX_ERRNO; - syscall_set_return_value(current, task_pt_regs(current), + syscall_set_return_value(current, current_pt_regs(), -data, 0); goto skip; case SECCOMP_RET_TRAP: /* Show the handler the original registers. */ - syscall_rollback(current, task_pt_regs(current)); + syscall_rollback(current, current_pt_regs()); /* Let the filter pass back 16 bits of data. */ seccomp_send_sigsys(this_syscall, data); goto skip; @@ -962,7 +966,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, /* ENOSYS these calls if there is no tracer attached. */ if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) { syscall_set_return_value(current, - task_pt_regs(current), + current_pt_regs(), -ENOSYS, 0); goto skip; } @@ -982,7 +986,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, if (fatal_signal_pending(current)) goto skip; /* Check if the tracer forced the syscall to be skipped. */ - this_syscall = syscall_get_nr(current, task_pt_regs(current)); + this_syscall = syscall_get_nr(current, current_pt_regs()); if (this_syscall < 0) goto skip; @@ -1020,20 +1024,20 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, default: seccomp_log(this_syscall, SIGSYS, action, true); /* Dump core only if this is the last remaining thread. */ - if (action == SECCOMP_RET_KILL_PROCESS || + if (action != SECCOMP_RET_KILL_THREAD || get_nr_threads(current) == 1) { kernel_siginfo_t info; /* Show the original registers in the dump. */ - syscall_rollback(current, task_pt_regs(current)); + syscall_rollback(current, current_pt_regs()); /* Trigger a manual coredump since do_exit skips it. */ seccomp_init_siginfo(&info, this_syscall, data); do_coredump(&info); } - if (action == SECCOMP_RET_KILL_PROCESS) - do_group_exit(SIGSYS); - else + if (action == SECCOMP_RET_KILL_THREAD) do_exit(SIGSYS); + else + do_group_exit(SIGSYS); } unreachable(); @@ -1060,7 +1064,7 @@ int __secure_computing(const struct seccomp_data *sd) return 0; this_syscall = sd ? sd->nr : - syscall_get_nr(current, task_pt_regs(current)); + syscall_get_nr(current, current_pt_regs()); switch (mode) { case SECCOMP_MODE_STRICT: @@ -1472,13 +1476,7 @@ static const struct file_operations seccomp_notify_ops = { static struct file *init_listener(struct seccomp_filter *filter) { - struct file *ret = ERR_PTR(-EBUSY); - struct seccomp_filter *cur; - - for (cur = current->seccomp.filter; cur; cur = cur->prev) { - if (cur->notif) - goto out; - } + struct file *ret; ret = ERR_PTR(-ENOMEM); filter->notif = kzalloc(sizeof(*(filter->notif)), GFP_KERNEL); @@ -1504,6 +1502,31 @@ out: return ret; } +/* + * Does @new_child have a listener while an ancestor also has a listener? + * If so, we'll want to reject this filter. + * This only has to be tested for the current process, even in the TSYNC case, + * because TSYNC installs @child with the same parent on all threads. + * Note that @new_child is not hooked up to its parent at this point yet, so + * we use current->seccomp.filter. + */ +static bool has_duplicate_listener(struct seccomp_filter *new_child) +{ + struct seccomp_filter *cur; + + /* must be protected against concurrent TSYNC */ + lockdep_assert_held(¤t->sighand->siglock); + + if (!new_child->notif) + return false; + for (cur = current->seccomp.filter; cur; cur = cur->prev) { + if (cur->notif) + return true; + } + + return false; +} + /** * seccomp_set_mode_filter: internal function for setting seccomp filter * @flags: flags to change filter behavior @@ -1575,6 +1598,11 @@ static long seccomp_set_mode_filter(unsigned int flags, if (!seccomp_may_assign_mode(seccomp_mode)) goto out; + if (has_duplicate_listener(prepared)) { + ret = -EBUSY; + goto out; + } + ret = seccomp_attach_filter(flags, prepared); if (ret) goto out; diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 0c781f912f9f..491789a793ae 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2367,6 +2367,15 @@ config TEST_HMM If unsure, say N. +config TEST_FREE_PAGES + tristate "Test freeing pages" + help + Test that a memory leak does not occur due to a race between + freeing a block of pages and a speculative page reference. + Loading this module is safe if your kernel has the bug fixed. + If the bug is not fixed, it will leak gigabytes of memory and + probably OOM your system. + config TEST_FPU tristate "Test floating point operations in kernel space" depends on X86 && !KCOV_INSTRUMENT_ALL diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index 047b53dbfd58..542a9c18398e 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan @@ -54,9 +54,9 @@ config KASAN_GENERIC Enables generic KASAN mode. This mode is supported in both GCC and Clang. With GCC it requires - version 8.3.0 or later. With Clang it requires version 7.0.0 or - later, but detection of out-of-bounds accesses for global variables - is supported only since Clang 11. + version 8.3.0 or later. Any supported Clang version is compatible, + but detection of out-of-bounds accesses for global variables is + supported only since Clang 11. This mode consumes about 1/8th of available memory at kernel start and introduces an overhead of ~x1.5 for the rest of the allocations. @@ -78,8 +78,7 @@ config KASAN_SW_TAGS Enables software tag-based KASAN mode. This mode requires Top Byte Ignore support by the CPU and therefore - is only supported for arm64. This mode requires Clang version 7.0.0 - or later. + is only supported for arm64. This mode requires Clang. This mode consumes about 1/16th of available memory at kernel start and introduces an overhead of ~20% for the rest of the allocations. @@ -167,12 +166,24 @@ config KASAN_VMALLOC for KASAN to detect more sorts of errors (and to support vmapped stacks), but at the cost of higher memory usage. -config TEST_KASAN - tristate "Module for testing KASAN for bug detection" - depends on m +config KASAN_KUNIT_TEST + tristate "KUnit-compatible tests of KASAN bug detection capabilities" if !KUNIT_ALL_TESTS + depends on KASAN && KUNIT + default KUNIT_ALL_TESTS help - This is a test module doing various nasty things like - out of bounds accesses, use after free. It is useful for testing + This is a KUnit test suite doing various nasty things like + out of bounds and use after free accesses. It is useful for testing kernel debugging features like KASAN. + For more information on KUnit and unit tests in general, please refer + to the KUnit documentation in Documentation/dev-tools/kunit + +config TEST_KASAN_MODULE + tristate "KUnit-incompatible tests of KASAN bug detection capabilities" + depends on m && KASAN + help + This is a part of the KASAN test suite that is incompatible with + KUnit. Currently includes tests that do bad copy_from/to_user + accesses. + endif # KASAN diff --git a/lib/Makefile b/lib/Makefile index a4a4c6864f51..49a2a9e36224 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -65,9 +65,11 @@ CFLAGS_test_bitops.o += -Werror obj-$(CONFIG_TEST_SYSCTL) += test_sysctl.o obj-$(CONFIG_TEST_HASH) += test_hash.o test_siphash.o obj-$(CONFIG_TEST_IDA) += test_ida.o -obj-$(CONFIG_TEST_KASAN) += test_kasan.o +obj-$(CONFIG_KASAN_KUNIT_TEST) += test_kasan.o CFLAGS_test_kasan.o += -fno-builtin CFLAGS_test_kasan.o += $(call cc-disable-warning, vla) +obj-$(CONFIG_TEST_KASAN_MODULE) += test_kasan_module.o +CFLAGS_test_kasan_module.o += -fno-builtin obj-$(CONFIG_TEST_UBSAN) += test_ubsan.o CFLAGS_test_ubsan.o += $(call cc-disable-warning, vla) UBSAN_SANITIZE_test_ubsan.o := y @@ -99,6 +101,7 @@ obj-$(CONFIG_TEST_BLACKHOLE_DEV) += test_blackhole_dev.o obj-$(CONFIG_TEST_MEMINIT) += test_meminit.o obj-$(CONFIG_TEST_LOCKUP) += test_lockup.o obj-$(CONFIG_TEST_HMM) += test_hmm.o +obj-$(CONFIG_TEST_FREE_PAGES) += test_free_pages.o # # CFLAGS for compiling floating point code inside the kernel. x86/Makefile turns diff --git a/lib/kunit/test.c b/lib/kunit/test.c index c36037200310..dcc35fd30d95 100644 --- a/lib/kunit/test.c +++ b/lib/kunit/test.c @@ -10,16 +10,12 @@ #include #include #include +#include #include "debugfs.h" #include "string-stream.h" #include "try-catch-impl.h" -static void kunit_set_failure(struct kunit *test) -{ - WRITE_ONCE(test->success, false); -} - static void kunit_print_tap_version(void) { static bool kunit_has_printed_tap_version; @@ -288,6 +284,10 @@ static void kunit_try_run_case(void *data) struct kunit_suite *suite = ctx->suite; struct kunit_case *test_case = ctx->test_case; +#if (IS_ENABLED(CONFIG_KASAN) && IS_ENABLED(CONFIG_KUNIT)) + current->kunit_test = test; +#endif /* IS_ENABLED(CONFIG_KASAN) && IS_ENABLED(CONFIG_KUNIT) */ + /* * kunit_run_case_internal may encounter a fatal error; if it does, * abort will be called, this thread will exit, and finally the parent @@ -602,6 +602,9 @@ void kunit_cleanup(struct kunit *test) spin_unlock(&test->lock); kunit_remove_resource(test, res); } +#if (IS_ENABLED(CONFIG_KASAN) && IS_ENABLED(CONFIG_KUNIT)) + current->kunit_test = NULL; +#endif /* IS_ENABLED(CONFIG_KASAN) && IS_ENABLED(CONFIG_KUNIT)*/ } EXPORT_SYMBOL_GPL(kunit_cleanup); diff --git a/lib/test_free_pages.c b/lib/test_free_pages.c new file mode 100644 index 000000000000..074e76bd76b2 --- /dev/null +++ b/lib/test_free_pages.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * test_free_pages.c: Check that free_pages() doesn't leak memory + * Copyright (c) 2020 Oracle + * Author: Matthew Wilcox + */ + +#include +#include +#include + +static void test_free_pages(gfp_t gfp) +{ + unsigned int i; + + for (i = 0; i < 1000 * 1000; i++) { + unsigned long addr = __get_free_pages(gfp, 3); + struct page *page = virt_to_page(addr); + + /* Simulate page cache getting a speculative reference */ + get_page(page); + free_pages(addr, 3); + put_page(page); + } +} + +static int m_in(void) +{ + test_free_pages(GFP_KERNEL); + test_free_pages(GFP_KERNEL | __GFP_COMP); + + return 0; +} + +static void m_ex(void) +{ +} + +module_init(m_in); +module_exit(m_ex); +MODULE_AUTHOR("Matthew Wilcox "); +MODULE_LICENSE("GPL"); diff --git a/lib/test_hmm.c b/lib/test_hmm.c index e7dc3de355b7..e151a7f10519 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -36,7 +36,6 @@ static const struct dev_pagemap_ops dmirror_devmem_ops; static const struct mmu_interval_notifier_ops dmirror_min_ops; static dev_t dmirror_dev; -static struct page *dmirror_zero_page; struct dmirror_device; @@ -460,6 +459,22 @@ static bool dmirror_allocate_chunk(struct dmirror_device *mdevice, unsigned long pfn_last; void *ptr; + devmem = kzalloc(sizeof(*devmem), GFP_KERNEL); + if (!devmem) + return -ENOMEM; + + res = request_free_mem_region(&iomem_resource, DEVMEM_CHUNK_SIZE, + "hmm_dmirror"); + if (IS_ERR(res)) + goto err_devmem; + + devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; + devmem->pagemap.range.start = res->start; + devmem->pagemap.range.end = res->end; + devmem->pagemap.nr_range = 1; + devmem->pagemap.ops = &dmirror_devmem_ops; + devmem->pagemap.owner = mdevice; + mutex_lock(&mdevice->devmem_lock); if (mdevice->devmem_count == mdevice->devmem_capacity) { @@ -472,33 +487,18 @@ static bool dmirror_allocate_chunk(struct dmirror_device *mdevice, sizeof(new_chunks[0]) * new_capacity, GFP_KERNEL); if (!new_chunks) - goto err; + goto err_release; mdevice->devmem_capacity = new_capacity; mdevice->devmem_chunks = new_chunks; } - res = request_free_mem_region(&iomem_resource, DEVMEM_CHUNK_SIZE, - "hmm_dmirror"); - if (IS_ERR(res)) - goto err; - - devmem = kzalloc(sizeof(*devmem), GFP_KERNEL); - if (!devmem) - goto err_release; - - devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; - devmem->pagemap.res = *res; - devmem->pagemap.ops = &dmirror_devmem_ops; - devmem->pagemap.owner = mdevice; - ptr = memremap_pages(&devmem->pagemap, numa_node_id()); if (IS_ERR(ptr)) - goto err_free; + goto err_release; devmem->mdevice = mdevice; - pfn_first = devmem->pagemap.res.start >> PAGE_SHIFT; - pfn_last = pfn_first + - (resource_size(&devmem->pagemap.res) >> PAGE_SHIFT); + pfn_first = devmem->pagemap.range.start >> PAGE_SHIFT; + pfn_last = pfn_first + (range_len(&devmem->pagemap.range) >> PAGE_SHIFT); mdevice->devmem_chunks[mdevice->devmem_count++] = devmem; mutex_unlock(&mdevice->devmem_lock); @@ -525,12 +525,12 @@ static bool dmirror_allocate_chunk(struct dmirror_device *mdevice, return true; -err_free: - kfree(devmem); err_release: - release_mem_region(res->start, resource_size(res)); -err: mutex_unlock(&mdevice->devmem_lock); + release_mem_region(devmem->pagemap.range.start, range_len(&devmem->pagemap.range)); +err_devmem: + kfree(devmem); + return false; } @@ -1100,8 +1100,8 @@ static void dmirror_device_remove(struct dmirror_device *mdevice) mdevice->devmem_chunks[i]; memunmap_pages(&devmem->pagemap); - release_mem_region(devmem->pagemap.res.start, - resource_size(&devmem->pagemap.res)); + release_mem_region(devmem->pagemap.range.start, + range_len(&devmem->pagemap.range)); kfree(devmem); } kfree(mdevice->devmem_chunks); @@ -1126,17 +1126,6 @@ static int __init hmm_dmirror_init(void) goto err_chrdev; } - /* - * Allocate a zero page to simulate a reserved page of device private - * memory which is always zero. The zero_pfn page isn't used just to - * make the code here simpler (i.e., we need a struct page for it). - */ - dmirror_zero_page = alloc_page(GFP_HIGHUSER | __GFP_ZERO); - if (!dmirror_zero_page) { - ret = -ENOMEM; - goto err_chrdev; - } - pr_info("HMM test module loaded. This is only for testing HMM.\n"); return 0; @@ -1152,8 +1141,6 @@ static void __exit hmm_dmirror_exit(void) { int id; - if (dmirror_zero_page) - __free_page(dmirror_zero_page); for (id = 0; id < DMIRROR_NDEVICES; id++) dmirror_device_remove(dmirror_devices + id); unregister_chrdev_region(dmirror_dev, DMIRROR_NDEVICES); diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 53e953bb1d1d..63c26171a791 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -5,8 +5,6 @@ * Author: Andrey Ryabinin */ -#define pr_fmt(fmt) "kasan test: %s " fmt, __func__ - #include #include #include @@ -23,6 +21,8 @@ #include +#include + #include "../mm/kasan/kasan.h" #define OOB_TAG_OFF (IS_ENABLED(CONFIG_KASAN_GENERIC) ? 0 : KASAN_SHADOW_SCALE_SIZE) @@ -32,418 +32,370 @@ * are not eliminated as dead code. */ -int kasan_int_result; void *kasan_ptr_result; +int kasan_int_result; -/* - * Note: test functions are marked noinline so that their names appear in - * reports. +static struct kunit_resource resource; +static struct kunit_kasan_expectation fail_data; +static bool multishot; + +static int kasan_test_init(struct kunit *test) +{ + /* + * Temporarily enable multi-shot mode and set panic_on_warn=0. + * Otherwise, we'd only get a report for the first case. + */ + multishot = kasan_save_enable_multi_shot(); + + return 0; +} + +static void kasan_test_exit(struct kunit *test) +{ + kasan_restore_multi_shot(multishot); +} + +/** + * KUNIT_EXPECT_KASAN_FAIL() - Causes a test failure when the expression does + * not cause a KASAN error. This uses a KUnit resource named "kasan_data." Do + * Do not use this name for a KUnit resource outside here. + * */ +#define KUNIT_EXPECT_KASAN_FAIL(test, condition) do { \ + fail_data.report_expected = true; \ + fail_data.report_found = false; \ + kunit_add_named_resource(test, \ + NULL, \ + NULL, \ + &resource, \ + "kasan_data", &fail_data); \ + condition; \ + KUNIT_EXPECT_EQ(test, \ + fail_data.report_expected, \ + fail_data.report_found); \ +} while (0) -static noinline void __init kmalloc_oob_right(void) +static void kmalloc_oob_right(struct kunit *test) { char *ptr; size_t size = 123; - pr_info("out-of-bounds to right\n"); ptr = kmalloc(size, GFP_KERNEL); - if (!ptr) { - pr_err("Allocation failed\n"); - return; - } - - ptr[size + OOB_TAG_OFF] = 'x'; + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + KUNIT_EXPECT_KASAN_FAIL(test, ptr[size + OOB_TAG_OFF] = 'x'); kfree(ptr); } -static noinline void __init kmalloc_oob_left(void) +static void kmalloc_oob_left(struct kunit *test) { char *ptr; size_t size = 15; - pr_info("out-of-bounds to left\n"); ptr = kmalloc(size, GFP_KERNEL); - if (!ptr) { - pr_err("Allocation failed\n"); - return; - } + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); - *ptr = *(ptr - 1); + KUNIT_EXPECT_KASAN_FAIL(test, *ptr = *(ptr - 1)); kfree(ptr); } -static noinline void __init kmalloc_node_oob_right(void) +static void kmalloc_node_oob_right(struct kunit *test) { char *ptr; size_t size = 4096; - pr_info("kmalloc_node(): out-of-bounds to right\n"); ptr = kmalloc_node(size, GFP_KERNEL, 0); - if (!ptr) { - pr_err("Allocation failed\n"); - return; - } + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); - ptr[size] = 0; + KUNIT_EXPECT_KASAN_FAIL(test, ptr[size] = 0); kfree(ptr); } -#ifdef CONFIG_SLUB -static noinline void __init kmalloc_pagealloc_oob_right(void) +static void kmalloc_pagealloc_oob_right(struct kunit *test) { char *ptr; size_t size = KMALLOC_MAX_CACHE_SIZE + 10; + if (!IS_ENABLED(CONFIG_SLUB)) { + kunit_info(test, "CONFIG_SLUB is not enabled."); + return; + } + /* Allocate a chunk that does not fit into a SLUB cache to trigger * the page allocator fallback. */ - pr_info("kmalloc pagealloc allocation: out-of-bounds to right\n"); ptr = kmalloc(size, GFP_KERNEL); - if (!ptr) { - pr_err("Allocation failed\n"); - return; - } - - ptr[size + OOB_TAG_OFF] = 0; + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + KUNIT_EXPECT_KASAN_FAIL(test, ptr[size + OOB_TAG_OFF] = 0); kfree(ptr); } -static noinline void __init kmalloc_pagealloc_uaf(void) +static void kmalloc_pagealloc_uaf(struct kunit *test) { char *ptr; size_t size = KMALLOC_MAX_CACHE_SIZE + 10; - pr_info("kmalloc pagealloc allocation: use-after-free\n"); - ptr = kmalloc(size, GFP_KERNEL); - if (!ptr) { - pr_err("Allocation failed\n"); + if (!IS_ENABLED(CONFIG_SLUB)) { + kunit_info(test, "CONFIG_SLUB is not enabled."); return; } + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + kfree(ptr); - ptr[0] = 0; + KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] = 0); } -static noinline void __init kmalloc_pagealloc_invalid_free(void) +static void kmalloc_pagealloc_invalid_free(struct kunit *test) { char *ptr; size_t size = KMALLOC_MAX_CACHE_SIZE + 10; - pr_info("kmalloc pagealloc allocation: invalid-free\n"); - ptr = kmalloc(size, GFP_KERNEL); - if (!ptr) { - pr_err("Allocation failed\n"); + if (!IS_ENABLED(CONFIG_SLUB)) { + kunit_info(test, "CONFIG_SLUB is not enabled."); return; } - kfree(ptr + 1); -} -#endif + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); -static noinline void __init kmalloc_large_oob_right(void) + KUNIT_EXPECT_KASAN_FAIL(test, kfree(ptr + 1)); +} + +static void kmalloc_large_oob_right(struct kunit *test) { char *ptr; size_t size = KMALLOC_MAX_CACHE_SIZE - 256; /* Allocate a chunk that is large enough, but still fits into a slab * and does not trigger the page allocator fallback in SLUB. */ - pr_info("kmalloc large allocation: out-of-bounds to right\n"); ptr = kmalloc(size, GFP_KERNEL); - if (!ptr) { - pr_err("Allocation failed\n"); - return; - } + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); - ptr[size] = 0; + KUNIT_EXPECT_KASAN_FAIL(test, ptr[size] = 0); kfree(ptr); } -static noinline void __init kmalloc_oob_krealloc_more(void) +static void kmalloc_oob_krealloc_more(struct kunit *test) { char *ptr1, *ptr2; size_t size1 = 17; size_t size2 = 19; - pr_info("out-of-bounds after krealloc more\n"); ptr1 = kmalloc(size1, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1); + ptr2 = krealloc(ptr1, size2, GFP_KERNEL); - if (!ptr1 || !ptr2) { - pr_err("Allocation failed\n"); - kfree(ptr1); - kfree(ptr2); - return; - } - - ptr2[size2 + OOB_TAG_OFF] = 'x'; + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2); + KUNIT_EXPECT_KASAN_FAIL(test, ptr2[size2 + OOB_TAG_OFF] = 'x'); kfree(ptr2); } -static noinline void __init kmalloc_oob_krealloc_less(void) +static void kmalloc_oob_krealloc_less(struct kunit *test) { char *ptr1, *ptr2; size_t size1 = 17; size_t size2 = 15; - pr_info("out-of-bounds after krealloc less\n"); ptr1 = kmalloc(size1, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1); + ptr2 = krealloc(ptr1, size2, GFP_KERNEL); - if (!ptr1 || !ptr2) { - pr_err("Allocation failed\n"); - kfree(ptr1); - return; - } - - ptr2[size2 + OOB_TAG_OFF] = 'x'; + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2); + KUNIT_EXPECT_KASAN_FAIL(test, ptr2[size2 + OOB_TAG_OFF] = 'x'); kfree(ptr2); } -static noinline void __init kmalloc_oob_16(void) +static void kmalloc_oob_16(struct kunit *test) { struct { u64 words[2]; } *ptr1, *ptr2; - pr_info("kmalloc out-of-bounds for 16-bytes access\n"); ptr1 = kmalloc(sizeof(*ptr1) - 3, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1); + ptr2 = kmalloc(sizeof(*ptr2), GFP_KERNEL); - if (!ptr1 || !ptr2) { - pr_err("Allocation failed\n"); - kfree(ptr1); - kfree(ptr2); - return; - } - *ptr1 = *ptr2; + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2); + + KUNIT_EXPECT_KASAN_FAIL(test, *ptr1 = *ptr2); kfree(ptr1); kfree(ptr2); } -static noinline void __init kmalloc_oob_memset_2(void) +static void kmalloc_oob_memset_2(struct kunit *test) { char *ptr; size_t size = 8; - pr_info("out-of-bounds in memset2\n"); ptr = kmalloc(size, GFP_KERNEL); - if (!ptr) { - pr_err("Allocation failed\n"); - return; - } - - memset(ptr + 7 + OOB_TAG_OFF, 0, 2); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + 7 + OOB_TAG_OFF, 0, 2)); kfree(ptr); } -static noinline void __init kmalloc_oob_memset_4(void) +static void kmalloc_oob_memset_4(struct kunit *test) { char *ptr; size_t size = 8; - pr_info("out-of-bounds in memset4\n"); ptr = kmalloc(size, GFP_KERNEL); - if (!ptr) { - pr_err("Allocation failed\n"); - return; - } - - memset(ptr + 5 + OOB_TAG_OFF, 0, 4); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + 5 + OOB_TAG_OFF, 0, 4)); kfree(ptr); } -static noinline void __init kmalloc_oob_memset_8(void) +static void kmalloc_oob_memset_8(struct kunit *test) { char *ptr; size_t size = 8; - pr_info("out-of-bounds in memset8\n"); ptr = kmalloc(size, GFP_KERNEL); - if (!ptr) { - pr_err("Allocation failed\n"); - return; - } - - memset(ptr + 1 + OOB_TAG_OFF, 0, 8); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + 1 + OOB_TAG_OFF, 0, 8)); kfree(ptr); } -static noinline void __init kmalloc_oob_memset_16(void) +static void kmalloc_oob_memset_16(struct kunit *test) { char *ptr; size_t size = 16; - pr_info("out-of-bounds in memset16\n"); ptr = kmalloc(size, GFP_KERNEL); - if (!ptr) { - pr_err("Allocation failed\n"); - return; - } - - memset(ptr + 1 + OOB_TAG_OFF, 0, 16); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + 1 + OOB_TAG_OFF, 0, 16)); kfree(ptr); } -static noinline void __init kmalloc_oob_in_memset(void) +static void kmalloc_oob_in_memset(struct kunit *test) { char *ptr; size_t size = 666; - pr_info("out-of-bounds in memset\n"); ptr = kmalloc(size, GFP_KERNEL); - if (!ptr) { - pr_err("Allocation failed\n"); - return; - } - - memset(ptr, 0, size + 5 + OOB_TAG_OFF); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr, 0, size + 5 + OOB_TAG_OFF)); kfree(ptr); } -static noinline void __init kmalloc_memmove_invalid_size(void) +static void kmalloc_memmove_invalid_size(struct kunit *test) { char *ptr; size_t size = 64; volatile size_t invalid_size = -2; - pr_info("invalid size in memmove\n"); ptr = kmalloc(size, GFP_KERNEL); - if (!ptr) { - pr_err("Allocation failed\n"); - return; - } + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); memset((char *)ptr, 0, 64); - memmove((char *)ptr, (char *)ptr + 4, invalid_size); + + KUNIT_EXPECT_KASAN_FAIL(test, + memmove((char *)ptr, (char *)ptr + 4, invalid_size)); kfree(ptr); } -static noinline void __init kmalloc_uaf(void) +static void kmalloc_uaf(struct kunit *test) { char *ptr; size_t size = 10; - pr_info("use-after-free\n"); ptr = kmalloc(size, GFP_KERNEL); - if (!ptr) { - pr_err("Allocation failed\n"); - return; - } + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); kfree(ptr); - *(ptr + 8) = 'x'; + KUNIT_EXPECT_KASAN_FAIL(test, *(ptr + 8) = 'x'); } -static noinline void __init kmalloc_uaf_memset(void) +static void kmalloc_uaf_memset(struct kunit *test) { char *ptr; size_t size = 33; - pr_info("use-after-free in memset\n"); ptr = kmalloc(size, GFP_KERNEL); - if (!ptr) { - pr_err("Allocation failed\n"); - return; - } + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); kfree(ptr); - memset(ptr, 0, size); + KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr, 0, size)); } -static noinline void __init kmalloc_uaf2(void) +static void kmalloc_uaf2(struct kunit *test) { char *ptr1, *ptr2; size_t size = 43; - pr_info("use-after-free after another kmalloc\n"); ptr1 = kmalloc(size, GFP_KERNEL); - if (!ptr1) { - pr_err("Allocation failed\n"); - return; - } + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1); kfree(ptr1); - ptr2 = kmalloc(size, GFP_KERNEL); - if (!ptr2) { - pr_err("Allocation failed\n"); - return; - } - ptr1[40] = 'x'; - if (ptr1 == ptr2) - pr_err("Could not detect use-after-free: ptr1 == ptr2\n"); + ptr2 = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2); + + KUNIT_EXPECT_KASAN_FAIL(test, ptr1[40] = 'x'); + KUNIT_EXPECT_PTR_NE(test, ptr1, ptr2); + kfree(ptr2); } -static noinline void __init kfree_via_page(void) +static void kfree_via_page(struct kunit *test) { char *ptr; size_t size = 8; struct page *page; unsigned long offset; - pr_info("invalid-free false positive (via page)\n"); ptr = kmalloc(size, GFP_KERNEL); - if (!ptr) { - pr_err("Allocation failed\n"); - return; - } + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); page = virt_to_page(ptr); offset = offset_in_page(ptr); kfree(page_address(page) + offset); } -static noinline void __init kfree_via_phys(void) +static void kfree_via_phys(struct kunit *test) { char *ptr; size_t size = 8; phys_addr_t phys; - pr_info("invalid-free false positive (via phys)\n"); ptr = kmalloc(size, GFP_KERNEL); - if (!ptr) { - pr_err("Allocation failed\n"); - return; - } + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); phys = virt_to_phys(ptr); kfree(phys_to_virt(phys)); } -static noinline void __init kmem_cache_oob(void) +static void kmem_cache_oob(struct kunit *test) { char *p; size_t size = 200; struct kmem_cache *cache = kmem_cache_create("test_cache", size, 0, 0, NULL); - if (!cache) { - pr_err("Cache allocation failed\n"); - return; - } - pr_info("out-of-bounds in kmem_cache_alloc\n"); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache); p = kmem_cache_alloc(cache, GFP_KERNEL); if (!p) { - pr_err("Allocation failed\n"); + kunit_err(test, "Allocation failed: %s\n", __func__); kmem_cache_destroy(cache); return; } - *p = p[size + OOB_TAG_OFF]; - + KUNIT_EXPECT_KASAN_FAIL(test, *p = p[size + OOB_TAG_OFF]); kmem_cache_free(cache, p); kmem_cache_destroy(cache); } -static noinline void __init memcg_accounted_kmem_cache(void) +static void memcg_accounted_kmem_cache(struct kunit *test) { int i; char *p; @@ -451,12 +403,8 @@ static noinline void __init memcg_accounted_kmem_cache(void) struct kmem_cache *cache; cache = kmem_cache_create("test_cache", size, 0, SLAB_ACCOUNT, NULL); - if (!cache) { - pr_err("Cache allocation failed\n"); - return; - } + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache); - pr_info("allocate memcg accounted object\n"); /* * Several allocations with a delay to allow for lazy per memcg kmem * cache creation. @@ -476,134 +424,93 @@ free_cache: static char global_array[10]; -static noinline void __init kasan_global_oob(void) +static void kasan_global_oob(struct kunit *test) { volatile int i = 3; char *p = &global_array[ARRAY_SIZE(global_array) + i]; - pr_info("out-of-bounds global variable\n"); - *(volatile char *)p; + KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p); } -static noinline void __init kasan_stack_oob(void) +static void ksize_unpoisons_memory(struct kunit *test) +{ + char *ptr; + size_t size = 123, real_size; + + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + real_size = ksize(ptr); + /* This access doesn't trigger an error. */ + ptr[size] = 'x'; + /* This one does. */ + KUNIT_EXPECT_KASAN_FAIL(test, ptr[real_size] = 'y'); + kfree(ptr); +} + +static void kasan_stack_oob(struct kunit *test) { char stack_array[10]; volatile int i = OOB_TAG_OFF; char *p = &stack_array[ARRAY_SIZE(stack_array) + i]; - pr_info("out-of-bounds on stack\n"); - *(volatile char *)p; -} - -static noinline void __init ksize_unpoisons_memory(void) -{ - char *ptr; - size_t size = 123, real_size; - - pr_info("ksize() unpoisons the whole allocated chunk\n"); - ptr = kmalloc(size, GFP_KERNEL); - if (!ptr) { - pr_err("Allocation failed\n"); - return; - } - real_size = ksize(ptr); - /* This access doesn't trigger an error. */ - ptr[size] = 'x'; - /* This one does. */ - ptr[real_size] = 'y'; - kfree(ptr); -} - -static noinline void __init copy_user_test(void) -{ - char *kmem; - char __user *usermem; - size_t size = 10; - int unused; - - kmem = kmalloc(size, GFP_KERNEL); - if (!kmem) - return; - - usermem = (char __user *)vm_mmap(NULL, 0, PAGE_SIZE, - PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_ANONYMOUS | MAP_PRIVATE, 0); - if (IS_ERR(usermem)) { - pr_err("Failed to allocate user memory\n"); - kfree(kmem); + if (!IS_ENABLED(CONFIG_KASAN_STACK)) { + kunit_info(test, "CONFIG_KASAN_STACK is not enabled"); return; } - pr_info("out-of-bounds in copy_from_user()\n"); - unused = copy_from_user(kmem, usermem, size + 1 + OOB_TAG_OFF); - - pr_info("out-of-bounds in copy_to_user()\n"); - unused = copy_to_user(usermem, kmem, size + 1 + OOB_TAG_OFF); - - pr_info("out-of-bounds in __copy_from_user()\n"); - unused = __copy_from_user(kmem, usermem, size + 1 + OOB_TAG_OFF); - - pr_info("out-of-bounds in __copy_to_user()\n"); - unused = __copy_to_user(usermem, kmem, size + 1 + OOB_TAG_OFF); - - pr_info("out-of-bounds in __copy_from_user_inatomic()\n"); - unused = __copy_from_user_inatomic(kmem, usermem, size + 1 + OOB_TAG_OFF); - - pr_info("out-of-bounds in __copy_to_user_inatomic()\n"); - unused = __copy_to_user_inatomic(usermem, kmem, size + 1 + OOB_TAG_OFF); - - pr_info("out-of-bounds in strncpy_from_user()\n"); - unused = strncpy_from_user(kmem, usermem, size + 1 + OOB_TAG_OFF); - - vm_munmap((unsigned long)usermem, PAGE_SIZE); - kfree(kmem); + KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p); } -static noinline void __init kasan_alloca_oob_left(void) +static void kasan_alloca_oob_left(struct kunit *test) { volatile int i = 10; char alloca_array[i]; char *p = alloca_array - 1; - pr_info("out-of-bounds to left on alloca\n"); - *(volatile char *)p; + if (!IS_ENABLED(CONFIG_KASAN_STACK)) { + kunit_info(test, "CONFIG_KASAN_STACK is not enabled"); + return; + } + + KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p); } -static noinline void __init kasan_alloca_oob_right(void) +static void kasan_alloca_oob_right(struct kunit *test) { volatile int i = 10; char alloca_array[i]; char *p = alloca_array + i; - pr_info("out-of-bounds to right on alloca\n"); - *(volatile char *)p; + if (!IS_ENABLED(CONFIG_KASAN_STACK)) { + kunit_info(test, "CONFIG_KASAN_STACK is not enabled"); + return; + } + + KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p); } -static noinline void __init kmem_cache_double_free(void) +static void kmem_cache_double_free(struct kunit *test) { char *p; size_t size = 200; struct kmem_cache *cache; cache = kmem_cache_create("test_cache", size, 0, 0, NULL); - if (!cache) { - pr_err("Cache allocation failed\n"); - return; - } - pr_info("double-free on heap object\n"); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache); + p = kmem_cache_alloc(cache, GFP_KERNEL); if (!p) { - pr_err("Allocation failed\n"); + kunit_err(test, "Allocation failed: %s\n", __func__); kmem_cache_destroy(cache); return; } kmem_cache_free(cache, p); - kmem_cache_free(cache, p); + KUNIT_EXPECT_KASAN_FAIL(test, kmem_cache_free(cache, p)); kmem_cache_destroy(cache); } -static noinline void __init kmem_cache_invalid_free(void) +static void kmem_cache_invalid_free(struct kunit *test) { char *p; size_t size = 200; @@ -611,20 +518,17 @@ static noinline void __init kmem_cache_invalid_free(void) cache = kmem_cache_create("test_cache", size, 0, SLAB_TYPESAFE_BY_RCU, NULL); - if (!cache) { - pr_err("Cache allocation failed\n"); - return; - } - pr_info("invalid-free of heap object\n"); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache); + p = kmem_cache_alloc(cache, GFP_KERNEL); if (!p) { - pr_err("Allocation failed\n"); + kunit_err(test, "Allocation failed: %s\n", __func__); kmem_cache_destroy(cache); return; } /* Trigger invalid free, the object doesn't get freed */ - kmem_cache_free(cache, p + 1); + KUNIT_EXPECT_KASAN_FAIL(test, kmem_cache_free(cache, p + 1)); /* * Properly free the object to prevent the "Objects remaining in @@ -635,45 +539,63 @@ static noinline void __init kmem_cache_invalid_free(void) kmem_cache_destroy(cache); } -static noinline void __init kasan_memchr(void) +static void kasan_memchr(struct kunit *test) { char *ptr; size_t size = 24; - pr_info("out-of-bounds in memchr\n"); - ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO); - if (!ptr) + /* See https://bugzilla.kernel.org/show_bug.cgi?id=206337 */ + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) { + kunit_info(test, + "str* functions are not instrumented with CONFIG_AMD_MEM_ENCRYPT"); return; + } + + ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + KUNIT_EXPECT_KASAN_FAIL(test, + kasan_ptr_result = memchr(ptr, '1', size + 1)); - kasan_ptr_result = memchr(ptr, '1', size + 1); kfree(ptr); } -static noinline void __init kasan_memcmp(void) +static void kasan_memcmp(struct kunit *test) { char *ptr; size_t size = 24; int arr[9]; - pr_info("out-of-bounds in memcmp\n"); - ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO); - if (!ptr) + /* See https://bugzilla.kernel.org/show_bug.cgi?id=206337 */ + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) { + kunit_info(test, + "str* functions are not instrumented with CONFIG_AMD_MEM_ENCRYPT"); return; + } + ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); memset(arr, 0, sizeof(arr)); - kasan_int_result = memcmp(ptr, arr, size + 1); + + KUNIT_EXPECT_KASAN_FAIL(test, + kasan_int_result = memcmp(ptr, arr, size+1)); kfree(ptr); } -static noinline void __init kasan_strings(void) +static void kasan_strings(struct kunit *test) { char *ptr; size_t size = 24; - pr_info("use-after-free in strchr\n"); - ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO); - if (!ptr) + /* See https://bugzilla.kernel.org/show_bug.cgi?id=206337 */ + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) { + kunit_info(test, + "str* functions are not instrumented with CONFIG_AMD_MEM_ENCRYPT"); return; + } + + ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); kfree(ptr); @@ -684,220 +606,164 @@ static noinline void __init kasan_strings(void) * will likely point to zeroed byte. */ ptr += 16; - kasan_ptr_result = strchr(ptr, '1'); + KUNIT_EXPECT_KASAN_FAIL(test, kasan_ptr_result = strchr(ptr, '1')); - pr_info("use-after-free in strrchr\n"); - kasan_ptr_result = strrchr(ptr, '1'); + KUNIT_EXPECT_KASAN_FAIL(test, kasan_ptr_result = strrchr(ptr, '1')); - pr_info("use-after-free in strcmp\n"); - kasan_int_result = strcmp(ptr, "2"); + KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = strcmp(ptr, "2")); - pr_info("use-after-free in strncmp\n"); - kasan_int_result = strncmp(ptr, "2", 1); + KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = strncmp(ptr, "2", 1)); - pr_info("use-after-free in strlen\n"); - kasan_int_result = strlen(ptr); + KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = strlen(ptr)); - pr_info("use-after-free in strnlen\n"); - kasan_int_result = strnlen(ptr, 1); + KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = strnlen(ptr, 1)); } -static noinline void __init kasan_bitops(void) +static void kasan_bitops(struct kunit *test) { /* * Allocate 1 more byte, which causes kzalloc to round up to 16-bytes; * this way we do not actually corrupt other memory. */ long *bits = kzalloc(sizeof(*bits) + 1, GFP_KERNEL); - if (!bits) - return; + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bits); /* * Below calls try to access bit within allocated memory; however, the * below accesses are still out-of-bounds, since bitops are defined to * operate on the whole long the bit is in. */ - pr_info("out-of-bounds in set_bit\n"); - set_bit(BITS_PER_LONG, bits); + KUNIT_EXPECT_KASAN_FAIL(test, set_bit(BITS_PER_LONG, bits)); - pr_info("out-of-bounds in __set_bit\n"); - __set_bit(BITS_PER_LONG, bits); + KUNIT_EXPECT_KASAN_FAIL(test, __set_bit(BITS_PER_LONG, bits)); - pr_info("out-of-bounds in clear_bit\n"); - clear_bit(BITS_PER_LONG, bits); + KUNIT_EXPECT_KASAN_FAIL(test, clear_bit(BITS_PER_LONG, bits)); - pr_info("out-of-bounds in __clear_bit\n"); - __clear_bit(BITS_PER_LONG, bits); + KUNIT_EXPECT_KASAN_FAIL(test, __clear_bit(BITS_PER_LONG, bits)); - pr_info("out-of-bounds in clear_bit_unlock\n"); - clear_bit_unlock(BITS_PER_LONG, bits); + KUNIT_EXPECT_KASAN_FAIL(test, clear_bit_unlock(BITS_PER_LONG, bits)); - pr_info("out-of-bounds in __clear_bit_unlock\n"); - __clear_bit_unlock(BITS_PER_LONG, bits); + KUNIT_EXPECT_KASAN_FAIL(test, __clear_bit_unlock(BITS_PER_LONG, bits)); - pr_info("out-of-bounds in change_bit\n"); - change_bit(BITS_PER_LONG, bits); + KUNIT_EXPECT_KASAN_FAIL(test, change_bit(BITS_PER_LONG, bits)); - pr_info("out-of-bounds in __change_bit\n"); - __change_bit(BITS_PER_LONG, bits); + KUNIT_EXPECT_KASAN_FAIL(test, __change_bit(BITS_PER_LONG, bits)); /* * Below calls try to access bit beyond allocated memory. */ - pr_info("out-of-bounds in test_and_set_bit\n"); - test_and_set_bit(BITS_PER_LONG + BITS_PER_BYTE, bits); + KUNIT_EXPECT_KASAN_FAIL(test, + test_and_set_bit(BITS_PER_LONG + BITS_PER_BYTE, bits)); - pr_info("out-of-bounds in __test_and_set_bit\n"); - __test_and_set_bit(BITS_PER_LONG + BITS_PER_BYTE, bits); + KUNIT_EXPECT_KASAN_FAIL(test, + __test_and_set_bit(BITS_PER_LONG + BITS_PER_BYTE, bits)); - pr_info("out-of-bounds in test_and_set_bit_lock\n"); - test_and_set_bit_lock(BITS_PER_LONG + BITS_PER_BYTE, bits); + KUNIT_EXPECT_KASAN_FAIL(test, + test_and_set_bit_lock(BITS_PER_LONG + BITS_PER_BYTE, bits)); - pr_info("out-of-bounds in test_and_clear_bit\n"); - test_and_clear_bit(BITS_PER_LONG + BITS_PER_BYTE, bits); + KUNIT_EXPECT_KASAN_FAIL(test, + test_and_clear_bit(BITS_PER_LONG + BITS_PER_BYTE, bits)); - pr_info("out-of-bounds in __test_and_clear_bit\n"); - __test_and_clear_bit(BITS_PER_LONG + BITS_PER_BYTE, bits); + KUNIT_EXPECT_KASAN_FAIL(test, + __test_and_clear_bit(BITS_PER_LONG + BITS_PER_BYTE, bits)); - pr_info("out-of-bounds in test_and_change_bit\n"); - test_and_change_bit(BITS_PER_LONG + BITS_PER_BYTE, bits); + KUNIT_EXPECT_KASAN_FAIL(test, + test_and_change_bit(BITS_PER_LONG + BITS_PER_BYTE, bits)); - pr_info("out-of-bounds in __test_and_change_bit\n"); - __test_and_change_bit(BITS_PER_LONG + BITS_PER_BYTE, bits); + KUNIT_EXPECT_KASAN_FAIL(test, + __test_and_change_bit(BITS_PER_LONG + BITS_PER_BYTE, bits)); - pr_info("out-of-bounds in test_bit\n"); - kasan_int_result = test_bit(BITS_PER_LONG + BITS_PER_BYTE, bits); + KUNIT_EXPECT_KASAN_FAIL(test, + kasan_int_result = + test_bit(BITS_PER_LONG + BITS_PER_BYTE, bits)); #if defined(clear_bit_unlock_is_negative_byte) - pr_info("out-of-bounds in clear_bit_unlock_is_negative_byte\n"); - kasan_int_result = clear_bit_unlock_is_negative_byte(BITS_PER_LONG + - BITS_PER_BYTE, bits); + KUNIT_EXPECT_KASAN_FAIL(test, + kasan_int_result = clear_bit_unlock_is_negative_byte( + BITS_PER_LONG + BITS_PER_BYTE, bits)); #endif kfree(bits); } -static noinline void __init kmalloc_double_kzfree(void) +static void kmalloc_double_kzfree(struct kunit *test) { char *ptr; size_t size = 16; - pr_info("double-free (kfree_sensitive)\n"); ptr = kmalloc(size, GFP_KERNEL); - if (!ptr) { - pr_err("Allocation failed\n"); - return; - } + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); kfree_sensitive(ptr); - kfree_sensitive(ptr); + KUNIT_EXPECT_KASAN_FAIL(test, kfree_sensitive(ptr)); } -#ifdef CONFIG_KASAN_VMALLOC -static noinline void __init vmalloc_oob(void) +static void vmalloc_oob(struct kunit *test) { void *area; - pr_info("vmalloc out-of-bounds\n"); + if (!IS_ENABLED(CONFIG_KASAN_VMALLOC)) { + kunit_info(test, "CONFIG_KASAN_VMALLOC is not enabled."); + return; + } /* * We have to be careful not to hit the guard page. * The MMU will catch that and crash us. */ area = vmalloc(3000); - if (!area) { - pr_err("Allocation failed\n"); - return; - } + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, area); - ((volatile char *)area)[3100]; + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)area)[3100]); vfree(area); } -#else -static void __init vmalloc_oob(void) {} -#endif -static struct kasan_rcu_info { - int i; - struct rcu_head rcu; -} *global_rcu_ptr; +static struct kunit_case kasan_kunit_test_cases[] = { + KUNIT_CASE(kmalloc_oob_right), + KUNIT_CASE(kmalloc_oob_left), + KUNIT_CASE(kmalloc_node_oob_right), + KUNIT_CASE(kmalloc_pagealloc_oob_right), + KUNIT_CASE(kmalloc_pagealloc_uaf), + KUNIT_CASE(kmalloc_pagealloc_invalid_free), + KUNIT_CASE(kmalloc_large_oob_right), + KUNIT_CASE(kmalloc_oob_krealloc_more), + KUNIT_CASE(kmalloc_oob_krealloc_less), + KUNIT_CASE(kmalloc_oob_16), + KUNIT_CASE(kmalloc_oob_in_memset), + KUNIT_CASE(kmalloc_oob_memset_2), + KUNIT_CASE(kmalloc_oob_memset_4), + KUNIT_CASE(kmalloc_oob_memset_8), + KUNIT_CASE(kmalloc_oob_memset_16), + KUNIT_CASE(kmalloc_memmove_invalid_size), + KUNIT_CASE(kmalloc_uaf), + KUNIT_CASE(kmalloc_uaf_memset), + KUNIT_CASE(kmalloc_uaf2), + KUNIT_CASE(kfree_via_page), + KUNIT_CASE(kfree_via_phys), + KUNIT_CASE(kmem_cache_oob), + KUNIT_CASE(memcg_accounted_kmem_cache), + KUNIT_CASE(kasan_global_oob), + KUNIT_CASE(kasan_stack_oob), + KUNIT_CASE(kasan_alloca_oob_left), + KUNIT_CASE(kasan_alloca_oob_right), + KUNIT_CASE(ksize_unpoisons_memory), + KUNIT_CASE(kmem_cache_double_free), + KUNIT_CASE(kmem_cache_invalid_free), + KUNIT_CASE(kasan_memchr), + KUNIT_CASE(kasan_memcmp), + KUNIT_CASE(kasan_strings), + KUNIT_CASE(kasan_bitops), + KUNIT_CASE(kmalloc_double_kzfree), + KUNIT_CASE(vmalloc_oob), + {} +}; -static noinline void __init kasan_rcu_reclaim(struct rcu_head *rp) -{ - struct kasan_rcu_info *fp = container_of(rp, - struct kasan_rcu_info, rcu); +static struct kunit_suite kasan_kunit_test_suite = { + .name = "kasan", + .init = kasan_test_init, + .test_cases = kasan_kunit_test_cases, + .exit = kasan_test_exit, +}; - kfree(fp); - fp->i = 1; -} +kunit_test_suite(kasan_kunit_test_suite); -static noinline void __init kasan_rcu_uaf(void) -{ - struct kasan_rcu_info *ptr; - - pr_info("use-after-free in kasan_rcu_reclaim\n"); - ptr = kmalloc(sizeof(struct kasan_rcu_info), GFP_KERNEL); - if (!ptr) { - pr_err("Allocation failed\n"); - return; - } - - global_rcu_ptr = rcu_dereference_protected(ptr, NULL); - call_rcu(&global_rcu_ptr->rcu, kasan_rcu_reclaim); -} - -static int __init kmalloc_tests_init(void) -{ - /* - * Temporarily enable multi-shot mode. Otherwise, we'd only get a - * report for the first case. - */ - bool multishot = kasan_save_enable_multi_shot(); - - kmalloc_oob_right(); - kmalloc_oob_left(); - kmalloc_node_oob_right(); -#ifdef CONFIG_SLUB - kmalloc_pagealloc_oob_right(); - kmalloc_pagealloc_uaf(); - kmalloc_pagealloc_invalid_free(); -#endif - kmalloc_large_oob_right(); - kmalloc_oob_krealloc_more(); - kmalloc_oob_krealloc_less(); - kmalloc_oob_16(); - kmalloc_oob_in_memset(); - kmalloc_oob_memset_2(); - kmalloc_oob_memset_4(); - kmalloc_oob_memset_8(); - kmalloc_oob_memset_16(); - kmalloc_memmove_invalid_size(); - kmalloc_uaf(); - kmalloc_uaf_memset(); - kmalloc_uaf2(); - kfree_via_page(); - kfree_via_phys(); - kmem_cache_oob(); - memcg_accounted_kmem_cache(); - kasan_stack_oob(); - kasan_global_oob(); - kasan_alloca_oob_left(); - kasan_alloca_oob_right(); - ksize_unpoisons_memory(); - copy_user_test(); - kmem_cache_double_free(); - kmem_cache_invalid_free(); - kasan_memchr(); - kasan_memcmp(); - kasan_strings(); - kasan_bitops(); - kmalloc_double_kzfree(); - vmalloc_oob(); - kasan_rcu_uaf(); - - kasan_restore_multi_shot(multishot); - - return -EAGAIN; -} - -module_init(kmalloc_tests_init); MODULE_LICENSE("GPL"); diff --git a/lib/test_kasan_module.c b/lib/test_kasan_module.c new file mode 100644 index 000000000000..2d68db6ae67b --- /dev/null +++ b/lib/test_kasan_module.c @@ -0,0 +1,111 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * + * Copyright (c) 2014 Samsung Electronics Co., Ltd. + * Author: Andrey Ryabinin + */ + +#define pr_fmt(fmt) "kasan test: %s " fmt, __func__ + +#include +#include +#include +#include +#include + +#include "../mm/kasan/kasan.h" + +#define OOB_TAG_OFF (IS_ENABLED(CONFIG_KASAN_GENERIC) ? 0 : KASAN_SHADOW_SCALE_SIZE) + +static noinline void __init copy_user_test(void) +{ + char *kmem; + char __user *usermem; + size_t size = 10; + int unused; + + kmem = kmalloc(size, GFP_KERNEL); + if (!kmem) + return; + + usermem = (char __user *)vm_mmap(NULL, 0, PAGE_SIZE, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_ANONYMOUS | MAP_PRIVATE, 0); + if (IS_ERR(usermem)) { + pr_err("Failed to allocate user memory\n"); + kfree(kmem); + return; + } + + pr_info("out-of-bounds in copy_from_user()\n"); + unused = copy_from_user(kmem, usermem, size + 1 + OOB_TAG_OFF); + + pr_info("out-of-bounds in copy_to_user()\n"); + unused = copy_to_user(usermem, kmem, size + 1 + OOB_TAG_OFF); + + pr_info("out-of-bounds in __copy_from_user()\n"); + unused = __copy_from_user(kmem, usermem, size + 1 + OOB_TAG_OFF); + + pr_info("out-of-bounds in __copy_to_user()\n"); + unused = __copy_to_user(usermem, kmem, size + 1 + OOB_TAG_OFF); + + pr_info("out-of-bounds in __copy_from_user_inatomic()\n"); + unused = __copy_from_user_inatomic(kmem, usermem, size + 1 + OOB_TAG_OFF); + + pr_info("out-of-bounds in __copy_to_user_inatomic()\n"); + unused = __copy_to_user_inatomic(usermem, kmem, size + 1 + OOB_TAG_OFF); + + pr_info("out-of-bounds in strncpy_from_user()\n"); + unused = strncpy_from_user(kmem, usermem, size + 1 + OOB_TAG_OFF); + + vm_munmap((unsigned long)usermem, PAGE_SIZE); + kfree(kmem); +} + +static struct kasan_rcu_info { + int i; + struct rcu_head rcu; +} *global_rcu_ptr; + +static noinline void __init kasan_rcu_reclaim(struct rcu_head *rp) +{ + struct kasan_rcu_info *fp = container_of(rp, + struct kasan_rcu_info, rcu); + + kfree(fp); + fp->i = 1; +} + +static noinline void __init kasan_rcu_uaf(void) +{ + struct kasan_rcu_info *ptr; + + pr_info("use-after-free in kasan_rcu_reclaim\n"); + ptr = kmalloc(sizeof(struct kasan_rcu_info), GFP_KERNEL); + if (!ptr) { + pr_err("Allocation failed\n"); + return; + } + + global_rcu_ptr = rcu_dereference_protected(ptr, NULL); + call_rcu(&global_rcu_ptr->rcu, kasan_rcu_reclaim); +} + + +static int __init test_kasan_module_init(void) +{ + /* + * Temporarily enable multi-shot mode. Otherwise, we'd only get a + * report for the first case. + */ + bool multishot = kasan_save_enable_multi_shot(); + + copy_user_test(); + kasan_rcu_uaf(); + + kasan_restore_multi_shot(multishot); + return -EAGAIN; +} + +module_init(test_kasan_module_init); +MODULE_LICENSE("GPL"); diff --git a/mm/Kconfig b/mm/Kconfig index e3ee7b32c637..8c60c49a123b 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -831,10 +831,10 @@ config PERCPU_STATS be used to help understand percpu memory usage. config GUP_BENCHMARK - bool "Enable infrastructure for get_user_pages_fast() benchmarking" + bool "Enable infrastructure for get_user_pages() and related calls benchmarking" help Provides /sys/kernel/debug/gup_benchmark that helps with testing - performance of get_user_pages_fast(). + performance of get_user_pages() and related calls. See tools/testing/selftests/vm/gup_benchmark.c diff --git a/mm/Makefile b/mm/Makefile index d5649f1c12c0..d73aed0fc99c 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -94,7 +94,6 @@ obj-$(CONFIG_GUP_BENCHMARK) += gup_benchmark.o obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o -obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o obj-$(CONFIG_DEBUG_RODATA_TEST) += rodata_test.o obj-$(CONFIG_DEBUG_VM_PGTABLE) += debug_vm_pgtable.o obj-$(CONFIG_PAGE_OWNER) += page_owner.o diff --git a/mm/compaction.c b/mm/compaction.c index 176dcded298e..6c63844fc061 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -180,11 +180,10 @@ bool compaction_deferred(struct zone *zone, int order) return false; /* Avoid possible overflow */ - if (++zone->compact_considered > defer_limit) + if (++zone->compact_considered >= defer_limit) { zone->compact_considered = defer_limit; - - if (zone->compact_considered >= defer_limit) return false; + } trace_mm_compaction_deferred(zone, order); diff --git a/mm/debug.c b/mm/debug.c index ca8d1cacdecc..ccca576b2899 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -102,12 +102,12 @@ void __dump_page(struct page *page, const char *reason) if (hpage_pincount_available(page)) { pr_warn("head:%p order:%u compound_mapcount:%d compound_pincount:%d\n", head, compound_order(head), - head_mapcount(head), - head_pincount(head)); + head_compound_mapcount(head), + head_compound_pincount(head)); } else { pr_warn("head:%p order:%u compound_mapcount:%d\n", head, compound_order(head), - head_mapcount(head)); + head_compound_mapcount(head)); } } if (PageKsm(page)) @@ -120,6 +120,7 @@ void __dump_page(struct page *page, const char *reason) struct hlist_node *dentry_first; struct dentry *dentry_ptr; struct dentry dentry; + unsigned long ino; /* * mapping can be invalid pointer and we don't want to crash @@ -136,21 +137,22 @@ void __dump_page(struct page *page, const char *reason) goto out_mapping; } - if (get_kernel_nofault(dentry_first, &host->i_dentry.first)) { + if (get_kernel_nofault(dentry_first, &host->i_dentry.first) || + get_kernel_nofault(ino, &host->i_ino)) { pr_warn("aops:%ps with invalid host inode %px\n", a_ops, host); goto out_mapping; } if (!dentry_first) { - pr_warn("aops:%ps ino:%lx\n", a_ops, host->i_ino); + pr_warn("aops:%ps ino:%lx\n", a_ops, ino); goto out_mapping; } dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias); if (get_kernel_nofault(dentry, dentry_ptr)) { - pr_warn("aops:%ps with invalid dentry %px\n", a_ops, - dentry_ptr); + pr_warn("aops:%ps ino:%lx with invalid dentry %px\n", + a_ops, ino, dentry_ptr); } else { /* * if dentry is corrupted, the %pd handler may still @@ -158,7 +160,7 @@ void __dump_page(struct page *page, const char *reason) * corrupted struct page */ pr_warn("aops:%ps ino:%lx dentry name:\"%pd\"\n", - a_ops, host->i_ino, &dentry); + a_ops, ino, &dentry); } } out_mapping: diff --git a/mm/dmapool.c b/mm/dmapool.c index f9fb9bbd733e..a97c97232337 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -266,6 +266,7 @@ static void pool_free_page(struct dma_pool *pool, struct dma_page *page) */ void dma_pool_destroy(struct dma_pool *pool) { + struct dma_page *page, *tmp; bool empty = false; if (unlikely(!pool)) @@ -281,17 +282,13 @@ void dma_pool_destroy(struct dma_pool *pool) device_remove_file(pool->dev, &dev_attr_pools); mutex_unlock(&pools_reg_lock); - while (!list_empty(&pool->page_list)) { - struct dma_page *page; - page = list_entry(pool->page_list.next, - struct dma_page, page_list); + list_for_each_entry_safe(page, tmp, &pool->page_list, page_list) { if (is_page_busy(page)) { if (pool->dev) - dev_err(pool->dev, - "dma_pool_destroy %s, %p busy\n", + dev_err(pool->dev, "%s %s, %p busy\n", __func__, pool->name, page->vaddr); else - pr_err("dma_pool_destroy %s, %p busy\n", + pr_err("%s %s, %p busy\n", __func__, pool->name, page->vaddr); /* leak the still-in-use consistent memory */ list_del(&page->page_list); @@ -355,12 +352,11 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, if (data[i] == POOL_POISON_FREED) continue; if (pool->dev) - dev_err(pool->dev, - "dma_pool_alloc %s, %p (corrupted)\n", - pool->name, retval); + dev_err(pool->dev, "%s %s, %p (corrupted)\n", + __func__, pool->name, retval); else - pr_err("dma_pool_alloc %s, %p (corrupted)\n", - pool->name, retval); + pr_err("%s %s, %p (corrupted)\n", + __func__, pool->name, retval); /* * Dump the first 4 bytes even if they are not @@ -416,12 +412,11 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) if (!page) { spin_unlock_irqrestore(&pool->lock, flags); if (pool->dev) - dev_err(pool->dev, - "dma_pool_free %s, %p/%lx (bad dma)\n", - pool->name, vaddr, (unsigned long)dma); + dev_err(pool->dev, "%s %s, %p/%pad (bad dma)\n", + __func__, pool->name, vaddr, &dma); else - pr_err("dma_pool_free %s, %p/%lx (bad dma)\n", - pool->name, vaddr, (unsigned long)dma); + pr_err("%s %s, %p/%pad (bad dma)\n", + __func__, pool->name, vaddr, &dma); return; } @@ -432,12 +427,11 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) if ((dma - page->dma) != offset) { spin_unlock_irqrestore(&pool->lock, flags); if (pool->dev) - dev_err(pool->dev, - "dma_pool_free %s, %p (bad vaddr)/%pad\n", - pool->name, vaddr, &dma); + dev_err(pool->dev, "%s %s, %p (bad vaddr)/%pad\n", + __func__, pool->name, vaddr, &dma); else - pr_err("dma_pool_free %s, %p (bad vaddr)/%pad\n", - pool->name, vaddr, &dma); + pr_err("%s %s, %p (bad vaddr)/%pad\n", + __func__, pool->name, vaddr, &dma); return; } { @@ -449,11 +443,11 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) } spin_unlock_irqrestore(&pool->lock, flags); if (pool->dev) - dev_err(pool->dev, "dma_pool_free %s, dma %pad already free\n", - pool->name, &dma); + dev_err(pool->dev, "%s %s, dma %pad already free\n", + __func__, pool->name, &dma); else - pr_err("dma_pool_free %s, dma %pad already free\n", - pool->name, &dma); + pr_err("%s %s, dma %pad already free\n", + __func__, pool->name, &dma); return; } } diff --git a/mm/fadvise.c b/mm/fadvise.c index 0e66f2aaeea3..d6baa4f451c5 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -141,7 +141,7 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice) } if (end_index >= start_index) { - unsigned long count; + unsigned long nr_pagevec = 0; /* * It's common to FADV_DONTNEED right after @@ -154,8 +154,9 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice) */ lru_add_drain(); - count = invalidate_mapping_pages(mapping, - start_index, end_index); + invalidate_mapping_pagevec(mapping, + start_index, end_index, + &nr_pagevec); /* * If fewer pages were invalidated than expected then @@ -163,7 +164,7 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice) * a per-cpu pagevec for a remote CPU. Drain all * pagevecs and try again. */ - if (count < (end_index - start_index + 1)) { + if (nr_pagevec) { lru_add_drain_all(); invalidate_mapping_pages(mapping, start_index, end_index); diff --git a/mm/filemap.c b/mm/filemap.c index 748b7b1b4f6d..38546dca58fe 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1645,19 +1645,19 @@ EXPORT_SYMBOL(page_cache_prev_miss); /** * find_get_entry - find and get a page cache entry * @mapping: the address_space to search - * @offset: the page cache index + * @index: The page cache index. * * Looks up the page cache slot at @mapping & @offset. If there is a - * page cache page, it is returned with an increased refcount. + * page cache page, the head page is returned with an increased refcount. * * If the slot holds a shadow entry of a previously evicted page, or a * swap entry from shmem/tmpfs, it is returned. * - * Return: the found page or shadow entry, %NULL if nothing is found. + * Return: The head page or shadow entry, %NULL if nothing is found. */ -struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) +struct page *find_get_entry(struct address_space *mapping, pgoff_t index) { - XA_STATE(xas, &mapping->i_pages, offset); + XA_STATE(xas, &mapping->i_pages, index); struct page *page; rcu_read_lock(); @@ -1685,7 +1685,6 @@ repeat: put_page(page); goto repeat; } - page = find_subpage(page, offset); out: rcu_read_unlock(); @@ -1693,40 +1692,37 @@ out: } /** - * find_lock_entry - locate, pin and lock a page cache entry - * @mapping: the address_space to search - * @offset: the page cache index + * find_lock_entry - Locate and lock a page cache entry. + * @mapping: The address_space to search. + * @index: The page cache index. * - * Looks up the page cache slot at @mapping & @offset. If there is a - * page cache page, it is returned locked and with an increased - * refcount. + * Looks up the page at @mapping & @index. If there is a page in the + * cache, the head page is returned locked and with an increased refcount. * * If the slot holds a shadow entry of a previously evicted page, or a * swap entry from shmem/tmpfs, it is returned. * - * find_lock_entry() may sleep. - * - * Return: the found page or shadow entry, %NULL if nothing is found. + * Context: May sleep. + * Return: The head page or shadow entry, %NULL if nothing is found. */ -struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset) +struct page *find_lock_entry(struct address_space *mapping, pgoff_t index) { struct page *page; repeat: - page = find_get_entry(mapping, offset); + page = find_get_entry(mapping, index); if (page && !xa_is_value(page)) { lock_page(page); /* Has the page been truncated? */ - if (unlikely(page_mapping(page) != mapping)) { + if (unlikely(page->mapping != mapping)) { unlock_page(page); put_page(page); goto repeat; } - VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page); + VM_BUG_ON_PAGE(!thp_contains(page, index), page); } return page; } -EXPORT_SYMBOL(find_lock_entry); /** * pagecache_get_page - Find and get a reference to a page. @@ -1741,6 +1737,8 @@ EXPORT_SYMBOL(find_lock_entry); * * * %FGP_ACCESSED - The page will be marked accessed. * * %FGP_LOCK - The page is returned locked. + * * %FGP_HEAD - If the page is present and a THP, return the head page + * rather than the exact page specified by the index. * * %FGP_CREAT - If no page is present then a new page is allocated using * @gfp_mask and added to the page cache and the VM's LRU list. * The page is returned locked and with an increased refcount. @@ -1781,12 +1779,12 @@ repeat: } /* Has the page been truncated? */ - if (unlikely(compound_head(page)->mapping != mapping)) { + if (unlikely(page->mapping != mapping)) { unlock_page(page); put_page(page); goto repeat; } - VM_BUG_ON_PAGE(page->index != index, page); + VM_BUG_ON_PAGE(!thp_contains(page, index), page); } if (fgp_flags & FGP_ACCESSED) @@ -1796,6 +1794,8 @@ repeat: if (page_is_idle(page)) clear_page_idle(page); } + if (!(fgp_flags & FGP_HEAD)) + page = find_subpage(page, index); no_page: if (!page && (fgp_flags & FGP_CREAT)) { @@ -2793,42 +2793,42 @@ void filemap_map_pages(struct vm_fault *vmf, pgoff_t last_pgoff = start_pgoff; unsigned long max_idx; XA_STATE(xas, &mapping->i_pages, start_pgoff); - struct page *page; + struct page *head, *page; unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss); rcu_read_lock(); - xas_for_each(&xas, page, end_pgoff) { - if (xas_retry(&xas, page)) + xas_for_each(&xas, head, end_pgoff) { + if (xas_retry(&xas, head)) continue; - if (xa_is_value(page)) + if (xa_is_value(head)) goto next; /* * Check for a locked page first, as a speculative * reference may adversely influence page migration. */ - if (PageLocked(page)) + if (PageLocked(head)) goto next; - if (!page_cache_get_speculative(page)) + if (!page_cache_get_speculative(head)) goto next; /* Has the page moved or been split? */ - if (unlikely(page != xas_reload(&xas))) + if (unlikely(head != xas_reload(&xas))) goto skip; - page = find_subpage(page, xas.xa_index); + page = find_subpage(head, xas.xa_index); - if (!PageUptodate(page) || + if (!PageUptodate(head) || PageReadahead(page) || PageHWPoison(page)) goto skip; - if (!trylock_page(page)) + if (!trylock_page(head)) goto skip; - if (page->mapping != mapping || !PageUptodate(page)) + if (head->mapping != mapping || !PageUptodate(head)) goto unlock; max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); - if (page->index >= max_idx) + if (xas.xa_index >= max_idx) goto unlock; if (mmap_miss > 0) @@ -2840,12 +2840,12 @@ void filemap_map_pages(struct vm_fault *vmf, last_pgoff = xas.xa_index; if (alloc_set_pte(vmf, page)) goto unlock; - unlock_page(page); + unlock_page(head); goto next; unlock: - unlock_page(page); + unlock_page(head); skip: - put_page(page); + put_page(head); next: /* Huge page is mapped? No need to proceed. */ if (pmd_trans_huge(*vmf->pmd)) diff --git a/mm/gup.c b/mm/gup.c index e869c634cc9a..ad617e7f22f5 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -328,6 +328,13 @@ void unpin_user_pages(struct page **pages, unsigned long npages) { unsigned long index; + /* + * If this WARN_ON() fires, then the system *might* be leaking pages (by + * leaving them pinned), but probably not. More likely, gup/pup returned + * a hard -ERRNO error to the caller, who erroneously passed it here. + */ + if (WARN_ON(IS_ERR_VALUE(npages))) + return; /* * TODO: this can be optimized for huge pages: if a series of pages is * physically contiguous and part of the same compound page, then a @@ -1747,6 +1754,25 @@ static __always_inline long __gup_longterm_locked(struct mm_struct *mm, } #endif /* CONFIG_FS_DAX || CONFIG_CMA */ +static bool is_valid_gup_flags(unsigned int gup_flags) +{ + /* + * FOLL_PIN must only be set internally by the pin_user_pages*() APIs, + * never directly by the caller, so enforce that with an assertion: + */ + if (WARN_ON_ONCE(gup_flags & FOLL_PIN)) + return false; + /* + * FOLL_PIN is a prerequisite to FOLL_LONGTERM. Another way of saying + * that is, FOLL_LONGTERM is a specific case, more restrictive case of + * FOLL_PIN. + */ + if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) + return false; + + return true; +} + #ifdef CONFIG_MMU static long __get_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, @@ -1842,11 +1868,7 @@ long get_user_pages_remote(struct mm_struct *mm, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *locked) { - /* - * FOLL_PIN must only be set internally by the pin_user_pages*() APIs, - * never directly by the caller, so enforce that with an assertion: - */ - if (WARN_ON_ONCE(gup_flags & FOLL_PIN)) + if (!is_valid_gup_flags(gup_flags)) return -EINVAL; return __get_user_pages_remote(mm, start, nr_pages, gup_flags, @@ -1892,11 +1914,7 @@ long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas) { - /* - * FOLL_PIN must only be set internally by the pin_user_pages*() APIs, - * never directly by the caller, so enforce that with an assertion: - */ - if (WARN_ON_ONCE(gup_flags & FOLL_PIN)) + if (!is_valid_gup_flags(gup_flags)) return -EINVAL; return __gup_longterm_locked(current->mm, start, nr_pages, @@ -2786,11 +2804,7 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast_only); int get_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) { - /* - * FOLL_PIN must only be set internally by the pin_user_pages*() APIs, - * never directly by the caller, so enforce that: - */ - if (WARN_ON_ONCE(gup_flags & FOLL_PIN)) + if (!is_valid_gup_flags(gup_flags)) return -EINVAL; /* diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c index be690fa66a46..464cae1fa3ea 100644 --- a/mm/gup_benchmark.c +++ b/mm/gup_benchmark.c @@ -6,10 +6,10 @@ #include #define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_benchmark) -#define GUP_LONGTERM_BENCHMARK _IOWR('g', 2, struct gup_benchmark) -#define GUP_BENCHMARK _IOWR('g', 3, struct gup_benchmark) -#define PIN_FAST_BENCHMARK _IOWR('g', 4, struct gup_benchmark) -#define PIN_BENCHMARK _IOWR('g', 5, struct gup_benchmark) +#define GUP_BENCHMARK _IOWR('g', 2, struct gup_benchmark) +#define PIN_FAST_BENCHMARK _IOWR('g', 3, struct gup_benchmark) +#define PIN_BENCHMARK _IOWR('g', 4, struct gup_benchmark) +#define PIN_LONGTERM_BENCHMARK _IOWR('g', 5, struct gup_benchmark) struct gup_benchmark { __u64 get_delta_usec; @@ -28,7 +28,6 @@ static void put_back_pages(unsigned int cmd, struct page **pages, switch (cmd) { case GUP_FAST_BENCHMARK: - case GUP_LONGTERM_BENCHMARK: case GUP_BENCHMARK: for (i = 0; i < nr_pages; i++) put_page(pages[i]); @@ -36,6 +35,7 @@ static void put_back_pages(unsigned int cmd, struct page **pages, case PIN_FAST_BENCHMARK: case PIN_BENCHMARK: + case PIN_LONGTERM_BENCHMARK: unpin_user_pages(pages, nr_pages); break; } @@ -50,6 +50,7 @@ static void verify_dma_pinned(unsigned int cmd, struct page **pages, switch (cmd) { case PIN_FAST_BENCHMARK: case PIN_BENCHMARK: + case PIN_LONGTERM_BENCHMARK: for (i = 0; i < nr_pages; i++) { page = pages[i]; if (WARN(!page_maybe_dma_pinned(page), @@ -101,11 +102,6 @@ static int __gup_benchmark_ioctl(unsigned int cmd, nr = get_user_pages_fast(addr, nr, gup->flags, pages + i); break; - case GUP_LONGTERM_BENCHMARK: - nr = get_user_pages(addr, nr, - gup->flags | FOLL_LONGTERM, - pages + i, NULL); - break; case GUP_BENCHMARK: nr = get_user_pages(addr, nr, gup->flags, pages + i, NULL); @@ -118,6 +114,11 @@ static int __gup_benchmark_ioctl(unsigned int cmd, nr = pin_user_pages(addr, nr, gup->flags, pages + i, NULL); break; + case PIN_LONGTERM_BENCHMARK: + nr = pin_user_pages(addr, nr, + gup->flags | FOLL_LONGTERM, + pages + i, NULL); + break; default: kvfree(pages); ret = -EINVAL; @@ -162,10 +163,10 @@ static long gup_benchmark_ioctl(struct file *filep, unsigned int cmd, switch (cmd) { case GUP_FAST_BENCHMARK: - case GUP_LONGTERM_BENCHMARK: case GUP_BENCHMARK: case PIN_FAST_BENCHMARK: case PIN_BENCHMARK: + case PIN_LONGTERM_BENCHMARK: break; default: return -EINVAL; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index ec0f0cc49545..65c289c13b58 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2306,13 +2306,13 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, /* * If we're also updating the vma->vm_next->vm_start, if the new - * vm_next->vm_start isn't page aligned and it could previously + * vm_next->vm_start isn't hpage aligned and it could previously * contain an hugepage: check if we need to split an huge pmd. */ if (adjust_next > 0) { struct vm_area_struct *next = vma->vm_next; unsigned long nstart = next->vm_start; - nstart += adjust_next << PAGE_SHIFT; + nstart += adjust_next; if (nstart & ~HPAGE_PMD_MASK && (nstart & HPAGE_PMD_MASK) >= next->vm_start && (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 67fc6383995b..2fb9a4c7a161 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -240,7 +240,6 @@ get_file_region_entry_from_cache(struct resv_map *resv, long from, long to) resv->region_cache_count--; nrg = list_first_entry(&resv->region_cache, struct file_region, link); - VM_BUG_ON(!nrg); list_del(&nrg->link); nrg->from = from; @@ -309,8 +308,7 @@ static void coalesce_file_region(struct resv_map *resv, struct file_region *rg) list_del(&rg->link); kfree(rg); - coalesce_file_region(resv, prg); - return; + rg = prg; } nrg = list_next_entry(rg, link); @@ -320,22 +318,20 @@ static void coalesce_file_region(struct resv_map *resv, struct file_region *rg) list_del(&rg->link); kfree(rg); - - coalesce_file_region(resv, nrg); - return; } } -/* Must be called with resv->lock held. Calling this with count_only == true - * will count the number of pages to be added but will not modify the linked - * list. If regions_needed != NULL and count_only == true, then regions_needed - * will indicate the number of file_regions needed in the cache to carry out to - * add the regions for this range. +/* + * Must be called with resv->lock held. + * + * Calling this with regions_needed != NULL will count the number of pages + * to be added but will not modify the linked list. And regions_needed will + * indicate the number of file_regions needed in the cache to carry out to add + * the regions for this range. */ static long add_reservation_in_range(struct resv_map *resv, long f, long t, struct hugetlb_cgroup *h_cg, - struct hstate *h, long *regions_needed, - bool count_only) + struct hstate *h, long *regions_needed) { long add = 0; struct list_head *head = &resv->regions; @@ -371,14 +367,14 @@ static long add_reservation_in_range(struct resv_map *resv, long f, long t, */ if (rg->from > last_accounted_offset) { add += rg->from - last_accounted_offset; - if (!count_only) { + if (!regions_needed) { nrg = get_file_region_entry_from_cache( resv, last_accounted_offset, rg->from); record_hugetlb_cgroup_uncharge_info(h_cg, h, resv, nrg); list_add(&nrg->link, rg->link.prev); coalesce_file_region(resv, nrg); - } else if (regions_needed) + } else *regions_needed += 1; } @@ -390,13 +386,13 @@ static long add_reservation_in_range(struct resv_map *resv, long f, long t, */ if (last_accounted_offset < t) { add += t - last_accounted_offset; - if (!count_only) { + if (!regions_needed) { nrg = get_file_region_entry_from_cache( resv, last_accounted_offset, t); record_hugetlb_cgroup_uncharge_info(h_cg, h, resv, nrg); list_add(&nrg->link, rg->link.prev); coalesce_file_region(resv, nrg); - } else if (regions_needed) + } else *regions_needed += 1; } @@ -448,11 +444,8 @@ static int allocate_file_region_entries(struct resv_map *resv, spin_lock(&resv->lock); - list_for_each_entry_safe(rg, trg, &allocated_regions, link) { - list_del(&rg->link); - list_add(&rg->link, &resv->region_cache); - resv->region_cache_count++; - } + list_splice(&allocated_regions, &resv->region_cache); + resv->region_cache_count += to_allocate; } return 0; @@ -492,8 +485,8 @@ static long region_add(struct resv_map *resv, long f, long t, retry: /* Count how many regions are actually needed to execute this add. */ - add_reservation_in_range(resv, f, t, NULL, NULL, &actual_regions_needed, - true); + add_reservation_in_range(resv, f, t, NULL, NULL, + &actual_regions_needed); /* * Check for sufficient descriptors in the cache to accommodate @@ -521,7 +514,7 @@ retry: goto retry; } - add = add_reservation_in_range(resv, f, t, h_cg, h, NULL, false); + add = add_reservation_in_range(resv, f, t, h_cg, h, NULL); resv->adds_in_progress -= in_regions_needed; @@ -557,9 +550,9 @@ static long region_chg(struct resv_map *resv, long f, long t, spin_lock(&resv->lock); - /* Count how many hugepages in this range are NOT respresented. */ + /* Count how many hugepages in this range are NOT represented. */ chg = add_reservation_in_range(resv, f, t, NULL, NULL, - out_regions_needed, true); + out_regions_needed); if (*out_regions_needed == 0) *out_regions_needed = 1; @@ -1047,21 +1040,17 @@ static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) if (nocma && is_migrate_cma_page(page)) continue; - if (!PageHWPoison(page)) - break; + if (PageHWPoison(page)) + continue; + + list_move(&page->lru, &h->hugepage_activelist); + set_page_refcounted(page); + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; + return page; } - /* - * if 'non-isolated free hugepage' not found on the list, - * the allocation fails. - */ - if (&h->hugepage_freelists[nid] == &page->lru) - return NULL; - list_move(&page->lru, &h->hugepage_activelist); - set_page_refcounted(page); - h->free_huge_pages--; - h->free_huge_pages_node[nid]--; - return page; + return NULL; } static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, int nid, @@ -1511,9 +1500,9 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) { INIT_LIST_HEAD(&page->lru); set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); - spin_lock(&hugetlb_lock); set_hugetlb_cgroup(page, NULL); set_hugetlb_cgroup_rsvd(page, NULL); + spin_lock(&hugetlb_lock); h->nr_huge_pages++; h->nr_huge_pages_node[nid]++; spin_unlock(&hugetlb_lock); @@ -2423,7 +2412,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, h->resv_huge_pages--; } spin_lock(&hugetlb_lock); - list_move(&page->lru, &h->hugepage_activelist); + list_add(&page->lru, &h->hugepage_activelist); /* Fall through */ } hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page); @@ -3799,23 +3788,23 @@ bool is_hugetlb_entry_migration(pte_t pte) if (huge_pte_none(pte) || pte_present(pte)) return false; swp = pte_to_swp_entry(pte); - if (non_swap_entry(swp) && is_migration_entry(swp)) + if (is_migration_entry(swp)) return true; else return false; } -static int is_hugetlb_entry_hwpoisoned(pte_t pte) +static bool is_hugetlb_entry_hwpoisoned(pte_t pte) { swp_entry_t swp; if (huge_pte_none(pte) || pte_present(pte)) - return 0; + return false; swp = pte_to_swp_entry(pte); - if (non_swap_entry(swp) && is_hwpoison_entry(swp)) - return 1; + if (is_hwpoison_entry(swp)) + return true; else - return 0; + return false; } int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, @@ -5348,10 +5337,16 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, * !shared pmd case because we can allocate the pmd later as well, it makes the * code much cleaner. * - * This routine must be called with i_mmap_rwsem held in at least read mode. - * For hugetlbfs, this prevents removal of any page table entries associated - * with the address space. This is important as we are setting up sharing - * based on existing page table entries (mappings). + * This routine must be called with i_mmap_rwsem held in at least read mode if + * sharing is possible. For hugetlbfs, this prevents removal of any page + * table entries associated with the address space. This is important as we + * are setting up sharing based on existing page table entries (mappings). + * + * NOTE: This routine is only called from huge_pte_alloc. Some callers of + * huge_pte_alloc know that sharing is not possible and do not take + * i_mmap_rwsem as a performance optimization. This is handled by the + * if !vma_shareable check at the beginning of the routine. i_mmap_rwsem is + * only required for subsequent processing. */ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) { @@ -5368,6 +5363,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) if (!vma_shareable(vma, addr)) return (pte_t *)pmd_alloc(mm, pud, addr); + i_mmap_assert_locked(mapping); vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { if (svma == vma) continue; diff --git a/mm/internal.h b/mm/internal.h index 10c677655912..a801a4d51f26 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -65,6 +65,9 @@ static inline void ra_submit(struct file_ra_state *ra, ra->start, ra->size, ra->async_size); } +struct page *find_get_entry(struct address_space *mapping, pgoff_t index); +struct page *find_lock_entry(struct address_space *mapping, pgoff_t index); + /** * page_evictable - test whether a page is evictable * @page: the page to test diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 4f49fa6cd1aa..00a53f1355ae 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -33,6 +33,8 @@ #include +#include + #include "kasan.h" #include "../slab.h" @@ -93,7 +95,7 @@ static void end_report(unsigned long *flags) pr_err("==================================================================\n"); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); spin_unlock_irqrestore(&report_lock, *flags); - if (panic_on_warn) { + if (panic_on_warn && !test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags)) { /* * This thread may hit another WARN() in the panic path. * Resetting this prevents additional WARN() from panicking the @@ -464,12 +466,37 @@ static bool report_enabled(void) return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags); } +#if IS_ENABLED(CONFIG_KUNIT) +static void kasan_update_kunit_status(struct kunit *cur_test) +{ + struct kunit_resource *resource; + struct kunit_kasan_expectation *kasan_data; + + resource = kunit_find_named_resource(cur_test, "kasan_data"); + + if (!resource) { + kunit_set_failure(cur_test); + return; + } + + kasan_data = (struct kunit_kasan_expectation *)resource->data; + kasan_data->report_found = true; + kunit_put_resource(resource); +} +#endif /* IS_ENABLED(CONFIG_KUNIT) */ + void kasan_report_invalid_free(void *object, unsigned long ip) { unsigned long flags; u8 tag = get_tag(object); object = reset_tag(object); + +#if IS_ENABLED(CONFIG_KUNIT) + if (current->kunit_test) + kasan_update_kunit_status(current->kunit_test); +#endif /* IS_ENABLED(CONFIG_KUNIT) */ + start_report(&flags); pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", (void *)ip); print_tags(tag, object); @@ -488,6 +515,11 @@ static void __kasan_report(unsigned long addr, size_t size, bool is_write, void *untagged_addr; unsigned long flags; +#if IS_ENABLED(CONFIG_KUNIT) + if (current->kunit_test) + kasan_update_kunit_status(current->kunit_test); +#endif /* IS_ENABLED(CONFIG_KUNIT) */ + disable_trace_on_warning(); tagged_addr = (void *)addr; diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 5e252d91eb14..c0014d3b91c1 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1471,15 +1471,15 @@ static void kmemleak_scan(void) if (kmemleak_stack_scan) { struct task_struct *p, *g; - read_lock(&tasklist_lock); - do_each_thread(g, p) { + rcu_read_lock(); + for_each_process_thread(g, p) { void *stack = try_get_task_stack(p); if (stack) { scan_block(stack, stack + THREAD_SIZE, NULL); put_task_stack(p); } - } while_each_thread(g, p); - read_unlock(&tasklist_lock); + } + rcu_read_unlock(); } /* diff --git a/mm/madvise.c b/mm/madvise.c index 6641c1157a3e..9f0a7498c4a3 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -224,25 +224,28 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct address_space *mapping) { - pgoff_t index; + XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start)); + pgoff_t end_index = end / PAGE_SIZE; struct page *page; - swp_entry_t swap; - for (; start < end; start += PAGE_SIZE) { - index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + rcu_read_lock(); + xas_for_each(&xas, page, end_index) { + swp_entry_t swap; - page = find_get_entry(mapping, index); - if (!xa_is_value(page)) { - if (page) - put_page(page); + if (!xa_is_value(page)) continue; - } + xas_pause(&xas); + rcu_read_unlock(); + swap = radix_to_swp_entry(page); page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE, NULL, 0, false); if (page) put_page(page); + + rcu_read_lock(); } + rcu_read_unlock(); lru_add_drain(); /* Push any new pages onto the LRU now */ } diff --git a/mm/memblock.c b/mm/memblock.c index 45f198750be9..165f40a8a254 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -132,7 +132,26 @@ struct memblock_type physmem = { }; #endif -int memblock_debug __initdata_memblock; +/* + * keep a pointer to &memblock.memory in the text section to use it in + * __next_mem_range() and its helpers. + * For architectures that do not keep memblock data after init, this + * pointer will be reset to NULL at memblock_discard() + */ +static __refdata struct memblock_type *memblock_memory = &memblock.memory; + +#define for_each_memblock_type(i, memblock_type, rgn) \ + for (i = 0, rgn = &memblock_type->regions[0]; \ + i < memblock_type->cnt; \ + i++, rgn = &memblock_type->regions[i]) + +#define memblock_dbg(fmt, ...) \ + do { \ + if (memblock_debug) \ + pr_info(fmt, ##__VA_ARGS__); \ + } while (0) + +static int memblock_debug __initdata_memblock; static bool system_has_some_mirror __initdata_memblock = false; static int memblock_can_resize __initdata_memblock; static int memblock_memory_in_slab __initdata_memblock = 0; @@ -391,6 +410,8 @@ void __init memblock_discard(void) memblock.memory.max); __memblock_free_late(addr, size); } + + memblock_memory = NULL; } #endif @@ -941,42 +962,16 @@ int __init_memblock memblock_clear_nomap(phys_addr_t base, phys_addr_t size) return memblock_setclr_flag(base, size, 0, MEMBLOCK_NOMAP); } -/** - * __next_reserved_mem_region - next function for for_each_reserved_region() - * @idx: pointer to u64 loop variable - * @out_start: ptr to phys_addr_t for start address of the region, can be %NULL - * @out_end: ptr to phys_addr_t for end address of the region, can be %NULL - * - * Iterate over all reserved memory regions. - */ -void __init_memblock __next_reserved_mem_region(u64 *idx, - phys_addr_t *out_start, - phys_addr_t *out_end) -{ - struct memblock_type *type = &memblock.reserved; - - if (*idx < type->cnt) { - struct memblock_region *r = &type->regions[*idx]; - phys_addr_t base = r->base; - phys_addr_t size = r->size; - - if (out_start) - *out_start = base; - if (out_end) - *out_end = base + size - 1; - - *idx += 1; - return; - } - - /* signal end of iteration */ - *idx = ULLONG_MAX; -} - -static bool should_skip_region(struct memblock_region *m, int nid, int flags) +static bool should_skip_region(struct memblock_type *type, + struct memblock_region *m, + int nid, int flags) { int m_nid = memblock_get_region_node(m); + /* we never skip regions when iterating memblock.reserved or physmem */ + if (type != memblock_memory) + return false; + /* only memory regions are associated with nodes, check it */ if (nid != NUMA_NO_NODE && nid != m_nid) return true; @@ -1041,7 +1036,7 @@ void __next_mem_range(u64 *idx, int nid, enum memblock_flags flags, phys_addr_t m_end = m->base + m->size; int m_nid = memblock_get_region_node(m); - if (should_skip_region(m, nid, flags)) + if (should_skip_region(type_a, m, nid, flags)) continue; if (!type_b) { @@ -1145,7 +1140,7 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, phys_addr_t m_end = m->base + m->size; int m_nid = memblock_get_region_node(m); - if (should_skip_region(m, nid, flags)) + if (should_skip_region(type_a, m, nid, flags)) continue; if (!type_b) { @@ -1649,23 +1644,6 @@ phys_addr_t __init_memblock memblock_reserved_size(void) return memblock.reserved.total_size; } -phys_addr_t __init memblock_mem_size(unsigned long limit_pfn) -{ - unsigned long pages = 0; - struct memblock_region *r; - unsigned long start_pfn, end_pfn; - - for_each_memblock(memory, r) { - start_pfn = memblock_region_memory_base_pfn(r); - end_pfn = memblock_region_memory_end_pfn(r); - start_pfn = min_t(unsigned long, start_pfn, limit_pfn); - end_pfn = min_t(unsigned long, end_pfn, limit_pfn); - pages += end_pfn - start_pfn; - } - - return PFN_PHYS(pages); -} - /* lowest address */ phys_addr_t __init_memblock memblock_start_of_DRAM(void) { @@ -1689,7 +1667,7 @@ static phys_addr_t __init_memblock __find_max_addr(phys_addr_t limit) * the memory memblock regions, if the @limit exceeds the total size * of those regions, max_addr will keep original value PHYS_ADDR_MAX */ - for_each_memblock(memory, r) { + for_each_mem_region(r) { if (limit <= r->size) { max_addr = r->base + limit; break; @@ -1859,7 +1837,7 @@ void __init_memblock memblock_trim_memory(phys_addr_t align) phys_addr_t start, end, orig_start, orig_end; struct memblock_region *r; - for_each_memblock(memory, r) { + for_each_mem_region(r) { orig_start = r->base; orig_end = r->base + r->size; start = round_up(orig_start, align); @@ -1915,7 +1893,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type) } } -void __init_memblock __memblock_dump_all(void) +static void __init_memblock __memblock_dump_all(void) { pr_info("MEMBLOCK configuration:\n"); pr_info(" memory size = %pa reserved size = %pa\n", @@ -1929,6 +1907,12 @@ void __init_memblock __memblock_dump_all(void) #endif } +void __init_memblock memblock_dump_all(void) +{ + if (memblock_debug) + __memblock_dump_all(); +} + void __init memblock_allow_resize(void) { memblock_can_resize = 1; @@ -1981,7 +1965,7 @@ static unsigned long __init free_low_memory_core_early(void) memblock_clear_hotplug(0, -1); - for_each_reserved_mem_region(i, &start, &end) + for_each_reserved_mem_range(i, &start, &end) reserve_bootmem_region(start, end); /* diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5c1983c84395..7f74a158cfa8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -197,14 +197,6 @@ static struct move_charge_struct { #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 -enum charge_type { - MEM_CGROUP_CHARGE_TYPE_CACHE = 0, - MEM_CGROUP_CHARGE_TYPE_ANON, - MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ - MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ - NR_CHARGE_TYPE, -}; - /* for encoding cft->private value on file */ enum res_type { _MEM, @@ -1102,9 +1094,9 @@ static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void) * invocations for reference counting, or use mem_cgroup_iter_break() * to cancel a hierarchy walk before the round-trip is complete. * - * Reclaimers can specify a node and a priority level in @reclaim to - * divide up the memcgs in the hierarchy among all concurrent - * reclaimers operating on the same node and priority. + * Reclaimers can specify a node in @reclaim to divide up the memcgs + * in the hierarchy among all concurrent reclaimers operating on the + * same node. */ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, struct mem_cgroup *prev, @@ -1456,6 +1448,70 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) return false; } +struct memory_stat { + const char *name; + unsigned int ratio; + unsigned int idx; +}; + +static struct memory_stat memory_stats[] = { + { "anon", PAGE_SIZE, NR_ANON_MAPPED }, + { "file", PAGE_SIZE, NR_FILE_PAGES }, + { "kernel_stack", 1024, NR_KERNEL_STACK_KB }, + { "percpu", 1, MEMCG_PERCPU_B }, + { "sock", PAGE_SIZE, MEMCG_SOCK }, + { "shmem", PAGE_SIZE, NR_SHMEM }, + { "file_mapped", PAGE_SIZE, NR_FILE_MAPPED }, + { "file_dirty", PAGE_SIZE, NR_FILE_DIRTY }, + { "file_writeback", PAGE_SIZE, NR_WRITEBACK }, +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + /* + * The ratio will be initialized in memory_stats_init(). Because + * on some architectures, the macro of HPAGE_PMD_SIZE is not + * constant(e.g. powerpc). + */ + { "anon_thp", 0, NR_ANON_THPS }, +#endif + { "inactive_anon", PAGE_SIZE, NR_INACTIVE_ANON }, + { "active_anon", PAGE_SIZE, NR_ACTIVE_ANON }, + { "inactive_file", PAGE_SIZE, NR_INACTIVE_FILE }, + { "active_file", PAGE_SIZE, NR_ACTIVE_FILE }, + { "unevictable", PAGE_SIZE, NR_UNEVICTABLE }, + + /* + * Note: The slab_reclaimable and slab_unreclaimable must be + * together and slab_reclaimable must be in front. + */ + { "slab_reclaimable", 1, NR_SLAB_RECLAIMABLE_B }, + { "slab_unreclaimable", 1, NR_SLAB_UNRECLAIMABLE_B }, + + /* The memory events */ + { "workingset_refault_anon", 1, WORKINGSET_REFAULT_ANON }, + { "workingset_refault_file", 1, WORKINGSET_REFAULT_FILE }, + { "workingset_activate_anon", 1, WORKINGSET_ACTIVATE_ANON }, + { "workingset_activate_file", 1, WORKINGSET_ACTIVATE_FILE }, + { "workingset_restore_anon", 1, WORKINGSET_RESTORE_ANON }, + { "workingset_restore_file", 1, WORKINGSET_RESTORE_FILE }, + { "workingset_nodereclaim", 1, WORKINGSET_NODERECLAIM }, +}; + +static int __init memory_stats_init(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (memory_stats[i].idx == NR_ANON_THPS) + memory_stats[i].ratio = HPAGE_PMD_SIZE; +#endif + VM_BUG_ON(!memory_stats[i].ratio); + VM_BUG_ON(memory_stats[i].idx >= MEMCG_NR_STAT); + } + + return 0; +} +pure_initcall(memory_stats_init); + static char *memory_stat_format(struct mem_cgroup *memcg) { struct seq_buf s; @@ -1476,52 +1532,19 @@ static char *memory_stat_format(struct mem_cgroup *memcg) * Current memory state: */ - seq_buf_printf(&s, "anon %llu\n", - (u64)memcg_page_state(memcg, NR_ANON_MAPPED) * - PAGE_SIZE); - seq_buf_printf(&s, "file %llu\n", - (u64)memcg_page_state(memcg, NR_FILE_PAGES) * - PAGE_SIZE); - seq_buf_printf(&s, "kernel_stack %llu\n", - (u64)memcg_page_state(memcg, NR_KERNEL_STACK_KB) * - 1024); - seq_buf_printf(&s, "slab %llu\n", - (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) + - memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B))); - seq_buf_printf(&s, "percpu %llu\n", - (u64)memcg_page_state(memcg, MEMCG_PERCPU_B)); - seq_buf_printf(&s, "sock %llu\n", - (u64)memcg_page_state(memcg, MEMCG_SOCK) * - PAGE_SIZE); + for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { + u64 size; - seq_buf_printf(&s, "shmem %llu\n", - (u64)memcg_page_state(memcg, NR_SHMEM) * - PAGE_SIZE); - seq_buf_printf(&s, "file_mapped %llu\n", - (u64)memcg_page_state(memcg, NR_FILE_MAPPED) * - PAGE_SIZE); - seq_buf_printf(&s, "file_dirty %llu\n", - (u64)memcg_page_state(memcg, NR_FILE_DIRTY) * - PAGE_SIZE); - seq_buf_printf(&s, "file_writeback %llu\n", - (u64)memcg_page_state(memcg, NR_WRITEBACK) * - PAGE_SIZE); + size = memcg_page_state(memcg, memory_stats[i].idx); + size *= memory_stats[i].ratio; + seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - seq_buf_printf(&s, "anon_thp %llu\n", - (u64)memcg_page_state(memcg, NR_ANON_THPS) * - HPAGE_PMD_SIZE); -#endif - - for (i = 0; i < NR_LRU_LISTS; i++) - seq_buf_printf(&s, "%s %llu\n", lru_list_name(i), - (u64)memcg_page_state(memcg, NR_LRU_BASE + i) * - PAGE_SIZE); - - seq_buf_printf(&s, "slab_reclaimable %llu\n", - (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B)); - seq_buf_printf(&s, "slab_unreclaimable %llu\n", - (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B)); + if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) { + size = memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) + + memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B); + seq_buf_printf(&s, "slab %llu\n", size); + } + } /* Accumulated memory events */ @@ -1529,22 +1552,6 @@ static char *memory_stat_format(struct mem_cgroup *memcg) memcg_events(memcg, PGFAULT)); seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT), memcg_events(memcg, PGMAJFAULT)); - - seq_buf_printf(&s, "workingset_refault_anon %lu\n", - memcg_page_state(memcg, WORKINGSET_REFAULT_ANON)); - seq_buf_printf(&s, "workingset_refault_file %lu\n", - memcg_page_state(memcg, WORKINGSET_REFAULT_FILE)); - seq_buf_printf(&s, "workingset_activate_anon %lu\n", - memcg_page_state(memcg, WORKINGSET_ACTIVATE_ANON)); - seq_buf_printf(&s, "workingset_activate_file %lu\n", - memcg_page_state(memcg, WORKINGSET_ACTIVATE_FILE)); - seq_buf_printf(&s, "workingset_restore_anon %lu\n", - memcg_page_state(memcg, WORKINGSET_RESTORE_ANON)); - seq_buf_printf(&s, "workingset_restore_file %lu\n", - memcg_page_state(memcg, WORKINGSET_RESTORE_FILE)); - seq_buf_printf(&s, "workingset_nodereclaim %lu\n", - memcg_page_state(memcg, WORKINGSET_NODERECLAIM)); - seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGREFILL), memcg_events(memcg, PGREFILL)); seq_buf_printf(&s, "pgscan %lu\n", @@ -1641,17 +1648,19 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) */ unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) { - unsigned long max; + unsigned long max = READ_ONCE(memcg->memory.max); - max = READ_ONCE(memcg->memory.max); - if (mem_cgroup_swappiness(memcg)) { - unsigned long memsw_max; - unsigned long swap_max; + if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) { + if (mem_cgroup_swappiness(memcg)) + max += min(READ_ONCE(memcg->swap.max), + (unsigned long)total_swap_pages); + } else { /* v1 */ + if (mem_cgroup_swappiness(memcg)) { + /* Calculate swap excess capacity from memsw limit */ + unsigned long swap = READ_ONCE(memcg->memsw.max) - max; - memsw_max = memcg->memsw.max; - swap_max = READ_ONCE(memcg->swap.max); - swap_max = min(swap_max, (unsigned long)total_swap_pages); - max = min(max + swap_max, memsw_max); + max += min(swap, (unsigned long)total_swap_pages); + } } return max; } @@ -1817,8 +1826,8 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) struct mem_cgroup *iter; /* - * When a new child is created while the hierarchy is under oom, - * mem_cgroup_oom_lock() may not be called. Watch for underflow. + * Be careful about under_oom underflows becase a child memcg + * could have been added after mem_cgroup_mark_under_oom. */ spin_lock(&memcg_oom_lock); for_each_mem_cgroup_tree(iter, memcg) @@ -2887,6 +2896,17 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p) page = virt_to_head_page(p); + /* + * If page->mem_cgroup is set, it's either a simple mem_cgroup pointer + * or a pointer to obj_cgroup vector. In the latter case the lowest + * bit of the pointer is set. + * The page->mem_cgroup pointer can be asynchronously changed + * from NULL to (obj_cgroup_vec | 0x1UL), but can't be changed + * from a valid memcg pointer to objcg vector or back. + */ + if (!page->mem_cgroup) + return NULL; + /* * Slab objects are accounted individually, not per-page. * Memcg membership data for each individual object is saved in @@ -4255,17 +4275,16 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, new->size = size; /* Copy thresholds (if any) to new array */ - if (thresholds->primary) { - memcpy(new->entries, thresholds->primary->entries, (size - 1) * - sizeof(struct mem_cgroup_threshold)); - } + if (thresholds->primary) + memcpy(new->entries, thresholds->primary->entries, + flex_array_size(new, entries, size - 1)); /* Add new threshold */ new->entries[size - 1].eventfd = eventfd; new->entries[size - 1].threshold = threshold; /* Sort thresholds. Registering of new threshold isn't time-critical */ - sort(new->entries, size, sizeof(struct mem_cgroup_threshold), + sort(new->entries, size, sizeof(*new->entries), compare_thresholds, NULL); /* Find current threshold */ @@ -5291,13 +5310,11 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) memcg->use_hierarchy = true; page_counter_init(&memcg->memory, &parent->memory); page_counter_init(&memcg->swap, &parent->swap); - page_counter_init(&memcg->memsw, &parent->memsw); page_counter_init(&memcg->kmem, &parent->kmem); page_counter_init(&memcg->tcpmem, &parent->tcpmem); } else { page_counter_init(&memcg->memory, NULL); page_counter_init(&memcg->swap, NULL); - page_counter_init(&memcg->memsw, NULL); page_counter_init(&memcg->kmem, NULL); page_counter_init(&memcg->tcpmem, NULL); /* @@ -5426,7 +5443,6 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX); page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX); - page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX); page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX); page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX); page_counter_set_min(&memcg->memory, 0); @@ -5500,7 +5516,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, struct page *page = NULL; swp_entry_t ent = pte_to_swp_entry(ptent); - if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent)) + if (!(mc.flags & MOVE_ANON)) return NULL; /* @@ -5519,6 +5535,9 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, return page; } + if (non_swap_entry(ent)) + return NULL; + /* * Because lookup_swap_cache() updates some statistics counter, * we call find_get_page() with swapper_space directly. @@ -5539,35 +5558,15 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, static struct page *mc_handle_file_pte(struct vm_area_struct *vma, unsigned long addr, pte_t ptent, swp_entry_t *entry) { - struct page *page = NULL; - struct address_space *mapping; - pgoff_t pgoff; - if (!vma->vm_file) /* anonymous vma */ return NULL; if (!(mc.flags & MOVE_FILE)) return NULL; - mapping = vma->vm_file->f_mapping; - pgoff = linear_page_index(vma, addr); - /* page is moved even if it's not RSS of this task(page-faulted). */ -#ifdef CONFIG_SWAP /* shmem/tmpfs may report page out on swap: account for that too. */ - if (shmem_mapping(mapping)) { - page = find_get_entry(mapping, pgoff); - if (xa_is_value(page)) { - swp_entry_t swp = radix_to_swp_entry(page); - *entry = swp; - page = find_get_page(swap_address_space(swp), - swp_offset(swp)); - } - } else - page = find_get_page(mapping, pgoff); -#else - page = find_get_page(mapping, pgoff); -#endif - return page; + return find_get_incore_page(vma->vm_file->f_mapping, + linear_page_index(vma, addr)); } /** @@ -6393,6 +6392,35 @@ static int memory_stat_show(struct seq_file *m, void *v) return 0; } +#ifdef CONFIG_NUMA +static int memory_numa_stat_show(struct seq_file *m, void *v) +{ + int i; + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + + for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { + int nid; + + if (memory_stats[i].idx >= NR_VM_NODE_STAT_ITEMS) + continue; + + seq_printf(m, "%s", memory_stats[i].name); + for_each_node_state(nid, N_MEMORY) { + u64 size; + struct lruvec *lruvec; + + lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); + size = lruvec_page_state(lruvec, memory_stats[i].idx); + size *= memory_stats[i].ratio; + seq_printf(m, " N%d=%llu", nid, size); + } + seq_putc(m, '\n'); + } + + return 0; +} +#endif + static int memory_oom_group_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_seq(m); @@ -6470,6 +6498,12 @@ static struct cftype memory_files[] = { .name = "stat", .seq_show = memory_stat_show, }, +#ifdef CONFIG_NUMA + { + .name = "numa_stat", + .seq_show = memory_numa_stat_show, + }, +#endif { .name = "oom.group", .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE, diff --git a/mm/memory-failure.c b/mm/memory-failure.c index a1e73943445e..990e3b2e37d5 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -484,11 +484,12 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, struct vm_area_struct *vma; struct task_struct *tsk; struct address_space *mapping = page->mapping; + pgoff_t pgoff; i_mmap_lock_read(mapping); read_lock(&tasklist_lock); + pgoff = page_to_pgoff(page); for_each_process(tsk) { - pgoff_t pgoff = page_to_pgoff(page); struct task_struct *t = task_early_kill(tsk, force_early); if (!t) @@ -824,7 +825,6 @@ static int me_huge_page(struct page *p, unsigned long pfn) #define sc ((1UL << PG_swapcache) | (1UL << PG_swapbacked)) #define unevict (1UL << PG_unevictable) #define mlock (1UL << PG_mlocked) -#define writeback (1UL << PG_writeback) #define lru (1UL << PG_lru) #define head (1UL << PG_head) #define slab (1UL << PG_slab) @@ -873,7 +873,6 @@ static struct page_state { #undef sc #undef unevict #undef mlock -#undef writeback #undef lru #undef head #undef slab diff --git a/mm/memory.c b/mm/memory.c index 81d0414082f2..b51536c7b7fe 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -807,15 +807,14 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, * lock. */ static inline int -copy_present_page(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pte_t *dst_pte, pte_t *src_pte, - struct vm_area_struct *vma, struct vm_area_struct *new, - unsigned long addr, int *rss, struct page **prealloc, - pte_t pte, struct page *page) +copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, + pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss, + struct page **prealloc, pte_t pte, struct page *page) { + struct mm_struct *src_mm = src_vma->vm_mm; struct page *new_page; - if (!is_cow_mapping(vma->vm_flags)) + if (!is_cow_mapping(src_vma->vm_flags)) return 1; /* @@ -845,16 +844,16 @@ copy_present_page(struct mm_struct *dst_mm, struct mm_struct *src_mm, * over and copy the page & arm it. */ *prealloc = NULL; - copy_user_highpage(new_page, page, addr, vma); + copy_user_highpage(new_page, page, addr, src_vma); __SetPageUptodate(new_page); - page_add_new_anon_rmap(new_page, new, addr, false); - lru_cache_add_inactive_or_unevictable(new_page, new); + page_add_new_anon_rmap(new_page, dst_vma, addr, false); + lru_cache_add_inactive_or_unevictable(new_page, dst_vma); rss[mm_counter(new_page)]++; /* All done, just insert the new page copy in the child */ - pte = mk_pte(new_page, new->vm_page_prot); - pte = maybe_mkwrite(pte_mkdirty(pte), new); - set_pte_at(dst_mm, addr, dst_pte, pte); + pte = mk_pte(new_page, dst_vma->vm_page_prot); + pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma); + set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte); return 0; } @@ -863,24 +862,21 @@ copy_present_page(struct mm_struct *dst_mm, struct mm_struct *src_mm, * is required to copy this pte. */ static inline int -copy_present_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, - struct vm_area_struct *new, - unsigned long addr, int *rss, struct page **prealloc) +copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, + pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss, + struct page **prealloc) { - unsigned long vm_flags = vma->vm_flags; + struct mm_struct *src_mm = src_vma->vm_mm; + unsigned long vm_flags = src_vma->vm_flags; pte_t pte = *src_pte; struct page *page; - page = vm_normal_page(vma, addr, pte); + page = vm_normal_page(src_vma, addr, pte); if (page) { int retval; - retval = copy_present_page(dst_mm, src_mm, - dst_pte, src_pte, - vma, new, - addr, rss, prealloc, - pte, page); + retval = copy_present_page(dst_vma, src_vma, dst_pte, src_pte, + addr, rss, prealloc, pte, page); if (retval <= 0) return retval; @@ -914,7 +910,7 @@ copy_present_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (!(vm_flags & VM_UFFD_WP)) pte = pte_clear_uffd_wp(pte); - set_pte_at(dst_mm, addr, dst_pte, pte); + set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte); return 0; } @@ -937,11 +933,13 @@ page_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma, return new_page; } -static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, - struct vm_area_struct *new, - unsigned long addr, unsigned long end) +static int +copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, + pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, + unsigned long end) { + struct mm_struct *dst_mm = dst_vma->vm_mm; + struct mm_struct *src_mm = src_vma->vm_mm; pte_t *orig_src_pte, *orig_dst_pte; pte_t *src_pte, *dst_pte; spinlock_t *src_ptl, *dst_ptl; @@ -984,15 +982,15 @@ again: if (unlikely(!pte_present(*src_pte))) { entry.val = copy_nonpresent_pte(dst_mm, src_mm, dst_pte, src_pte, - vma, addr, rss); + src_vma, addr, rss); if (entry.val) break; progress += 8; continue; } /* copy_present_pte() will clear `*prealloc' if consumed */ - ret = copy_present_pte(dst_mm, src_mm, dst_pte, src_pte, - vma, new, addr, rss, &prealloc); + ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte, + addr, rss, &prealloc); /* * If we need a pre-allocated page for this pte, drop the * locks, allocate, and try again. @@ -1027,7 +1025,7 @@ again: entry.val = 0; } else if (ret) { WARN_ON_ONCE(ret != -EAGAIN); - prealloc = page_copy_prealloc(src_mm, vma, addr); + prealloc = page_copy_prealloc(src_mm, src_vma, addr); if (!prealloc) return -ENOMEM; /* We've captured and resolved the error. Reset, try again. */ @@ -1041,11 +1039,13 @@ out: return ret; } -static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma, - struct vm_area_struct *new, - unsigned long addr, unsigned long end) +static inline int +copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, + pud_t *dst_pud, pud_t *src_pud, unsigned long addr, + unsigned long end) { + struct mm_struct *dst_mm = dst_vma->vm_mm; + struct mm_struct *src_mm = src_vma->vm_mm; pmd_t *src_pmd, *dst_pmd; unsigned long next; @@ -1058,9 +1058,9 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) { int err; - VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma); + VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma); err = copy_huge_pmd(dst_mm, src_mm, - dst_pmd, src_pmd, addr, vma); + dst_pmd, src_pmd, addr, src_vma); if (err == -ENOMEM) return -ENOMEM; if (!err) @@ -1069,18 +1069,20 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src } if (pmd_none_or_clear_bad(src_pmd)) continue; - if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, - vma, new, addr, next)) + if (copy_pte_range(dst_vma, src_vma, dst_pmd, src_pmd, + addr, next)) return -ENOMEM; } while (dst_pmd++, src_pmd++, addr = next, addr != end); return 0; } -static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - p4d_t *dst_p4d, p4d_t *src_p4d, struct vm_area_struct *vma, - struct vm_area_struct *new, - unsigned long addr, unsigned long end) +static inline int +copy_pud_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, + p4d_t *dst_p4d, p4d_t *src_p4d, unsigned long addr, + unsigned long end) { + struct mm_struct *dst_mm = dst_vma->vm_mm; + struct mm_struct *src_mm = src_vma->vm_mm; pud_t *src_pud, *dst_pud; unsigned long next; @@ -1093,9 +1095,9 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) { int err; - VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, vma); + VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma); err = copy_huge_pud(dst_mm, src_mm, - dst_pud, src_pud, addr, vma); + dst_pud, src_pud, addr, src_vma); if (err == -ENOMEM) return -ENOMEM; if (!err) @@ -1104,18 +1106,19 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src } if (pud_none_or_clear_bad(src_pud)) continue; - if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud, - vma, new, addr, next)) + if (copy_pmd_range(dst_vma, src_vma, dst_pud, src_pud, + addr, next)) return -ENOMEM; } while (dst_pud++, src_pud++, addr = next, addr != end); return 0; } -static inline int copy_p4d_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma, - struct vm_area_struct *new, - unsigned long addr, unsigned long end) +static inline int +copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, + pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long addr, + unsigned long end) { + struct mm_struct *dst_mm = dst_vma->vm_mm; p4d_t *src_p4d, *dst_p4d; unsigned long next; @@ -1127,20 +1130,22 @@ static inline int copy_p4d_range(struct mm_struct *dst_mm, struct mm_struct *src next = p4d_addr_end(addr, end); if (p4d_none_or_clear_bad(src_p4d)) continue; - if (copy_pud_range(dst_mm, src_mm, dst_p4d, src_p4d, - vma, new, addr, next)) + if (copy_pud_range(dst_vma, src_vma, dst_p4d, src_p4d, + addr, next)) return -ENOMEM; } while (dst_p4d++, src_p4d++, addr = next, addr != end); return 0; } -int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - struct vm_area_struct *vma, struct vm_area_struct *new) +int +copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { pgd_t *src_pgd, *dst_pgd; unsigned long next; - unsigned long addr = vma->vm_start; - unsigned long end = vma->vm_end; + unsigned long addr = src_vma->vm_start; + unsigned long end = src_vma->vm_end; + struct mm_struct *dst_mm = dst_vma->vm_mm; + struct mm_struct *src_mm = src_vma->vm_mm; struct mmu_notifier_range range; bool is_cow; int ret; @@ -1151,19 +1156,19 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, * readonly mappings. The tradeoff is that copy_page_range is more * efficient than faulting. */ - if (!(vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) && - !vma->anon_vma) + if (!(src_vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) && + !src_vma->anon_vma) return 0; - if (is_vm_hugetlb_page(vma)) - return copy_hugetlb_page_range(dst_mm, src_mm, vma); + if (is_vm_hugetlb_page(src_vma)) + return copy_hugetlb_page_range(dst_mm, src_mm, src_vma); - if (unlikely(vma->vm_flags & VM_PFNMAP)) { + if (unlikely(src_vma->vm_flags & VM_PFNMAP)) { /* * We do not free on error cases below as remove_vma * gets called on error from higher level routine */ - ret = track_pfn_copy(vma); + ret = track_pfn_copy(src_vma); if (ret) return ret; } @@ -1174,11 +1179,11 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, * parent mm. And a permission downgrade will only happen if * is_cow_mapping() returns true. */ - is_cow = is_cow_mapping(vma->vm_flags); + is_cow = is_cow_mapping(src_vma->vm_flags); if (is_cow) { mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, - 0, vma, src_mm, addr, end); + 0, src_vma, src_mm, addr, end); mmu_notifier_invalidate_range_start(&range); } @@ -1189,8 +1194,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(src_pgd)) continue; - if (unlikely(copy_p4d_range(dst_mm, src_mm, dst_pgd, src_pgd, - vma, new, addr, next))) { + if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd, + addr, next))) { ret = -ENOMEM; break; } @@ -3602,7 +3607,7 @@ static vm_fault_t __do_fault(struct vm_fault *vmf) * unlock_page(A) * lock_page(B) * lock_page(B) - * pte_alloc_pne + * pte_alloc_one * shrink_page_list * wait_on_page_writeback(A) * SetPageWriteback(B) @@ -3610,7 +3615,7 @@ static vm_fault_t __do_fault(struct vm_fault *vmf) * # flush A, B to clear the writeback */ if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) { - vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm); + vmf->prealloc_pte = pte_alloc_one(vma->vm_mm); if (!vmf->prealloc_pte) return VM_FAULT_OOM; smp_wmb(); /* See comment in __pte_alloc() */ @@ -3777,7 +3782,7 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) /** * alloc_set_pte - setup new PTE entry for given page and add reverse page - * mapping. If needed, the fucntion allocates page table or use pre-allocated. + * mapping. If needed, the function allocates page table or use pre-allocated. * * @vmf: fault environment * @page: page to map diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index ce3e73e3a5c1..8e9e2d44cdad 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -353,11 +353,19 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages, #ifdef CONFIG_NUMA int __weak memory_add_physaddr_to_nid(u64 start) { - pr_info_once("Unknown target node for memory at 0x%llx, assuming node 0\n", + pr_info_once("Unknown online node for memory at 0x%llx, assuming node 0\n", start); return 0; } EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); + +int __weak phys_to_target_node(u64 start) +{ + pr_info_once("Unknown target node for memory at 0x%llx, assuming node 0\n", + start); + return 0; +} +EXPORT_SYMBOL_GPL(phys_to_target_node); #endif /* find the smallest valid pfn in the range [start_pfn, end_pfn) */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 133ec4a89faf..eb2998e1a516 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -876,13 +876,12 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, goto out; } - task_lock(current); ret = mpol_set_nodemask(new, nodes, scratch); if (ret) { - task_unlock(current); mpol_put(new); goto out; } + task_lock(current); old = current->mempolicy; current->mempolicy = new; if (new && new->mode == MPOL_INTERLEAVE) @@ -1325,9 +1324,7 @@ static long do_mbind(unsigned long start, unsigned long len, NODEMASK_SCRATCH(scratch); if (scratch) { mmap_write_lock(mm); - task_lock(current); err = mpol_set_nodemask(new, nmask, scratch); - task_unlock(current); if (err) mmap_write_unlock(mm); } else @@ -1886,8 +1883,7 @@ nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) } /* Return the node id preferred by the given mempolicy, or the given id */ -static int policy_node(gfp_t gfp, struct mempolicy *policy, - int nd) +static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd) { if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL)) nd = policy->v.preferred_node; diff --git a/mm/mempool.c b/mm/mempool.c index 79bff63ecf27..f473cdddaff0 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -58,11 +58,10 @@ static void __check_element(mempool_t *pool, void *element, size_t size) static void check_element(mempool_t *pool, void *element) { /* Mempools backed by slab allocator */ - if (pool->free == mempool_free_slab || pool->free == mempool_kfree) + if (pool->free == mempool_free_slab || pool->free == mempool_kfree) { __check_element(pool, element, ksize(element)); - - /* Mempools backed by page allocator */ - if (pool->free == mempool_free_pages) { + } else if (pool->free == mempool_free_pages) { + /* Mempools backed by page allocator */ int order = (int)(long)pool->pool_data; void *addr = kmap_atomic((struct page *)element); @@ -82,11 +81,10 @@ static void __poison_element(void *element, size_t size) static void poison_element(mempool_t *pool, void *element) { /* Mempools backed by slab allocator */ - if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) + if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) { __poison_element(element, ksize(element)); - - /* Mempools backed by page allocator */ - if (pool->alloc == mempool_alloc_pages) { + } else if (pool->alloc == mempool_alloc_pages) { + /* Mempools backed by page allocator */ int order = (int)(long)pool->pool_data; void *addr = kmap_atomic((struct page *)element); @@ -107,7 +105,7 @@ static __always_inline void kasan_poison_element(mempool_t *pool, void *element) { if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) kasan_poison_kfree(element, _RET_IP_); - if (pool->alloc == mempool_alloc_pages) + else if (pool->alloc == mempool_alloc_pages) kasan_free_pages(element, (unsigned long)pool->pool_data); } @@ -115,7 +113,7 @@ static void kasan_unpoison_element(mempool_t *pool, void *element) { if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) kasan_unpoison_slab(element); - if (pool->alloc == mempool_alloc_pages) + else if (pool->alloc == mempool_alloc_pages) kasan_alloc_pages(element, (unsigned long)pool->pool_data); } diff --git a/mm/memremap.c b/mm/memremap.c index 006dace60b1a..198083453182 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -40,12 +40,10 @@ EXPORT_SYMBOL_GPL(memremap_compat_align); #ifdef CONFIG_DEV_PAGEMAP_OPS DEFINE_STATIC_KEY_FALSE(devmap_managed_key); EXPORT_SYMBOL(devmap_managed_key); -static atomic_t devmap_managed_enable; static void devmap_managed_enable_put(void) { - if (atomic_dec_and_test(&devmap_managed_enable)) - static_branch_disable(&devmap_managed_key); + static_branch_dec(&devmap_managed_key); } static int devmap_managed_enable_get(struct dev_pagemap *pgmap) @@ -56,8 +54,7 @@ static int devmap_managed_enable_get(struct dev_pagemap *pgmap) return -EINVAL; } - if (atomic_inc_return(&devmap_managed_enable) == 1) - static_branch_enable(&devmap_managed_key); + static_branch_inc(&devmap_managed_key); return 0; } #else @@ -70,24 +67,28 @@ static void devmap_managed_enable_put(void) } #endif /* CONFIG_DEV_PAGEMAP_OPS */ -static void pgmap_array_delete(struct resource *res) +static void pgmap_array_delete(struct range *range) { - xa_store_range(&pgmap_array, PHYS_PFN(res->start), PHYS_PFN(res->end), + xa_store_range(&pgmap_array, PHYS_PFN(range->start), PHYS_PFN(range->end), NULL, GFP_KERNEL); synchronize_rcu(); } -static unsigned long pfn_first(struct dev_pagemap *pgmap) +static unsigned long pfn_first(struct dev_pagemap *pgmap, int range_id) { - return PHYS_PFN(pgmap->res.start) + - vmem_altmap_offset(pgmap_altmap(pgmap)); + struct range *range = &pgmap->ranges[range_id]; + unsigned long pfn = PHYS_PFN(range->start); + + if (range_id) + return pfn; + return pfn + vmem_altmap_offset(pgmap_altmap(pgmap)); } -static unsigned long pfn_end(struct dev_pagemap *pgmap) +static unsigned long pfn_end(struct dev_pagemap *pgmap, int range_id) { - const struct resource *res = &pgmap->res; + const struct range *range = &pgmap->ranges[range_id]; - return (res->start + resource_size(res)) >> PAGE_SHIFT; + return (range->start + range_len(range)) >> PAGE_SHIFT; } static unsigned long pfn_next(unsigned long pfn) @@ -97,8 +98,8 @@ static unsigned long pfn_next(unsigned long pfn) return pfn + 1; } -#define for_each_device_pfn(pfn, map) \ - for (pfn = pfn_first(map); pfn < pfn_end(map); pfn = pfn_next(pfn)) +#define for_each_device_pfn(pfn, map, i) \ + for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); pfn = pfn_next(pfn)) static void dev_pagemap_kill(struct dev_pagemap *pgmap) { @@ -124,39 +125,49 @@ static void dev_pagemap_cleanup(struct dev_pagemap *pgmap) pgmap->ref = NULL; } -void memunmap_pages(struct dev_pagemap *pgmap) +static void pageunmap_range(struct dev_pagemap *pgmap, int range_id) { - struct resource *res = &pgmap->res; + struct range *range = &pgmap->ranges[range_id]; struct page *first_page; - unsigned long pfn; int nid; - dev_pagemap_kill(pgmap); - for_each_device_pfn(pfn, pgmap) - put_page(pfn_to_page(pfn)); - dev_pagemap_cleanup(pgmap); - /* make sure to access a memmap that was actually initialized */ - first_page = pfn_to_page(pfn_first(pgmap)); + first_page = pfn_to_page(pfn_first(pgmap, range_id)); /* pages are dead and unused, undo the arch mapping */ nid = page_to_nid(first_page); mem_hotplug_begin(); - remove_pfn_range_from_zone(page_zone(first_page), PHYS_PFN(res->start), - PHYS_PFN(resource_size(res))); + remove_pfn_range_from_zone(page_zone(first_page), PHYS_PFN(range->start), + PHYS_PFN(range_len(range))); if (pgmap->type == MEMORY_DEVICE_PRIVATE) { - __remove_pages(PHYS_PFN(res->start), - PHYS_PFN(resource_size(res)), NULL); + __remove_pages(PHYS_PFN(range->start), + PHYS_PFN(range_len(range)), NULL); } else { - arch_remove_memory(nid, res->start, resource_size(res), + arch_remove_memory(nid, range->start, range_len(range), pgmap_altmap(pgmap)); - kasan_remove_zero_shadow(__va(res->start), resource_size(res)); + kasan_remove_zero_shadow(__va(range->start), range_len(range)); } mem_hotplug_done(); - untrack_pfn(NULL, PHYS_PFN(res->start), resource_size(res)); - pgmap_array_delete(res); + untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range)); + pgmap_array_delete(range); +} + +void memunmap_pages(struct dev_pagemap *pgmap) +{ + unsigned long pfn; + int i; + + dev_pagemap_kill(pgmap); + for (i = 0; i < pgmap->nr_range; i++) + for_each_device_pfn(pfn, pgmap, i) + put_page(pfn_to_page(pfn)); + dev_pagemap_cleanup(pgmap); + + for (i = 0; i < pgmap->nr_range; i++) + pageunmap_range(pgmap, i); + WARN_ONCE(pgmap->altmap.alloc, "failed to free all reserved pages\n"); devmap_managed_enable_put(); } @@ -175,6 +186,114 @@ static void dev_pagemap_percpu_release(struct percpu_ref *ref) complete(&pgmap->done); } +static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params, + int range_id, int nid) +{ + struct range *range = &pgmap->ranges[range_id]; + struct dev_pagemap *conflict_pgmap; + int error, is_ram; + + if (WARN_ONCE(pgmap_altmap(pgmap) && range_id > 0, + "altmap not supported for multiple ranges\n")) + return -EINVAL; + + conflict_pgmap = get_dev_pagemap(PHYS_PFN(range->start), NULL); + if (conflict_pgmap) { + WARN(1, "Conflicting mapping in same section\n"); + put_dev_pagemap(conflict_pgmap); + return -ENOMEM; + } + + conflict_pgmap = get_dev_pagemap(PHYS_PFN(range->end), NULL); + if (conflict_pgmap) { + WARN(1, "Conflicting mapping in same section\n"); + put_dev_pagemap(conflict_pgmap); + return -ENOMEM; + } + + is_ram = region_intersects(range->start, range_len(range), + IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE); + + if (is_ram != REGION_DISJOINT) { + WARN_ONCE(1, "attempted on %s region %#llx-%#llx\n", + is_ram == REGION_MIXED ? "mixed" : "ram", + range->start, range->end); + return -ENXIO; + } + + error = xa_err(xa_store_range(&pgmap_array, PHYS_PFN(range->start), + PHYS_PFN(range->end), pgmap, GFP_KERNEL)); + if (error) + return error; + + if (nid < 0) + nid = numa_mem_id(); + + error = track_pfn_remap(NULL, ¶ms->pgprot, PHYS_PFN(range->start), 0, + range_len(range)); + if (error) + goto err_pfn_remap; + + mem_hotplug_begin(); + + /* + * For device private memory we call add_pages() as we only need to + * allocate and initialize struct page for the device memory. More- + * over the device memory is un-accessible thus we do not want to + * create a linear mapping for the memory like arch_add_memory() + * would do. + * + * For all other device memory types, which are accessible by + * the CPU, we do want the linear mapping and thus use + * arch_add_memory(). + */ + if (pgmap->type == MEMORY_DEVICE_PRIVATE) { + error = add_pages(nid, PHYS_PFN(range->start), + PHYS_PFN(range_len(range)), params); + } else { + error = kasan_add_zero_shadow(__va(range->start), range_len(range)); + if (error) { + mem_hotplug_done(); + goto err_kasan; + } + + error = arch_add_memory(nid, range->start, range_len(range), + params); + } + + if (!error) { + struct zone *zone; + + zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE]; + move_pfn_range_to_zone(zone, PHYS_PFN(range->start), + PHYS_PFN(range_len(range)), params->altmap); + } + + mem_hotplug_done(); + if (error) + goto err_add_memory; + + /* + * Initialization of the pages has been deferred until now in order + * to allow us to do the work while not holding the hotplug lock. + */ + memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], + PHYS_PFN(range->start), + PHYS_PFN(range_len(range)), pgmap); + percpu_ref_get_many(pgmap->ref, pfn_end(pgmap, range_id) + - pfn_first(pgmap, range_id)); + return 0; + +err_add_memory: + kasan_remove_zero_shadow(__va(range->start), range_len(range)); +err_kasan: + untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range)); +err_pfn_remap: + pgmap_array_delete(range); + return error; +} + + /* * Not device managed version of dev_memremap_pages, undone by * memunmap_pages(). Please use dev_memremap_pages if you have a struct @@ -182,17 +301,16 @@ static void dev_pagemap_percpu_release(struct percpu_ref *ref) */ void *memremap_pages(struct dev_pagemap *pgmap, int nid) { - struct resource *res = &pgmap->res; - struct dev_pagemap *conflict_pgmap; struct mhp_params params = { - /* - * We do not want any optional features only our own memmap - */ .altmap = pgmap_altmap(pgmap), .pgprot = PAGE_KERNEL, }; - int error, is_ram; + const int nr_range = pgmap->nr_range; bool need_devmap_managed = true; + int error, i; + + if (WARN_ONCE(!nr_range, "nr_range must be specified\n")) + return ERR_PTR(-EINVAL); switch (pgmap->type) { case MEMORY_DEVICE_PRIVATE: @@ -251,105 +369,27 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) return ERR_PTR(error); } - conflict_pgmap = get_dev_pagemap(PHYS_PFN(res->start), NULL); - if (conflict_pgmap) { - WARN(1, "Conflicting mapping in same section\n"); - put_dev_pagemap(conflict_pgmap); - error = -ENOMEM; - goto err_array; - } - - conflict_pgmap = get_dev_pagemap(PHYS_PFN(res->end), NULL); - if (conflict_pgmap) { - WARN(1, "Conflicting mapping in same section\n"); - put_dev_pagemap(conflict_pgmap); - error = -ENOMEM; - goto err_array; - } - - is_ram = region_intersects(res->start, resource_size(res), - IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE); - - if (is_ram != REGION_DISJOINT) { - WARN_ONCE(1, "%s attempted on %s region %pr\n", __func__, - is_ram == REGION_MIXED ? "mixed" : "ram", res); - error = -ENXIO; - goto err_array; - } - - error = xa_err(xa_store_range(&pgmap_array, PHYS_PFN(res->start), - PHYS_PFN(res->end), pgmap, GFP_KERNEL)); - if (error) - goto err_array; - - if (nid < 0) - nid = numa_mem_id(); - - error = track_pfn_remap(NULL, ¶ms.pgprot, PHYS_PFN(res->start), - 0, resource_size(res)); - if (error) - goto err_pfn_remap; - - mem_hotplug_begin(); - /* - * For device private memory we call add_pages() as we only need to - * allocate and initialize struct page for the device memory. More- - * over the device memory is un-accessible thus we do not want to - * create a linear mapping for the memory like arch_add_memory() - * would do. - * - * For all other device memory types, which are accessible by - * the CPU, we do want the linear mapping and thus use - * arch_add_memory(). + * Clear the pgmap nr_range as it will be incremented for each + * successfully processed range. This communicates how many + * regions to unwind in the abort case. */ - if (pgmap->type == MEMORY_DEVICE_PRIVATE) { - error = add_pages(nid, PHYS_PFN(res->start), - PHYS_PFN(resource_size(res)), ¶ms); - } else { - error = kasan_add_zero_shadow(__va(res->start), resource_size(res)); - if (error) { - mem_hotplug_done(); - goto err_kasan; - } - - error = arch_add_memory(nid, res->start, resource_size(res), - ¶ms); + pgmap->nr_range = 0; + error = 0; + for (i = 0; i < nr_range; i++) { + error = pagemap_range(pgmap, ¶ms, i, nid); + if (error) + break; + pgmap->nr_range++; } - if (!error) { - struct zone *zone; - - zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE]; - move_pfn_range_to_zone(zone, PHYS_PFN(res->start), - PHYS_PFN(resource_size(res)), params.altmap); + if (i < nr_range) { + memunmap_pages(pgmap); + pgmap->nr_range = nr_range; + return ERR_PTR(error); } - mem_hotplug_done(); - if (error) - goto err_add_memory; - - /* - * Initialization of the pages has been deferred until now in order - * to allow us to do the work while not holding the hotplug lock. - */ - memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], - PHYS_PFN(res->start), - PHYS_PFN(resource_size(res)), pgmap); - percpu_ref_get_many(pgmap->ref, pfn_end(pgmap) - pfn_first(pgmap)); - return __va(res->start); - - err_add_memory: - kasan_remove_zero_shadow(__va(res->start), resource_size(res)); - err_kasan: - untrack_pfn(NULL, PHYS_PFN(res->start), resource_size(res)); - err_pfn_remap: - pgmap_array_delete(res); - err_array: - dev_pagemap_kill(pgmap); - dev_pagemap_cleanup(pgmap); - devmap_managed_enable_put(); - return ERR_PTR(error); + return __va(pgmap->ranges[0].start); } EXPORT_SYMBOL_GPL(memremap_pages); @@ -369,7 +409,7 @@ EXPORT_SYMBOL_GPL(memremap_pages); * 'live' on entry and will be killed and reaped at * devm_memremap_pages_release() time, or if this routine fails. * - * 4/ res is expected to be a host memory range that could feasibly be + * 4/ range is expected to be a host memory range that could feasibly be * treated as a "System RAM" range, i.e. not a device mmio range, but * this is not enforced. */ @@ -426,7 +466,7 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, * In the cached case we're already holding a live reference. */ if (pgmap) { - if (phys >= pgmap->res.start && phys <= pgmap->res.end) + if (phys >= pgmap->range.start && phys <= pgmap->range.end) return pgmap; put_dev_pagemap(pgmap); } @@ -451,8 +491,6 @@ void free_devmap_managed_page(struct page *page) return; } - /* Clear Active bit in case of parallel mark_page_accessed */ - __ClearPageActive(page); __ClearPageWaiters(page); mem_cgroup_uncharge(page); diff --git a/mm/migrate.c b/mm/migrate.c index 4de11dfd730b..f94d7c7eeddf 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -381,7 +381,7 @@ static int expected_page_refs(struct address_space *mapping, struct page *page) int expected_count = 1; /* - * Device public or private pages have an extra refcount as they are + * Device private pages have an extra refcount as they are * ZONE_DEVICE pages. */ expected_count += is_device_private_page(page); @@ -3077,7 +3077,6 @@ void migrate_vma_finalize(struct migrate_vma *migrate) remove_migration_ptes(page, newpage, false); unlock_page(page); - migrate->cpages--; if (is_zone_device_page(page)) put_page(page); diff --git a/mm/mincore.c b/mm/mincore.c index 453ff112470f..02db1a834021 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -48,7 +48,7 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, * and is up to date; i.e. that no page-in operation would be required * at this time if an application were to map and access this page. */ -static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) +static unsigned char mincore_page(struct address_space *mapping, pgoff_t index) { unsigned char present = 0; struct page *page; @@ -59,31 +59,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) * any other file mapping (ie. marked !present and faulted in with * tmpfs's .fault). So swapped out tmpfs mappings are tested here. */ -#ifdef CONFIG_SWAP - if (shmem_mapping(mapping)) { - page = find_get_entry(mapping, pgoff); - /* - * shmem/tmpfs may return swap: account for swapcache - * page too. - */ - if (xa_is_value(page)) { - swp_entry_t swp = radix_to_swp_entry(page); - struct swap_info_struct *si; - - /* Prevent swap device to being swapoff under us */ - si = get_swap_device(swp); - if (si) { - page = find_get_page(swap_address_space(swp), - swp_offset(swp)); - put_swap_device(si); - } else - page = NULL; - } - } else - page = find_get_page(mapping, pgoff); -#else - page = find_get_page(mapping, pgoff); -#endif + page = find_get_incore_page(mapping, index); if (page) { present = PageUptodate(page); put_page(page); diff --git a/mm/mmap.c b/mm/mmap.c index 4473e08157a0..29b9c84e9a5f 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -143,7 +143,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma, struct file *file, struct address_space *mapping) { if (vma->vm_flags & VM_DENYWRITE) - atomic_inc(&file_inode(file)->i_writecount); + allow_write_access(file); if (vma->vm_flags & VM_SHARED) mapping_unmap_writable(mapping); @@ -474,8 +474,12 @@ static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma, { /* * All rb_subtree_gap values must be consistent prior to erase, - * with the possible exception of the "next" vma being erased if - * next->vm_start was reduced. + * with the possible exception of + * + * a. the "next" vma being erased if next->vm_start was reduced in + * __vma_adjust() -> __vma_unlink() + * b. the vma being erased in detach_vmas_to_be_unmapped() -> + * vma_rb_erase() */ validate_mm_rb(root, ignore); @@ -485,13 +489,7 @@ static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma, static __always_inline void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root) { - /* - * All rb_subtree_gap values must be consistent prior to erase, - * with the possible exception of the vma being erased. - */ - validate_mm_rb(root, vma); - - __vma_rb_erase(vma, root); + vma_rb_erase_ignore(vma, root, vma); } /* @@ -623,7 +621,7 @@ static void __vma_link_file(struct vm_area_struct *vma) if (vma->vm_flags & VM_DENYWRITE) atomic_dec(&file_inode(file)->i_writecount); if (vma->vm_flags & VM_SHARED) - atomic_inc(&mapping->i_mmap_writable); + mapping_allow_writable(mapping); flush_dcache_mmap_lock(mapping); vma_interval_tree_insert(vma, &mapping->i_mmap); @@ -677,7 +675,7 @@ static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) mm->map_count++; } -static __always_inline void __vma_unlink_common(struct mm_struct *mm, +static __always_inline void __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *ignore) { @@ -760,7 +758,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, * vma expands, overlapping part of the next: * mprotect case 5 shifting the boundary up. */ - adjust_next = (end - next->vm_start) >> PAGE_SHIFT; + adjust_next = (end - next->vm_start); exporter = next; importer = vma; VM_WARN_ON(expand != importer); @@ -770,7 +768,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, * split_vma inserting another: so it must be * mprotect case 4 shifting the boundary down. */ - adjust_next = -((vma->vm_end - end) >> PAGE_SHIFT); + adjust_next = -(vma->vm_end - end); exporter = vma; importer = next; VM_WARN_ON(expand != importer); @@ -825,7 +823,7 @@ again: anon_vma_interval_tree_pre_update_vma(next); } - if (root) { + if (file) { flush_dcache_mmap_lock(mapping); vma_interval_tree_remove(vma, root); if (adjust_next) @@ -842,11 +840,11 @@ again: } vma->vm_pgoff = pgoff; if (adjust_next) { - next->vm_start += adjust_next << PAGE_SHIFT; - next->vm_pgoff += adjust_next; + next->vm_start += adjust_next; + next->vm_pgoff += adjust_next >> PAGE_SHIFT; } - if (root) { + if (file) { if (adjust_next) vma_interval_tree_insert(next, root); vma_interval_tree_insert(vma, root); @@ -859,7 +857,7 @@ again: * us to remove next before dropping the locks. */ if (remove_next != 3) - __vma_unlink_common(mm, next, next); + __vma_unlink(mm, next, next); else /* * vma is not before next if they've been @@ -870,7 +868,7 @@ again: * "next" (which is stored in post-swap() * "vma"). */ - __vma_unlink_common(mm, next, vma); + __vma_unlink(mm, next, vma); if (file) __remove_shared_vm_struct(next, file, mapping); } else if (insert) { @@ -897,10 +895,9 @@ again: anon_vma_interval_tree_post_update_vma(next); anon_vma_unlock_write(anon_vma); } - if (mapping) - i_mmap_unlock_write(mapping); - if (root) { + if (file) { + i_mmap_unlock_write(mapping); uprobe_mmap(vma); if (adjust_next) @@ -3246,7 +3243,7 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) * By setting it to reflect the virtual start address of the * vma, merges and splits can happen in a seamless way, just * using the existing file pgoff checks and manipulations. - * Similarly in do_mmap and in do_brk. + * Similarly in do_mmap and in do_brk_flags. */ if (vma_is_anonymous(vma)) { BUG_ON(vma->anon_vma); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index e90f25d6385d..8b84661a6410 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -64,6 +64,8 @@ int sysctl_oom_dump_tasks = 1; * and mark_oom_victim */ DEFINE_MUTEX(oom_lock); +/* Serializes oom_score_adj and oom_score_adj_min updates */ +DEFINE_MUTEX(oom_adj_mutex); static inline bool is_memcg_oom(struct oom_control *oc) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9e59749a8803..84148aecbef1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -156,16 +156,16 @@ static int __init early_init_on_alloc(char *buf) int ret; bool bool_result; - if (!buf) - return -EINVAL; ret = kstrtobool(buf, &bool_result); + if (ret) + return ret; if (bool_result && page_poisoning_enabled()) pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_alloc\n"); if (bool_result) static_branch_enable(&init_on_alloc); else static_branch_disable(&init_on_alloc); - return ret; + return 0; } early_param("init_on_alloc", early_init_on_alloc); @@ -174,16 +174,16 @@ static int __init early_init_on_free(char *buf) int ret; bool bool_result; - if (!buf) - return -EINVAL; ret = kstrtobool(buf, &bool_result); + if (ret) + return ret; if (bool_result && page_poisoning_enabled()) pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_free\n"); if (bool_result) static_branch_enable(&init_on_free); else static_branch_disable(&init_on_free); - return ret; + return 0; } early_param("init_on_free", early_init_on_free); @@ -3753,8 +3753,8 @@ retry: */ no_fallback = alloc_flags & ALLOC_NOFRAGMENT; z = ac->preferred_zoneref; - for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, - ac->highest_zoneidx, ac->nodemask) { + for_next_zone_zonelist_nodemask(zone, z, ac->highest_zoneidx, + ac->nodemask) { struct page *page; unsigned long mark; @@ -3998,8 +3998,10 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, * success so it is time to admit defeat. We will skip the OOM killer * because it is very likely that the caller has a more reasonable * fallback than shooting a random task. + * + * The OOM killer may not free memory on a specific node. */ - if (gfp_mask & __GFP_RETRY_MAYFAIL) + if (gfp_mask & (__GFP_RETRY_MAYFAIL | __GFP_THISNODE)) goto out; /* The OOM killer does not needlessly kill tasks for lowmem */ if (ac->highest_zoneidx < ZONE_NORMAL) @@ -4016,10 +4018,6 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, * failures more gracefully we should just bail out here. */ - /* The OOM killer may not free memory on a specific node */ - if (gfp_mask & __GFP_THISNODE) - goto out; - /* Exhausted what can be done so it's blame time */ if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) { *did_some_progress = 1; @@ -4267,13 +4265,12 @@ EXPORT_SYMBOL_GPL(fs_reclaim_release); #endif /* Perform direct synchronous page reclaim */ -static int +static unsigned long __perform_reclaim(gfp_t gfp_mask, unsigned int order, const struct alloc_context *ac) { - int progress; unsigned int noreclaim_flag; - unsigned long pflags; + unsigned long pflags, progress; cond_resched(); @@ -4852,12 +4849,6 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, *alloc_flags = current_alloc_flags(gfp_mask, *alloc_flags); - return true; -} - -/* Determine whether to spread dirty pages and what the first usable zone */ -static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac) -{ /* Dirty zone balancing only done in the fast path */ ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE); @@ -4868,6 +4859,8 @@ static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac) */ ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, ac->highest_zoneidx, ac->nodemask); + + return true; } /* @@ -4896,8 +4889,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags)) return NULL; - finalise_ac(gfp_mask, &ac); - /* * Forbid the first pass from falling back to types that fragment * memory until all local zones are considered. @@ -4973,6 +4964,9 @@ void __free_pages(struct page *page, unsigned int order) { if (put_page_testzero(page)) free_the_page(page, order); + else if (!PageHead(page)) + while (order-- > 0) + free_the_page(page + (1 << order), order); } EXPORT_SYMBOL(__free_pages); @@ -5663,7 +5657,6 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask) int n, val; int min_val = INT_MAX; int best_node = NUMA_NO_NODE; - const struct cpumask *tmp = cpumask_of_node(0); /* Use the local node if we haven't already */ if (!node_isset(node, *used_node_mask)) { @@ -5684,8 +5677,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask) val += (n < node); /* Give preference to headless and unused nodes */ - tmp = cpumask_of_node(n); - if (!cpumask_empty(tmp)) + if (!cpumask_empty(cpumask_of_node(n))) val += PENALTY_FOR_NODE_WITH_CPUS; /* Slight preference for less loaded node */ @@ -5981,7 +5973,7 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn) if (mirrored_kernelcore && zone == ZONE_MOVABLE) { if (!r || *pfn >= memblock_region_memory_end_pfn(r)) { - for_each_memblock(memory, r) { + for_each_mem_region(r) { if (*pfn < memblock_region_memory_end_pfn(r)) break; } @@ -6566,7 +6558,7 @@ static unsigned long __init zone_absent_pages_in_node(int nid, unsigned long start_pfn, end_pfn; struct memblock_region *r; - for_each_memblock(memory, r) { + for_each_mem_region(r) { start_pfn = clamp(memblock_region_memory_base_pfn(r), zone_start_pfn, zone_end_pfn); end_pfn = clamp(memblock_region_memory_end_pfn(r), @@ -7010,8 +7002,7 @@ static void __init init_unavailable_mem(void) * Loop through unavailable ranges not covered by memblock.memory. */ pgcnt = 0; - for_each_mem_range(i, &memblock.memory, NULL, - NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, NULL) { + for_each_mem_range(i, &start, &end) { if (next < start) pgcnt += init_unavailable_range(PFN_DOWN(next), PFN_UP(start)); @@ -7161,7 +7152,7 @@ static void __init find_zone_movable_pfns_for_nodes(void) * options. */ if (movable_node_is_enabled()) { - for_each_memblock(memory, r) { + for_each_mem_region(r) { if (!memblock_is_hotpluggable(r)) continue; @@ -7182,7 +7173,7 @@ static void __init find_zone_movable_pfns_for_nodes(void) if (mirrored_kernelcore) { bool mem_below_4gb_not_mirrored = false; - for_each_memblock(memory, r) { + for_each_mem_region(r) { if (memblock_is_mirror(r)) continue; @@ -8249,14 +8240,7 @@ struct page *has_unmovable_pages(struct zone *zone, struct page *page, { unsigned long iter = 0; unsigned long pfn = page_to_pfn(page); - - /* - * TODO we could make this much more efficient by not checking every - * page in the range if we know all of them are in MOVABLE_ZONE and - * that the movable zone guarantees that pages are migratable but - * the later is not the case right now unfortunatelly. E.g. movablecore - * can still lead to having bootmem allocations in zone_movable. - */ + unsigned long offset = pfn % pageblock_nr_pages; if (is_migrate_cma_page(page)) { /* @@ -8270,12 +8254,18 @@ struct page *has_unmovable_pages(struct zone *zone, struct page *page, return page; } - for (; iter < pageblock_nr_pages; iter++) { + for (; iter < pageblock_nr_pages - offset; iter++) { if (!pfn_valid_within(pfn + iter)) continue; page = pfn_to_page(pfn + iter); + /* + * Both, bootmem allocations and memory holes are marked + * PG_reserved and are unmovable. We can even have unmovable + * allocations inside ZONE_MOVABLE, for example when + * specifying "movablecore". + */ if (PageReserved(page)) return page; @@ -8349,14 +8339,6 @@ struct page *has_unmovable_pages(struct zone *zone, struct page *page, * it. But now, memory offline itself doesn't call * shrink_node_slabs() and it still to be fixed. */ - /* - * If the page is not RAM, page_count()should be 0. - * we don't need more check. This is an _used_ not-movable page. - * - * The problematic thing here is PG_reserved pages. PG_reserved - * is set to both of a memory hole page and a _used_ kernel - * page at boot. - */ return page; } return NULL; diff --git a/mm/page_counter.c b/mm/page_counter.c index afe22ad335cc..b24a60b28bb0 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -109,7 +109,7 @@ bool page_counter_try_charge(struct page_counter *counter, * * The atomic_long_add_return() implies a full memory * barrier between incrementing the count and reading - * the limit. When racing with page_counter_limit(), + * the limit. When racing with page_counter_set_max(), * we either see the new limit or the setter sees the * counter has changed and retries. */ diff --git a/mm/page_io.c b/mm/page_io.c index f9e9267f296f..433df1263349 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -312,7 +312,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, struct swap_info_struct *sis = page_swap_info(page); VM_BUG_ON_PAGE(!PageSwapCache(page), page); - if (data_race(sis->flags & SWP_FS)) { + if (data_race(sis->flags & SWP_FS_OPS)) { struct kiocb kiocb; struct file *swap_file = sis->swap_file; struct address_space *mapping = swap_file->f_mapping; @@ -359,13 +359,11 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, return 0; } - ret = 0; bio = get_swap_bio(GFP_NOIO, page, end_write_func); if (bio == NULL) { set_page_dirty(page); unlock_page(page); - ret = -ENOMEM; - goto out; + return -ENOMEM; } bio->bi_opf = REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc); bio_associate_blkg_from_page(bio, page); @@ -373,8 +371,8 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, set_page_writeback(page); unlock_page(page); submit_bio(bio); -out: - return ret; + + return 0; } int swap_readpage(struct page *page, bool synchronous) @@ -403,7 +401,7 @@ int swap_readpage(struct page *page, bool synchronous) goto out; } - if (data_race(sis->flags & SWP_FS)) { + if (data_race(sis->flags & SWP_FS_OPS)) { struct file *swap_file = sis->swap_file; struct address_space *mapping = swap_file->f_mapping; @@ -467,7 +465,7 @@ int swap_set_page_dirty(struct page *page) { struct swap_info_struct *sis = page_swap_info(page); - if (data_race(sis->flags & SWP_FS)) { + if (data_race(sis->flags & SWP_FS_OPS)) { struct address_space *mapping = sis->swap_file->f_mapping; VM_BUG_ON_PAGE(!PageSwapCache(page), page); diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 63a3db10a8c0..aa94afb63823 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -17,22 +17,21 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_flags) { - struct page *unmovable = NULL; - struct zone *zone; + struct zone *zone = page_zone(page); + struct page *unmovable; unsigned long flags; - int ret = -EBUSY; - - zone = page_zone(page); spin_lock_irqsave(&zone->lock, flags); /* * We assume the caller intended to SET migrate type to isolate. * If it is already set, then someone else must have raced and - * set it before us. Return -EBUSY + * set it before us. */ - if (is_migrate_isolate_page(page)) - goto out; + if (is_migrate_isolate_page(page)) { + spin_unlock_irqrestore(&zone->lock, flags); + return -EBUSY; + } /* * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. @@ -49,25 +48,21 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_ NULL); __mod_zone_freepage_state(zone, -nr_pages, mt); - ret = 0; - } - -out: - spin_unlock_irqrestore(&zone->lock, flags); - if (!ret) { + spin_unlock_irqrestore(&zone->lock, flags); drain_all_pages(zone); - } else { - WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE); - - if ((isol_flags & REPORT_FAILURE) && unmovable) - /* - * printk() with zone->lock held will likely trigger a - * lockdep splat, so defer it here. - */ - dump_page(unmovable, "unmovable page"); + return 0; } - return ret; + spin_unlock_irqrestore(&zone->lock, flags); + if (isol_flags & REPORT_FAILURE) { + /* + * printk() with zone->lock held will likely trigger a + * lockdep splat, so defer it here. + */ + dump_page(unmovable, "unmovable page"); + } + + return -EBUSY; } static void unset_migratetype_isolate(struct page *page, unsigned migratetype) diff --git a/mm/shmem.c b/mm/shmem.c index b52476aa0d0a..c61b4d10ff25 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1830,6 +1830,8 @@ repeat: return error; } + if (page) + hindex = page->index; if (page && sgp == SGP_WRITE) mark_page_accessed(page); @@ -1840,11 +1842,10 @@ repeat: unlock_page(page); put_page(page); page = NULL; + hindex = index; } - if (page || sgp == SGP_READ) { - *pagep = page; - return 0; - } + if (page || sgp == SGP_READ) + goto out; /* * Fast cache lookup did not find it: @@ -1969,14 +1970,13 @@ clear: * it now, lest undo on failure cancel our earlier guarantee. */ if (sgp != SGP_WRITE && !PageUptodate(page)) { - struct page *head = compound_head(page); int i; - for (i = 0; i < compound_nr(head); i++) { - clear_highpage(head + i); - flush_dcache_page(head + i); + for (i = 0; i < compound_nr(page); i++) { + clear_highpage(page + i); + flush_dcache_page(page + i); } - SetPageUptodate(head); + SetPageUptodate(page); } /* Perhaps the file has been truncated since we checked */ @@ -1992,6 +1992,7 @@ clear: error = -EINVAL; goto unlock; } +out: *pagep = page + index - hindex; return 0; diff --git a/mm/slab.c b/mm/slab.c index f658e86ec8ce..399a9d185b0f 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2305,8 +2305,6 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep, /* Slab management obj is off-slab. */ freelist = kmem_cache_alloc_node(cachep->freelist_cache, local_flags, nodeid); - if (!freelist) - return NULL; } else { /* We will use last bytes at the slab for freelist */ freelist = addr + (PAGE_SIZE << cachep->gfporder) - @@ -3440,7 +3438,7 @@ void ___cache_free(struct kmem_cache *cachep, void *objp, memset(objp, 0, cachep->object_size); kmemleak_free_recursive(objp, cachep->flags); objp = cache_free_debugcheck(cachep, objp, caller); - memcg_slab_free_hook(cachep, virt_to_head_page(objp), objp); + memcg_slab_free_hook(cachep, &objp, 1); /* * Skip calling cache_free_alien() when the platform is not numa. diff --git a/mm/slab.h b/mm/slab.h index 6cc323f1313a..6dd4b702888a 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -345,30 +345,42 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, obj_cgroup_put(objcg); } -static inline void memcg_slab_free_hook(struct kmem_cache *s, struct page *page, - void *p) +static inline void memcg_slab_free_hook(struct kmem_cache *s_orig, + void **p, int objects) { + struct kmem_cache *s; struct obj_cgroup *objcg; + struct page *page; unsigned int off; + int i; if (!memcg_kmem_enabled()) return; - if (!page_has_obj_cgroups(page)) - return; + for (i = 0; i < objects; i++) { + if (unlikely(!p[i])) + continue; - off = obj_to_index(s, page, p); - objcg = page_obj_cgroups(page)[off]; - page_obj_cgroups(page)[off] = NULL; + page = virt_to_head_page(p[i]); + if (!page_has_obj_cgroups(page)) + continue; - if (!objcg) - return; + if (!s_orig) + s = page->slab_cache; + else + s = s_orig; - obj_cgroup_uncharge(objcg, obj_full_size(s)); - mod_objcg_state(objcg, page_pgdat(page), cache_vmstat_idx(s), - -obj_full_size(s)); + off = obj_to_index(s, page, p[i]); + objcg = page_obj_cgroups(page)[off]; + if (!objcg) + continue; - obj_cgroup_put(objcg); + page_obj_cgroups(page)[off] = NULL; + obj_cgroup_uncharge(objcg, obj_full_size(s)); + mod_objcg_state(objcg, page_pgdat(page), cache_vmstat_idx(s), + -obj_full_size(s)); + obj_cgroup_put(objcg); + } } #else /* CONFIG_MEMCG_KMEM */ @@ -406,8 +418,8 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, { } -static inline void memcg_slab_free_hook(struct kmem_cache *s, struct page *page, - void *p) +static inline void memcg_slab_free_hook(struct kmem_cache *s, + void **p, int objects) { } #endif /* CONFIG_MEMCG_KMEM */ diff --git a/mm/slub.c b/mm/slub.c index 6d3574013b2f..61d0d2968413 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2245,7 +2245,8 @@ redo: } } else { m = M_FULL; - if (kmem_cache_debug(s) && !lock) { +#ifdef CONFIG_SLUB_DEBUG + if ((s->flags & SLAB_STORE_USER) && !lock) { lock = 1; /* * This also ensures that the scanning of full @@ -2254,6 +2255,7 @@ redo: */ spin_lock(&n->list_lock); } +#endif } if (l != m) { @@ -2661,6 +2663,8 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, void *freelist; struct page *page; + stat(s, ALLOC_SLOWPATH); + page = c->page; if (!page) { /* @@ -2850,7 +2854,6 @@ redo: page = c->page; if (unlikely(!object || !node_match(page, node))) { object = __slab_alloc(s, gfpflags, node, addr, c); - stat(s, ALLOC_SLOWPATH); } else { void *next_object = get_freepointer_safe(s, object); @@ -3019,20 +3022,21 @@ static void __slab_free(struct kmem_cache *s, struct page *page, if (likely(!n)) { - /* - * If we just froze the page then put it onto the - * per cpu partial list. - */ - if (new.frozen && !was_frozen) { + if (likely(was_frozen)) { + /* + * The list lock was not taken therefore no list + * activity can be necessary. + */ + stat(s, FREE_FROZEN); + } else if (new.frozen) { + /* + * If we just froze the page then put it onto the + * per cpu partial list. + */ put_cpu_partial(s, page, 1); stat(s, CPU_PARTIAL_FREE); } - /* - * The list lock was not taken therefore no list - * activity can be necessary. - */ - if (was_frozen) - stat(s, FREE_FROZEN); + return; } @@ -3091,7 +3095,7 @@ static __always_inline void do_slab_free(struct kmem_cache *s, struct kmem_cache_cpu *c; unsigned long tid; - memcg_slab_free_hook(s, page, head); + memcg_slab_free_hook(s, &head, 1); redo: /* * Determine the currently cpus per cpu slab. @@ -3253,6 +3257,7 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) if (WARN_ON(!size)) return; + memcg_slab_free_hook(s, p, size); do { struct detached_freelist df; diff --git a/mm/sparse.c b/mm/sparse.c index fcc3d176f1ea..b25ad8e64839 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -291,13 +291,11 @@ static void __init memory_present(int nid, unsigned long start, unsigned long en */ static void __init memblocks_present(void) { - struct memblock_region *reg; + unsigned long start, end; + int i, nid; - for_each_memblock(memory, reg) { - memory_present(memblock_get_region_node(reg), - memblock_region_memory_base_pfn(reg), - memblock_region_memory_end_pfn(reg)); - } + for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) + memory_present(nid, start, end); } /* diff --git a/mm/swap.c b/mm/swap.c index 65ef7e3525bf..47a47681c86b 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -348,7 +348,7 @@ static bool need_activate_page_drain(int cpu) return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0; } -void activate_page(struct page *page) +static void activate_page(struct page *page) { page = compound_head(page); if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { @@ -368,7 +368,7 @@ static inline void activate_page_drain(int cpu) { } -void activate_page(struct page *page) +static void activate_page(struct page *page) { pg_data_t *pgdat = page_pgdat(page); @@ -481,9 +481,7 @@ EXPORT_SYMBOL(lru_cache_add); * @vma: vma in which page is mapped for determining reclaimability * * Place @page on the inactive or unevictable LRU list, depending on its - * evictability. Note that if the page is not evictable, it goes - * directly back onto it's zone's unevictable list, it does NOT use a - * per cpu pagevec. + * evictability. */ void lru_cache_add_inactive_or_unevictable(struct page *page, struct vm_area_struct *vma) @@ -891,6 +889,7 @@ void release_pages(struct page **pages, int nr) locked_pgdat = NULL; } + page = compound_head(page); if (is_huge_zero_page(page)) continue; @@ -902,7 +901,7 @@ void release_pages(struct page **pages, int nr) } /* * ZONE_DEVICE pages that return 'false' from - * put_devmap_managed_page() do not require special + * page_is_devmap_managed() do not require special * processing, and instead, expect a call to * put_page_testzero(). */ @@ -912,7 +911,6 @@ void release_pages(struct page **pages, int nr) } } - page = compound_head(page); if (!put_page_testzero(page)) continue; @@ -943,8 +941,6 @@ void release_pages(struct page **pages, int nr) del_page_from_lru_list(page, lruvec, page_off_lru(page)); } - /* Clear Active bit in case of parallel mark_page_accessed */ - __ClearPageActive(page); __ClearPageWaiters(page); list_add(&page->lru, &pages_to_free); diff --git a/mm/swap_slots.c b/mm/swap_slots.c index 3e6453573a89..0357fbe70645 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -237,7 +237,7 @@ static int free_slot_cache(unsigned int cpu) return 0; } -int enable_swap_slots_cache(void) +void enable_swap_slots_cache(void) { mutex_lock(&swap_slots_cache_enable_mutex); if (!swap_slot_cache_initialized) { @@ -255,7 +255,6 @@ int enable_swap_slots_cache(void) __reenable_swap_slots_cache(); out_unlock: mutex_unlock(&swap_slots_cache_enable_mutex); - return 0; } /* called with swap slot cache's alloc lock held */ diff --git a/mm/swap_state.c b/mm/swap_state.c index c16eebb81d8b..aa40e706604c 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "internal.h" /* @@ -414,6 +415,39 @@ struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, return page; } +/** + * find_get_incore_page - Find and get a page from the page or swap caches. + * @mapping: The address_space to search. + * @index: The page cache index. + * + * This differs from find_get_page() in that it will also look for the + * page in the swap cache. + * + * Return: The found page or %NULL. + */ +struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index) +{ + swp_entry_t swp; + struct swap_info_struct *si; + struct page *page = find_get_entry(mapping, index); + + if (!page) + return page; + if (!xa_is_value(page)) + return find_subpage(page, index); + if (!shmem_mapping(mapping)) + return NULL; + + swp = radix_to_swp_entry(page); + /* Prevent swapoff from happening to us */ + si = get_swap_device(swp); + if (!si) + return NULL; + page = find_get_page(swap_address_space(swp), swp_offset(swp)); + put_swap_device(si); + return page; +} + struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, bool *new_page_allocated) @@ -631,7 +665,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, goto skip; /* Test swap type to make sure the dereference is safe */ - if (likely(si->flags & (SWP_BLKDEV | SWP_FS))) { + if (likely(si->flags & (SWP_BLKDEV | SWP_FS_OPS))) { struct inode *inode = si->swap_file->f_mapping->host; if (inode_read_congested(inode)) goto skip; diff --git a/mm/swapfile.c b/mm/swapfile.c index ced4635d924c..c4a613688a17 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1184,7 +1184,6 @@ static struct swap_info_struct *_swap_info_get(swp_entry_t entry) bad_free: pr_err("swap_info_get: %s%08lx\n", Unused_offset, entry.val); - goto out; out: return NULL; } @@ -1929,11 +1928,6 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, lru_cache_add_inactive_or_unevictable(page, vma); } swap_free(entry); - /* - * Move the page to the active list so it is not - * immediately swapped out again after swapon. - */ - activate_page(page); out: pte_unmap_unlock(pte, ptl); if (page != swapcache) { @@ -2437,7 +2431,7 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) if (ret >= 0) sis->flags |= SWP_ACTIVATED; if (!ret) { - sis->flags |= SWP_FS; + sis->flags |= SWP_FS_OPS; ret = add_swap_extent(sis, 0, sis->max, 0); *span = sis->pages; } @@ -3348,7 +3342,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = inode_drain_writes(inode); if (error) { inode->i_flags &= ~S_SWAPFILE; - goto bad_swap_unlock_inode; + goto free_swap_address_space; } mutex_lock(&swapon_mutex); @@ -3373,6 +3367,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = 0; goto out; +free_swap_address_space: + exit_swap_address_space(p->type); bad_swap_unlock_inode: inode_unlock(inode); bad_swap: diff --git a/mm/truncate.c b/mm/truncate.c index dd9ebc1da356..6bbe0f0b3ce9 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -528,23 +528,8 @@ void truncate_inode_pages_final(struct address_space *mapping) } EXPORT_SYMBOL(truncate_inode_pages_final); -/** - * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode - * @mapping: the address_space which holds the pages to invalidate - * @start: the offset 'from' which to invalidate - * @end: the offset 'to' which to invalidate (inclusive) - * - * This function only removes the unlocked pages, if you want to - * remove all the pages of one inode, you must call truncate_inode_pages. - * - * invalidate_mapping_pages() will not block on IO activity. It will not - * invalidate pages which are dirty, locked, under writeback or mapped into - * pagetables. - * - * Return: the number of the pages that were invalidated - */ -unsigned long invalidate_mapping_pages(struct address_space *mapping, - pgoff_t start, pgoff_t end) +unsigned long __invalidate_mapping_pages(struct address_space *mapping, + pgoff_t start, pgoff_t end, unsigned long *nr_pagevec) { pgoff_t indices[PAGEVEC_SIZE]; struct pagevec pvec; @@ -610,8 +595,13 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, * Invalidation is a hint that the page is no longer * of interest and try to speed up its reclaim. */ - if (!ret) + if (!ret) { deactivate_file_page(page); + /* It is likely on the pagevec of a remote CPU */ + if (nr_pagevec) + (*nr_pagevec)++; + } + if (PageTransHuge(page)) put_page(page); count += ret; @@ -623,8 +613,40 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, } return count; } + +/** + * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode + * @mapping: the address_space which holds the pages to invalidate + * @start: the offset 'from' which to invalidate + * @end: the offset 'to' which to invalidate (inclusive) + * + * This function only removes the unlocked pages, if you want to + * remove all the pages of one inode, you must call truncate_inode_pages. + * + * invalidate_mapping_pages() will not block on IO activity. It will not + * invalidate pages which are dirty, locked, under writeback or mapped into + * pagetables. + * + * Return: the number of the pages that were invalidated + */ +unsigned long invalidate_mapping_pages(struct address_space *mapping, + pgoff_t start, pgoff_t end) +{ + return __invalidate_mapping_pages(mapping, start, end, NULL); +} EXPORT_SYMBOL(invalidate_mapping_pages); +/** + * This helper is similar with the above one, except that it accounts for pages + * that are likely on a pagevec and count them in @nr_pagevec, which will used by + * the caller. + */ +void invalidate_mapping_pagevec(struct address_space *mapping, + pgoff_t start, pgoff_t end, unsigned long *nr_pagevec) +{ + __invalidate_mapping_pages(mapping, start, end, nr_pagevec); +} + /* * This is like invalidate_complete_page(), except it ignores the page's * refcount. We do this because invalidate_inode_pages2() needs stronger diff --git a/mm/vmalloc.c b/mm/vmalloc.c index be4724b916b3..04ac98bf5045 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2133,7 +2133,7 @@ struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, * It is up to the caller to do all required locking to keep the returned * pointer valid. * - * Return: pointer to the found area or %NULL on faulure + * Return: the area descriptor on success or %NULL on failure. */ struct vm_struct *find_vm_area(const void *addr) { @@ -2154,7 +2154,7 @@ struct vm_struct *find_vm_area(const void *addr) * This function returns the found VM area, but using it is NOT safe * on SMP machines, except for its size or flags. * - * Return: pointer to the found area or %NULL on faulure + * Return: the area descriptor on success or %NULL on failure. */ struct vm_struct *remove_vm_area(const void *addr) { @@ -2447,7 +2447,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, page = alloc_pages_node(node, alloc_mask|highmem_mask, 0); if (unlikely(!page)) { - /* Successfully allocated i pages, free them in __vunmap() */ + /* Successfully allocated i pages, free them in __vfree() */ area->nr_pages = i; atomic_long_add(area->nr_pages, &nr_vmalloc_pages); goto fail; diff --git a/mm/vmscan.c b/mm/vmscan.c index 466fc3144fff..879fb57c5045 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -699,6 +699,9 @@ void drop_slab_node(int nid) do { struct mem_cgroup *memcg = NULL; + if (fatal_signal_pending(current)) + return; + freed = 0; memcg = mem_cgroup_iter(NULL, NULL, NULL); do { @@ -1751,7 +1754,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, * Restrictions: * * (1) Must be called with an elevated refcount on the page. This is a - * fundamentnal difference from isolate_lru_pages (which is called + * fundamental difference from isolate_lru_pages (which is called * without a stable reference). * (2) the lru_lock must not be held. * (3) interrupts must be enabled. diff --git a/mm/z3fold.c b/mm/z3fold.c index 460b0feced26..18feaa0bc537 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -212,13 +212,12 @@ static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool, { struct z3fold_buddy_slots *slots; - slots = kmem_cache_alloc(pool->c_handle, + slots = kmem_cache_zalloc(pool->c_handle, (gfp & ~(__GFP_HIGHMEM | __GFP_MOVABLE))); if (slots) { /* It will be freed separately in free_handle(). */ kmemleak_not_leak(slots); - memset(slots->slot, 0, sizeof(slots->slot)); slots->pool = (unsigned long)pool; rwlock_init(&slots->lock); } diff --git a/mm/zbud.c b/mm/zbud.c index bc93aa4e46fc..c49966ece674 100644 --- a/mm/zbud.c +++ b/mm/zbud.c @@ -367,7 +367,6 @@ int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp, spin_lock(&pool->lock); /* First, try to find an unbuddied zbud page. */ - zhdr = NULL; for_each_unbuddied_list(i, chunks) { if (!list_empty(&pool->unbuddied[i])) { zhdr = list_first_entry(&pool->unbuddied[i], diff --git a/samples/Makefile b/samples/Makefile index 754553597581..c3392a595e4b 100644 --- a/samples/Makefile +++ b/samples/Makefile @@ -28,3 +28,4 @@ subdir-$(CONFIG_SAMPLE_VFS) += vfs obj-$(CONFIG_SAMPLE_INTEL_MEI) += mei/ subdir-$(CONFIG_SAMPLE_WATCHDOG) += watchdog subdir-$(CONFIG_SAMPLE_WATCH_QUEUE) += watch_queue +obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak/ diff --git a/samples/kmemleak/Makefile b/samples/kmemleak/Makefile new file mode 100644 index 000000000000..16b6132c540c --- /dev/null +++ b/samples/kmemleak/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o diff --git a/mm/kmemleak-test.c b/samples/kmemleak/kmemleak-test.c similarity index 98% rename from mm/kmemleak-test.c rename to samples/kmemleak/kmemleak-test.c index e19279ff6aa3..7b476eb8285f 100644 --- a/mm/kmemleak-test.c +++ b/samples/kmemleak/kmemleak-test.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * mm/kmemleak-test.c + * samples/kmemleak/kmemleak-test.c * * Copyright (C) 2008 ARM Limited * Written by Catalin Marinas diff --git a/scripts/decodecode b/scripts/decodecode index fbdb325cdf4f..31d884e35f2f 100755 --- a/scripts/decodecode +++ b/scripts/decodecode @@ -6,6 +6,7 @@ # options: set env. variable AFLAGS=options to pass options to "as"; # e.g., to decode an i386 oops on an x86_64 system, use: # AFLAGS=--32 decodecode < 386.oops +# PC=hex - the PC (program counter) the oops points to cleanup() { rm -f $T $T.s $T.o $T.oo $T.aa $T.dis @@ -67,15 +68,19 @@ if [ -z "$ARCH" ]; then esac fi +# Params: (tmp_file, pc_sub) disas() { - ${CROSS_COMPILE}as $AFLAGS -o $1.o $1.s > /dev/null 2>&1 + t=$1 + pc_sub=$2 + + ${CROSS_COMPILE}as $AFLAGS -o $t.o $t.s > /dev/null 2>&1 if [ "$ARCH" = "arm" ]; then if [ $width -eq 2 ]; then OBJDUMPFLAGS="-M force-thumb" fi - ${CROSS_COMPILE}strip $1.o + ${CROSS_COMPILE}strip $t.o fi if [ "$ARCH" = "arm64" ]; then @@ -83,11 +88,18 @@ disas() { type=inst fi - ${CROSS_COMPILE}strip $1.o + ${CROSS_COMPILE}strip $t.o fi - ${CROSS_COMPILE}objdump $OBJDUMPFLAGS -S $1.o | \ - grep -v "/tmp\|Disassembly\|\.text\|^$" > $1.dis 2>&1 + if [ $pc_sub -ne 0 ]; then + if [ $PC ]; then + adj_vma=$(( $PC - $pc_sub )) + OBJDUMPFLAGS="$OBJDUMPFLAGS --adjust-vma=$adj_vma" + fi + fi + + ${CROSS_COMPILE}objdump $OBJDUMPFLAGS -S $t.o | \ + grep -v "/tmp\|Disassembly\|\.text\|^$" > $t.dis 2>&1 } marker=`expr index "$code" "\<"` @@ -95,14 +107,17 @@ if [ $marker -eq 0 ]; then marker=`expr index "$code" "\("` fi + touch $T.oo if [ $marker -ne 0 ]; then + # 2 opcode bytes and a single space + pc_sub=$(( $marker / 3 )) echo All code >> $T.oo echo ======== >> $T.oo beforemark=`echo "$code"` echo -n " .$type 0x" > $T.s echo $beforemark | sed -e 's/ /,0x/g; s/[<>()]//g' >> $T.s - disas $T + disas $T $pc_sub cat $T.dis >> $T.oo rm -f $T.o $T.s $T.dis @@ -114,7 +129,7 @@ echo =========================================== >> $T.aa code=`echo $code | sed -e 's/ [<(]/ /;s/[>)] / /;s/ /,0x/g; s/[>)]$//'` echo -n " .$type 0x" > $T.s echo $code >> $T.s -disas $T +disas $T 0 cat $T.dis >> $T.aa # (lines of whole $T.oo) - (lines of $T.aa, i.e. "Code starting") + 3, diff --git a/scripts/gdb/linux/dmesg.py b/scripts/gdb/linux/dmesg.py index 2fa7bb83885f..a92c55bd8de5 100644 --- a/scripts/gdb/linux/dmesg.py +++ b/scripts/gdb/linux/dmesg.py @@ -16,8 +16,13 @@ import sys from linux import utils -printk_log_type = utils.CachedType("struct printk_log") - +printk_info_type = utils.CachedType("struct printk_info") +prb_data_blk_lpos_type = utils.CachedType("struct prb_data_blk_lpos") +prb_desc_type = utils.CachedType("struct prb_desc") +prb_desc_ring_type = utils.CachedType("struct prb_desc_ring") +prb_data_ring_type = utils.CachedType("struct prb_data_ring") +printk_ringbuffer_type = utils.CachedType("struct printk_ringbuffer") +atomic_long_type = utils.CachedType("atomic_long_t") class LxDmesg(gdb.Command): """Print Linux kernel log buffer.""" @@ -26,44 +31,110 @@ class LxDmesg(gdb.Command): super(LxDmesg, self).__init__("lx-dmesg", gdb.COMMAND_DATA) def invoke(self, arg, from_tty): - log_buf_addr = int(str(gdb.parse_and_eval( - "(void *)'printk.c'::log_buf")).split()[0], 16) - log_first_idx = int(gdb.parse_and_eval("'printk.c'::log_first_idx")) - log_next_idx = int(gdb.parse_and_eval("'printk.c'::log_next_idx")) - log_buf_len = int(gdb.parse_and_eval("'printk.c'::log_buf_len")) - inf = gdb.inferiors()[0] - start = log_buf_addr + log_first_idx - if log_first_idx < log_next_idx: - log_buf_2nd_half = -1 - length = log_next_idx - log_first_idx - log_buf = utils.read_memoryview(inf, start, length).tobytes() - else: - log_buf_2nd_half = log_buf_len - log_first_idx - a = utils.read_memoryview(inf, start, log_buf_2nd_half) - b = utils.read_memoryview(inf, log_buf_addr, log_next_idx) - log_buf = a.tobytes() + b.tobytes() - length_offset = printk_log_type.get_type()['len'].bitpos // 8 - text_len_offset = printk_log_type.get_type()['text_len'].bitpos // 8 - time_stamp_offset = printk_log_type.get_type()['ts_nsec'].bitpos // 8 - text_offset = printk_log_type.get_type().sizeof + # read in prb structure + prb_addr = int(str(gdb.parse_and_eval("(void *)'printk.c'::prb")).split()[0], 16) + sz = printk_ringbuffer_type.get_type().sizeof + prb = utils.read_memoryview(inf, prb_addr, sz).tobytes() - pos = 0 - while pos < log_buf.__len__(): - length = utils.read_u16(log_buf, pos + length_offset) - if length == 0: - if log_buf_2nd_half == -1: - gdb.write("Corrupted log buffer!\n") + # read in descriptor ring structure + off = printk_ringbuffer_type.get_type()['desc_ring'].bitpos // 8 + addr = prb_addr + off + sz = prb_desc_ring_type.get_type().sizeof + desc_ring = utils.read_memoryview(inf, addr, sz).tobytes() + + # read in descriptor array + off = prb_desc_ring_type.get_type()['count_bits'].bitpos // 8 + desc_ring_count = 1 << utils.read_u32(desc_ring, off) + desc_sz = prb_desc_type.get_type().sizeof + off = prb_desc_ring_type.get_type()['descs'].bitpos // 8 + addr = utils.read_ulong(desc_ring, off) + descs = utils.read_memoryview(inf, addr, desc_sz * desc_ring_count).tobytes() + + # read in info array + info_sz = printk_info_type.get_type().sizeof + off = prb_desc_ring_type.get_type()['infos'].bitpos // 8 + addr = utils.read_ulong(desc_ring, off) + infos = utils.read_memoryview(inf, addr, info_sz * desc_ring_count).tobytes() + + # read in text data ring structure + off = printk_ringbuffer_type.get_type()['text_data_ring'].bitpos // 8 + addr = prb_addr + off + sz = prb_data_ring_type.get_type().sizeof + text_data_ring = utils.read_memoryview(inf, addr, sz).tobytes() + + # read in text data + off = prb_data_ring_type.get_type()['size_bits'].bitpos // 8 + text_data_sz = 1 << utils.read_u32(text_data_ring, off) + off = prb_data_ring_type.get_type()['data'].bitpos // 8 + addr = utils.read_ulong(text_data_ring, off) + text_data = utils.read_memoryview(inf, addr, text_data_sz).tobytes() + + counter_off = atomic_long_type.get_type()['counter'].bitpos // 8 + + sv_off = prb_desc_type.get_type()['state_var'].bitpos // 8 + + off = prb_desc_type.get_type()['text_blk_lpos'].bitpos // 8 + begin_off = off + (prb_data_blk_lpos_type.get_type()['begin'].bitpos // 8) + next_off = off + (prb_data_blk_lpos_type.get_type()['next'].bitpos // 8) + + ts_off = printk_info_type.get_type()['ts_nsec'].bitpos // 8 + len_off = printk_info_type.get_type()['text_len'].bitpos // 8 + + # definitions from kernel/printk/printk_ringbuffer.h + desc_committed = 1 + desc_finalized = 2 + desc_sv_bits = utils.get_long_type().sizeof * 8 + desc_flags_shift = desc_sv_bits - 2 + desc_flags_mask = 3 << desc_flags_shift + desc_id_mask = ~desc_flags_mask + + # read in tail and head descriptor ids + off = prb_desc_ring_type.get_type()['tail_id'].bitpos // 8 + tail_id = utils.read_u64(desc_ring, off + counter_off) + off = prb_desc_ring_type.get_type()['head_id'].bitpos // 8 + head_id = utils.read_u64(desc_ring, off + counter_off) + + did = tail_id + while True: + ind = did % desc_ring_count + desc_off = desc_sz * ind + info_off = info_sz * ind + + # skip non-committed record + state = 3 & (utils.read_u64(descs, desc_off + sv_off + + counter_off) >> desc_flags_shift) + if state != desc_committed and state != desc_finalized: + if did == head_id: break - pos = log_buf_2nd_half + did = (did + 1) & desc_id_mask continue - text_len = utils.read_u16(log_buf, pos + text_len_offset) - text_start = pos + text_offset - text = log_buf[text_start:text_start + text_len].decode( - encoding='utf8', errors='replace') - time_stamp = utils.read_u64(log_buf, pos + time_stamp_offset) + begin = utils.read_ulong(descs, desc_off + begin_off) % text_data_sz + end = utils.read_ulong(descs, desc_off + next_off) % text_data_sz + + # handle data-less record + if begin & 1 == 1: + text = "" + else: + # handle wrapping data block + if begin > end: + begin = 0 + + # skip over descriptor id + text_start = begin + utils.get_long_type().sizeof + + text_len = utils.read_u16(infos, info_off + len_off) + + # handle truncated message + if end - text_start < text_len: + text_len = end - text_start + + text = text_data[text_start:text_start + text_len].decode( + encoding='utf8', errors='replace') + + time_stamp = utils.read_u64(infos, info_off + ts_off) for line in text.splitlines(): msg = u"[{time:12.6f}] {line}\n".format( @@ -75,7 +146,9 @@ class LxDmesg(gdb.Command): msg = msg.encode(encoding='utf8', errors='replace') gdb.write(msg) - pos += length + if did == head_id: + break + did = (did + 1) & desc_id_mask LxDmesg() diff --git a/scripts/gdb/linux/utils.py b/scripts/gdb/linux/utils.py index ea94221dbd39..ff7c1799d588 100644 --- a/scripts/gdb/linux/utils.py +++ b/scripts/gdb/linux/utils.py @@ -123,6 +123,13 @@ def read_u64(buffer, offset): return read_u32(buffer, offset + 4) + (read_u32(buffer, offset) << 32) +def read_ulong(buffer, offset): + if get_long_type().sizeof == 8: + return read_u64(buffer, offset) + else: + return read_u32(buffer, offset) + + target_arch = None diff --git a/scripts/selinux/mdp/mdp.c b/scripts/selinux/mdp/mdp.c index 6ceb88eb9b59..105c1c31a316 100644 --- a/scripts/selinux/mdp/mdp.c +++ b/scripts/selinux/mdp/mdp.c @@ -35,6 +35,9 @@ struct security_class_mapping { #include "classmap.h" #include "initial_sid_to_string.h" +#include "policycap_names.h" + +#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) int main(int argc, char *argv[]) { @@ -115,6 +118,10 @@ int main(int argc, char *argv[]) } } + /* enable all policy capabilities */ + for (i = 0; i < ARRAY_SIZE(selinux_policycap_names); i++) + fprintf(fout, "policycap %s;\n", selinux_policycap_names[i]); + /* types, roles, and allows */ fprintf(fout, "type base_t;\n"); fprintf(fout, "role base_r;\n"); diff --git a/scripts/spelling.txt b/scripts/spelling.txt index feb2efaaa5e6..cb46a79665f5 100644 --- a/scripts/spelling.txt +++ b/scripts/spelling.txt @@ -9,6 +9,7 @@ # abandonning||abandoning abigious||ambiguous +abitrary||arbitrary abitrate||arbitrate abnornally||abnormally abnrormal||abnormal @@ -482,6 +483,7 @@ disgest||digest dispalying||displaying diplay||display directon||direction +direcly||directly direectly||directly diregard||disregard disassocation||disassociation @@ -871,6 +873,7 @@ malplace||misplace managable||manageable managment||management mangement||management +manger||manager manoeuvering||maneuvering manufaucturing||manufacturing mappping||mapping @@ -1478,6 +1481,7 @@ unsolicitied||unsolicited unsuccessfull||unsuccessful unsuported||unsupported untill||until +ununsed||unused unuseful||useless unvalid||invalid upate||update diff --git a/security/selinux/avc.c b/security/selinux/avc.c index d18cb32a242a..3c05827608b6 100644 --- a/security/selinux/avc.c +++ b/security/selinux/avc.c @@ -31,6 +31,9 @@ #include "avc_ss.h" #include "classmap.h" +#define CREATE_TRACE_POINTS +#include + #define AVC_CACHE_SLOTS 512 #define AVC_DEF_CACHE_THRESHOLD 512 #define AVC_CACHE_RECLAIM 16 @@ -702,33 +705,37 @@ static void avc_audit_post_callback(struct audit_buffer *ab, void *a) { struct common_audit_data *ad = a; struct selinux_audit_data *sad = ad->selinux_audit_data; - char *scontext; + char *scontext = NULL; + char *tcontext = NULL; + const char *tclass = NULL; u32 scontext_len; + u32 tcontext_len; int rc; rc = security_sid_to_context(sad->state, sad->ssid, &scontext, &scontext_len); if (rc) audit_log_format(ab, " ssid=%d", sad->ssid); - else { + else audit_log_format(ab, " scontext=%s", scontext); - kfree(scontext); - } - rc = security_sid_to_context(sad->state, sad->tsid, &scontext, - &scontext_len); + rc = security_sid_to_context(sad->state, sad->tsid, &tcontext, + &tcontext_len); if (rc) audit_log_format(ab, " tsid=%d", sad->tsid); - else { - audit_log_format(ab, " tcontext=%s", scontext); - kfree(scontext); - } + else + audit_log_format(ab, " tcontext=%s", tcontext); - audit_log_format(ab, " tclass=%s", secclass_map[sad->tclass-1].name); + tclass = secclass_map[sad->tclass-1].name; + audit_log_format(ab, " tclass=%s", tclass); if (sad->denied) audit_log_format(ab, " permissive=%u", sad->result ? 0 : 1); + trace_selinux_audited(sad, scontext, tcontext, tclass); + kfree(tcontext); + kfree(scontext); + /* in case of invalid context report also the actual context string */ rc = security_sid_to_context_inval(sad->state, sad->ssid, &scontext, &scontext_len); diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 1828a9e4146d..d644087f63f9 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -1981,7 +1981,7 @@ static inline u32 file_to_av(struct file *file) } /* - * Convert a file to an access vector and include the correct open + * Convert a file to an access vector and include the correct * open permission. */ static inline u32 open_file_to_av(struct file *file) @@ -3274,6 +3274,9 @@ static int selinux_inode_removexattr(struct dentry *dentry, const char *name) return dentry_has_perm(current_cred(), dentry, FILE__SETATTR); } + if (!selinux_initialized(&selinux_state)) + return 0; + /* No one is allowed to remove a SELinux security label. You can change the label, but all data must be labeled. */ return -EACCES; @@ -3712,7 +3715,7 @@ static int selinux_mmap_file(struct file *file, unsigned long reqprot, return rc; } - if (selinux_state.checkreqprot) + if (checkreqprot_get(&selinux_state)) prot = reqprot; return file_map_prot_check(file, prot, @@ -3726,7 +3729,7 @@ static int selinux_file_mprotect(struct vm_area_struct *vma, const struct cred *cred = current_cred(); u32 sid = cred_sid(cred); - if (selinux_state.checkreqprot) + if (checkreqprot_get(&selinux_state)) prot = reqprot; if (default_noexec && @@ -4441,7 +4444,7 @@ static int selinux_skb_peerlbl_sid(struct sk_buff *skb, u16 family, u32 *sid) * * If @skb_sid is valid then the user:role:type information from @sk_sid is * combined with the MLS information from @skb_sid in order to create - * @conn_sid. If @skb_sid is not valid then then @conn_sid is simply a copy + * @conn_sid. If @skb_sid is not valid then @conn_sid is simply a copy * of @sk_sid. Returns zero on success, negative values on failure. * */ @@ -5311,7 +5314,7 @@ static int selinux_sctp_bind_connect(struct sock *sk, int optname, /* As selinux_sctp_bind_connect() is called by the * SCTP protocol layer, the socket is already locked, - * therefore selinux_netlbl_socket_connect_locked() is + * therefore selinux_netlbl_socket_connect_locked() * is called here. The situations handled are: * sctp_connectx(3), sctp_sendmsg(3), sendmsg(2), * whenever a new IP address is added or when a new @@ -7198,10 +7201,10 @@ static __init int selinux_init(void) memset(&selinux_state, 0, sizeof(selinux_state)); enforcing_set(&selinux_state, selinux_enforcing_boot); - selinux_state.checkreqprot = selinux_checkreqprot_boot; - selinux_ss_init(&selinux_state.ss); + checkreqprot_set(&selinux_state, selinux_checkreqprot_boot); selinux_avc_init(&selinux_state.avc); mutex_init(&selinux_state.status_lock); + mutex_init(&selinux_state.policy_mutex); /* Set the security state for the initial task. */ cred_init_security(); diff --git a/security/selinux/include/conditional.h b/security/selinux/include/conditional.h index 539ab357707d..b09343346e3f 100644 --- a/security/selinux/include/conditional.h +++ b/security/selinux/include/conditional.h @@ -13,7 +13,7 @@ #include "security.h" -int security_get_bools(struct selinux_state *state, +int security_get_bools(struct selinux_policy *policy, u32 *len, char ***names, int **values); int security_set_bools(struct selinux_state *state, u32 len, int *values); diff --git a/security/selinux/include/policycap.h b/security/selinux/include/policycap.h new file mode 100644 index 000000000000..2ec038efbb03 --- /dev/null +++ b/security/selinux/include/policycap.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _SELINUX_POLICYCAP_H_ +#define _SELINUX_POLICYCAP_H_ + +/* Policy capabilities */ +enum { + POLICYDB_CAPABILITY_NETPEER, + POLICYDB_CAPABILITY_OPENPERM, + POLICYDB_CAPABILITY_EXTSOCKCLASS, + POLICYDB_CAPABILITY_ALWAYSNETWORK, + POLICYDB_CAPABILITY_CGROUPSECLABEL, + POLICYDB_CAPABILITY_NNP_NOSUID_TRANSITION, + POLICYDB_CAPABILITY_GENFS_SECLABEL_SYMLINKS, + __POLICYDB_CAPABILITY_MAX +}; +#define POLICYDB_CAPABILITY_MAX (__POLICYDB_CAPABILITY_MAX - 1) + +extern const char *selinux_policycap_names[__POLICYDB_CAPABILITY_MAX]; + +#endif /* _SELINUX_POLICYCAP_H_ */ diff --git a/security/selinux/include/policycap_names.h b/security/selinux/include/policycap_names.h new file mode 100644 index 000000000000..b89289f092c9 --- /dev/null +++ b/security/selinux/include/policycap_names.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _SELINUX_POLICYCAP_NAMES_H_ +#define _SELINUX_POLICYCAP_NAMES_H_ + +#include "policycap.h" + +/* Policy capability names */ +const char *selinux_policycap_names[__POLICYDB_CAPABILITY_MAX] = { + "network_peer_controls", + "open_perms", + "extended_socket_class", + "always_check_network", + "cgroup_seclabel", + "nnp_nosuid_transition", + "genfs_seclabel_symlinks" +}; + +#endif /* _SELINUX_POLICYCAP_NAMES_H_ */ diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h index 83f1ea1d00e1..c940f73779cd 100644 --- a/security/selinux/include/security.h +++ b/security/selinux/include/security.h @@ -13,9 +13,11 @@ #include #include #include +#include #include #include #include "flask.h" +#include "policycap.h" #define SECSID_NULL 0x00000000 /* unspecified SID */ #define SECSID_WILD 0xffffffff /* wildcard SID */ @@ -72,21 +74,6 @@ struct netlbl_lsm_secattr; extern int selinux_enabled_boot; -/* Policy capabilities */ -enum { - POLICYDB_CAPABILITY_NETPEER, - POLICYDB_CAPABILITY_OPENPERM, - POLICYDB_CAPABILITY_EXTSOCKCLASS, - POLICYDB_CAPABILITY_ALWAYSNETWORK, - POLICYDB_CAPABILITY_CGROUPSECLABEL, - POLICYDB_CAPABILITY_NNP_NOSUID_TRANSITION, - POLICYDB_CAPABILITY_GENFS_SECLABEL_SYMLINKS, - __POLICYDB_CAPABILITY_MAX -}; -#define POLICYDB_CAPABILITY_MAX (__POLICYDB_CAPABILITY_MAX - 1) - -extern const char *selinux_policycap_names[__POLICYDB_CAPABILITY_MAX]; - /* * type_datum properties * available at the kernel policy version >= POLICYDB_VERSION_BOUNDARY @@ -98,7 +85,7 @@ extern const char *selinux_policycap_names[__POLICYDB_CAPABILITY_MAX]; #define POLICYDB_BOUNDS_MAXDEPTH 4 struct selinux_avc; -struct selinux_ss; +struct selinux_policy; struct selinux_state { #ifdef CONFIG_SECURITY_SELINUX_DISABLE @@ -116,10 +103,10 @@ struct selinux_state { struct mutex status_lock; struct selinux_avc *avc; - struct selinux_ss *ss; + struct selinux_policy __rcu *policy; + struct mutex policy_mutex; } __randomize_layout; -void selinux_ss_init(struct selinux_ss **ss); void selinux_avc_init(struct selinux_avc **avc); extern struct selinux_state selinux_state; @@ -157,6 +144,16 @@ static inline void enforcing_set(struct selinux_state *state, bool value) } #endif +static inline bool checkreqprot_get(const struct selinux_state *state) +{ + return READ_ONCE(state->checkreqprot); +} + +static inline void checkreqprot_set(struct selinux_state *state, bool value) +{ + WRITE_ONCE(state->checkreqprot, value); +} + #ifdef CONFIG_SECURITY_SELINUX_DISABLE static inline bool selinux_disabled(struct selinux_state *state) { @@ -178,49 +175,49 @@ static inline bool selinux_policycap_netpeer(void) { struct selinux_state *state = &selinux_state; - return state->policycap[POLICYDB_CAPABILITY_NETPEER]; + return READ_ONCE(state->policycap[POLICYDB_CAPABILITY_NETPEER]); } static inline bool selinux_policycap_openperm(void) { struct selinux_state *state = &selinux_state; - return state->policycap[POLICYDB_CAPABILITY_OPENPERM]; + return READ_ONCE(state->policycap[POLICYDB_CAPABILITY_OPENPERM]); } static inline bool selinux_policycap_extsockclass(void) { struct selinux_state *state = &selinux_state; - return state->policycap[POLICYDB_CAPABILITY_EXTSOCKCLASS]; + return READ_ONCE(state->policycap[POLICYDB_CAPABILITY_EXTSOCKCLASS]); } static inline bool selinux_policycap_alwaysnetwork(void) { struct selinux_state *state = &selinux_state; - return state->policycap[POLICYDB_CAPABILITY_ALWAYSNETWORK]; + return READ_ONCE(state->policycap[POLICYDB_CAPABILITY_ALWAYSNETWORK]); } static inline bool selinux_policycap_cgroupseclabel(void) { struct selinux_state *state = &selinux_state; - return state->policycap[POLICYDB_CAPABILITY_CGROUPSECLABEL]; + return READ_ONCE(state->policycap[POLICYDB_CAPABILITY_CGROUPSECLABEL]); } static inline bool selinux_policycap_nnp_nosuid_transition(void) { struct selinux_state *state = &selinux_state; - return state->policycap[POLICYDB_CAPABILITY_NNP_NOSUID_TRANSITION]; + return READ_ONCE(state->policycap[POLICYDB_CAPABILITY_NNP_NOSUID_TRANSITION]); } static inline bool selinux_policycap_genfs_seclabel_symlinks(void) { struct selinux_state *state = &selinux_state; - return state->policycap[POLICYDB_CAPABILITY_GENFS_SECLABEL_SYMLINKS]; + return READ_ONCE(state->policycap[POLICYDB_CAPABILITY_GENFS_SECLABEL_SYMLINKS]); } static inline bool selinux_android_nlroute_getlink(void) @@ -232,10 +229,14 @@ static inline bool selinux_android_nlroute_getlink(void) int security_mls_enabled(struct selinux_state *state); int security_load_policy(struct selinux_state *state, - void *data, size_t len); + void *data, size_t len, + struct selinux_policy **newpolicyp); +void selinux_policy_commit(struct selinux_state *state, + struct selinux_policy *newpolicy); +void selinux_policy_cancel(struct selinux_state *state, + struct selinux_policy *policy); int security_read_policy(struct selinux_state *state, void **data, size_t *len); -size_t security_policydb_len(struct selinux_state *state); int security_policycap_supported(struct selinux_state *state, unsigned int req_cap); @@ -366,9 +367,9 @@ int security_net_peersid_resolve(struct selinux_state *state, u32 xfrm_sid, u32 *peer_sid); -int security_get_classes(struct selinux_state *state, +int security_get_classes(struct selinux_policy *policy, char ***classes, int *nclasses); -int security_get_permissions(struct selinux_state *state, +int security_get_permissions(struct selinux_policy *policy, char *class, char ***perms, int *nperms); int security_get_reject_unknown(struct selinux_state *state); int security_get_allow_unknown(struct selinux_state *state); @@ -388,6 +389,10 @@ int security_genfs_sid(struct selinux_state *state, const char *fstype, char *name, u16 sclass, u32 *sid); +int selinux_policy_genfs_sid(struct selinux_policy *policy, + const char *fstype, char *name, u16 sclass, + u32 *sid); + #ifdef CONFIG_NETLABEL int security_netlbl_secattr_to_sid(struct selinux_state *state, struct netlbl_lsm_secattr *secattr, diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index 4781314c2510..4bde570d56a2 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -74,7 +75,6 @@ struct selinux_fs_info { unsigned long last_class_ino; bool policy_opened; struct dentry *policycap_dir; - struct mutex mutex; unsigned long last_ino; struct selinux_state *state; struct super_block *sb; @@ -88,7 +88,6 @@ static int selinux_fs_info_create(struct super_block *sb) if (!fsi) return -ENOMEM; - mutex_init(&fsi->mutex); fsi->last_ino = SEL_INO_NEXT - 1; fsi->state = &selinux_state; fsi->sb = sb; @@ -117,6 +116,10 @@ static void selinux_fs_info_free(struct super_block *sb) #define SEL_POLICYCAP_INO_OFFSET 0x08000000 #define SEL_INO_MASK 0x00ffffff +#define BOOL_DIR_NAME "booleans" +#define CLASS_DIR_NAME "class" +#define POLICYCAP_DIR_NAME "policy_capabilities" + #define TMPBUFLEN 12 static ssize_t sel_read_enforce(struct file *filp, char __user *buf, size_t count, loff_t *ppos) @@ -346,14 +349,24 @@ static const struct file_operations sel_policyvers_ops = { }; /* declaration for sel_write_load */ -static int sel_make_bools(struct selinux_fs_info *fsi); -static int sel_make_classes(struct selinux_fs_info *fsi); -static int sel_make_policycap(struct selinux_fs_info *fsi); +static int sel_make_bools(struct selinux_policy *newpolicy, struct dentry *bool_dir, + unsigned int *bool_num, char ***bool_pending_names, + unsigned int **bool_pending_values); +static int sel_make_classes(struct selinux_policy *newpolicy, + struct dentry *class_dir, + unsigned long *last_class_ino); /* declaration for sel_make_class_dirs */ static struct dentry *sel_make_dir(struct dentry *dir, const char *name, unsigned long *ino); +/* declaration for sel_make_policy_nodes */ +static struct dentry *sel_make_disconnected_dir(struct super_block *sb, + unsigned long *ino); + +/* declaration for sel_make_policy_nodes */ +static void sel_remove_entries(struct dentry *de); + static ssize_t sel_read_mls(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { @@ -385,7 +398,7 @@ static int sel_open_policy(struct inode *inode, struct file *filp) BUG_ON(filp->private_data); - mutex_lock(&fsi->mutex); + mutex_lock(&fsi->state->policy_mutex); rc = avc_has_perm(&selinux_state, current_sid(), SECINITSID_SECURITY, @@ -402,25 +415,25 @@ static int sel_open_policy(struct inode *inode, struct file *filp) if (!plm) goto err; - if (i_size_read(inode) != security_policydb_len(state)) { - inode_lock(inode); - i_size_write(inode, security_policydb_len(state)); - inode_unlock(inode); - } - rc = security_read_policy(state, &plm->data, &plm->len); if (rc) goto err; + if ((size_t)i_size_read(inode) != plm->len) { + inode_lock(inode); + i_size_write(inode, plm->len); + inode_unlock(inode); + } + fsi->policy_opened = 1; filp->private_data = plm; - mutex_unlock(&fsi->mutex); + mutex_unlock(&fsi->state->policy_mutex); return 0; err: - mutex_unlock(&fsi->mutex); + mutex_unlock(&fsi->state->policy_mutex); if (plm) vfree(plm->data); @@ -508,29 +521,94 @@ static const struct file_operations sel_policy_ops = { .llseek = generic_file_llseek, }; -static int sel_make_policy_nodes(struct selinux_fs_info *fsi) +static void sel_remove_old_bool_data(unsigned int bool_num, char **bool_names, + unsigned int *bool_values) { - int ret; + u32 i; - ret = sel_make_bools(fsi); + /* bool_dir cleanup */ + for (i = 0; i < bool_num; i++) + kfree(bool_names[i]); + kfree(bool_names); + kfree(bool_values); +} + +static int sel_make_policy_nodes(struct selinux_fs_info *fsi, + struct selinux_policy *newpolicy) +{ + int ret = 0; + struct dentry *tmp_parent, *tmp_bool_dir, *tmp_class_dir, *old_dentry; + unsigned int tmp_bool_num, old_bool_num; + char **tmp_bool_names, **old_bool_names; + unsigned int *tmp_bool_values, *old_bool_values; + unsigned long tmp_ino = fsi->last_ino; /* Don't increment last_ino in this function */ + + tmp_parent = sel_make_disconnected_dir(fsi->sb, &tmp_ino); + if (IS_ERR(tmp_parent)) + return PTR_ERR(tmp_parent); + + tmp_ino = fsi->bool_dir->d_inode->i_ino - 1; /* sel_make_dir will increment and set */ + tmp_bool_dir = sel_make_dir(tmp_parent, BOOL_DIR_NAME, &tmp_ino); + if (IS_ERR(tmp_bool_dir)) { + ret = PTR_ERR(tmp_bool_dir); + goto out; + } + + tmp_ino = fsi->class_dir->d_inode->i_ino - 1; /* sel_make_dir will increment and set */ + tmp_class_dir = sel_make_dir(tmp_parent, CLASS_DIR_NAME, &tmp_ino); + if (IS_ERR(tmp_class_dir)) { + ret = PTR_ERR(tmp_class_dir); + goto out; + } + + ret = sel_make_bools(newpolicy, tmp_bool_dir, &tmp_bool_num, + &tmp_bool_names, &tmp_bool_values); if (ret) { pr_err("SELinux: failed to load policy booleans\n"); - return ret; + goto out; } - ret = sel_make_classes(fsi); + ret = sel_make_classes(newpolicy, tmp_class_dir, + &fsi->last_class_ino); if (ret) { pr_err("SELinux: failed to load policy classes\n"); - return ret; + goto out; } - ret = sel_make_policycap(fsi); - if (ret) { - pr_err("SELinux: failed to load policy capabilities\n"); - return ret; - } + /* booleans */ + old_dentry = fsi->bool_dir; + lock_rename(tmp_bool_dir, old_dentry); + d_exchange(tmp_bool_dir, fsi->bool_dir); - return 0; + old_bool_num = fsi->bool_num; + old_bool_names = fsi->bool_pending_names; + old_bool_values = fsi->bool_pending_values; + + fsi->bool_num = tmp_bool_num; + fsi->bool_pending_names = tmp_bool_names; + fsi->bool_pending_values = tmp_bool_values; + + sel_remove_old_bool_data(old_bool_num, old_bool_names, old_bool_values); + + fsi->bool_dir = tmp_bool_dir; + unlock_rename(tmp_bool_dir, old_dentry); + + /* classes */ + old_dentry = fsi->class_dir; + lock_rename(tmp_class_dir, old_dentry); + d_exchange(tmp_class_dir, fsi->class_dir); + fsi->class_dir = tmp_class_dir; + unlock_rename(tmp_class_dir, old_dentry); + +out: + /* Since the other temporary dirs are children of tmp_parent + * this will handle all the cleanup in the case of a failure before + * the swapover + */ + sel_remove_entries(tmp_parent); + dput(tmp_parent); /* d_genocide() only handles the children */ + + return ret; } static ssize_t sel_write_load(struct file *file, const char __user *buf, @@ -538,10 +616,11 @@ static ssize_t sel_write_load(struct file *file, const char __user *buf, { struct selinux_fs_info *fsi = file_inode(file)->i_sb->s_fs_info; + struct selinux_policy *newpolicy; ssize_t length; void *data = NULL; - mutex_lock(&fsi->mutex); + mutex_lock(&fsi->state->policy_mutex); length = avc_has_perm(&selinux_state, current_sid(), SECINITSID_SECURITY, @@ -563,15 +642,19 @@ static ssize_t sel_write_load(struct file *file, const char __user *buf, if (copy_from_user(data, buf, count) != 0) goto out; - length = security_load_policy(fsi->state, data, count); + length = security_load_policy(fsi->state, data, count, &newpolicy); if (length) { pr_warn_ratelimited("SELinux: failed to load policy\n"); goto out; } - length = sel_make_policy_nodes(fsi); - if (length) + length = sel_make_policy_nodes(fsi, newpolicy); + if (length) { + selinux_policy_cancel(fsi->state, newpolicy); goto out1; + } + + selinux_policy_commit(fsi->state, newpolicy); length = count; @@ -581,7 +664,7 @@ out1: from_kuid(&init_user_ns, audit_get_loginuid(current)), audit_get_sessionid(current)); out: - mutex_unlock(&fsi->mutex); + mutex_unlock(&fsi->state->policy_mutex); vfree(data); return length; } @@ -634,7 +717,8 @@ static ssize_t sel_read_checkreqprot(struct file *filp, char __user *buf, char tmpbuf[TMPBUFLEN]; ssize_t length; - length = scnprintf(tmpbuf, TMPBUFLEN, "%u", fsi->state->checkreqprot); + length = scnprintf(tmpbuf, TMPBUFLEN, "%u", + checkreqprot_get(fsi->state)); return simple_read_from_buffer(buf, count, ppos, tmpbuf, length); } @@ -676,7 +760,7 @@ static ssize_t sel_write_checkreqprot(struct file *file, const char __user *buf, comm, current->pid); } - fsi->state->checkreqprot = new_value ? 1 : 0; + checkreqprot_set(fsi->state, (new_value ? 1 : 0)); length = count; out: kfree(page); @@ -1186,7 +1270,7 @@ static ssize_t sel_read_bool(struct file *filep, char __user *buf, unsigned index = file_inode(filep)->i_ino & SEL_INO_MASK; const char *name = filep->f_path.dentry->d_name.name; - mutex_lock(&fsi->mutex); + mutex_lock(&fsi->state->policy_mutex); ret = -EINVAL; if (index >= fsi->bool_num || strcmp(name, @@ -1205,14 +1289,14 @@ static ssize_t sel_read_bool(struct file *filep, char __user *buf, } length = scnprintf(page, PAGE_SIZE, "%d %d", cur_enforcing, fsi->bool_pending_values[index]); - mutex_unlock(&fsi->mutex); + mutex_unlock(&fsi->state->policy_mutex); ret = simple_read_from_buffer(buf, count, ppos, page, length); out_free: free_page((unsigned long)page); return ret; out_unlock: - mutex_unlock(&fsi->mutex); + mutex_unlock(&fsi->state->policy_mutex); goto out_free; } @@ -1237,7 +1321,7 @@ static ssize_t sel_write_bool(struct file *filep, const char __user *buf, if (IS_ERR(page)) return PTR_ERR(page); - mutex_lock(&fsi->mutex); + mutex_lock(&fsi->state->policy_mutex); length = avc_has_perm(&selinux_state, current_sid(), SECINITSID_SECURITY, @@ -1262,7 +1346,7 @@ static ssize_t sel_write_bool(struct file *filep, const char __user *buf, length = count; out: - mutex_unlock(&fsi->mutex); + mutex_unlock(&fsi->state->policy_mutex); kfree(page); return length; } @@ -1293,7 +1377,7 @@ static ssize_t sel_commit_bools_write(struct file *filep, if (IS_ERR(page)) return PTR_ERR(page); - mutex_lock(&fsi->mutex); + mutex_lock(&fsi->state->policy_mutex); length = avc_has_perm(&selinux_state, current_sid(), SECINITSID_SECURITY, @@ -1315,7 +1399,7 @@ static ssize_t sel_commit_bools_write(struct file *filep, length = count; out: - mutex_unlock(&fsi->mutex); + mutex_unlock(&fsi->state->policy_mutex); kfree(page); return length; } @@ -1331,14 +1415,13 @@ static void sel_remove_entries(struct dentry *de) shrink_dcache_parent(de); } -#define BOOL_DIR_NAME "booleans" - -static int sel_make_bools(struct selinux_fs_info *fsi) +static int sel_make_bools(struct selinux_policy *newpolicy, struct dentry *bool_dir, + unsigned int *bool_num, char ***bool_pending_names, + unsigned int **bool_pending_values) { int ret; ssize_t len; struct dentry *dentry = NULL; - struct dentry *dir = fsi->bool_dir; struct inode *inode = NULL; struct inode_security_struct *isec; char **names = NULL, *page; @@ -1346,34 +1429,23 @@ static int sel_make_bools(struct selinux_fs_info *fsi) int *values = NULL; u32 sid; - /* remove any existing files */ - for (i = 0; i < fsi->bool_num; i++) - kfree(fsi->bool_pending_names[i]); - kfree(fsi->bool_pending_names); - kfree(fsi->bool_pending_values); - fsi->bool_num = 0; - fsi->bool_pending_names = NULL; - fsi->bool_pending_values = NULL; - - sel_remove_entries(dir); - ret = -ENOMEM; page = (char *)get_zeroed_page(GFP_KERNEL); if (!page) goto out; - ret = security_get_bools(fsi->state, &num, &names, &values); + ret = security_get_bools(newpolicy, &num, &names, &values); if (ret) goto out; for (i = 0; i < num; i++) { ret = -ENOMEM; - dentry = d_alloc_name(dir, names[i]); + dentry = d_alloc_name(bool_dir, names[i]); if (!dentry) goto out; ret = -ENOMEM; - inode = sel_make_inode(dir->d_sb, S_IFREG | S_IRUGO | S_IWUSR); + inode = sel_make_inode(bool_dir->d_sb, S_IFREG | S_IRUGO | S_IWUSR); if (!inode) { dput(dentry); goto out; @@ -1388,7 +1460,7 @@ static int sel_make_bools(struct selinux_fs_info *fsi) } isec = selinux_inode(inode); - ret = security_genfs_sid(fsi->state, "selinuxfs", page, + ret = selinux_policy_genfs_sid(newpolicy, "selinuxfs", page, SECCLASS_FILE, &sid); if (ret) { pr_warn_ratelimited("SELinux: no sid found, defaulting to security isid for %s\n", @@ -1402,9 +1474,9 @@ static int sel_make_bools(struct selinux_fs_info *fsi) inode->i_ino = i|SEL_BOOL_INO_OFFSET; d_add(dentry, inode); } - fsi->bool_num = num; - fsi->bool_pending_names = names; - fsi->bool_pending_values = values; + *bool_num = num; + *bool_pending_names = names; + *bool_pending_values = values; free_page((unsigned long)page); return 0; @@ -1417,7 +1489,7 @@ out: kfree(names); } kfree(values); - sel_remove_entries(dir); + sel_remove_entries(bool_dir); return ret; } @@ -1791,14 +1863,14 @@ static const struct file_operations sel_policycap_ops = { .llseek = generic_file_llseek, }; -static int sel_make_perm_files(char *objclass, int classvalue, - struct dentry *dir) +static int sel_make_perm_files(struct selinux_policy *newpolicy, + char *objclass, int classvalue, + struct dentry *dir) { - struct selinux_fs_info *fsi = dir->d_sb->s_fs_info; int i, rc, nperms; char **perms; - rc = security_get_permissions(fsi->state, objclass, &perms, &nperms); + rc = security_get_permissions(newpolicy, objclass, &perms, &nperms); if (rc) return rc; @@ -1831,8 +1903,9 @@ out: return rc; } -static int sel_make_class_dir_entries(char *classname, int index, - struct dentry *dir) +static int sel_make_class_dir_entries(struct selinux_policy *newpolicy, + char *classname, int index, + struct dentry *dir) { struct super_block *sb = dir->d_sb; struct selinux_fs_info *fsi = sb->s_fs_info; @@ -1858,39 +1931,38 @@ static int sel_make_class_dir_entries(char *classname, int index, if (IS_ERR(dentry)) return PTR_ERR(dentry); - rc = sel_make_perm_files(classname, index, dentry); + rc = sel_make_perm_files(newpolicy, classname, index, dentry); return rc; } -static int sel_make_classes(struct selinux_fs_info *fsi) +static int sel_make_classes(struct selinux_policy *newpolicy, + struct dentry *class_dir, + unsigned long *last_class_ino) { int rc, nclasses, i; char **classes; - /* delete any existing entries */ - sel_remove_entries(fsi->class_dir); - - rc = security_get_classes(fsi->state, &classes, &nclasses); + rc = security_get_classes(newpolicy, &classes, &nclasses); if (rc) return rc; /* +2 since classes are 1-indexed */ - fsi->last_class_ino = sel_class_to_ino(nclasses + 2); + *last_class_ino = sel_class_to_ino(nclasses + 2); for (i = 0; i < nclasses; i++) { struct dentry *class_name_dir; - class_name_dir = sel_make_dir(fsi->class_dir, classes[i], - &fsi->last_class_ino); + class_name_dir = sel_make_dir(class_dir, classes[i], + last_class_ino); if (IS_ERR(class_name_dir)) { rc = PTR_ERR(class_name_dir); goto out; } /* i+1 since class values are 1-indexed */ - rc = sel_make_class_dir_entries(classes[i], i + 1, + rc = sel_make_class_dir_entries(newpolicy, classes[i], i + 1, class_name_dir); if (rc) goto out; @@ -1909,8 +1981,6 @@ static int sel_make_policycap(struct selinux_fs_info *fsi) struct dentry *dentry = NULL; struct inode *inode = NULL; - sel_remove_entries(fsi->policycap_dir); - for (iter = 0; iter <= POLICYDB_CAPABILITY_MAX; iter++) { if (iter < ARRAY_SIZE(selinux_policycap_names)) dentry = d_alloc_name(fsi->policycap_dir, @@ -1962,6 +2032,22 @@ static struct dentry *sel_make_dir(struct dentry *dir, const char *name, return dentry; } +static struct dentry *sel_make_disconnected_dir(struct super_block *sb, + unsigned long *ino) +{ + struct inode *inode = sel_make_inode(sb, S_IFDIR | S_IRUGO | S_IXUGO); + + if (!inode) + return ERR_PTR(-ENOMEM); + + inode->i_op = &simple_dir_inode_operations; + inode->i_fop = &simple_dir_operations; + inode->i_ino = ++(*ino); + /* directory inodes start off with i_nlink == 2 (for "." entry) */ + inc_nlink(inode); + return d_obtain_alias(inode); +} + #define NULL_FILE_NAME "null" static int sel_fill_super(struct super_block *sb, struct fs_context *fc) @@ -2060,14 +2146,14 @@ static int sel_fill_super(struct super_block *sb, struct fs_context *fc) if (ret) goto err; - fsi->class_dir = sel_make_dir(sb->s_root, "class", &fsi->last_ino); + fsi->class_dir = sel_make_dir(sb->s_root, CLASS_DIR_NAME, &fsi->last_ino); if (IS_ERR(fsi->class_dir)) { ret = PTR_ERR(fsi->class_dir); fsi->class_dir = NULL; goto err; } - fsi->policycap_dir = sel_make_dir(sb->s_root, "policy_capabilities", + fsi->policycap_dir = sel_make_dir(sb->s_root, POLICYCAP_DIR_NAME, &fsi->last_ino); if (IS_ERR(fsi->policycap_dir)) { ret = PTR_ERR(fsi->policycap_dir); @@ -2075,9 +2161,12 @@ static int sel_fill_super(struct super_block *sb, struct fs_context *fc) goto err; } - ret = sel_make_policy_nodes(fsi); - if (ret) + ret = sel_make_policycap(fsi); + if (ret) { + pr_err("SELinux: failed to load policy capabilities\n"); goto err; + } + return 0; err: pr_err("SELinux: %s: failed while creating inodes\n", diff --git a/security/selinux/ss/avtab.c b/security/selinux/ss/avtab.c index 01b300a4a882..0172d87e2b9a 100644 --- a/security/selinux/ss/avtab.c +++ b/security/selinux/ss/avtab.c @@ -301,7 +301,6 @@ void avtab_destroy(struct avtab *h) void avtab_init(struct avtab *h) { - kvfree(h->htable); h->htable = NULL; h->nel = 0; } @@ -340,6 +339,54 @@ int avtab_alloc(struct avtab *h, u32 nrules) return 0; } +int avtab_duplicate(struct avtab *new, struct avtab *orig) +{ + int i; + struct avtab_node *node, *tmp, *tail; + + memset(new, 0, sizeof(*new)); + + new->htable = kvcalloc(orig->nslot, sizeof(void *), GFP_KERNEL); + if (!new->htable) + return -ENOMEM; + new->nslot = orig->nslot; + new->mask = orig->mask; + + for (i = 0; i < orig->nslot; i++) { + tail = NULL; + for (node = orig->htable[i]; node; node = node->next) { + tmp = kmem_cache_zalloc(avtab_node_cachep, GFP_KERNEL); + if (!tmp) + goto error; + tmp->key = node->key; + if (tmp->key.specified & AVTAB_XPERMS) { + tmp->datum.u.xperms = + kmem_cache_zalloc(avtab_xperms_cachep, + GFP_KERNEL); + if (!tmp->datum.u.xperms) { + kmem_cache_free(avtab_node_cachep, tmp); + goto error; + } + tmp->datum.u.xperms = node->datum.u.xperms; + } else + tmp->datum.u.data = node->datum.u.data; + + if (tail) + tail->next = tmp; + else + new->htable[i] = tmp; + + tail = tmp; + new->nel++; + } + } + + return 0; +error: + avtab_destroy(new); + return -ENOMEM; +} + void avtab_hash_eval(struct avtab *h, char *tag) { int i, chain_len, slots_used, max_chain_len; diff --git a/security/selinux/ss/avtab.h b/security/selinux/ss/avtab.h index 5fdcb6696bcc..4c4445ca9118 100644 --- a/security/selinux/ss/avtab.h +++ b/security/selinux/ss/avtab.h @@ -89,6 +89,7 @@ struct avtab { void avtab_init(struct avtab *h); int avtab_alloc(struct avtab *, u32); +int avtab_duplicate(struct avtab *new, struct avtab *orig); struct avtab_datum *avtab_search(struct avtab *h, struct avtab_key *k); void avtab_destroy(struct avtab *h); void avtab_hash_eval(struct avtab *h, char *tag); diff --git a/security/selinux/ss/conditional.c b/security/selinux/ss/conditional.c index 5a47258c1d77..0b32f3ab025e 100644 --- a/security/selinux/ss/conditional.c +++ b/security/selinux/ss/conditional.c @@ -600,3 +600,158 @@ void cond_compute_av(struct avtab *ctab, struct avtab_key *key, services_compute_xperms_drivers(xperms, node); } } + +static int cond_dup_av_list(struct cond_av_list *new, + struct cond_av_list *orig, + struct avtab *avtab) +{ + struct avtab_node *avnode; + u32 i; + + memset(new, 0, sizeof(*new)); + + new->nodes = kcalloc(orig->len, sizeof(*new->nodes), GFP_KERNEL); + if (!new->nodes) + return -ENOMEM; + + for (i = 0; i < orig->len; i++) { + avnode = avtab_search_node(avtab, &orig->nodes[i]->key); + if (WARN_ON(!avnode)) + return -EINVAL; + new->nodes[i] = avnode; + new->len++; + } + + return 0; +} + +static int duplicate_policydb_cond_list(struct policydb *newp, + struct policydb *origp) +{ + int rc, i, j; + + rc = avtab_duplicate(&newp->te_cond_avtab, &origp->te_cond_avtab); + if (rc) + return rc; + + newp->cond_list_len = 0; + newp->cond_list = kcalloc(origp->cond_list_len, + sizeof(*newp->cond_list), + GFP_KERNEL); + if (!newp->cond_list) + goto error; + + for (i = 0; i < origp->cond_list_len; i++) { + struct cond_node *newn = &newp->cond_list[i]; + struct cond_node *orign = &origp->cond_list[i]; + + newp->cond_list_len++; + + newn->cur_state = orign->cur_state; + newn->expr.nodes = kcalloc(orign->expr.len, + sizeof(*newn->expr.nodes), GFP_KERNEL); + if (!newn->expr.nodes) + goto error; + for (j = 0; j < orign->expr.len; j++) + newn->expr.nodes[j] = orign->expr.nodes[j]; + newn->expr.len = orign->expr.len; + + rc = cond_dup_av_list(&newn->true_list, &orign->true_list, + &newp->te_cond_avtab); + if (rc) + goto error; + + rc = cond_dup_av_list(&newn->false_list, &orign->false_list, + &newp->te_cond_avtab); + if (rc) + goto error; + } + + return 0; + +error: + avtab_destroy(&newp->te_cond_avtab); + cond_list_destroy(newp); + return -ENOMEM; +} + +static int cond_bools_destroy(void *key, void *datum, void *args) +{ + /* key was not copied so no need to free here */ + kfree(datum); + return 0; +} + +static int cond_bools_copy(struct hashtab_node *new, struct hashtab_node *orig, void *args) +{ + struct cond_bool_datum *datum; + + datum = kmemdup(orig->datum, sizeof(struct cond_bool_datum), + GFP_KERNEL); + if (!datum) + return -ENOMEM; + + new->key = orig->key; /* No need to copy, never modified */ + new->datum = datum; + return 0; +} + +static int cond_bools_index(void *key, void *datum, void *args) +{ + struct cond_bool_datum *booldatum, **cond_bool_array; + + booldatum = datum; + cond_bool_array = args; + cond_bool_array[booldatum->value - 1] = booldatum; + + return 0; +} + +static int duplicate_policydb_bools(struct policydb *newdb, + struct policydb *orig) +{ + struct cond_bool_datum **cond_bool_array; + int rc; + + cond_bool_array = kmalloc_array(orig->p_bools.nprim, + sizeof(*orig->bool_val_to_struct), + GFP_KERNEL); + if (!cond_bool_array) + return -ENOMEM; + + rc = hashtab_duplicate(&newdb->p_bools.table, &orig->p_bools.table, + cond_bools_copy, cond_bools_destroy, NULL); + if (rc) { + kfree(cond_bool_array); + return -ENOMEM; + } + + hashtab_map(&newdb->p_bools.table, cond_bools_index, cond_bool_array); + newdb->bool_val_to_struct = cond_bool_array; + + newdb->p_bools.nprim = orig->p_bools.nprim; + + return 0; +} + +void cond_policydb_destroy_dup(struct policydb *p) +{ + hashtab_map(&p->p_bools.table, cond_bools_destroy, NULL); + hashtab_destroy(&p->p_bools.table); + cond_policydb_destroy(p); +} + +int cond_policydb_dup(struct policydb *new, struct policydb *orig) +{ + cond_policydb_init(new); + + if (duplicate_policydb_bools(new, orig)) + return -ENOMEM; + + if (duplicate_policydb_cond_list(new, orig)) { + cond_policydb_destroy_dup(new); + return -ENOMEM; + } + + return 0; +} diff --git a/security/selinux/ss/conditional.h b/security/selinux/ss/conditional.h index 79e7e03db859..e47ec6ddeaf6 100644 --- a/security/selinux/ss/conditional.h +++ b/security/selinux/ss/conditional.h @@ -79,5 +79,7 @@ void cond_compute_av(struct avtab *ctab, struct avtab_key *key, void cond_compute_xperms(struct avtab *ctab, struct avtab_key *key, struct extended_perms_decision *xpermd); void evaluate_cond_nodes(struct policydb *p); +void cond_policydb_destroy_dup(struct policydb *p); +int cond_policydb_dup(struct policydb *new, struct policydb *orig); #endif /* _CONDITIONAL_H_ */ diff --git a/security/selinux/ss/hashtab.c b/security/selinux/ss/hashtab.c index d9287bb4bfeb..dab8c25c739b 100644 --- a/security/selinux/ss/hashtab.c +++ b/security/selinux/ss/hashtab.c @@ -122,6 +122,59 @@ void hashtab_stat(struct hashtab *h, struct hashtab_info *info) info->max_chain_len = max_chain_len; } +int hashtab_duplicate(struct hashtab *new, struct hashtab *orig, + int (*copy)(struct hashtab_node *new, + struct hashtab_node *orig, void *args), + int (*destroy)(void *k, void *d, void *args), + void *args) +{ + struct hashtab_node *cur, *tmp, *tail; + int i, rc; + + memset(new, 0, sizeof(*new)); + + new->htable = kcalloc(orig->size, sizeof(*new->htable), GFP_KERNEL); + if (!new->htable) + return -ENOMEM; + + new->size = orig->size; + + for (i = 0; i < orig->size; i++) { + tail = NULL; + for (cur = orig->htable[i]; cur; cur = cur->next) { + tmp = kmem_cache_zalloc(hashtab_node_cachep, + GFP_KERNEL); + if (!tmp) + goto error; + rc = copy(tmp, cur, args); + if (rc) { + kmem_cache_free(hashtab_node_cachep, tmp); + goto error; + } + tmp->next = NULL; + if (!tail) + new->htable[i] = tmp; + else + tail->next = tmp; + tail = tmp; + new->nel++; + } + } + + return 0; + + error: + for (i = 0; i < new->size; i++) { + for (cur = new->htable[i]; cur; cur = tmp) { + tmp = cur->next; + destroy(cur->key, cur->datum, args); + kmem_cache_free(hashtab_node_cachep, cur); + } + } + kmem_cache_free(hashtab_node_cachep, new); + return -ENOMEM; +} + void __init hashtab_cache_init(void) { hashtab_node_cachep = kmem_cache_create("hashtab_node", diff --git a/security/selinux/ss/hashtab.h b/security/selinux/ss/hashtab.h index 3c952f0f01f9..043a773bf0b7 100644 --- a/security/selinux/ss/hashtab.h +++ b/security/selinux/ss/hashtab.h @@ -136,6 +136,12 @@ int hashtab_map(struct hashtab *h, int (*apply)(void *k, void *d, void *args), void *args); +int hashtab_duplicate(struct hashtab *new, struct hashtab *orig, + int (*copy)(struct hashtab_node *new, + struct hashtab_node *orig, void *args), + int (*destroy)(void *k, void *d, void *args), + void *args); + /* Fill info with some hash table statistics */ void hashtab_stat(struct hashtab *h, struct hashtab_info *info); diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c index f2108ed9b151..17979bbd0b03 100644 --- a/security/selinux/ss/services.c +++ b/security/selinux/ss/services.c @@ -64,25 +64,7 @@ #include "xfrm.h" #include "ebitmap.h" #include "audit.h" - -/* Policy capability names */ -const char *selinux_policycap_names[__POLICYDB_CAPABILITY_MAX] = { - "network_peer_controls", - "open_perms", - "extended_socket_class", - "always_check_network", - "cgroup_seclabel", - "nnp_nosuid_transition", - "genfs_seclabel_symlinks" -}; - -static struct selinux_ss selinux_ss; - -void selinux_ss_init(struct selinux_ss **ss) -{ - rwlock_init(&selinux_ss.policy_rwlock); - *ss = &selinux_ss; -} +#include "policycap_names.h" /* Forward declaration. */ static int context_struct_to_string(struct policydb *policydb, @@ -248,9 +230,17 @@ static void map_decision(struct selinux_map *map, int security_mls_enabled(struct selinux_state *state) { - struct policydb *p = &state->ss->policydb; + int mls_enabled; + struct selinux_policy *policy; - return p->mls_enabled; + if (!selinux_initialized(state)) + return 0; + + rcu_read_lock(); + policy = rcu_dereference(state->policy); + mls_enabled = policy->policydb.mls_enabled; + rcu_read_unlock(); + return mls_enabled; } /* @@ -721,13 +711,14 @@ static void context_struct_compute_av(struct policydb *policydb, } static int security_validtrans_handle_fail(struct selinux_state *state, - struct sidtab_entry *oentry, - struct sidtab_entry *nentry, - struct sidtab_entry *tentry, - u16 tclass) + struct selinux_policy *policy, + struct sidtab_entry *oentry, + struct sidtab_entry *nentry, + struct sidtab_entry *tentry, + u16 tclass) { - struct policydb *p = &state->ss->policydb; - struct sidtab *sidtab = state->ss->sidtab; + struct policydb *p = &policy->policydb; + struct sidtab *sidtab = policy->sidtab; char *o = NULL, *n = NULL, *t = NULL; u32 olen, nlen, tlen; @@ -755,6 +746,7 @@ static int security_compute_validatetrans(struct selinux_state *state, u32 oldsid, u32 newsid, u32 tasksid, u16 orig_tclass, bool user) { + struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; struct sidtab_entry *oentry; @@ -769,13 +761,14 @@ static int security_compute_validatetrans(struct selinux_state *state, if (!selinux_initialized(state)) return 0; - read_lock(&state->ss->policy_rwlock); + rcu_read_lock(); - policydb = &state->ss->policydb; - sidtab = state->ss->sidtab; + policy = rcu_dereference(state->policy); + policydb = &policy->policydb; + sidtab = policy->sidtab; if (!user) - tclass = unmap_class(&state->ss->map, orig_tclass); + tclass = unmap_class(&policy->map, orig_tclass); else tclass = orig_tclass; @@ -818,17 +811,18 @@ static int security_compute_validatetrans(struct selinux_state *state, rc = -EPERM; else rc = security_validtrans_handle_fail(state, - oentry, - nentry, - tentry, - tclass); + policy, + oentry, + nentry, + tentry, + tclass); goto out; } constraint = constraint->next; } out: - read_unlock(&state->ss->policy_rwlock); + rcu_read_unlock(); return rc; } @@ -860,6 +854,7 @@ int security_validate_transition(struct selinux_state *state, int security_bounded_transition(struct selinux_state *state, u32 old_sid, u32 new_sid) { + struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; struct sidtab_entry *old_entry, *new_entry; @@ -870,10 +865,10 @@ int security_bounded_transition(struct selinux_state *state, if (!selinux_initialized(state)) return 0; - read_lock(&state->ss->policy_rwlock); - - policydb = &state->ss->policydb; - sidtab = state->ss->sidtab; + rcu_read_lock(); + policy = rcu_dereference(state->policy); + policydb = &policy->policydb; + sidtab = policy->sidtab; rc = -EINVAL; old_entry = sidtab_search_entry(sidtab, old_sid); @@ -934,17 +929,20 @@ int security_bounded_transition(struct selinux_state *state, kfree(old_name); } out: - read_unlock(&state->ss->policy_rwlock); + rcu_read_unlock(); return rc; } -static void avd_init(struct selinux_state *state, struct av_decision *avd) +static void avd_init(struct selinux_policy *policy, struct av_decision *avd) { avd->allowed = 0; avd->auditallow = 0; avd->auditdeny = 0xffffffff; - avd->seqno = state->ss->latest_granting; + if (policy) + avd->seqno = policy->latest_granting; + else + avd->seqno = 0; avd->flags = 0; } @@ -1009,6 +1007,7 @@ void security_compute_xperms_decision(struct selinux_state *state, u8 driver, struct extended_perms_decision *xpermd) { + struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; u16 tclass; @@ -1025,12 +1024,13 @@ void security_compute_xperms_decision(struct selinux_state *state, memset(xpermd->auditallow->p, 0, sizeof(xpermd->auditallow->p)); memset(xpermd->dontaudit->p, 0, sizeof(xpermd->dontaudit->p)); - read_lock(&state->ss->policy_rwlock); + rcu_read_lock(); if (!selinux_initialized(state)) goto allow; - policydb = &state->ss->policydb; - sidtab = state->ss->sidtab; + policy = rcu_dereference(state->policy); + policydb = &policy->policydb; + sidtab = policy->sidtab; scontext = sidtab_search(sidtab, ssid); if (!scontext) { @@ -1046,7 +1046,7 @@ void security_compute_xperms_decision(struct selinux_state *state, goto out; } - tclass = unmap_class(&state->ss->map, orig_tclass); + tclass = unmap_class(&policy->map, orig_tclass); if (unlikely(orig_tclass && !tclass)) { if (policydb->allow_unknown) goto allow; @@ -1078,7 +1078,7 @@ void security_compute_xperms_decision(struct selinux_state *state, } } out: - read_unlock(&state->ss->policy_rwlock); + rcu_read_unlock(); return; allow: memset(xpermd->allowed->p, 0xff, sizeof(xpermd->allowed->p)); @@ -1103,19 +1103,21 @@ void security_compute_av(struct selinux_state *state, struct av_decision *avd, struct extended_perms *xperms) { + struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; u16 tclass; struct context *scontext = NULL, *tcontext = NULL; - read_lock(&state->ss->policy_rwlock); - avd_init(state, avd); + rcu_read_lock(); + policy = rcu_dereference(state->policy); + avd_init(policy, avd); xperms->len = 0; if (!selinux_initialized(state)) goto allow; - policydb = &state->ss->policydb; - sidtab = state->ss->sidtab; + policydb = &policy->policydb; + sidtab = policy->sidtab; scontext = sidtab_search(sidtab, ssid); if (!scontext) { @@ -1135,7 +1137,7 @@ void security_compute_av(struct selinux_state *state, goto out; } - tclass = unmap_class(&state->ss->map, orig_tclass); + tclass = unmap_class(&policy->map, orig_tclass); if (unlikely(orig_tclass && !tclass)) { if (policydb->allow_unknown) goto allow; @@ -1143,10 +1145,10 @@ void security_compute_av(struct selinux_state *state, } context_struct_compute_av(policydb, scontext, tcontext, tclass, avd, xperms); - map_decision(&state->ss->map, orig_tclass, avd, + map_decision(&policy->map, orig_tclass, avd, policydb->allow_unknown); out: - read_unlock(&state->ss->policy_rwlock); + rcu_read_unlock(); return; allow: avd->allowed = 0xffffffff; @@ -1159,17 +1161,19 @@ void security_compute_av_user(struct selinux_state *state, u16 tclass, struct av_decision *avd) { + struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; struct context *scontext = NULL, *tcontext = NULL; - read_lock(&state->ss->policy_rwlock); - avd_init(state, avd); + rcu_read_lock(); + policy = rcu_dereference(state->policy); + avd_init(policy, avd); if (!selinux_initialized(state)) goto allow; - policydb = &state->ss->policydb; - sidtab = state->ss->sidtab; + policydb = &policy->policydb; + sidtab = policy->sidtab; scontext = sidtab_search(sidtab, ssid); if (!scontext) { @@ -1198,7 +1202,7 @@ void security_compute_av_user(struct selinux_state *state, context_struct_compute_av(policydb, scontext, tcontext, tclass, avd, NULL); out: - read_unlock(&state->ss->policy_rwlock); + rcu_read_unlock(); return; allow: avd->allowed = 0xffffffff; @@ -1283,6 +1287,7 @@ static int sidtab_entry_to_string(struct policydb *p, int security_sidtab_hash_stats(struct selinux_state *state, char *page) { + struct selinux_policy *policy; int rc; if (!selinux_initialized(state)) { @@ -1291,9 +1296,10 @@ int security_sidtab_hash_stats(struct selinux_state *state, char *page) return -EINVAL; } - read_lock(&state->ss->policy_rwlock); - rc = sidtab_hash_stats(state->ss->sidtab, page); - read_unlock(&state->ss->policy_rwlock); + rcu_read_lock(); + policy = rcu_dereference(state->policy); + rc = sidtab_hash_stats(policy->sidtab, page); + rcu_read_unlock(); return rc; } @@ -1310,6 +1316,7 @@ static int security_sid_to_context_core(struct selinux_state *state, u32 *scontext_len, int force, int only_invalid) { + struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; struct sidtab_entry *entry; @@ -1339,9 +1346,10 @@ static int security_sid_to_context_core(struct selinux_state *state, "load_policy on unknown SID %d\n", __func__, sid); return -EINVAL; } - read_lock(&state->ss->policy_rwlock); - policydb = &state->ss->policydb; - sidtab = state->ss->sidtab; + rcu_read_lock(); + policy = rcu_dereference(state->policy); + policydb = &policy->policydb; + sidtab = policy->sidtab; if (force) entry = sidtab_search_entry_force(sidtab, sid); @@ -1360,7 +1368,7 @@ static int security_sid_to_context_core(struct selinux_state *state, scontext_len); out_unlock: - read_unlock(&state->ss->policy_rwlock); + rcu_read_unlock(); return rc; } @@ -1495,6 +1503,7 @@ static int security_context_to_sid_core(struct selinux_state *state, u32 *sid, u32 def_sid, gfp_t gfp_flags, int force) { + struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; char *scontext2, *str = NULL; @@ -1533,9 +1542,10 @@ static int security_context_to_sid_core(struct selinux_state *state, if (!str) goto out; } - read_lock(&state->ss->policy_rwlock); - policydb = &state->ss->policydb; - sidtab = state->ss->sidtab; + rcu_read_lock(); + policy = rcu_dereference(state->policy); + policydb = &policy->policydb; + sidtab = policy->sidtab; rc = string_to_context_struct(policydb, sidtab, scontext2, &context, def_sid); if (rc == -EINVAL && force) { @@ -1547,7 +1557,7 @@ static int security_context_to_sid_core(struct selinux_state *state, rc = sidtab_context_to_sid(sidtab, &context, sid); context_destroy(&context); out_unlock: - read_unlock(&state->ss->policy_rwlock); + rcu_read_unlock(); out: kfree(scontext2); kfree(str); @@ -1617,13 +1627,14 @@ int security_context_to_sid_force(struct selinux_state *state, static int compute_sid_handle_invalid_context( struct selinux_state *state, + struct selinux_policy *policy, struct sidtab_entry *sentry, struct sidtab_entry *tentry, u16 tclass, struct context *newcontext) { - struct policydb *policydb = &state->ss->policydb; - struct sidtab *sidtab = state->ss->sidtab; + struct policydb *policydb = &policy->policydb; + struct sidtab *sidtab = policy->sidtab; char *s = NULL, *t = NULL, *n = NULL; u32 slen, tlen, nlen; struct audit_buffer *ab; @@ -1690,6 +1701,7 @@ static int security_compute_sid(struct selinux_state *state, u32 *out_sid, bool kern) { + struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; struct class_datum *cladatum = NULL; @@ -1716,19 +1728,21 @@ static int security_compute_sid(struct selinux_state *state, context_init(&newcontext); - read_lock(&state->ss->policy_rwlock); + rcu_read_lock(); + + policy = rcu_dereference(state->policy); if (kern) { - tclass = unmap_class(&state->ss->map, orig_tclass); + tclass = unmap_class(&policy->map, orig_tclass); sock = security_is_socket_class(orig_tclass); } else { tclass = orig_tclass; - sock = security_is_socket_class(map_class(&state->ss->map, + sock = security_is_socket_class(map_class(&policy->map, tclass)); } - policydb = &state->ss->policydb; - sidtab = state->ss->sidtab; + policydb = &policy->policydb; + sidtab = policy->sidtab; sentry = sidtab_search_entry(sidtab, ssid); if (!sentry) { @@ -1848,15 +1862,16 @@ static int security_compute_sid(struct selinux_state *state, /* Check the validity of the context. */ if (!policydb_context_isvalid(policydb, &newcontext)) { - rc = compute_sid_handle_invalid_context(state, sentry, tentry, - tclass, &newcontext); + rc = compute_sid_handle_invalid_context(state, policy, sentry, + tentry, tclass, + &newcontext); if (rc) goto out_unlock; } /* Obtain the sid for the context. */ rc = sidtab_context_to_sid(sidtab, &newcontext, out_sid); out_unlock: - read_unlock(&state->ss->policy_rwlock); + rcu_read_unlock(); context_destroy(&newcontext); out: return rc; @@ -1943,9 +1958,9 @@ int security_change_sid(struct selinux_state *state, static inline int convert_context_handle_invalid_context( struct selinux_state *state, + struct policydb *policydb, struct context *context) { - struct policydb *policydb = &state->ss->policydb; char *s; u32 len; @@ -2077,7 +2092,9 @@ static int convert_context(struct context *oldc, struct context *newc, void *p) /* Check the validity of the new context. */ if (!policydb_context_isvalid(args->newp, newc)) { - rc = convert_context_handle_invalid_context(args->state, oldc); + rc = convert_context_handle_invalid_context(args->state, + args->oldp, + oldc); if (rc) goto bad; } @@ -2096,14 +2113,18 @@ bad: return 0; } -static void security_load_policycaps(struct selinux_state *state) +static void security_load_policycaps(struct selinux_state *state, + struct selinux_policy *policy) { - struct policydb *p = &state->ss->policydb; + struct policydb *p; unsigned int i; struct ebitmap_node *node; + p = &policy->policydb; + for (i = 0; i < ARRAY_SIZE(state->policycap); i++) - state->policycap[i] = ebitmap_get_bit(&p->policycaps, i); + WRITE_ONCE(state->policycap[i], + ebitmap_get_bit(&p->policycaps, i)); for (i = 0; i < ARRAY_SIZE(selinux_policycap_names); i++) pr_info("SELinux: policy capability %s=%d\n", @@ -2120,8 +2141,97 @@ static void security_load_policycaps(struct selinux_state *state) selinux_nlmsg_init(); } -static int security_preserve_bools(struct selinux_state *state, - struct policydb *newpolicydb); +static int security_preserve_bools(struct selinux_policy *oldpolicy, + struct selinux_policy *newpolicy); + +static void selinux_policy_free(struct selinux_policy *policy) +{ + if (!policy) + return; + + sidtab_destroy(policy->sidtab); + kfree(policy->map.mapping); + policydb_destroy(&policy->policydb); + kfree(policy->sidtab); + kfree(policy); +} + +static void selinux_policy_cond_free(struct selinux_policy *policy) +{ + cond_policydb_destroy_dup(&policy->policydb); + kfree(policy); +} + +void selinux_policy_cancel(struct selinux_state *state, + struct selinux_policy *policy) +{ + struct selinux_policy *oldpolicy; + + oldpolicy = rcu_dereference_protected(state->policy, + lockdep_is_held(&state->policy_mutex)); + + sidtab_cancel_convert(oldpolicy->sidtab); + selinux_policy_free(policy); +} + +static void selinux_notify_policy_change(struct selinux_state *state, + u32 seqno) +{ + /* Flush external caches and notify userspace of policy load */ + avc_ss_reset(state->avc, seqno); + selnl_notify_policyload(seqno); + selinux_status_update_policyload(state, seqno); + selinux_netlbl_cache_invalidate(); + selinux_xfrm_notify_policyload(); +} + +void selinux_policy_commit(struct selinux_state *state, + struct selinux_policy *newpolicy) +{ + struct selinux_policy *oldpolicy; + u32 seqno; + + oldpolicy = rcu_dereference_protected(state->policy, + lockdep_is_held(&state->policy_mutex)); + + /* If switching between different policy types, log MLS status */ + if (oldpolicy) { + if (oldpolicy->policydb.mls_enabled && !newpolicy->policydb.mls_enabled) + pr_info("SELinux: Disabling MLS support...\n"); + else if (!oldpolicy->policydb.mls_enabled && newpolicy->policydb.mls_enabled) + pr_info("SELinux: Enabling MLS support...\n"); + } + + /* Set latest granting seqno for new policy. */ + if (oldpolicy) + newpolicy->latest_granting = oldpolicy->latest_granting + 1; + else + newpolicy->latest_granting = 1; + seqno = newpolicy->latest_granting; + + /* Install the new policy. */ + rcu_assign_pointer(state->policy, newpolicy); + + /* Load the policycaps from the new policy */ + security_load_policycaps(state, newpolicy); + + if (!selinux_initialized(state)) { + /* + * After first policy load, the security server is + * marked as initialized and ready to handle requests and + * any objects created prior to policy load are then labeled. + */ + selinux_mark_initialized(state); + selinux_complete_init(); + } + + /* Free the old policy */ + synchronize_rcu(); + selinux_policy_free(oldpolicy); + + /* Notify others of the policy change */ + selinux_notify_policy_change(state, seqno); +} /** * security_load_policy - Load a security policy configuration. @@ -2133,173 +2243,95 @@ static int security_preserve_bools(struct selinux_state *state, * This function will flush the access vector cache after * loading the new policy. */ -int security_load_policy(struct selinux_state *state, void *data, size_t len) +int security_load_policy(struct selinux_state *state, void *data, size_t len, + struct selinux_policy **newpolicyp) { - struct policydb *policydb; - struct sidtab *oldsidtab, *newsidtab; - struct policydb *oldpolicydb, *newpolicydb; - struct selinux_mapping *oldmapping; - struct selinux_map newmap; + struct selinux_policy *newpolicy, *oldpolicy; struct sidtab_convert_params convert_params; struct convert_context_args args; - u32 seqno; int rc = 0; struct policy_file file = { data, len }, *fp = &file; - policydb = &state->ss->policydb; - - newsidtab = kmalloc(sizeof(*newsidtab), GFP_KERNEL); - if (!newsidtab) + newpolicy = kzalloc(sizeof(*newpolicy), GFP_KERNEL); + if (!newpolicy) return -ENOMEM; + newpolicy->sidtab = kzalloc(sizeof(*newpolicy->sidtab), GFP_KERNEL); + if (!newpolicy->sidtab) { + rc = -ENOMEM; + goto err_policy; + } + + rc = policydb_read(&newpolicy->policydb, fp); + if (rc) + goto err_sidtab; + + newpolicy->policydb.len = len; + rc = selinux_set_mapping(&newpolicy->policydb, secclass_map, + &newpolicy->map); + if (rc) + goto err_policydb; + + rc = policydb_load_isids(&newpolicy->policydb, newpolicy->sidtab); + if (rc) { + pr_err("SELinux: unable to load the initial SIDs\n"); + goto err_mapping; + } + + if (!selinux_initialized(state)) { - rc = policydb_read(policydb, fp); - if (rc) { - kfree(newsidtab); - return rc; - } - - policydb->len = len; - rc = selinux_set_mapping(policydb, secclass_map, - &state->ss->map); - if (rc) { - kfree(newsidtab); - policydb_destroy(policydb); - return rc; - } - - rc = policydb_load_isids(policydb, newsidtab); - if (rc) { - kfree(newsidtab); - policydb_destroy(policydb); - return rc; - } - - state->ss->sidtab = newsidtab; - security_load_policycaps(state); - selinux_mark_initialized(state); - seqno = ++state->ss->latest_granting; - selinux_complete_init(); - avc_ss_reset(state->avc, seqno); - selnl_notify_policyload(seqno); - selinux_status_update_policyload(state, seqno); - selinux_netlbl_cache_invalidate(); - selinux_xfrm_notify_policyload(); + /* First policy load, so no need to preserve state from old policy */ + *newpolicyp = newpolicy; return 0; } - oldpolicydb = kcalloc(2, sizeof(*oldpolicydb), GFP_KERNEL); - if (!oldpolicydb) { - kfree(newsidtab); - return -ENOMEM; - } - newpolicydb = oldpolicydb + 1; + oldpolicy = rcu_dereference_protected(state->policy, + lockdep_is_held(&state->policy_mutex)); - rc = policydb_read(newpolicydb, fp); - if (rc) { - kfree(newsidtab); - goto out; - } - - newpolicydb->len = len; - /* If switching between different policy types, log MLS status */ - if (policydb->mls_enabled && !newpolicydb->mls_enabled) - pr_info("SELinux: Disabling MLS support...\n"); - else if (!policydb->mls_enabled && newpolicydb->mls_enabled) - pr_info("SELinux: Enabling MLS support...\n"); - - rc = policydb_load_isids(newpolicydb, newsidtab); - if (rc) { - pr_err("SELinux: unable to load the initial SIDs\n"); - policydb_destroy(newpolicydb); - kfree(newsidtab); - goto out; - } - - rc = selinux_set_mapping(newpolicydb, secclass_map, &newmap); - if (rc) - goto err; - - rc = security_preserve_bools(state, newpolicydb); + /* Preserve active boolean values from the old policy */ + rc = security_preserve_bools(oldpolicy, newpolicy); if (rc) { pr_err("SELinux: unable to preserve booleans\n"); - goto err; + goto err_free_isids; } - oldsidtab = state->ss->sidtab; - /* * Convert the internal representations of contexts * in the new SID table. */ args.state = state; - args.oldp = policydb; - args.newp = newpolicydb; + args.oldp = &oldpolicy->policydb; + args.newp = &newpolicy->policydb; convert_params.func = convert_context; convert_params.args = &args; - convert_params.target = newsidtab; + convert_params.target = newpolicy->sidtab; - rc = sidtab_convert(oldsidtab, &convert_params); + rc = sidtab_convert(oldpolicy->sidtab, &convert_params); if (rc) { pr_err("SELinux: unable to convert the internal" " representation of contexts in the new SID" " table\n"); - goto err; + goto err_free_isids; } - /* Save the old policydb and SID table to free later. */ - memcpy(oldpolicydb, policydb, sizeof(*policydb)); + *newpolicyp = newpolicy; + return 0; - /* Install the new policydb and SID table. */ - write_lock_irq(&state->ss->policy_rwlock); - memcpy(policydb, newpolicydb, sizeof(*policydb)); - state->ss->sidtab = newsidtab; - security_load_policycaps(state); - oldmapping = state->ss->map.mapping; - state->ss->map.mapping = newmap.mapping; - state->ss->map.size = newmap.size; - seqno = ++state->ss->latest_granting; - write_unlock_irq(&state->ss->policy_rwlock); +err_free_isids: + sidtab_destroy(newpolicy->sidtab); +err_mapping: + kfree(newpolicy->map.mapping); +err_policydb: + policydb_destroy(&newpolicy->policydb); +err_sidtab: + kfree(newpolicy->sidtab); +err_policy: + kfree(newpolicy); - /* Free the old policydb and SID table. */ - policydb_destroy(oldpolicydb); - sidtab_destroy(oldsidtab); - kfree(oldsidtab); - kfree(oldmapping); - - avc_ss_reset(state->avc, seqno); - selnl_notify_policyload(seqno); - selinux_status_update_policyload(state, seqno); - selinux_netlbl_cache_invalidate(); - selinux_xfrm_notify_policyload(); - - rc = 0; - goto out; - -err: - kfree(newmap.mapping); - sidtab_destroy(newsidtab); - kfree(newsidtab); - policydb_destroy(newpolicydb); - -out: - kfree(oldpolicydb); return rc; } -size_t security_policydb_len(struct selinux_state *state) -{ - struct policydb *p = &state->ss->policydb; - size_t len; - - read_lock(&state->ss->policy_rwlock); - len = p->len; - read_unlock(&state->ss->policy_rwlock); - - return len; -} - /** * security_port_sid - Obtain the SID for a port. * @protocol: protocol number @@ -2309,15 +2341,21 @@ size_t security_policydb_len(struct selinux_state *state) int security_port_sid(struct selinux_state *state, u8 protocol, u16 port, u32 *out_sid) { + struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; struct ocontext *c; int rc = 0; - read_lock(&state->ss->policy_rwlock); + if (!selinux_initialized(state)) { + *out_sid = SECINITSID_PORT; + return 0; + } - policydb = &state->ss->policydb; - sidtab = state->ss->sidtab; + rcu_read_lock(); + policy = rcu_dereference(state->policy); + policydb = &policy->policydb; + sidtab = policy->sidtab; c = policydb->ocontexts[OCON_PORT]; while (c) { @@ -2341,7 +2379,7 @@ int security_port_sid(struct selinux_state *state, } out: - read_unlock(&state->ss->policy_rwlock); + rcu_read_unlock(); return rc; } @@ -2354,15 +2392,21 @@ out: int security_ib_pkey_sid(struct selinux_state *state, u64 subnet_prefix, u16 pkey_num, u32 *out_sid) { + struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; struct ocontext *c; int rc = 0; - read_lock(&state->ss->policy_rwlock); + if (!selinux_initialized(state)) { + *out_sid = SECINITSID_UNLABELED; + return 0; + } - policydb = &state->ss->policydb; - sidtab = state->ss->sidtab; + rcu_read_lock(); + policy = rcu_dereference(state->policy); + policydb = &policy->policydb; + sidtab = policy->sidtab; c = policydb->ocontexts[OCON_IBPKEY]; while (c) { @@ -2387,7 +2431,7 @@ int security_ib_pkey_sid(struct selinux_state *state, *out_sid = SECINITSID_UNLABELED; out: - read_unlock(&state->ss->policy_rwlock); + rcu_read_unlock(); return rc; } @@ -2400,15 +2444,21 @@ out: int security_ib_endport_sid(struct selinux_state *state, const char *dev_name, u8 port_num, u32 *out_sid) { + struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; struct ocontext *c; int rc = 0; - read_lock(&state->ss->policy_rwlock); + if (!selinux_initialized(state)) { + *out_sid = SECINITSID_UNLABELED; + return 0; + } - policydb = &state->ss->policydb; - sidtab = state->ss->sidtab; + rcu_read_lock(); + policy = rcu_dereference(state->policy); + policydb = &policy->policydb; + sidtab = policy->sidtab; c = policydb->ocontexts[OCON_IBENDPORT]; while (c) { @@ -2433,7 +2483,7 @@ int security_ib_endport_sid(struct selinux_state *state, *out_sid = SECINITSID_UNLABELED; out: - read_unlock(&state->ss->policy_rwlock); + rcu_read_unlock(); return rc; } @@ -2445,15 +2495,21 @@ out: int security_netif_sid(struct selinux_state *state, char *name, u32 *if_sid) { + struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; int rc = 0; struct ocontext *c; - read_lock(&state->ss->policy_rwlock); + if (!selinux_initialized(state)) { + *if_sid = SECINITSID_NETIF; + return 0; + } - policydb = &state->ss->policydb; - sidtab = state->ss->sidtab; + rcu_read_lock(); + policy = rcu_dereference(state->policy); + policydb = &policy->policydb; + sidtab = policy->sidtab; c = policydb->ocontexts[OCON_NETIF]; while (c) { @@ -2478,7 +2534,7 @@ int security_netif_sid(struct selinux_state *state, *if_sid = SECINITSID_NETIF; out: - read_unlock(&state->ss->policy_rwlock); + rcu_read_unlock(); return rc; } @@ -2508,15 +2564,21 @@ int security_node_sid(struct selinux_state *state, u32 addrlen, u32 *out_sid) { + struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; int rc; struct ocontext *c; - read_lock(&state->ss->policy_rwlock); + if (!selinux_initialized(state)) { + *out_sid = SECINITSID_NODE; + return 0; + } - policydb = &state->ss->policydb; - sidtab = state->ss->sidtab; + rcu_read_lock(); + policy = rcu_dereference(state->policy); + policydb = &policy->policydb; + sidtab = policy->sidtab; switch (domain) { case AF_INET: { @@ -2571,7 +2633,7 @@ int security_node_sid(struct selinux_state *state, rc = 0; out: - read_unlock(&state->ss->policy_rwlock); + rcu_read_unlock(); return rc; } @@ -2597,6 +2659,7 @@ int security_get_user_sids(struct selinux_state *state, u32 **sids, u32 *nel) { + struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; struct context *fromcon, usercon; @@ -2613,10 +2676,10 @@ int security_get_user_sids(struct selinux_state *state, if (!selinux_initialized(state)) goto out; - read_lock(&state->ss->policy_rwlock); - - policydb = &state->ss->policydb; - sidtab = state->ss->sidtab; + rcu_read_lock(); + policy = rcu_dereference(state->policy); + policydb = &policy->policydb; + sidtab = policy->sidtab; context_init(&usercon); @@ -2667,7 +2730,7 @@ int security_get_user_sids(struct selinux_state *state, } rc = 0; out_unlock: - read_unlock(&state->ss->policy_rwlock); + rcu_read_unlock(); if (rc || !mynel) { kfree(mysids); goto out; @@ -2708,17 +2771,15 @@ out: * Obtain a SID to use for a file in a filesystem that * cannot support xattr or use a fixed labeling behavior like * transition SIDs or task SIDs. - * - * The caller must acquire the policy_rwlock before calling this function. */ -static inline int __security_genfs_sid(struct selinux_state *state, +static inline int __security_genfs_sid(struct selinux_policy *policy, const char *fstype, char *path, u16 orig_sclass, u32 *sid) { - struct policydb *policydb = &state->ss->policydb; - struct sidtab *sidtab = state->ss->sidtab; + struct policydb *policydb = &policy->policydb; + struct sidtab *sidtab = policy->sidtab; int len; u16 sclass; struct genfs *genfs; @@ -2728,7 +2789,7 @@ static inline int __security_genfs_sid(struct selinux_state *state, while (path[0] == '/' && path[1] == '/') path++; - sclass = unmap_class(&state->ss->map, orig_sclass); + sclass = unmap_class(&policy->map, orig_sclass); *sid = SECINITSID_UNLABELED; for (genfs = policydb->genfs; genfs; genfs = genfs->next) { @@ -2780,20 +2841,39 @@ int security_genfs_sid(struct selinux_state *state, u16 orig_sclass, u32 *sid) { + struct selinux_policy *policy; int retval; - read_lock(&state->ss->policy_rwlock); - retval = __security_genfs_sid(state, fstype, path, orig_sclass, sid); - read_unlock(&state->ss->policy_rwlock); + if (!selinux_initialized(state)) { + *sid = SECINITSID_UNLABELED; + return 0; + } + + rcu_read_lock(); + policy = rcu_dereference(state->policy); + retval = __security_genfs_sid(policy, + fstype, path, orig_sclass, sid); + rcu_read_unlock(); return retval; } +int selinux_policy_genfs_sid(struct selinux_policy *policy, + const char *fstype, + char *path, + u16 orig_sclass, + u32 *sid) +{ + /* no lock required, policy is not yet accessible by other threads */ + return __security_genfs_sid(policy, fstype, path, orig_sclass, sid); +} + /** * security_fs_use - Determine how to handle labeling for a filesystem. * @sb: superblock in question */ int security_fs_use(struct selinux_state *state, struct super_block *sb) { + struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; int rc = 0; @@ -2801,10 +2881,16 @@ int security_fs_use(struct selinux_state *state, struct super_block *sb) struct superblock_security_struct *sbsec = sb->s_security; const char *fstype = sb->s_type->name; - read_lock(&state->ss->policy_rwlock); + if (!selinux_initialized(state)) { + sbsec->behavior = SECURITY_FS_USE_NONE; + sbsec->sid = SECINITSID_UNLABELED; + return 0; + } - policydb = &state->ss->policydb; - sidtab = state->ss->sidtab; + rcu_read_lock(); + policy = rcu_dereference(state->policy); + policydb = &policy->policydb; + sidtab = policy->sidtab; c = policydb->ocontexts[OCON_FSUSE]; while (c) { @@ -2823,8 +2909,8 @@ int security_fs_use(struct selinux_state *state, struct super_block *sb) } sbsec->sid = c->sid[0]; } else { - rc = __security_genfs_sid(state, fstype, "/", SECCLASS_DIR, - &sbsec->sid); + rc = __security_genfs_sid(policy, fstype, "/", + SECCLASS_DIR, &sbsec->sid); if (rc) { sbsec->behavior = SECURITY_FS_USE_NONE; rc = 0; @@ -2834,27 +2920,18 @@ int security_fs_use(struct selinux_state *state, struct super_block *sb) } out: - read_unlock(&state->ss->policy_rwlock); + rcu_read_unlock(); return rc; } -int security_get_bools(struct selinux_state *state, +int security_get_bools(struct selinux_policy *policy, u32 *len, char ***names, int **values) { struct policydb *policydb; u32 i; int rc; - if (!selinux_initialized(state)) { - *len = 0; - *names = NULL; - *values = NULL; - return 0; - } - - read_lock(&state->ss->policy_rwlock); - - policydb = &state->ss->policydb; + policydb = &policy->policydb; *names = NULL; *values = NULL; @@ -2885,7 +2962,6 @@ int security_get_bools(struct selinux_state *state, } rc = 0; out: - read_unlock(&state->ss->policy_rwlock); return rc; err: if (*names) { @@ -2903,61 +2979,89 @@ err: int security_set_bools(struct selinux_state *state, u32 len, int *values) { - struct policydb *policydb; + struct selinux_policy *newpolicy, *oldpolicy; int rc; - u32 i, lenp, seqno = 0; + u32 i, seqno = 0; - write_lock_irq(&state->ss->policy_rwlock); + if (!selinux_initialized(state)) + return -EINVAL; - policydb = &state->ss->policydb; + oldpolicy = rcu_dereference_protected(state->policy, + lockdep_is_held(&state->policy_mutex)); - rc = -EFAULT; - lenp = policydb->p_bools.nprim; - if (len != lenp) - goto out; + /* Consistency check on number of booleans, should never fail */ + if (WARN_ON(len != oldpolicy->policydb.p_bools.nprim)) + return -EINVAL; + newpolicy = kmemdup(oldpolicy, sizeof(*newpolicy), GFP_KERNEL); + if (!newpolicy) + return -ENOMEM; + + /* + * Deep copy only the parts of the policydb that might be + * modified as a result of changing booleans. + */ + rc = cond_policydb_dup(&newpolicy->policydb, &oldpolicy->policydb); + if (rc) { + kfree(newpolicy); + return -ENOMEM; + } + + /* Update the boolean states in the copy */ for (i = 0; i < len; i++) { - if (!!values[i] != policydb->bool_val_to_struct[i]->state) { + int new_state = !!values[i]; + int old_state = newpolicy->policydb.bool_val_to_struct[i]->state; + + if (new_state != old_state) { audit_log(audit_context(), GFP_ATOMIC, AUDIT_MAC_CONFIG_CHANGE, "bool=%s val=%d old_val=%d auid=%u ses=%u", - sym_name(policydb, SYM_BOOLS, i), - !!values[i], - policydb->bool_val_to_struct[i]->state, + sym_name(&newpolicy->policydb, SYM_BOOLS, i), + new_state, + old_state, from_kuid(&init_user_ns, audit_get_loginuid(current)), audit_get_sessionid(current)); + newpolicy->policydb.bool_val_to_struct[i]->state = new_state; } - if (values[i]) - policydb->bool_val_to_struct[i]->state = 1; - else - policydb->bool_val_to_struct[i]->state = 0; } - evaluate_cond_nodes(policydb); + /* Re-evaluate the conditional rules in the copy */ + evaluate_cond_nodes(&newpolicy->policydb); - seqno = ++state->ss->latest_granting; - rc = 0; -out: - write_unlock_irq(&state->ss->policy_rwlock); - if (!rc) { - avc_ss_reset(state->avc, seqno); - selnl_notify_policyload(seqno); - selinux_status_update_policyload(state, seqno); - selinux_xfrm_notify_policyload(); - } - return rc; + /* Set latest granting seqno for new policy */ + newpolicy->latest_granting = oldpolicy->latest_granting + 1; + seqno = newpolicy->latest_granting; + + /* Install the new policy */ + rcu_assign_pointer(state->policy, newpolicy); + + /* + * Free the conditional portions of the old policydb + * that were copied for the new policy, and the oldpolicy + * structure itself but not what it references. + */ + synchronize_rcu(); + selinux_policy_cond_free(oldpolicy); + + /* Notify others of the policy change */ + selinux_notify_policy_change(state, seqno); + return 0; } int security_get_bool_value(struct selinux_state *state, u32 index) { + struct selinux_policy *policy; struct policydb *policydb; int rc; u32 len; - read_lock(&state->ss->policy_rwlock); + if (!selinux_initialized(state)) + return 0; - policydb = &state->ss->policydb; + rcu_read_lock(); + policy = rcu_dereference(state->policy); + policydb = &policy->policydb; rc = -EFAULT; len = policydb->p_bools.nprim; @@ -2966,27 +3070,28 @@ int security_get_bool_value(struct selinux_state *state, rc = policydb->bool_val_to_struct[index]->state; out: - read_unlock(&state->ss->policy_rwlock); + rcu_read_unlock(); return rc; } -static int security_preserve_bools(struct selinux_state *state, - struct policydb *policydb) +static int security_preserve_bools(struct selinux_policy *oldpolicy, + struct selinux_policy *newpolicy) { int rc, *bvalues = NULL; char **bnames = NULL; struct cond_bool_datum *booldatum; u32 i, nbools = 0; - rc = security_get_bools(state, &nbools, &bnames, &bvalues); + rc = security_get_bools(oldpolicy, &nbools, &bnames, &bvalues); if (rc) goto out; for (i = 0; i < nbools; i++) { - booldatum = symtab_search(&policydb->p_bools, bnames[i]); + booldatum = symtab_search(&newpolicy->policydb.p_bools, + bnames[i]); if (booldatum) booldatum->state = bvalues[i]; } - evaluate_cond_nodes(policydb); + evaluate_cond_nodes(&newpolicy->policydb); out: if (bnames) { @@ -3005,8 +3110,9 @@ out: int security_sid_mls_copy(struct selinux_state *state, u32 sid, u32 mls_sid, u32 *new_sid) { - struct policydb *policydb = &state->ss->policydb; - struct sidtab *sidtab = state->ss->sidtab; + struct selinux_policy *policy; + struct policydb *policydb; + struct sidtab *sidtab; struct context *context1; struct context *context2; struct context newcon; @@ -3015,14 +3121,22 @@ int security_sid_mls_copy(struct selinux_state *state, int rc; rc = 0; - if (!selinux_initialized(state) || !policydb->mls_enabled) { + if (!selinux_initialized(state)) { *new_sid = sid; goto out; } context_init(&newcon); - read_lock(&state->ss->policy_rwlock); + rcu_read_lock(); + policy = rcu_dereference(state->policy); + policydb = &policy->policydb; + sidtab = policy->sidtab; + + if (!policydb->mls_enabled) { + *new_sid = sid; + goto out_unlock; + } rc = -EINVAL; context1 = sidtab_search(sidtab, sid); @@ -3049,7 +3163,8 @@ int security_sid_mls_copy(struct selinux_state *state, /* Check the validity of the new context. */ if (!policydb_context_isvalid(policydb, &newcon)) { - rc = convert_context_handle_invalid_context(state, &newcon); + rc = convert_context_handle_invalid_context(state, policydb, + &newcon); if (rc) { if (!context_struct_to_string(policydb, &newcon, &s, &len)) { @@ -3070,7 +3185,7 @@ int security_sid_mls_copy(struct selinux_state *state, } rc = sidtab_context_to_sid(sidtab, &newcon, new_sid); out_unlock: - read_unlock(&state->ss->policy_rwlock); + rcu_read_unlock(); context_destroy(&newcon); out: return rc; @@ -3101,8 +3216,9 @@ int security_net_peersid_resolve(struct selinux_state *state, u32 xfrm_sid, u32 *peer_sid) { - struct policydb *policydb = &state->ss->policydb; - struct sidtab *sidtab = state->ss->sidtab; + struct selinux_policy *policy; + struct policydb *policydb; + struct sidtab *sidtab; int rc; struct context *nlbl_ctx; struct context *xfrm_ctx; @@ -3124,15 +3240,23 @@ int security_net_peersid_resolve(struct selinux_state *state, return 0; } + if (!selinux_initialized(state)) + return 0; + + rcu_read_lock(); + policy = rcu_dereference(state->policy); + policydb = &policy->policydb; + sidtab = policy->sidtab; + /* * We don't need to check initialized here since the only way both * nlbl_sid and xfrm_sid are not equal to SECSID_NULL would be if the * security server was initialized and state->initialized was true. */ - if (!policydb->mls_enabled) - return 0; - - read_lock(&state->ss->policy_rwlock); + if (!policydb->mls_enabled) { + rc = 0; + goto out; + } rc = -EINVAL; nlbl_ctx = sidtab_search(sidtab, nlbl_sid); @@ -3159,7 +3283,7 @@ int security_net_peersid_resolve(struct selinux_state *state, * expressive */ *peer_sid = xfrm_sid; out: - read_unlock(&state->ss->policy_rwlock); + rcu_read_unlock(); return rc; } @@ -3176,19 +3300,13 @@ static int get_classes_callback(void *k, void *d, void *args) return 0; } -int security_get_classes(struct selinux_state *state, +int security_get_classes(struct selinux_policy *policy, char ***classes, int *nclasses) { - struct policydb *policydb = &state->ss->policydb; + struct policydb *policydb; int rc; - if (!selinux_initialized(state)) { - *nclasses = 0; - *classes = NULL; - return 0; - } - - read_lock(&state->ss->policy_rwlock); + policydb = &policy->policydb; rc = -ENOMEM; *nclasses = policydb->p_classes.nprim; @@ -3206,7 +3324,6 @@ int security_get_classes(struct selinux_state *state, } out: - read_unlock(&state->ss->policy_rwlock); return rc; } @@ -3223,14 +3340,14 @@ static int get_permissions_callback(void *k, void *d, void *args) return 0; } -int security_get_permissions(struct selinux_state *state, +int security_get_permissions(struct selinux_policy *policy, char *class, char ***perms, int *nperms) { - struct policydb *policydb = &state->ss->policydb; + struct policydb *policydb; int rc, i; struct class_datum *match; - read_lock(&state->ss->policy_rwlock); + policydb = &policy->policydb; rc = -EINVAL; match = symtab_search(&policydb->p_classes, class); @@ -3259,11 +3376,9 @@ int security_get_permissions(struct selinux_state *state, goto err; out: - read_unlock(&state->ss->policy_rwlock); return rc; err: - read_unlock(&state->ss->policy_rwlock); for (i = 0; i < *nperms; i++) kfree((*perms)[i]); kfree(*perms); @@ -3272,12 +3387,32 @@ err: int security_get_reject_unknown(struct selinux_state *state) { - return state->ss->policydb.reject_unknown; + struct selinux_policy *policy; + int value; + + if (!selinux_initialized(state)) + return 0; + + rcu_read_lock(); + policy = rcu_dereference(state->policy); + value = policy->policydb.reject_unknown; + rcu_read_unlock(); + return value; } int security_get_allow_unknown(struct selinux_state *state) { - return state->ss->policydb.allow_unknown; + struct selinux_policy *policy; + int value; + + if (!selinux_initialized(state)) + return 0; + + rcu_read_lock(); + policy = rcu_dereference(state->policy); + value = policy->policydb.allow_unknown; + rcu_read_unlock(); + return value; } /** @@ -3293,12 +3428,16 @@ int security_get_allow_unknown(struct selinux_state *state) int security_policycap_supported(struct selinux_state *state, unsigned int req_cap) { - struct policydb *policydb = &state->ss->policydb; + struct selinux_policy *policy; int rc; - read_lock(&state->ss->policy_rwlock); - rc = ebitmap_get_bit(&policydb->policycaps, req_cap); - read_unlock(&state->ss->policy_rwlock); + if (!selinux_initialized(state)) + return 0; + + rcu_read_lock(); + policy = rcu_dereference(state->policy); + rc = ebitmap_get_bit(&policy->policydb.policycaps, req_cap); + rcu_read_unlock(); return rc; } @@ -3321,7 +3460,8 @@ void selinux_audit_rule_free(void *vrule) int selinux_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule) { struct selinux_state *state = &selinux_state; - struct policydb *policydb = &state->ss->policydb; + struct selinux_policy *policy; + struct policydb *policydb; struct selinux_audit_rule *tmprule; struct role_datum *roledatum; struct type_datum *typedatum; @@ -3364,9 +3504,11 @@ int selinux_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule) context_init(&tmprule->au_ctxt); - read_lock(&state->ss->policy_rwlock); + rcu_read_lock(); + policy = rcu_dereference(state->policy); + policydb = &policy->policydb; - tmprule->au_seqno = state->ss->latest_granting; + tmprule->au_seqno = policy->latest_granting; switch (field) { case AUDIT_SUBJ_USER: @@ -3405,7 +3547,7 @@ int selinux_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule) } rc = 0; out: - read_unlock(&state->ss->policy_rwlock); + rcu_read_unlock(); if (rc) { selinux_audit_rule_free(tmprule); @@ -3445,6 +3587,7 @@ int selinux_audit_rule_known(struct audit_krule *rule) int selinux_audit_rule_match(u32 sid, u32 field, u32 op, void *vrule) { struct selinux_state *state = &selinux_state; + struct selinux_policy *policy; struct context *ctxt; struct mls_level *level; struct selinux_audit_rule *rule = vrule; @@ -3455,14 +3598,19 @@ int selinux_audit_rule_match(u32 sid, u32 field, u32 op, void *vrule) return -ENOENT; } - read_lock(&state->ss->policy_rwlock); + if (!selinux_initialized(state)) + return 0; - if (rule->au_seqno < state->ss->latest_granting) { + rcu_read_lock(); + + policy = rcu_dereference(state->policy); + + if (rule->au_seqno < policy->latest_granting) { match = -ESTALE; goto out; } - ctxt = sidtab_search(state->ss->sidtab, sid); + ctxt = sidtab_search(policy->sidtab, sid); if (unlikely(!ctxt)) { WARN_ONCE(1, "selinux_audit_rule_match: unrecognized SID %d\n", sid); @@ -3546,7 +3694,7 @@ int selinux_audit_rule_match(u32 sid, u32 field, u32 op, void *vrule) } out: - read_unlock(&state->ss->policy_rwlock); + rcu_read_unlock(); return match; } @@ -3624,8 +3772,9 @@ int security_netlbl_secattr_to_sid(struct selinux_state *state, struct netlbl_lsm_secattr *secattr, u32 *sid) { - struct policydb *policydb = &state->ss->policydb; - struct sidtab *sidtab = state->ss->sidtab; + struct selinux_policy *policy; + struct policydb *policydb; + struct sidtab *sidtab; int rc; struct context *ctx; struct context ctx_new; @@ -3635,7 +3784,10 @@ int security_netlbl_secattr_to_sid(struct selinux_state *state, return 0; } - read_lock(&state->ss->policy_rwlock); + rcu_read_lock(); + policy = rcu_dereference(state->policy); + policydb = &policy->policydb; + sidtab = policy->sidtab; if (secattr->flags & NETLBL_SECATTR_CACHE) *sid = *(u32 *)secattr->cache->data; @@ -3671,12 +3823,12 @@ int security_netlbl_secattr_to_sid(struct selinux_state *state, } else *sid = SECSID_NULL; - read_unlock(&state->ss->policy_rwlock); + rcu_read_unlock(); return 0; out_free: ebitmap_destroy(&ctx_new.range.level[0].cat); out: - read_unlock(&state->ss->policy_rwlock); + rcu_read_unlock(); return rc; } @@ -3693,17 +3845,20 @@ out: int security_netlbl_sid_to_secattr(struct selinux_state *state, u32 sid, struct netlbl_lsm_secattr *secattr) { - struct policydb *policydb = &state->ss->policydb; + struct selinux_policy *policy; + struct policydb *policydb; int rc; struct context *ctx; if (!selinux_initialized(state)) return 0; - read_lock(&state->ss->policy_rwlock); + rcu_read_lock(); + policy = rcu_dereference(state->policy); + policydb = &policy->policydb; rc = -ENOENT; - ctx = sidtab_search(state->ss->sidtab, sid); + ctx = sidtab_search(policy->sidtab, sid); if (ctx == NULL) goto out; @@ -3718,7 +3873,7 @@ int security_netlbl_sid_to_secattr(struct selinux_state *state, mls_export_netlbl_lvl(policydb, ctx, secattr); rc = mls_export_netlbl_cat(policydb, ctx, secattr); out: - read_unlock(&state->ss->policy_rwlock); + rcu_read_unlock(); return rc; } #endif /* CONFIG_NETLABEL */ @@ -3732,15 +3887,16 @@ out: int security_read_policy(struct selinux_state *state, void **data, size_t *len) { - struct policydb *policydb = &state->ss->policydb; + struct selinux_policy *policy; int rc; struct policy_file fp; - if (!selinux_initialized(state)) + policy = rcu_dereference_protected( + state->policy, lockdep_is_held(&state->policy_mutex)); + if (!policy) return -EINVAL; - *len = security_policydb_len(state); - + *len = policy->policydb.len; *data = vmalloc_user(*len); if (!*data) return -ENOMEM; @@ -3748,10 +3904,7 @@ int security_read_policy(struct selinux_state *state, fp.data = *data; fp.len = *len; - read_lock(&state->ss->policy_rwlock); - rc = policydb_write(policydb, &fp); - read_unlock(&state->ss->policy_rwlock); - + rc = policydb_write(&policy->policydb, &fp); if (rc) return rc; diff --git a/security/selinux/ss/services.h b/security/selinux/ss/services.h index a06f3d835216..9555ad074303 100644 --- a/security/selinux/ss/services.h +++ b/security/selinux/ss/services.h @@ -22,12 +22,11 @@ struct selinux_map { u16 size; /* array size of mapping */ }; -struct selinux_ss { +struct selinux_policy { struct sidtab *sidtab; struct policydb policydb; - rwlock_t policy_rwlock; - u32 latest_granting; struct selinux_map map; + u32 latest_granting; } __randomize_layout; void services_compute_xperms_drivers(struct extended_perms *xperms, diff --git a/security/selinux/ss/sidtab.c b/security/selinux/ss/sidtab.c index eb6d27b5aeb4..5ee190bd30f5 100644 --- a/security/selinux/ss/sidtab.c +++ b/security/selinux/ss/sidtab.c @@ -464,6 +464,16 @@ int sidtab_convert(struct sidtab *s, struct sidtab_convert_params *params) return 0; } +void sidtab_cancel_convert(struct sidtab *s) +{ + unsigned long flags; + + /* cancelling policy load - disable live convert of sidtab */ + spin_lock_irqsave(&s->lock, flags); + s->convert = NULL; + spin_unlock_irqrestore(&s->lock, flags); +} + static void sidtab_destroy_entry(struct sidtab_entry *entry) { context_destroy(&entry->context); diff --git a/security/selinux/ss/sidtab.h b/security/selinux/ss/sidtab.h index f2a84560b8b3..80c744d07ad6 100644 --- a/security/selinux/ss/sidtab.h +++ b/security/selinux/ss/sidtab.h @@ -123,6 +123,8 @@ static inline struct context *sidtab_search_force(struct sidtab *s, u32 sid) int sidtab_convert(struct sidtab *s, struct sidtab_convert_params *params); +void sidtab_cancel_convert(struct sidtab *s); + int sidtab_context_to_sid(struct sidtab *s, struct context *context, u32 *sid); void sidtab_destroy(struct sidtab *s); diff --git a/security/smack/smack.h b/security/smack/smack.h index e9e817d09785..a9768b12716b 100644 --- a/security/smack/smack.h +++ b/security/smack/smack.h @@ -100,7 +100,12 @@ struct socket_smack { struct smack_known *smk_out; /* outbound label */ struct smack_known *smk_in; /* inbound label */ struct smack_known *smk_packet; /* TCP peer label */ + int smk_state; /* netlabel socket states */ }; +#define SMK_NETLBL_UNSET 0 +#define SMK_NETLBL_UNLABELED 1 +#define SMK_NETLBL_LABELED 2 +#define SMK_NETLBL_REQSKB 3 /* * Inode smack data @@ -196,19 +201,6 @@ enum { #define SMACK_DELETE_OPTION "-DELETE" #define SMACK_CIPSO_OPTION "-CIPSO" -/* - * How communications on this socket are treated. - * Usually it's determined by the underlying netlabel code - * but there are certain cases, including single label hosts - * and potentially single label interfaces for which the - * treatment can not be known in advance. - * - * The possibility of additional labeling schemes being - * introduced in the future exists as well. - */ -#define SMACK_UNLABELED_SOCKET 0 -#define SMACK_CIPSO_SOCKET 1 - /* * CIPSO defaults. */ @@ -305,6 +297,7 @@ struct smack_known *smk_find_entry(const char *); bool smack_privileged(int cap); bool smack_privileged_cred(int cap, const struct cred *cred); void smk_destroy_label_list(struct list_head *list); +int smack_populate_secattr(struct smack_known *skp); /* * Shared data. diff --git a/security/smack/smack_access.c b/security/smack/smack_access.c index 38ac3da4e791..efe2406a3960 100644 --- a/security/smack/smack_access.c +++ b/security/smack/smack_access.c @@ -510,6 +510,42 @@ int smk_netlbl_mls(int level, char *catset, struct netlbl_lsm_secattr *sap, return 0; } +/** + * smack_populate_secattr - fill in the smack_known netlabel information + * @skp: pointer to the structure to fill + * + * Populate the netlabel secattr structure for a Smack label. + * + * Returns 0 unless creating the category mapping fails + */ +int smack_populate_secattr(struct smack_known *skp) +{ + int slen; + + skp->smk_netlabel.attr.secid = skp->smk_secid; + skp->smk_netlabel.domain = skp->smk_known; + skp->smk_netlabel.cache = netlbl_secattr_cache_alloc(GFP_ATOMIC); + if (skp->smk_netlabel.cache != NULL) { + skp->smk_netlabel.flags |= NETLBL_SECATTR_CACHE; + skp->smk_netlabel.cache->free = NULL; + skp->smk_netlabel.cache->data = skp; + } + skp->smk_netlabel.flags |= NETLBL_SECATTR_SECID | + NETLBL_SECATTR_MLS_LVL | + NETLBL_SECATTR_DOMAIN; + /* + * If direct labeling works use it. + * Otherwise use mapped labeling. + */ + slen = strlen(skp->smk_known); + if (slen < SMK_CIPSOLEN) + return smk_netlbl_mls(smack_cipso_direct, skp->smk_known, + &skp->smk_netlabel, slen); + + return smk_netlbl_mls(smack_cipso_mapped, (char *)&skp->smk_secid, + &skp->smk_netlabel, sizeof(skp->smk_secid)); +} + /** * smk_import_entry - import a label, return the list entry * @string: a text string that might be a Smack label @@ -523,7 +559,6 @@ struct smack_known *smk_import_entry(const char *string, int len) { struct smack_known *skp; char *smack; - int slen; int rc; smack = smk_parse_smack(string, len); @@ -544,21 +579,8 @@ struct smack_known *smk_import_entry(const char *string, int len) skp->smk_known = smack; skp->smk_secid = smack_next_secid++; - skp->smk_netlabel.domain = skp->smk_known; - skp->smk_netlabel.flags = - NETLBL_SECATTR_DOMAIN | NETLBL_SECATTR_MLS_LVL; - /* - * If direct labeling works use it. - * Otherwise use mapped labeling. - */ - slen = strlen(smack); - if (slen < SMK_CIPSOLEN) - rc = smk_netlbl_mls(smack_cipso_direct, skp->smk_known, - &skp->smk_netlabel, slen); - else - rc = smk_netlbl_mls(smack_cipso_mapped, (char *)&skp->smk_secid, - &skp->smk_netlabel, sizeof(skp->smk_secid)); + rc = smack_populate_secattr(skp); if (rc >= 0) { INIT_LIST_HEAD(&skp->smk_rules); mutex_init(&skp->smk_rules_lock); @@ -569,9 +591,6 @@ struct smack_known *smk_import_entry(const char *string, int len) smk_insert_entry(skp); goto unlockout; } - /* - * smk_netlbl_mls failed. - */ kfree(skp); skp = ERR_PTR(rc); freeout: diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index d4edd63a1402..fe8d33b287f5 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -2384,38 +2384,31 @@ static struct smack_known *smack_ipv6host_label(struct sockaddr_in6 *sip) } /** - * smack_netlabel - Set the secattr on a socket + * smack_netlbl_add - Set the secattr on a socket * @sk: the socket - * @labeled: socket label scheme * - * Convert the outbound smack value (smk_out) to a - * secattr and attach it to the socket. + * Attach the outbound smack value (smk_out) to the socket. * * Returns 0 on success or an error code */ -static int smack_netlabel(struct sock *sk, int labeled) +static int smack_netlbl_add(struct sock *sk) { - struct smack_known *skp; struct socket_smack *ssp = sk->sk_security; - int rc = 0; + struct smack_known *skp = ssp->smk_out; + int rc; - /* - * Usually the netlabel code will handle changing the - * packet labeling based on the label. - * The case of a single label host is different, because - * a single label host should never get a labeled packet - * even though the label is usually associated with a packet - * label. - */ local_bh_disable(); bh_lock_sock_nested(sk); - if (ssp->smk_out == smack_net_ambient || - labeled == SMACK_UNLABELED_SOCKET) - netlbl_sock_delattr(sk); - else { - skp = ssp->smk_out; - rc = netlbl_sock_setattr(sk, sk->sk_family, &skp->smk_netlabel); + rc = netlbl_sock_setattr(sk, sk->sk_family, &skp->smk_netlabel); + switch (rc) { + case 0: + ssp->smk_state = SMK_NETLBL_LABELED; + break; + case -EDESTADDRREQ: + ssp->smk_state = SMK_NETLBL_REQSKB; + rc = 0; + break; } bh_unlock_sock(sk); @@ -2425,7 +2418,31 @@ static int smack_netlabel(struct sock *sk, int labeled) } /** - * smack_netlbel_send - Set the secattr on a socket and perform access checks + * smack_netlbl_delete - Remove the secattr from a socket + * @sk: the socket + * + * Remove the outbound smack value from a socket + */ +static void smack_netlbl_delete(struct sock *sk) +{ + struct socket_smack *ssp = sk->sk_security; + + /* + * Take the label off the socket if one is set. + */ + if (ssp->smk_state != SMK_NETLBL_LABELED) + return; + + local_bh_disable(); + bh_lock_sock_nested(sk); + netlbl_sock_delattr(sk); + bh_unlock_sock(sk); + local_bh_enable(); + ssp->smk_state = SMK_NETLBL_UNLABELED; +} + +/** + * smk_ipv4_check - Perform IPv4 host access checks * @sk: the socket * @sap: the destination address * @@ -2435,11 +2452,10 @@ static int smack_netlabel(struct sock *sk, int labeled) * Returns 0 on success or an error code. * */ -static int smack_netlabel_send(struct sock *sk, struct sockaddr_in *sap) +static int smk_ipv4_check(struct sock *sk, struct sockaddr_in *sap) { struct smack_known *skp; - int rc; - int sk_lbl; + int rc = 0; struct smack_known *hkp; struct socket_smack *ssp = sk->sk_security; struct smk_audit_info ad; @@ -2455,19 +2471,18 @@ static int smack_netlabel_send(struct sock *sk, struct sockaddr_in *sap) ad.a.u.net->dport = sap->sin_port; ad.a.u.net->v4info.daddr = sap->sin_addr.s_addr; #endif - sk_lbl = SMACK_UNLABELED_SOCKET; skp = ssp->smk_out; rc = smk_access(skp, hkp, MAY_WRITE, &ad); rc = smk_bu_note("IPv4 host check", skp, hkp, MAY_WRITE, rc); - } else { - sk_lbl = SMACK_CIPSO_SOCKET; - rc = 0; + /* + * Clear the socket netlabel if it's set. + */ + if (!rc) + smack_netlbl_delete(sk); } rcu_read_unlock(); - if (rc != 0) - return rc; - return smack_netlabel(sk, sk_lbl); + return rc; } /** @@ -2704,7 +2719,7 @@ static int smack_inode_setsecurity(struct inode *inode, const char *name, else if (strcmp(name, XATTR_SMACK_IPOUT) == 0) { ssp->smk_out = skp; if (sock->sk->sk_family == PF_INET) { - rc = smack_netlabel(sock->sk, SMACK_CIPSO_SOCKET); + rc = smack_netlbl_add(sock->sk); if (rc != 0) printk(KERN_WARNING "Smack: \"%s\" netlbl error %d.\n", @@ -2755,7 +2770,7 @@ static int smack_socket_post_create(struct socket *sock, int family, /* * Set the outbound netlbl. */ - return smack_netlabel(sock->sk, SMACK_CIPSO_SOCKET); + return smack_netlbl_add(sock->sk); } /** @@ -2846,7 +2861,7 @@ static int smack_socket_connect(struct socket *sock, struct sockaddr *sap, } if (sap->sa_family != AF_INET || addrlen < sizeof(struct sockaddr_in)) return 0; - rc = smack_netlabel_send(sock->sk, (struct sockaddr_in *)sap); + rc = smk_ipv4_check(sock->sk, (struct sockaddr_in *)sap); return rc; } @@ -3664,7 +3679,7 @@ static int smack_socket_sendmsg(struct socket *sock, struct msghdr *msg, if (msg->msg_namelen < sizeof(struct sockaddr_in) || sip->sin_family != AF_INET) return -EINVAL; - rc = smack_netlabel_send(sock->sk, sip); + rc = smk_ipv4_check(sock->sk, sip); break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: @@ -3701,6 +3716,18 @@ static struct smack_known *smack_from_secattr(struct netlbl_lsm_secattr *sap, int acat; int kcat; + /* + * Netlabel found it in the cache. + */ + if ((sap->flags & NETLBL_SECATTR_CACHE) != 0) + return (struct smack_known *)sap->cache->data; + + if ((sap->flags & NETLBL_SECATTR_SECID) != 0) + /* + * Looks like a fallback, which gives us a secid. + */ + return smack_from_secid(sap->attr.secid); + if ((sap->flags & NETLBL_SECATTR_MLS_LVL) != 0) { /* * Looks like a CIPSO packet. @@ -3748,11 +3775,6 @@ static struct smack_known *smack_from_secattr(struct netlbl_lsm_secattr *sap, return &smack_known_web; return &smack_known_star; } - if ((sap->flags & NETLBL_SECATTR_SECID) != 0) - /* - * Looks like a fallback, which gives us a secid. - */ - return smack_from_secid(sap->attr.secid); /* * Without guidance regarding the smack value * for the packet fall back on the network @@ -3811,6 +3833,62 @@ static int smk_skb_to_addr_ipv6(struct sk_buff *skb, struct sockaddr_in6 *sip) } #endif /* CONFIG_IPV6 */ +/** + * smack_from_skb - Smack data from the secmark in an skb + * @skb: packet + * + * Returns smack_known of the secmark or NULL if that won't work. + */ +#ifdef CONFIG_NETWORK_SECMARK +static struct smack_known *smack_from_skb(struct sk_buff *skb) +{ + if (skb == NULL || skb->secmark == 0) + return NULL; + + return smack_from_secid(skb->secmark); +} +#else +static inline struct smack_known *smack_from_skb(struct sk_buff *skb) +{ + return NULL; +} +#endif + +/** + * smack_from_netlbl - Smack data from the IP options in an skb + * @sk: socket data came in on + * @family: address family + * @skb: packet + * + * Find the Smack label in the IP options. If it hasn't been + * added to the netlabel cache, add it here. + * + * Returns smack_known of the IP options or NULL if that won't work. + */ +static struct smack_known *smack_from_netlbl(struct sock *sk, u16 family, + struct sk_buff *skb) +{ + struct netlbl_lsm_secattr secattr; + struct socket_smack *ssp = NULL; + struct smack_known *skp = NULL; + int rc; + + netlbl_secattr_init(&secattr); + + if (sk) + ssp = sk->sk_security; + + if (netlbl_skbuff_getattr(skb, family, &secattr) == 0) { + skp = smack_from_secattr(&secattr, ssp); + if (secattr.flags & NETLBL_SECATTR_CACHEABLE) + rc = netlbl_cache_add(skb, family, &skp->smk_netlabel); + } + + netlbl_secattr_destroy(&secattr); + + return skp; +} + /** * smack_socket_sock_rcv_skb - Smack packet delivery access check * @sk: socket @@ -3820,7 +3898,6 @@ static int smk_skb_to_addr_ipv6(struct sk_buff *skb, struct sockaddr_in6 *sip) */ static int smack_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) { - struct netlbl_lsm_secattr secattr; struct socket_smack *ssp = sk->sk_security; struct smack_known *skp = NULL; int rc = 0; @@ -3839,33 +3916,18 @@ static int smack_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) switch (family) { case PF_INET: -#ifdef CONFIG_SECURITY_SMACK_NETFILTER /* * If there is a secmark use it rather than the CIPSO label. * If there is no secmark fall back to CIPSO. * The secmark is assumed to reflect policy better. */ - if (skb && skb->secmark != 0) { - skp = smack_from_secid(skb->secmark); - goto access_check; + skp = smack_from_skb(skb); + if (skp == NULL) { + skp = smack_from_netlbl(sk, family, skb); + if (skp == NULL) + skp = smack_net_ambient; } -#endif /* CONFIG_SECURITY_SMACK_NETFILTER */ - /* - * Translate what netlabel gave us. - */ - netlbl_secattr_init(&secattr); - rc = netlbl_skbuff_getattr(skb, family, &secattr); - if (rc == 0) - skp = smack_from_secattr(&secattr, ssp); - else - skp = smack_net_ambient; - - netlbl_secattr_destroy(&secattr); - -#ifdef CONFIG_SECURITY_SMACK_NETFILTER -access_check: -#endif #ifdef CONFIG_AUDIT smk_ad_init_net(&ad, __func__, LSM_AUDIT_DATA_NET, &net); ad.a.u.net->family = family; @@ -3891,16 +3953,14 @@ access_check: proto != IPPROTO_TCP && proto != IPPROTO_DCCP) break; #ifdef SMACK_IPV6_SECMARK_LABELING - if (skb && skb->secmark != 0) - skp = smack_from_secid(skb->secmark); - else if (smk_ipv6_localhost(&sadd)) - break; - else + skp = smack_from_skb(skb); + if (skp == NULL) { + if (smk_ipv6_localhost(&sadd)) + break; skp = smack_ipv6host_label(&sadd); - if (skp == NULL) - skp = smack_net_ambient; - if (skb == NULL) - break; + if (skp == NULL) + skp = smack_net_ambient; + } #ifdef CONFIG_AUDIT smk_ad_init_net(&ad, __func__, LSM_AUDIT_DATA_NET, &net); ad.a.u.net->family = family; @@ -3972,12 +4032,11 @@ static int smack_socket_getpeersec_dgram(struct socket *sock, struct sk_buff *skb, u32 *secid) { - struct netlbl_lsm_secattr secattr; struct socket_smack *ssp = NULL; struct smack_known *skp; + struct sock *sk = NULL; int family = PF_UNSPEC; u32 s = 0; /* 0 is the invalid secid */ - int rc; if (skb != NULL) { if (skb->protocol == htons(ETH_P_IP)) @@ -3996,27 +4055,25 @@ static int smack_socket_getpeersec_dgram(struct socket *sock, s = ssp->smk_out->smk_secid; break; case PF_INET: -#ifdef CONFIG_SECURITY_SMACK_NETFILTER - s = skb->secmark; - if (s != 0) + skp = smack_from_skb(skb); + if (skp) { + s = skp->smk_secid; break; -#endif + } /* * Translate what netlabel gave us. */ - if (sock != NULL && sock->sk != NULL) - ssp = sock->sk->sk_security; - netlbl_secattr_init(&secattr); - rc = netlbl_skbuff_getattr(skb, family, &secattr); - if (rc == 0) { - skp = smack_from_secattr(&secattr, ssp); + if (sock != NULL) + sk = sock->sk; + skp = smack_from_netlbl(sk, family, skb); + if (skp != NULL) s = skp->smk_secid; - } - netlbl_secattr_destroy(&secattr); break; case PF_INET6: #ifdef SMACK_IPV6_SECMARK_LABELING - s = skb->secmark; + skp = smack_from_skb(skb); + if (skp) + s = skp->smk_secid; #endif break; } @@ -4064,7 +4121,6 @@ static int smack_inet_conn_request(struct sock *sk, struct sk_buff *skb, u16 family = sk->sk_family; struct smack_known *skp; struct socket_smack *ssp = sk->sk_security; - struct netlbl_lsm_secattr secattr; struct sockaddr_in addr; struct iphdr *hdr; struct smack_known *hskp; @@ -4088,29 +4144,17 @@ static int smack_inet_conn_request(struct sock *sk, struct sk_buff *skb, } #endif /* CONFIG_IPV6 */ -#ifdef CONFIG_SECURITY_SMACK_NETFILTER /* * If there is a secmark use it rather than the CIPSO label. * If there is no secmark fall back to CIPSO. * The secmark is assumed to reflect policy better. */ - if (skb && skb->secmark != 0) { - skp = smack_from_secid(skb->secmark); - goto access_check; + skp = smack_from_skb(skb); + if (skp == NULL) { + skp = smack_from_netlbl(sk, family, skb); + if (skp == NULL) + skp = &smack_known_huh; } -#endif /* CONFIG_SECURITY_SMACK_NETFILTER */ - - netlbl_secattr_init(&secattr); - rc = netlbl_skbuff_getattr(skb, family, &secattr); - if (rc == 0) - skp = smack_from_secattr(&secattr, ssp); - else - skp = &smack_known_huh; - netlbl_secattr_destroy(&secattr); - -#ifdef CONFIG_SECURITY_SMACK_NETFILTER -access_check: -#endif #ifdef CONFIG_AUDIT smk_ad_init_net(&ad, __func__, LSM_AUDIT_DATA_NET, &net); diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c index 9c4308077574..e567b4baf3a0 100644 --- a/security/smack/smackfs.c +++ b/security/smack/smackfs.c @@ -922,6 +922,10 @@ static ssize_t smk_set_cipso(struct file *file, const char __user *buf, skp->smk_netlabel.attr.mls.cat = ncats.attr.mls.cat; skp->smk_netlabel.attr.mls.lvl = ncats.attr.mls.lvl; rc = count; + /* + * This mapping may have been cached, so clear the cache. + */ + netlbl_cache_invalidate(); } out: @@ -2950,15 +2954,6 @@ static struct file_system_type smk_fs_type = { static struct vfsmount *smackfs_mount; -static int __init smk_preset_netlabel(struct smack_known *skp) -{ - skp->smk_netlabel.domain = skp->smk_known; - skp->smk_netlabel.flags = - NETLBL_SECATTR_DOMAIN | NETLBL_SECATTR_MLS_LVL; - return smk_netlbl_mls(smack_cipso_direct, skp->smk_known, - &skp->smk_netlabel, strlen(skp->smk_known)); -} - /** * init_smk_fs - get the smackfs superblock * @@ -2997,19 +2992,19 @@ static int __init init_smk_fs(void) smk_cipso_doi(); smk_unlbl_ambient(NULL); - rc = smk_preset_netlabel(&smack_known_floor); + rc = smack_populate_secattr(&smack_known_floor); if (err == 0 && rc < 0) err = rc; - rc = smk_preset_netlabel(&smack_known_hat); + rc = smack_populate_secattr(&smack_known_hat); if (err == 0 && rc < 0) err = rc; - rc = smk_preset_netlabel(&smack_known_huh); + rc = smack_populate_secattr(&smack_known_huh); if (err == 0 && rc < 0) err = rc; - rc = smk_preset_netlabel(&smack_known_star); + rc = smack_populate_secattr(&smack_known_star); if (err == 0 && rc < 0) err = rc; - rc = smk_preset_netlabel(&smack_known_web); + rc = smack_populate_secattr(&smack_known_web); if (err == 0 && rc < 0) err = rc; diff --git a/security/tomoyo/util.c b/security/tomoyo/util.c index eba0b3395851..a40abb0b91ee 100644 --- a/security/tomoyo/util.c +++ b/security/tomoyo/util.c @@ -143,6 +143,8 @@ char *tomoyo_read_token(struct tomoyo_acl_param *param) return pos; } +static bool tomoyo_correct_path2(const char *filename, const size_t len); + /** * tomoyo_get_domainname - Read a domainname from a line. * @@ -157,10 +159,10 @@ const struct tomoyo_path_info *tomoyo_get_domainname char *pos = start; while (*pos) { - if (*pos++ != ' ' || *pos++ == '/') + if (*pos++ != ' ' || + tomoyo_correct_path2(pos, strchrnul(pos, ' ') - pos)) continue; - pos -= 2; - *pos++ = '\0'; + *(pos - 1) = '\0'; break; } param->data = pos; @@ -513,6 +515,22 @@ bool tomoyo_correct_word(const char *string) return tomoyo_correct_word2(string, strlen(string)); } +/** + * tomoyo_correct_path2 - Check whether the given pathname follows the naming rules. + * + * @filename: The pathname to check. + * @len: Length of @filename. + * + * Returns true if @filename follows the naming rules, false otherwise. + */ +static bool tomoyo_correct_path2(const char *filename, const size_t len) +{ + const char *cp1 = memchr(filename, '/', len); + const char *cp2 = memchr(filename, '.', len); + + return cp1 && (!cp2 || (cp1 < cp2)) && tomoyo_correct_word2(filename, len); +} + /** * tomoyo_correct_path - Validate a pathname. * @@ -523,7 +541,7 @@ bool tomoyo_correct_word(const char *string) */ bool tomoyo_correct_path(const char *filename) { - return *filename == '/' && tomoyo_correct_word(filename); + return tomoyo_correct_path2(filename, strlen(filename)); } /** @@ -545,8 +563,7 @@ bool tomoyo_correct_domain(const unsigned char *domainname) if (!cp) break; - if (*domainname != '/' || - !tomoyo_correct_word2(domainname, cp - domainname)) + if (!tomoyo_correct_path2(domainname, cp - domainname)) return false; domainname = cp + 1; } diff --git a/tools/testing/nvdimm/dax-dev.c b/tools/testing/nvdimm/dax-dev.c index 7e5d979e73cb..fb342a8c98d3 100644 --- a/tools/testing/nvdimm/dax-dev.c +++ b/tools/testing/nvdimm/dax-dev.c @@ -9,12 +9,19 @@ phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff, unsigned long size) { - struct resource *res = &dev_dax->region->res; - phys_addr_t addr; + int i; - addr = pgoff * PAGE_SIZE + res->start; - if (addr >= res->start && addr <= res->end) { - if (addr + size - 1 <= res->end) { + for (i = 0; i < dev_dax->nr_range; i++) { + struct dev_dax_range *dax_range = &dev_dax->ranges[i]; + struct range *range = &dax_range->range; + unsigned long long pgoff_end; + phys_addr_t addr; + + pgoff_end = dax_range->pgoff + PHYS_PFN(range_len(range)) - 1; + if (pgoff < dax_range->pgoff || pgoff > pgoff_end) + continue; + addr = PFN_PHYS(pgoff - dax_range->pgoff) + range->start; + if (addr + size - 1 <= range->end) { if (get_nfit_res(addr)) { struct page *page; @@ -23,9 +30,10 @@ phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff, page = vmalloc_to_page((void *)addr); return PFN_PHYS(page_to_pfn(page)); - } else - return addr; + } + return addr; } + break; } return -1; } diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c index 03e40b3b0106..c62d372d426f 100644 --- a/tools/testing/nvdimm/test/iomap.c +++ b/tools/testing/nvdimm/test/iomap.c @@ -126,7 +126,7 @@ static void dev_pagemap_percpu_release(struct percpu_ref *ref) void *__wrap_devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) { int error; - resource_size_t offset = pgmap->res.start; + resource_size_t offset = pgmap->range.start; struct nfit_test_resource *nfit_res = get_nfit_res(offset); if (!nfit_res) diff --git a/tools/testing/selftests/clone3/clone3.c b/tools/testing/selftests/clone3/clone3.c index b7e6dec36173..42be3b925830 100644 --- a/tools/testing/selftests/clone3/clone3.c +++ b/tools/testing/selftests/clone3/clone3.c @@ -20,13 +20,6 @@ #include "../kselftest.h" #include "clone3_selftests.h" -/* - * Different sizes of struct clone_args - */ -#ifndef CLONE3_ARGS_SIZE_V0 -#define CLONE3_ARGS_SIZE_V0 64 -#endif - enum test_mode { CLONE3_ARGS_NO_TEST, CLONE3_ARGS_ALL_0, @@ -38,13 +31,13 @@ enum test_mode { static int call_clone3(uint64_t flags, size_t size, enum test_mode test_mode) { - struct clone_args args = { + struct __clone_args args = { .flags = flags, .exit_signal = SIGCHLD, }; struct clone_args_extended { - struct clone_args args; + struct __clone_args args; __aligned_u64 excess_space[2]; } args_ext; @@ -52,11 +45,11 @@ static int call_clone3(uint64_t flags, size_t size, enum test_mode test_mode) int status; memset(&args_ext, 0, sizeof(args_ext)); - if (size > sizeof(struct clone_args)) + if (size > sizeof(struct __clone_args)) args_ext.excess_space[1] = 1; if (size == 0) - size = sizeof(struct clone_args); + size = sizeof(struct __clone_args); switch (test_mode) { case CLONE3_ARGS_ALL_0: @@ -77,9 +70,9 @@ static int call_clone3(uint64_t flags, size_t size, enum test_mode test_mode) break; } - memcpy(&args_ext.args, &args, sizeof(struct clone_args)); + memcpy(&args_ext.args, &args, sizeof(struct __clone_args)); - pid = sys_clone3((struct clone_args *)&args_ext, size); + pid = sys_clone3((struct __clone_args *)&args_ext, size); if (pid < 0) { ksft_print_msg("%s - Failed to create new process\n", strerror(errno)); @@ -144,14 +137,14 @@ int main(int argc, char *argv[]) else ksft_test_result_skip("Skipping clone3() with CLONE_NEWPID\n"); - /* Do a clone3() with CLONE3_ARGS_SIZE_V0. */ - test_clone3(0, CLONE3_ARGS_SIZE_V0, 0, CLONE3_ARGS_NO_TEST); + /* Do a clone3() with CLONE_ARGS_SIZE_VER0. */ + test_clone3(0, CLONE_ARGS_SIZE_VER0, 0, CLONE3_ARGS_NO_TEST); - /* Do a clone3() with CLONE3_ARGS_SIZE_V0 - 8 */ - test_clone3(0, CLONE3_ARGS_SIZE_V0 - 8, -EINVAL, CLONE3_ARGS_NO_TEST); + /* Do a clone3() with CLONE_ARGS_SIZE_VER0 - 8 */ + test_clone3(0, CLONE_ARGS_SIZE_VER0 - 8, -EINVAL, CLONE3_ARGS_NO_TEST); /* Do a clone3() with sizeof(struct clone_args) + 8 */ - test_clone3(0, sizeof(struct clone_args) + 8, 0, CLONE3_ARGS_NO_TEST); + test_clone3(0, sizeof(struct __clone_args) + 8, 0, CLONE3_ARGS_NO_TEST); /* Do a clone3() with exit_signal having highest 32 bits non-zero */ test_clone3(0, 0, -EINVAL, CLONE3_ARGS_INVAL_EXIT_SIGNAL_BIG); @@ -165,31 +158,31 @@ int main(int argc, char *argv[]) /* Do a clone3() with NSIG < exit_signal < CSIG */ test_clone3(0, 0, -EINVAL, CLONE3_ARGS_INVAL_EXIT_SIGNAL_NSIG); - test_clone3(0, sizeof(struct clone_args) + 8, 0, CLONE3_ARGS_ALL_0); + test_clone3(0, sizeof(struct __clone_args) + 8, 0, CLONE3_ARGS_ALL_0); - test_clone3(0, sizeof(struct clone_args) + 16, -E2BIG, + test_clone3(0, sizeof(struct __clone_args) + 16, -E2BIG, CLONE3_ARGS_ALL_0); - test_clone3(0, sizeof(struct clone_args) * 2, -E2BIG, + test_clone3(0, sizeof(struct __clone_args) * 2, -E2BIG, CLONE3_ARGS_ALL_0); /* Do a clone3() with > page size */ test_clone3(0, getpagesize() + 8, -E2BIG, CLONE3_ARGS_NO_TEST); - /* Do a clone3() with CLONE3_ARGS_SIZE_V0 in a new PID NS. */ + /* Do a clone3() with CLONE_ARGS_SIZE_VER0 in a new PID NS. */ if (uid == 0) - test_clone3(CLONE_NEWPID, CLONE3_ARGS_SIZE_V0, 0, + test_clone3(CLONE_NEWPID, CLONE_ARGS_SIZE_VER0, 0, CLONE3_ARGS_NO_TEST); else ksft_test_result_skip("Skipping clone3() with CLONE_NEWPID\n"); - /* Do a clone3() with CLONE3_ARGS_SIZE_V0 - 8 in a new PID NS */ - test_clone3(CLONE_NEWPID, CLONE3_ARGS_SIZE_V0 - 8, -EINVAL, + /* Do a clone3() with CLONE_ARGS_SIZE_VER0 - 8 in a new PID NS */ + test_clone3(CLONE_NEWPID, CLONE_ARGS_SIZE_VER0 - 8, -EINVAL, CLONE3_ARGS_NO_TEST); /* Do a clone3() with sizeof(struct clone_args) + 8 in a new PID NS */ if (uid == 0) - test_clone3(CLONE_NEWPID, sizeof(struct clone_args) + 8, 0, + test_clone3(CLONE_NEWPID, sizeof(struct __clone_args) + 8, 0, CLONE3_ARGS_NO_TEST); else ksft_test_result_skip("Skipping clone3() with CLONE_NEWPID\n"); diff --git a/tools/testing/selftests/clone3/clone3_cap_checkpoint_restore.c b/tools/testing/selftests/clone3/clone3_cap_checkpoint_restore.c index 9562425aa0a9..55bd387ce7ec 100644 --- a/tools/testing/selftests/clone3/clone3_cap_checkpoint_restore.c +++ b/tools/testing/selftests/clone3/clone3_cap_checkpoint_restore.c @@ -44,13 +44,13 @@ static int call_clone3_set_tid(struct __test_metadata *_metadata, int status; pid_t pid = -1; - struct clone_args args = { + struct __clone_args args = { .exit_signal = SIGCHLD, .set_tid = ptr_to_u64(set_tid), .set_tid_size = set_tid_size, }; - pid = sys_clone3(&args, sizeof(struct clone_args)); + pid = sys_clone3(&args, sizeof(args)); if (pid < 0) { TH_LOG("%s - Failed to create new process", strerror(errno)); return -errno; diff --git a/tools/testing/selftests/clone3/clone3_clear_sighand.c b/tools/testing/selftests/clone3/clone3_clear_sighand.c index db5fc9c5edcf..47a8c0fc3676 100644 --- a/tools/testing/selftests/clone3/clone3_clear_sighand.c +++ b/tools/testing/selftests/clone3/clone3_clear_sighand.c @@ -47,7 +47,7 @@ static void test_clone3_clear_sighand(void) { int ret; pid_t pid; - struct clone_args args = {}; + struct __clone_args args = {}; struct sigaction act; /* diff --git a/tools/testing/selftests/clone3/clone3_selftests.h b/tools/testing/selftests/clone3/clone3_selftests.h index 91c1a78ddb39..e81ffaaee02b 100644 --- a/tools/testing/selftests/clone3/clone3_selftests.h +++ b/tools/testing/selftests/clone3/clone3_selftests.h @@ -19,13 +19,11 @@ #define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */ #endif -#ifndef CLONE_ARGS_SIZE_VER0 -#define CLONE_ARGS_SIZE_VER0 64 -#endif - #ifndef __NR_clone3 #define __NR_clone3 -1 -struct clone_args { +#endif + +struct __clone_args { __aligned_u64 flags; __aligned_u64 pidfd; __aligned_u64 child_tid; @@ -34,15 +32,21 @@ struct clone_args { __aligned_u64 stack; __aligned_u64 stack_size; __aligned_u64 tls; -#define CLONE_ARGS_SIZE_VER1 80 +#ifndef CLONE_ARGS_SIZE_VER0 +#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */ +#endif __aligned_u64 set_tid; __aligned_u64 set_tid_size; -#define CLONE_ARGS_SIZE_VER2 88 +#ifndef CLONE_ARGS_SIZE_VER1 +#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */ +#endif __aligned_u64 cgroup; +#ifndef CLONE_ARGS_SIZE_VER2 +#define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */ +#endif }; -#endif /* __NR_clone3 */ -static pid_t sys_clone3(struct clone_args *args, size_t size) +static pid_t sys_clone3(struct __clone_args *args, size_t size) { fflush(stdout); fflush(stderr); @@ -52,7 +56,7 @@ static pid_t sys_clone3(struct clone_args *args, size_t size) static inline void test_clone3_supported(void) { pid_t pid; - struct clone_args args = {}; + struct __clone_args args = {}; if (__NR_clone3 < 0) ksft_exit_skip("clone3() syscall is not supported\n"); diff --git a/tools/testing/selftests/clone3/clone3_set_tid.c b/tools/testing/selftests/clone3/clone3_set_tid.c index 5831c1082d6d..0229e9ebb995 100644 --- a/tools/testing/selftests/clone3/clone3_set_tid.c +++ b/tools/testing/selftests/clone3/clone3_set_tid.c @@ -46,14 +46,14 @@ static int call_clone3_set_tid(pid_t *set_tid, int status; pid_t pid = -1; - struct clone_args args = { + struct __clone_args args = { .flags = flags, .exit_signal = SIGCHLD, .set_tid = ptr_to_u64(set_tid), .set_tid_size = set_tid_size, }; - pid = sys_clone3(&args, sizeof(struct clone_args)); + pid = sys_clone3(&args, sizeof(args)); if (pid < 0) { ksft_print_msg("%s - Failed to create new process\n", strerror(errno)); diff --git a/tools/testing/selftests/pidfd/pidfd_setns_test.c b/tools/testing/selftests/pidfd/pidfd_setns_test.c index 7dca1aa4672d..1f085b922c6e 100644 --- a/tools/testing/selftests/pidfd/pidfd_setns_test.c +++ b/tools/testing/selftests/pidfd/pidfd_setns_test.c @@ -75,7 +75,7 @@ static int sys_waitid(int which, pid_t pid, int options) pid_t create_child(int *pidfd, unsigned flags) { - struct clone_args args = { + struct __clone_args args = { .flags = CLONE_PIDFD | flags, .exit_signal = SIGCHLD, .pidfd = ptr_to_u64(pidfd), diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 7a6d40286a42..4a180439ee9e 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -774,8 +774,15 @@ void *kill_thread(void *data) return (void *)SIBLING_EXIT_UNKILLED; } +enum kill_t { + KILL_THREAD, + KILL_PROCESS, + RET_UNKNOWN +}; + /* Prepare a thread that will kill itself or both of us. */ -void kill_thread_or_group(struct __test_metadata *_metadata, bool kill_process) +void kill_thread_or_group(struct __test_metadata *_metadata, + enum kill_t kill_how) { pthread_t thread; void *status; @@ -791,11 +798,12 @@ void kill_thread_or_group(struct __test_metadata *_metadata, bool kill_process) .len = (unsigned short)ARRAY_SIZE(filter_thread), .filter = filter_thread, }; + int kill = kill_how == KILL_PROCESS ? SECCOMP_RET_KILL_PROCESS : 0xAAAAAAAAA; struct sock_filter filter_process[] = { BPF_STMT(BPF_LD|BPF_W|BPF_ABS, offsetof(struct seccomp_data, nr)), BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_prctl, 0, 1), - BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL_PROCESS), + BPF_STMT(BPF_RET|BPF_K, kill), BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), }; struct sock_fprog prog_process = { @@ -808,13 +816,15 @@ void kill_thread_or_group(struct __test_metadata *_metadata, bool kill_process) } ASSERT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0, - kill_process ? &prog_process : &prog_thread)); + kill_how == KILL_THREAD ? &prog_thread + : &prog_process)); /* * Add the KILL_THREAD rule again to make sure that the KILL_PROCESS * flag cannot be downgraded by a new filter. */ - ASSERT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog_thread)); + if (kill_how == KILL_PROCESS) + ASSERT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog_thread)); /* Start a thread that will exit immediately. */ ASSERT_EQ(0, pthread_create(&thread, NULL, kill_thread, (void *)false)); @@ -842,7 +852,7 @@ TEST(KILL_thread) child_pid = fork(); ASSERT_LE(0, child_pid); if (child_pid == 0) { - kill_thread_or_group(_metadata, false); + kill_thread_or_group(_metadata, KILL_THREAD); _exit(38); } @@ -861,7 +871,7 @@ TEST(KILL_process) child_pid = fork(); ASSERT_LE(0, child_pid); if (child_pid == 0) { - kill_thread_or_group(_metadata, true); + kill_thread_or_group(_metadata, KILL_PROCESS); _exit(38); } @@ -872,6 +882,27 @@ TEST(KILL_process) ASSERT_EQ(SIGSYS, WTERMSIG(status)); } +TEST(KILL_unknown) +{ + int status; + pid_t child_pid; + + child_pid = fork(); + ASSERT_LE(0, child_pid); + if (child_pid == 0) { + kill_thread_or_group(_metadata, RET_UNKNOWN); + _exit(38); + } + + ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0)); + + /* If the entire process was killed, we'll see SIGSYS. */ + EXPECT_TRUE(WIFSIGNALED(status)) { + TH_LOG("Unknown SECCOMP_RET is only killing the thread?"); + } + ASSERT_EQ(SIGSYS, WTERMSIG(status)); +} + /* TODO(wad) add 64-bit versus 32-bit arg tests. */ TEST(arg_out_of_range) { @@ -1667,70 +1698,148 @@ TEST_F(TRACE_poke, getpid_runs_normally) } #if defined(__x86_64__) -# define ARCH_REGS struct user_regs_struct -# define SYSCALL_NUM orig_rax -# define SYSCALL_RET rax +# define ARCH_REGS struct user_regs_struct +# define SYSCALL_NUM(_regs) (_regs).orig_rax +# define SYSCALL_RET(_regs) (_regs).rax #elif defined(__i386__) -# define ARCH_REGS struct user_regs_struct -# define SYSCALL_NUM orig_eax -# define SYSCALL_RET eax +# define ARCH_REGS struct user_regs_struct +# define SYSCALL_NUM(_regs) (_regs).orig_eax +# define SYSCALL_RET(_regs) (_regs).eax #elif defined(__arm__) -# define ARCH_REGS struct pt_regs -# define SYSCALL_NUM ARM_r7 -# define SYSCALL_RET ARM_r0 +# define ARCH_REGS struct pt_regs +# define SYSCALL_NUM(_regs) (_regs).ARM_r7 +# ifndef PTRACE_SET_SYSCALL +# define PTRACE_SET_SYSCALL 23 +# endif +# define SYSCALL_NUM_SET(_regs, _nr) \ + EXPECT_EQ(0, ptrace(PTRACE_SET_SYSCALL, tracee, NULL, _nr)) +# define SYSCALL_RET(_regs) (_regs).ARM_r0 #elif defined(__aarch64__) -# define ARCH_REGS struct user_pt_regs -# define SYSCALL_NUM regs[8] -# define SYSCALL_RET regs[0] +# define ARCH_REGS struct user_pt_regs +# define SYSCALL_NUM(_regs) (_regs).regs[8] +# ifndef NT_ARM_SYSTEM_CALL +# define NT_ARM_SYSTEM_CALL 0x404 +# endif +# define SYSCALL_NUM_SET(_regs, _nr) \ + do { \ + struct iovec __v; \ + typeof(_nr) __nr = (_nr); \ + __v.iov_base = &__nr; \ + __v.iov_len = sizeof(__nr); \ + EXPECT_EQ(0, ptrace(PTRACE_SETREGSET, tracee, \ + NT_ARM_SYSTEM_CALL, &__v)); \ + } while (0) +# define SYSCALL_RET(_regs) (_regs).regs[0] #elif defined(__riscv) && __riscv_xlen == 64 -# define ARCH_REGS struct user_regs_struct -# define SYSCALL_NUM a7 -# define SYSCALL_RET a0 +# define ARCH_REGS struct user_regs_struct +# define SYSCALL_NUM(_regs) (_regs).a7 +# define SYSCALL_RET(_regs) (_regs).a0 #elif defined(__csky__) -# define ARCH_REGS struct pt_regs -#if defined(__CSKYABIV2__) -# define SYSCALL_NUM regs[3] -#else -# define SYSCALL_NUM regs[9] -#endif -# define SYSCALL_RET a0 +# define ARCH_REGS struct pt_regs +# if defined(__CSKYABIV2__) +# define SYSCALL_NUM(_regs) (_regs).regs[3] +# else +# define SYSCALL_NUM(_regs) (_regs).regs[9] +# endif +# define SYSCALL_RET(_regs) (_regs).a0 #elif defined(__hppa__) -# define ARCH_REGS struct user_regs_struct -# define SYSCALL_NUM gr[20] -# define SYSCALL_RET gr[28] +# define ARCH_REGS struct user_regs_struct +# define SYSCALL_NUM(_regs) (_regs).gr[20] +# define SYSCALL_RET(_regs) (_regs).gr[28] #elif defined(__powerpc__) -# define ARCH_REGS struct pt_regs -# define SYSCALL_NUM gpr[0] -# define SYSCALL_RET gpr[3] +# define ARCH_REGS struct pt_regs +# define SYSCALL_NUM(_regs) (_regs).gpr[0] +# define SYSCALL_RET(_regs) (_regs).gpr[3] +# define SYSCALL_RET_SET(_regs, _val) \ + do { \ + typeof(_val) _result = (_val); \ + /* \ + * A syscall error is signaled by CR0 SO bit \ + * and the code is stored as a positive value. \ + */ \ + if (_result < 0) { \ + SYSCALL_RET(_regs) = -result; \ + (_regs).ccr |= 0x10000000; \ + } else { \ + SYSCALL_RET(_regs) = result; \ + (_regs).ccr &= ~0x10000000; \ + } \ + } while (0) +# define SYSCALL_RET_SET_ON_PTRACE_EXIT #elif defined(__s390__) -# define ARCH_REGS s390_regs -# define SYSCALL_NUM gprs[2] -# define SYSCALL_RET gprs[2] -# define SYSCALL_NUM_RET_SHARE_REG +# define ARCH_REGS s390_regs +# define SYSCALL_NUM(_regs) (_regs).gprs[2] +# define SYSCALL_RET_SET(_regs, _val) \ + TH_LOG("Can't modify syscall return on this architecture") #elif defined(__mips__) -# define ARCH_REGS struct pt_regs -# define SYSCALL_NUM regs[2] -# define SYSCALL_SYSCALL_NUM regs[4] -# define SYSCALL_RET regs[2] -# define SYSCALL_NUM_RET_SHARE_REG +# include +# include +# include +# define ARCH_REGS struct pt_regs +# define SYSCALL_NUM(_regs) \ + ({ \ + typeof((_regs).regs[2]) _nr; \ + if ((_regs).regs[2] == __NR_O32_Linux) \ + _nr = (_regs).regs[4]; \ + else \ + _nr = (_regs).regs[2]; \ + _nr; \ + }) +# define SYSCALL_NUM_SET(_regs, _nr) \ + do { \ + if ((_regs).regs[2] == __NR_O32_Linux) \ + (_regs).regs[4] = _nr; \ + else \ + (_regs).regs[2] = _nr; \ + } while (0) +# define SYSCALL_RET_SET(_regs, _val) \ + TH_LOG("Can't modify syscall return on this architecture") #elif defined(__xtensa__) -# define ARCH_REGS struct user_pt_regs -# define SYSCALL_NUM syscall +# define ARCH_REGS struct user_pt_regs +# define SYSCALL_NUM(_regs) (_regs).syscall /* * On xtensa syscall return value is in the register * a2 of the current window which is not fixed. */ -#define SYSCALL_RET(reg) a[(reg).windowbase * 4 + 2] +#define SYSCALL_RET(_regs) (_regs).a[(_regs).windowbase * 4 + 2] #elif defined(__sh__) -# define ARCH_REGS struct pt_regs -# define SYSCALL_NUM gpr[3] -# define SYSCALL_RET gpr[0] +# define ARCH_REGS struct pt_regs +# define SYSCALL_NUM(_regs) (_regs).gpr[3] +# define SYSCALL_RET(_regs) (_regs).gpr[0] #else # error "Do not know how to find your architecture's registers and syscalls" #endif +/* + * Most architectures can change the syscall by just updating the + * associated register. This is the default if not defined above. + */ +#ifndef SYSCALL_NUM_SET +# define SYSCALL_NUM_SET(_regs, _nr) \ + do { \ + SYSCALL_NUM(_regs) = (_nr); \ + } while (0) +#endif +/* + * Most architectures can change the syscall return value by just + * writing to the SYSCALL_RET register. This is the default if not + * defined above. If an architecture cannot set the return value + * (for example when the syscall and return value register is + * shared), report it with TH_LOG() in an arch-specific definition + * of SYSCALL_RET_SET() above, and leave SYSCALL_RET undefined. + */ +#if !defined(SYSCALL_RET) && !defined(SYSCALL_RET_SET) +# error "One of SYSCALL_RET or SYSCALL_RET_SET is needed for this arch" +#endif +#ifndef SYSCALL_RET_SET +# define SYSCALL_RET_SET(_regs, _val) \ + do { \ + SYSCALL_RET(_regs) = (_val); \ + } while (0) +#endif + /* When the syscall return can't be changed, stub out the tests for it. */ -#ifdef SYSCALL_NUM_RET_SHARE_REG +#ifndef SYSCALL_RET # define EXPECT_SYSCALL_RETURN(val, action) EXPECT_EQ(-1, action) #else # define EXPECT_SYSCALL_RETURN(val, action) \ @@ -1745,116 +1854,92 @@ TEST_F(TRACE_poke, getpid_runs_normally) } while (0) #endif -/* Use PTRACE_GETREGS and PTRACE_SETREGS when available. This is useful for +/* + * Some architectures (e.g. powerpc) can only set syscall + * return values on syscall exit during ptrace. + */ +const bool ptrace_entry_set_syscall_nr = true; +const bool ptrace_entry_set_syscall_ret = +#ifndef SYSCALL_RET_SET_ON_PTRACE_EXIT + true; +#else + false; +#endif + +/* + * Use PTRACE_GETREGS and PTRACE_SETREGS when available. This is useful for * architectures without HAVE_ARCH_TRACEHOOK (e.g. User-mode Linux). */ #if defined(__x86_64__) || defined(__i386__) || defined(__mips__) -#define HAVE_GETREGS +# define ARCH_GETREGS(_regs) ptrace(PTRACE_GETREGS, tracee, 0, &(_regs)) +# define ARCH_SETREGS(_regs) ptrace(PTRACE_SETREGS, tracee, 0, &(_regs)) +#else +# define ARCH_GETREGS(_regs) ({ \ + struct iovec __v; \ + __v.iov_base = &(_regs); \ + __v.iov_len = sizeof(_regs); \ + ptrace(PTRACE_GETREGSET, tracee, NT_PRSTATUS, &__v); \ + }) +# define ARCH_SETREGS(_regs) ({ \ + struct iovec __v; \ + __v.iov_base = &(_regs); \ + __v.iov_len = sizeof(_regs); \ + ptrace(PTRACE_SETREGSET, tracee, NT_PRSTATUS, &__v); \ + }) #endif /* Architecture-specific syscall fetching routine. */ int get_syscall(struct __test_metadata *_metadata, pid_t tracee) { ARCH_REGS regs; -#ifdef HAVE_GETREGS - EXPECT_EQ(0, ptrace(PTRACE_GETREGS, tracee, 0, ®s)) { - TH_LOG("PTRACE_GETREGS failed"); + + EXPECT_EQ(0, ARCH_GETREGS(regs)) { return -1; } -#else - struct iovec iov; - iov.iov_base = ®s; - iov.iov_len = sizeof(regs); - EXPECT_EQ(0, ptrace(PTRACE_GETREGSET, tracee, NT_PRSTATUS, &iov)) { - TH_LOG("PTRACE_GETREGSET failed"); - return -1; - } -#endif - -#if defined(__mips__) - if (regs.SYSCALL_NUM == __NR_O32_Linux) - return regs.SYSCALL_SYSCALL_NUM; -#endif - return regs.SYSCALL_NUM; + return SYSCALL_NUM(regs); } /* Architecture-specific syscall changing routine. */ -void change_syscall(struct __test_metadata *_metadata, - pid_t tracee, int syscall, int result) +void __change_syscall(struct __test_metadata *_metadata, + pid_t tracee, long *syscall, long *ret) { - int ret; - ARCH_REGS regs; -#ifdef HAVE_GETREGS - ret = ptrace(PTRACE_GETREGS, tracee, 0, ®s); -#else - struct iovec iov; - iov.iov_base = ®s; - iov.iov_len = sizeof(regs); - ret = ptrace(PTRACE_GETREGSET, tracee, NT_PRSTATUS, &iov); -#endif - EXPECT_EQ(0, ret) {} + ARCH_REGS orig, regs; -#if defined(__x86_64__) || defined(__i386__) || defined(__powerpc__) || \ - defined(__s390__) || defined(__hppa__) || defined(__riscv) || \ - defined(__xtensa__) || defined(__csky__) || defined(__sh__) - { - regs.SYSCALL_NUM = syscall; - } -#elif defined(__mips__) - { - if (regs.SYSCALL_NUM == __NR_O32_Linux) - regs.SYSCALL_SYSCALL_NUM = syscall; - else - regs.SYSCALL_NUM = syscall; + /* Do not get/set registers if we have nothing to do. */ + if (!syscall && !ret) + return; + + EXPECT_EQ(0, ARCH_GETREGS(regs)) { + return; } + orig = regs; -#elif defined(__arm__) -# ifndef PTRACE_SET_SYSCALL -# define PTRACE_SET_SYSCALL 23 -# endif - { - ret = ptrace(PTRACE_SET_SYSCALL, tracee, NULL, syscall); - EXPECT_EQ(0, ret); - } + if (syscall) + SYSCALL_NUM_SET(regs, *syscall); -#elif defined(__aarch64__) -# ifndef NT_ARM_SYSTEM_CALL -# define NT_ARM_SYSTEM_CALL 0x404 -# endif - { - iov.iov_base = &syscall; - iov.iov_len = sizeof(syscall); - ret = ptrace(PTRACE_SETREGSET, tracee, NT_ARM_SYSTEM_CALL, - &iov); - EXPECT_EQ(0, ret); - } + if (ret) + SYSCALL_RET_SET(regs, *ret); -#else - ASSERT_EQ(1, 0) { - TH_LOG("How is the syscall changed on this architecture?"); - } -#endif + /* Flush any register changes made. */ + if (memcmp(&orig, ®s, sizeof(orig)) != 0) + EXPECT_EQ(0, ARCH_SETREGS(regs)); +} - /* If syscall is skipped, change return value. */ - if (syscall == -1) -#ifdef SYSCALL_NUM_RET_SHARE_REG - TH_LOG("Can't modify syscall return on this architecture"); +/* Change only syscall number. */ +void change_syscall_nr(struct __test_metadata *_metadata, + pid_t tracee, long syscall) +{ + __change_syscall(_metadata, tracee, &syscall, NULL); +} -#elif defined(__xtensa__) - regs.SYSCALL_RET(regs) = result; -#else - regs.SYSCALL_RET = result; -#endif +/* Change syscall return value (and set syscall number to -1). */ +void change_syscall_ret(struct __test_metadata *_metadata, + pid_t tracee, long ret) +{ + long syscall = -1; -#ifdef HAVE_GETREGS - ret = ptrace(PTRACE_SETREGS, tracee, 0, ®s); -#else - iov.iov_base = ®s; - iov.iov_len = sizeof(regs); - ret = ptrace(PTRACE_SETREGSET, tracee, NT_PRSTATUS, &iov); -#endif - EXPECT_EQ(0, ret); + __change_syscall(_metadata, tracee, &syscall, &ret); } void tracer_seccomp(struct __test_metadata *_metadata, pid_t tracee, @@ -1872,17 +1957,17 @@ void tracer_seccomp(struct __test_metadata *_metadata, pid_t tracee, case 0x1002: /* change getpid to getppid. */ EXPECT_EQ(__NR_getpid, get_syscall(_metadata, tracee)); - change_syscall(_metadata, tracee, __NR_getppid, 0); + change_syscall_nr(_metadata, tracee, __NR_getppid); break; case 0x1003: /* skip gettid with valid return code. */ EXPECT_EQ(__NR_gettid, get_syscall(_metadata, tracee)); - change_syscall(_metadata, tracee, -1, 45000); + change_syscall_ret(_metadata, tracee, 45000); break; case 0x1004: /* skip openat with error. */ EXPECT_EQ(__NR_openat, get_syscall(_metadata, tracee)); - change_syscall(_metadata, tracee, -1, -ESRCH); + change_syscall_ret(_metadata, tracee, -ESRCH); break; case 0x1005: /* do nothing (allow getppid) */ @@ -1897,12 +1982,21 @@ void tracer_seccomp(struct __test_metadata *_metadata, pid_t tracee, } +FIXTURE(TRACE_syscall) { + struct sock_fprog prog; + pid_t tracer, mytid, mypid, parent; + long syscall_nr; +}; + void tracer_ptrace(struct __test_metadata *_metadata, pid_t tracee, int status, void *args) { - int ret, nr; + int ret; unsigned long msg; static bool entry; + long syscall_nr_val, syscall_ret_val; + long *syscall_nr = NULL, *syscall_ret = NULL; + FIXTURE_DATA(TRACE_syscall) *self = args; /* * The traditional way to tell PTRACE_SYSCALL entry/exit @@ -1916,24 +2010,48 @@ void tracer_ptrace(struct __test_metadata *_metadata, pid_t tracee, EXPECT_EQ(entry ? PTRACE_EVENTMSG_SYSCALL_ENTRY : PTRACE_EVENTMSG_SYSCALL_EXIT, msg); - if (!entry) + /* + * Some architectures only support setting return values during + * syscall exit under ptrace, and on exit the syscall number may + * no longer be available. Therefore, save the initial sycall + * number here, so it can be examined during both entry and exit + * phases. + */ + if (entry) + self->syscall_nr = get_syscall(_metadata, tracee); + + /* + * Depending on the architecture's syscall setting abilities, we + * pick which things to set during this phase (entry or exit). + */ + if (entry == ptrace_entry_set_syscall_nr) + syscall_nr = &syscall_nr_val; + if (entry == ptrace_entry_set_syscall_ret) + syscall_ret = &syscall_ret_val; + + /* Now handle the actual rewriting cases. */ + switch (self->syscall_nr) { + case __NR_getpid: + syscall_nr_val = __NR_getppid; + /* Never change syscall return for this case. */ + syscall_ret = NULL; + break; + case __NR_gettid: + syscall_nr_val = -1; + syscall_ret_val = 45000; + break; + case __NR_openat: + syscall_nr_val = -1; + syscall_ret_val = -ESRCH; + break; + default: + /* Unhandled, do nothing. */ return; + } - nr = get_syscall(_metadata, tracee); - - if (nr == __NR_getpid) - change_syscall(_metadata, tracee, __NR_getppid, 0); - if (nr == __NR_gettid) - change_syscall(_metadata, tracee, -1, 45000); - if (nr == __NR_openat) - change_syscall(_metadata, tracee, -1, -ESRCH); + __change_syscall(_metadata, tracee, syscall_nr, syscall_ret); } -FIXTURE(TRACE_syscall) { - struct sock_fprog prog; - pid_t tracer, mytid, mypid, parent; -}; - FIXTURE_VARIANT(TRACE_syscall) { /* * All of the SECCOMP_RET_TRACE behaviors can be tested with either @@ -1992,7 +2110,7 @@ FIXTURE_SETUP(TRACE_syscall) self->tracer = setup_trace_fixture(_metadata, variant->use_ptrace ? tracer_ptrace : tracer_seccomp, - NULL, variant->use_ptrace); + self, variant->use_ptrace); ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); ASSERT_EQ(0, ret); @@ -3142,11 +3260,11 @@ skip: static int user_notif_syscall(int nr, unsigned int flags) { struct sock_filter filter[] = { - BPF_STMT(BPF_LD+BPF_W+BPF_ABS, + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, offsetof(struct seccomp_data, nr)), - BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, nr, 0, 1), - BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_USER_NOTIF), - BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, nr, 0, 1), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_USER_NOTIF), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), }; struct sock_fprog prog = { @@ -3699,7 +3817,7 @@ TEST(user_notification_filter_empty) long ret; int status; struct pollfd pollfd; - struct clone_args args = { + struct __clone_args args = { .flags = CLONE_FILES, .exit_signal = SIGCHLD, }; @@ -3715,7 +3833,7 @@ TEST(user_notification_filter_empty) if (pid == 0) { int listener; - listener = user_notif_syscall(__NR_mknod, SECCOMP_FILTER_FLAG_NEW_LISTENER); + listener = user_notif_syscall(__NR_mknodat, SECCOMP_FILTER_FLAG_NEW_LISTENER); if (listener < 0) _exit(EXIT_FAILURE); @@ -3753,7 +3871,7 @@ TEST(user_notification_filter_empty_threaded) long ret; int status; struct pollfd pollfd; - struct clone_args args = { + struct __clone_args args = { .flags = CLONE_FILES, .exit_signal = SIGCHLD, }; diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index a9026706d597..30873b19d04b 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -3,6 +3,23 @@ uname_M := $(shell uname -m 2>/dev/null || echo not) MACHINE ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/') +# Without this, failed build products remain, with up-to-date timestamps, +# thus tricking Make (and you!) into believing that All Is Well, in subsequent +# make invocations: +.DELETE_ON_ERROR: + +# Avoid accidental wrong builds, due to built-in rules working just a little +# bit too well--but not quite as well as required for our situation here. +# +# In other words, "make userfaultfd" is supposed to fail to build at all, +# because this Makefile only supports either "make" (all), or "make /full/path". +# However, the built-in rules, if not suppressed, will pick up CFLAGS and the +# initial LDLIBS (but not the target-specific LDLIBS, because those are only +# set for the full path target!). This causes it to get pretty far into building +# things despite using incorrect values such as an *occasionally* incomplete +# LDLIBS. +MAKEFLAGS += --no-builtin-rules + CFLAGS = -Wall -I ../../../../usr/include $(EXTRA_CFLAGS) LDLIBS = -lrt TEST_GEN_FILES = compaction_test diff --git a/tools/testing/selftests/vm/compaction_test.c b/tools/testing/selftests/vm/compaction_test.c index bcec71250873..9b420140ba2b 100644 --- a/tools/testing/selftests/vm/compaction_test.c +++ b/tools/testing/selftests/vm/compaction_test.c @@ -18,7 +18,8 @@ #include "../kselftest.h" -#define MAP_SIZE 1048576 +#define MAP_SIZE_MB 100 +#define MAP_SIZE (MAP_SIZE_MB * 1024 * 1024) struct map_list { void *map; @@ -165,7 +166,7 @@ int main(int argc, char **argv) void *map = NULL; unsigned long mem_free = 0; unsigned long hugepage_size = 0; - unsigned long mem_fragmentable = 0; + long mem_fragmentable_MB = 0; if (prereq() != 0) { printf("Either the sysctl compact_unevictable_allowed is not\n" @@ -190,9 +191,9 @@ int main(int argc, char **argv) return -1; } - mem_fragmentable = mem_free * 0.8 / 1024; + mem_fragmentable_MB = mem_free * 0.8 / 1024; - while (mem_fragmentable > 0) { + while (mem_fragmentable_MB > 0) { map = mmap(NULL, MAP_SIZE, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE | MAP_LOCKED, -1, 0); if (map == MAP_FAILED) @@ -213,7 +214,7 @@ int main(int argc, char **argv) for (i = 0; i < MAP_SIZE; i += page_size) *(unsigned long *)(map + i) = (unsigned long)map + i; - mem_fragmentable--; + mem_fragmentable_MB -= MAP_SIZE_MB; } for (entry = list; entry != NULL; entry = entry->next) { diff --git a/tools/testing/selftests/vm/gup_benchmark.c b/tools/testing/selftests/vm/gup_benchmark.c index 43b4dfe161a2..31f8bb086907 100644 --- a/tools/testing/selftests/vm/gup_benchmark.c +++ b/tools/testing/selftests/vm/gup_benchmark.c @@ -15,12 +15,12 @@ #define PAGE_SIZE sysconf(_SC_PAGESIZE) #define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_benchmark) -#define GUP_LONGTERM_BENCHMARK _IOWR('g', 2, struct gup_benchmark) -#define GUP_BENCHMARK _IOWR('g', 3, struct gup_benchmark) +#define GUP_BENCHMARK _IOWR('g', 2, struct gup_benchmark) /* Similar to above, but use FOLL_PIN instead of FOLL_GET. */ -#define PIN_FAST_BENCHMARK _IOWR('g', 4, struct gup_benchmark) -#define PIN_BENCHMARK _IOWR('g', 5, struct gup_benchmark) +#define PIN_FAST_BENCHMARK _IOWR('g', 3, struct gup_benchmark) +#define PIN_BENCHMARK _IOWR('g', 4, struct gup_benchmark) +#define PIN_LONGTERM_BENCHMARK _IOWR('g', 5, struct gup_benchmark) /* Just the flags we need, copied from mm.h: */ #define FOLL_WRITE 0x01 /* check pte is writable */ @@ -52,6 +52,9 @@ int main(int argc, char **argv) case 'b': cmd = PIN_BENCHMARK; break; + case 'L': + cmd = PIN_LONGTERM_BENCHMARK; + break; case 'm': size = atoi(optarg) * MB; break; @@ -67,9 +70,6 @@ int main(int argc, char **argv) case 'T': thp = 0; break; - case 'L': - cmd = GUP_LONGTERM_BENCHMARK; - break; case 'U': cmd = GUP_BENCHMARK; break; diff --git a/tools/testing/selftests/vm/hmm-tests.c b/tools/testing/selftests/vm/hmm-tests.c index 93fc5cadce61..0a28a6a29581 100644 --- a/tools/testing/selftests/vm/hmm-tests.c +++ b/tools/testing/selftests/vm/hmm-tests.c @@ -680,7 +680,7 @@ TEST_F(hmm, anon_write_hugetlbfs) n = gethugepagesizes(pagesizes, 4); if (n <= 0) - return; + SKIP(return, "Huge page size could not be determined"); for (idx = 0; --n > 0; ) { if (pagesizes[n] < pagesizes[idx]) idx = n; @@ -694,7 +694,7 @@ TEST_F(hmm, anon_write_hugetlbfs) buffer->ptr = get_hugepage_region(size, GHR_STRICT); if (buffer->ptr == NULL) { free(buffer); - return; + SKIP(return, "Huge page could not be allocated"); } buffer->fd = -1;