Files
android_kernel_xiaomi_sm8450/mm/hugetlb.c
Greg Kroah-Hartman d0782c9411 Merge tag 'android12-5.10.160_r00' into android12-5.10
This is the merge of the upstream LTS release of 5.10.160 into the
android12-5.10 branch.

It contains the following commits:

003c389455 Merge 5.10.160 into android12-5.10-lts
a2428a8dcb Linux 5.10.160
54c15f67cb ASoC: ops: Correct bounds check for second channel on SX controls
74b139c63f nvme-pci: clear the prp2 field when not used
77ebf88e00 ASoC: cs42l51: Correct PGA Volume minimum value
4db1d19b74 can: mcba_usb: Fix termination command argument
683837f2f6 can: sja1000: fix size of OCR_MODE_MASK define
434b523671 pinctrl: meditatek: Startup with the IRQs disabled
5cb4abb0ca libbpf: Use page size as max_entries when probing ring buffer map
50b5f6d4d9 ASoC: ops: Check bounds for second channel in snd_soc_put_volsw_sx()
344739dc56 ASoC: fsl_micfil: explicitly clear CHnF flags
a49c1a7307 ASoC: fsl_micfil: explicitly clear software reset bit
75454b4bbf io_uring: add missing item types for splice request
17f386e6b7 fuse: always revalidate if exclusive create
eb6313c129 nfp: fix use-after-free in area_cache_get()
965d93fb39 vfs: fix copy_file_range() averts filesystem freeze protection
ed96733949 vfs: fix copy_file_range() regression in cross-fs copies
970862a96c x86/smpboot: Move rcu_cpu_starting() earlier
32e45c58a0 Merge "Merge 5.10.159 into android12-5.10-lts" into android12-5.10-lts
d31626cbea ANDROID: usb: gadget: uvc: remove duplicate code in unbind
01ef2d0b53 Merge 5.10.159 into android12-5.10-lts
931578be69 Linux 5.10.159
4fd6f84e0a can: esd_usb: Allow REC and TEC to return to zero
cf0e423106 macsec: add missing attribute validation for offload
6b03e41767 net: mvneta: Fix an out of bounds check
8208d7e56b ipv6: avoid use-after-free in ip6_fragment()
3d59adad12 net: plip: don't call kfree_skb/dev_kfree_skb() under spin_lock_irq()
a00444e25b xen/netback: fix build warning
87277bdf2c ethernet: aeroflex: fix potential skb leak in greth_init_rings()
cc668fddde tipc: call tipc_lxc_xmit without holding node_read_lock
4be43e46c3 net: dsa: sja1105: fix memory leak in sja1105_setup_devlink_regions()
8e3f9ac009 ipv4: Fix incorrect route flushing when table ID 0 is used
5211e5ff9d ipv4: Fix incorrect route flushing when source address is deleted
36e248269a tipc: Fix potential OOB in tipc_link_proto_rcv()
93aaa4bb72 net: hisilicon: Fix potential use-after-free in hix5hd2_rx()
296a50aa8b net: hisilicon: Fix potential use-after-free in hisi_femac_rx()
8d1aed7a11 net: thunderx: Fix missing destroy_workqueue of nicvf_rx_mode_wq
a5cfbc1995 ip_gre: do not report erspan version on GRE interface
696e34d54c net: stmmac: fix "snps,axi-config" node property parsing
ca26f45083 nvme initialize core quirks before calling nvme_init_subsystem
27eb2d7a1b NFC: nci: Bounds check struct nfc_target arrays
a2506b19d7 i40e: Disallow ip4 and ip6 l4_4_bytes
8329b65e34 i40e: Fix for VF MAC address 0
215f3ac53b i40e: Fix not setting default xps_cpus after reset
146ebee8fc net: mvneta: Prevent out of bounds read in mvneta_config_rss()
e6860c889f xen-netfront: Fix NULL sring after live migration
3d3b30718a net: encx24j600: Fix invalid logic in reading of MISTAT register
51ba1820e7 net: encx24j600: Add parentheses to fix precedence
42c319635c mac802154: fix missing INIT_LIST_HEAD in ieee802154_if_add()
4c693330ce selftests: rtnetlink: correct xfrm policy rule in kci_test_ipsec_offload
bccda3ad07 net: dsa: ksz: Check return value
e7b9504581 Bluetooth: Fix not cleanup led when bt_init fails
1717354d77 Bluetooth: 6LoWPAN: add missing hci_dev_put() in get_l2cap_conn()
80c69b31aa vmxnet3: correctly report encapsulated LRO packet
575a6266f6 af_unix: Get user_ns from in_skb in unix_diag_get_exact().
6c788c0a25 drm: bridge: dw_hdmi: fix preference of RGB modes over YUV420
de918d9738 igb: Allocate MSI-X vector when testing
6595c9208d e1000e: Fix TX dispatch condition
5ee6413d3d gpio: amd8111: Fix PCI device reference count leak
b9aca69a6c drm/bridge: ti-sn65dsi86: Fix output polarity setting bug
b46e8c50c3 netfilter: ctnetlink: fix compilation warning after data race fixes in ct mark
0a8e66e375 ca8210: Fix crash by zero initializing data
27c71825ff ieee802154: cc2520: Fix error return code in cc2520_hw_init()
a0418d0a6b netfilter: nft_set_pipapo: Actually validate intervals in fields after the first one
cb283cca1d rtc: mc146818-lib: fix signedness bug in mc146818_get_time()
5c432383b6 rtc: mc146818-lib: fix locking in mc146818_set_time
5e26531d81 rtc: cmos: Disable irq around direct invocation of cmos_interrupt()
fccee93eb2 mm/hugetlb: fix races when looking up a CONT-PTE/PMD size hugetlb page
c42221efb1 can: af_can: fix NULL pointer dereference in can_rcv_filter
bc03f809da HID: core: fix shift-out-of-bounds in hid_report_raw_event
959a23a4d1 HID: hid-lg4ff: Add check for empty lbuf
4dde75945a HID: usbhid: Add ALWAYS_POLL quirk for some mice
11e95d85c3 drm/shmem-helper: Avoid vm_open error paths
6a4da05acd drm/shmem-helper: Remove errant put in error path
007f561f59 drm/vmwgfx: Don't use screen objects when SEV is active
3cb78c3925 KVM: s390: vsie: Fix the initialization of the epoch extension (epdx) field
549b46f813 Bluetooth: Fix crash when replugging CSR fake controllers
380d183e99 Bluetooth: btusb: Add debug message for CSR controllers
f1cf856123 mm/gup: fix gup_pud_range() for dax
f1f7f36cf6 memcg: fix possible use-after-free in memcg_write_event_control()
32f01f0306 media: v4l2-dv-timings.c: fix too strict blanking sanity checks
043b2bc96c Revert "ARM: dts: imx7: Fix NAND controller size-cells"
abfb8ae69b media: videobuf2-core: take mmap_lock in vb2_get_unmapped_area()
83632fc414 xen/netback: don't call kfree_skb() with interrupts disabled
3eecd2bc10 xen/netback: do some code cleanup
49e07c0768 xen/netback: Ensure protocol headers don't fall in the non-linear area
db44a9443e rtc: mc146818: Reduce spinlock section in mc146818_set_time()
17293d630f rtc: cmos: Replace spin_lock_irqsave with spin_lock in hard IRQ
acfd8ef683 rtc: cmos: avoid UIP when reading alarm time
949bae0282 rtc: cmos: avoid UIP when writing alarm time
33ac73a41a rtc: mc146818-lib: extract mc146818_avoid_UIP
8bb5fe5830 rtc: mc146818-lib: fix RTC presence check
775d4661f1 rtc: Check return value from mc146818_get_time()
b9a5c470e0 rtc: mc146818-lib: change return values of mc146818_get_time()
94eaf9966e rtc: cmos: remove stale REVISIT comments
f5b51f8550 rtc: mc146818: Dont test for bit 0-5 in Register D
3736972360 rtc: mc146818: Detect and handle broken RTCs
7c7075c88d rtc: mc146818: Prevent reading garbage
7f445ca2e0 mm/khugepaged: invoke MMU notifiers in shmem/file collapse paths
4a1cdb49d0 mm/khugepaged: fix GUP-fast interaction by sending IPI
cdfd3739b2 mm/khugepaged: take the right locks for page table retraction
1c0eec6a1d net: usb: qmi_wwan: add u-blox 0x1342 composition
a8c5ffb4df 9p/xen: check logical size for buffer size
ec36ebae36 usb: dwc3: gadget: Disable GUSB2PHYCFG.SUSPHY for End Transfer
d9b53caf01 fbcon: Use kzalloc() in fbcon_prepare_logo()
8b130c770d regulator: twl6030: fix get status of twl6032 regulators
f6f45e5383 ASoC: soc-pcm: Add NULL check in BE reparenting
688a45aff2 btrfs: send: avoid unaligned encoded writes when attempting to clone range
15c42ab8d4 ALSA: seq: Fix function prototype mismatch in snd_seq_expand_var_event
d38e021416 regulator: slg51000: Wait after asserting CS pin
1331bcfcac 9p/fd: Use P9_HDRSZ for header size
96b43f36a5 ARM: dts: rockchip: disable arm_global_timer on rk3066 and rk3188
ddf58f5939 ASoC: wm8962: Wait for updated value of WM8962_CLOCKING1 register
dbd78abd69 ARM: 9266/1: mm: fix no-MMU ZERO_PAGE() implementation
bb1866cf1e ARM: 9251/1: perf: Fix stacktraces for tracepoint events in THUMB2 kernels
b1f40a0cdf ARM: dts: rockchip: rk3188: fix lcdc1-rgb24 node name
5f9474d07b arm64: dts: rockchip: fix ir-receiver node names
060d58924a ARM: dts: rockchip: fix ir-receiver node names
3e0c466771 arm: dts: rockchip: fix node name for hym8563 rtc
3ada63a876 arm64: dts: rockchip: keep I2S1 disabled for GPIO function on ROCK Pi 4 series
202ee06349 Revert "mmc: sdhci: Fix voltage switch delay"
0b0939466f ANDROID: gki_defconfig: add CONFIG_FUNCTION_ERROR_INJECTION
5ab4c6b843 Merge 5.10.158 into android12-5.10-lts
592346d5dc Linux 5.10.158
cc1b4718cc ipc/sem: Fix dangling sem_array access in semtimedop race
d072a10c81 v4l2: don't fall back to follow_pfn() if pin_user_pages_fast() fails
9ba389863a proc: proc_skip_spaces() shouldn't think it is working on C strings
4aa32aaef6 proc: avoid integer type confusion in get_proc_long
5f2f775605 block: unhash blkdev part inode when the part is deleted
a82869ac52 Input: raydium_ts_i2c - fix memory leak in raydium_i2c_send()
4e0d6c687c char: tpm: Protect tpm_pm_suspend with locks
5a6f935ef3 Revert "clocksource/drivers/riscv: Events are stopped during CPU suspend"
f075cf139f ACPI: HMAT: Fix initiator registration for single-initiator systems
f3b76b4d38 ACPI: HMAT: remove unnecessary variable initialization
63e72417a1 i2c: imx: Only DMA messages with I2C_M_DMA_SAFE flag set
df76136598 i2c: npcm7xx: Fix error handling in npcm_i2c_init()
7462cd2443 x86/pm: Add enumeration check before spec MSRs save/restore setup
5e3d4a68e2 x86/tsx: Add a feature bit for TSX control MSR support
b7f7a0402e Revert "tty: n_gsm: avoid call of sleeping functions from atomic context"
481f9ed8eb ipv4: Fix route deletion when nexthop info is not specified
0b5394229e ipv4: Handle attempt to delete multipath route when fib_info contains an nh reference
4919503426 selftests: net: fix nexthop warning cleanup double ip typo
7ca14c5f24 selftests: net: add delete nexthop route warning test
f09ac62f0e Kconfig.debug: provide a little extra FRAME_WARN leeway when KASAN is enabled
19d91d3798 parisc: Increase FRAME_WARN to 2048 bytes on parisc
fcf20da099 xtensa: increase size of gcc stack frame check
a1877001ed parisc: Increase size of gcc stack frame check
a5c65cd56a iommu/vt-d: Fix PCI device refcount leak in dmar_dev_scope_init()
10ed7655a1 iommu/vt-d: Fix PCI device refcount leak in has_external_pci()
302edce1dd pinctrl: single: Fix potential division by zero
b50c964189 ASoC: ops: Fix bounds check for _sx controls
a2efc46524 io_uring: don't hold uring_lock when calling io_run_task_work*
be111ebd88 tracing: Free buffers when a used dynamic event is removed
648b92e576 drm/i915: Never return 0 if not all requests retired
8649c023c4 drm/amdgpu: temporarily disable broken Clang builds due to blown stack-frame
940b774069 mmc: sdhci: Fix voltage switch delay
ed19662453 mmc: sdhci-sprd: Fix no reset data and command after voltage switch
ef767907e7 mmc: sdhci-esdhc-imx: correct CQHCI exit halt state check
46ee041cd6 mmc: core: Fix ambiguous TRIM and DISCARD arg
b79be962b5 mmc: mmc_test: Fix removal of debugfs file
d4fc344c0d net: stmmac: Set MAC's flow control register to reflect current settings
549e24409a pinctrl: intel: Save and restore pins in "direct IRQ" mode
471fb7b735 x86/bugs: Make sure MSR_SPEC_CTRL is updated properly upon resume from S3
e858917ab7 nilfs2: fix NULL pointer dereference in nilfs_palloc_commit_free_entry()
6ddf788400 tools/vm/slabinfo-gnuplot: use "grep -E" instead of "egrep"
c099d12c55 error-injection: Add prompt for function error injection
26b6f927bb riscv: vdso: fix section overlapping under some conditions
2b1d8f27e2 net/mlx5: DR, Fix uninitialized var warning
c40db1e5f3 hwmon: (coretemp) fix pci device refcount leak in nv1a_ram_new()
f06e0cd01e hwmon: (coretemp) Check for null before removing sysfs attrs
d93522d04f net: ethernet: renesas: ravb: Fix promiscuous mode after system resumed
176ee6c673 sctp: fix memory leak in sctp_stream_outq_migrate()
1c38c88acc packet: do not set TP_STATUS_CSUM_VALID on CHECKSUM_COMPLETE
5f442e1d40 net: tun: Fix use-after-free in tun_detach()
5fa0fc5876 afs: Fix fileserver probe RTT handling
7ca81a161e net: hsr: Fix potential use-after-free
a1ba595e35 tipc: re-fetch skb cb after tipc_msg_validate
4621bdfff5 dsa: lan9303: Correct stat name
45752af024 net: ethernet: nixge: fix NULL dereference
e01c154237 net/9p: Fix a potential socket leak in p9_socket_open
b080d4668f net: net_netdev: Fix error handling in ntb_netdev_init_module()
fe6bc99c27 net: phy: fix null-ptr-deref while probe() failed
0184ede0ec wifi: mac8021: fix possible oob access in ieee80211_get_rate_duration
e2ed90fd3a wifi: cfg80211: don't allow multi-BSSID in S1G
9e6b79a3cd wifi: cfg80211: fix buffer overflow in elem comparison
6922948c2e aquantia: Do not purge addresses when setting the number of rings
fa59d49a49 qlcnic: fix sleep-in-atomic-context bugs caused by msleep
d753f554f2 can: cc770: cc770_isa_probe(): add missing free_cc770dev()
e74746bf04 can: sja1000_isa: sja1000_isa_probe(): add missing free_sja1000dev()
0d2f9d95d9 net/mlx5e: Fix use-after-free when reverting termination table
2cb84ff349 net/mlx5: Fix uninitialized variable bug in outlen_write()
b775f37d94 e100: Fix possible use after free in e100_xmit_prepare
086f656e44 e100: switch from 'pci_' to 'dma_' API
971c55f076 iavf: Fix error handling in iavf_init_module()
d389a4c698 iavf: remove redundant ret variable
fd4960ea53 fm10k: Fix error handling in fm10k_init_module()
dd425cec79 i40e: Fix error handling in i40e_init_module()
f166c62cad ixgbevf: Fix resource leak in ixgbevf_init_module()
8f7047f418 of: property: decrement node refcount in of_fwnode_get_reference_args()
be006212bd bpf: Do not copy spin lock field from user in bpf_selem_alloc
90907cd4d1 hwmon: (ibmpex) Fix possible UAF when ibmpex_register_bmc() fails
7649bba263 hwmon: (i5500_temp) fix missing pci_disable_device()
dddfc03f04 hwmon: (ina3221) Fix shunt sum critical calculation
984fcd3ec1 hwmon: (ltc2947) fix temperature scaling
8a549ab672 libbpf: Handle size overflow for ringbuf mmap
cc140c729c ARM: at91: rm9200: fix usb device clock id
592724b14d scripts/faddr2line: Fix regression in name resolution on ppc64le
353c3aaaf3 bpf, perf: Use subprog name when reporting subprog ksymbol
d48f6a5784 iio: light: rpr0521: add missing Kconfig dependencies
5eb114f55b iio: health: afe4404: Fix oob read in afe4404_[read|write]_raw
b1756af172 iio: health: afe4403: Fix oob read in afe4403_read_raw
01d7c41eac btrfs: qgroup: fix sleep from invalid context bug in btrfs_qgroup_inherit()
d3f5be8246 drm/amdgpu: Partially revert "drm/amdgpu: update drm_display_info correctly when the edid is read"
00570fafc2 drm/amdgpu: update drm_display_info correctly when the edid is read
44b204730b drm/display/dp_mst: Fix drm_dp_mst_add_affected_dsc_crtcs() return code
1faf21bdd1 btrfs: move QUOTA_ENABLED check to rescan_should_stop from btrfs_qgroup_rescan_worker
6050872f9f spi: spi-imx: Fix spi_bus_clk if requested clock is higher than input clock
7b020665d4 btrfs: free btrfs_path before copying inodes to userspace
d5b7a34379 btrfs: sink iterator parameter to btrfs_ioctl_logical_to_ino
f3226d86f8 Revert "xfrm: fix "disable_policy" on ipv4 early demux"
982d7f3eb8 Merge 5.10.157 into android12-5.10-lts
37d3df60cb ANDROID: CRC ABI fixups in ip.h and ipv6.h
f4245f0538 Linux 5.10.157
4801672fb0 fuse: lock inode unconditionally in fuse_fallocate()
86f0082fb9 drm/i915: fix TLB invalidation for Gen12 video and compute engines
feb97cf45e drm/amdgpu: always register an MMU notifier for userptr
596b7d55d7 drm/amd/dc/dce120: Fix audio register mapping, stop triggering KASAN
c86c1a7037 btrfs: sysfs: normalize the error handling branch in btrfs_init_sysfs()
1581830c0e btrfs: free btrfs_path before copying subvol info to userspace
0bdb8f7ef8 btrfs: free btrfs_path before copying fspath to userspace
24a37ba2cb btrfs: free btrfs_path before copying root refs to userspace
b56d6e5585 genirq: Take the proposed affinity at face value if force==true
9d90a2b98e irqchip/gic-v3: Always trust the managed affinity provided by the core code
e0d2c59ee9 genirq: Always limit the affinity to online CPUs
f8f80d532f genirq/msi: Shutdown managed interrupts with unsatifiable affinities
3eb6b89a4e wifi: wilc1000: validate number of channels
5a068535c0 wifi: wilc1000: validate length of IEEE80211_P2P_ATTR_CHANNEL_LIST attribute
905f886eae wifi: wilc1000: validate length of IEEE80211_P2P_ATTR_OPER_CHANNEL attribute
7c6535fb4d wifi: wilc1000: validate pairwise and authentication suite offsets
64b7f9a7dd dm integrity: clear the journal on suspend
d306f73079 dm integrity: flush the journal on suspend
79d9a11679 gpu: host1x: Avoid trying to use GART on Tegra20
a7f30b5b8d net: usb: qmi_wwan: add Telit 0x103a composition
7e8eaa939e tcp: configurable source port perturb table size
0acc008cf9 platform/x86: hp-wmi: Ignore Smart Experience App event
0964b77bab zonefs: fix zone report size in __zonefs_io_error()
a5937dae66 platform/x86: acer-wmi: Enable SW_TABLET_MODE on Switch V 10 (SW5-017)
52fb7bcea0 platform/x86: asus-wmi: add missing pci_dev_put() in asus_wmi_set_xusb2pr()
4fa717ba2d xen/platform-pci: add missing free_irq() in error path
f45a5a6c9f xen-pciback: Allow setting PCI_MSIX_FLAGS_MASKALL too
9bbb587472 Input: soc_button_array - add Acer Switch V 10 to dmi_use_low_level_irq[]
4ea4316dff Input: soc_button_array - add use_low_level_irq module parameter
c1620e996d Input: goodix - try resetting the controller when no config is set
f4db050958 serial: 8250: 8250_omap: Avoid RS485 RTS glitch on ->set_termios()
7c3e39ccf5 ASoC: Intel: bytcht_es8316: Add quirk for the Nanote UMPC-01
36e0b97619 Input: synaptics - switch touchpad on HP Laptop 15-da3001TU to RMI mode
ae9e0cc973 binder: Gracefully handle BINDER_TYPE_FDA objects with num_fds=0
017de84253 binder: Address corner cases in deferred copy and fixup
2e3c27f241 binder: fix pointer cast warning
c9d3f25a7f binder: defer copies of pre-patched txn data
5204296fc7 binder: read pre-translated fds from sender buffer
23e9d815fa binder: avoid potential data leakage when copying txn
22870431cd x86/ioremap: Fix page aligned size calculation in __ioremap_caller()
3fdeacf087 KVM: x86: remove exit_int_info warning in svm_handle_exit
7e5cb13091 KVM: x86: nSVM: leave nested mode on vCPU free
d925dd3e44 mm: vmscan: fix extreme overreclaim and swap floods
a4a62a23fa gcov: clang: fix the buffer overflow issue
e7f21d10e9 nilfs2: fix nilfs_sufile_mark_dirty() not set segment usage as dirty
f06b7e6a77 usb: dwc3: gadget: Clear ep descriptor last
cff7523ab8 usb: dwc3: gadget: Return -ESHUTDOWN on ep disable
a32635528d usb: dwc3: gadget: conditionally remove requests
ca3a08e9d9 ceph: fix NULL pointer dereference for req->r_session
00c004c070 ceph: Use kcalloc for allocating multiple elements
69263bf781 ceph: fix possible NULL pointer dereference for req->r_session
8e137ace53 ceph: put the requests/sessions when it fails to alloc memory
38993788f4 ceph: fix off by one bugs in unsafe_request_wait()
8a31ae7f77 ceph: flush the mdlog before waiting on unsafe reqs
78b2f546f7 ceph: flush mdlog before umounting
d94ba7b3b7 ceph: make iterate_sessions a global symbol
9ac038d3c2 ceph: make ceph_create_session_msg a global symbol
8382cdf0ab usb: cdns3: Add support for DRD CDNSP
57112da86b mmc: sdhci-brcmstb: Fix SDHCI_RESET_ALL for CQHCI
b5d770977b mmc: sdhci-brcmstb: Enable Clock Gating to save power
049194538c mmc: sdhci-brcmstb: Re-organize flags
fbe955be26 nios2: add FORCE for vmlinuz.gz
c0a9c9973d init/Kconfig: fix CC_HAS_ASM_GOTO_TIED_OUTPUT test with dash
456e895fd0 iio: core: Fix entry not deleted when iio_register_sw_trigger_type() fails
fa9efcbfbf iio: light: apds9960: fix wrong register for gesture gain
bd1b8041c2 arm64: dts: rockchip: lower rk3399-puma-haikou SD controller clock frequency
86ba9c8595 ext4: fix use-after-free in ext4_ext_shift_extents
350e98a08a usb: dwc3: exynos: Fix remove() function
d21d26e65b lib/vdso: use "grep -E" instead of "egrep"
c0cf8bc259 net: enetc: preserve TX ring priority across reconfiguration
de4dd4f9b3 net: enetc: cache accesses to &priv->si->hw
1f080b8caa net: enetc: manage ENETC_F_QBV in priv->active_offloads only when enabled
1d840c5d67 s390/crashdump: fix TOD programmable field size
11052f1188 net: thunderx: Fix the ACPI memory leak
b034fe2a08 nfc: st-nci: fix memory leaks in EVT_TRANSACTION
e14583073f nfc: st-nci: fix incorrect validating logic in EVT_TRANSACTION
9cc863d523 arcnet: fix potential memory leak in com20020_probe()
4d2be0cf27 net: arcnet: Fix RESET flag handling
e61b00374a s390/dasd: fix no record found for raw_track_access
aeebb07499 ipv4: Fix error return code in fib_table_insert()
c0af4d005a dccp/tcp: Reset saddr on failure after inet6?_hash_connect().
b8e494240e netfilter: flowtable_offload: add missing locking
af9de5cdcb dma-buf: fix racing conflict of dma_heap_add()
c40b76dfa7 bnx2x: fix pci device refcount leak in bnx2x_vf_is_pcie_pending()
f81e9c0510 regulator: twl6030: re-add TWL6032_SUBCLASS
32b944b9c4 NFC: nci: fix memory leak in nci_rx_data_packet()
68a7aec3f4 net: sched: allow act_ct to be built without NF_NAT
8e2664e12b sfc: fix potential memleak in __ef100_hard_start_xmit()
6b638a16ea xfrm: Fix ignored return value in xfrm6_init()
c7788361a6 tipc: check skb_linearize() return value in tipc_disc_rcv()
4058e3b74a tipc: add an extra conn_get in tipc_conn_alloc
e87a077d09 tipc: set con sock in tipc_conn_alloc
891daa95b0 net/mlx5: Fix handling of entry refcount when command is not issued to FW
e06ff9f8fe net/mlx5: Fix FW tracer timestamp calculation
5689eba90a netfilter: ipset: regression in ip_set_hash_ip.c
e62e62ea91 netfilter: ipset: Limit the maximal range of consecutive elements to add/delete
8dca384970 Drivers: hv: vmbus: fix possible memory leak in vmbus_device_register()
909186cf34 Drivers: hv: vmbus: fix double free in the error path of vmbus_add_channel_work()
f42802e14a macsec: Fix invalid error code set
72be055615 nfp: add port from netdev validation for EEPROM access
ce41e03cac nfp: fill splittable of devlink_port_attrs correctly
0b553ded34 net: pch_gbe: fix pci device refcount leak while module exiting
2c59ef9ab6 net/qla3xxx: fix potential memleak in ql3xxx_send()
a24d5f6c8b net/mlx4: Check retval of mlx4_bitmap_init
da86a63479 net: ethernet: mtk_eth_soc: fix error handling in mtk_open()
756534f7cf ARM: dts: imx6q-prti6q: Fix ref/tcxo-clock-frequency properties
290a71ff72 ARM: mxs: fix memory leak in mxs_machine_init()
5c97af75f5 netfilter: conntrack: Fix data-races around ct mark
459332f8db 9p/fd: fix issue of list_del corruption in p9_fd_cancel()
26bb8f6aaa net: pch_gbe: fix potential memleak in pch_gbe_tx_queue()
398a860a44 nfc/nci: fix race with opening and closing
3535c632e6 rxrpc: Fix race between conn bundle lookup and bundle removal [ZDI-CAN-15975]
23c03ee0ee rxrpc: Use refcount_t rather than atomic_t
bddde342c6 rxrpc: Allow list of in-use local UDP endpoints to be viewed in /proc
a2d5dba2fc net: liquidio: simplify if expression
8124a02e17 ARM: dts: at91: sam9g20ek: enable udc vbus gpio pinctrl
b547bf71fa tee: optee: fix possible memory leak in optee_register_device()
b76c5a99f4 bus: sunxi-rsb: Support atomic transfers
0c059b7d2a regulator: core: fix UAF in destroy_regulator()
fcb2d28636 spi: dw-dma: decrease reference count in dw_spi_dma_init_mfld()
0b6441abfa regulator: core: fix kobject release warning and memory leak in regulator_register()
26d3d3ffa8 scsi: storvsc: Fix handling of srb_status and capacity change events
c34db0d6b8 ASoC: soc-pcm: Don't zero TDM masks in __soc_pcm_open()
4f6c7344ab ASoC: sgtl5000: Reset the CHIP_CLK_CTRL reg on remove
164a5b50d1 ASoC: hdac_hda: fix hda pcm buffer overflow issue
7cfb4b8579 ARM: dts: am335x-pcm-953: Define fixed regulators in root node
b7000254c1 af_key: Fix send_acquire race with pfkey_register
51969d679b xfrm: replay: Fix ESN wrap around for GSO
497653f6d2 xfrm: fix "disable_policy" on ipv4 early demux
836bbdfcf8 MIPS: pic32: treat port as signed integer
c0bb600f07 RISC-V: vdso: Do not add missing symbols to version section in linker script
81cc6d8400 arm64/syscall: Include asm/ptrace.h in syscall_wrapper header.
fa5f2c72d3 block, bfq: fix null pointer dereference in bfq_bio_bfqg()
d29bde8689 drm: panel-orientation-quirks: Add quirk for Acer Switch V 10 (SW5-017)
f7ce6fb04e scsi: scsi_debug: Make the READ CAPACITY response compliant with ZBC
2574903ee2 scsi: ibmvfc: Avoid path failures during live migration
7fc62181c1 platform/x86: touchscreen_dmi: Add info for the RCA Cambio W101 v2 2-in-1
f54a11b6bf Revert "net: macsec: report real_dev features when HW offloading is enabled"
f4b8c0710a selftests/bpf: Add verifier test for release_reference()
361a165098 spi: stm32: fix stm32_spi_prepare_mbr() that halves spi clk for every run
2c1ca23555 wifi: mac80211: Fix ack frame idr leak when mesh has no route
8d39913158 wifi: airo: do not assign -1 to unsigned char
8552e6048e audit: fix undefined behavior in bit shift for AUDIT_BIT
1c9eb641d1 riscv: dts: sifive unleashed: Add PWM controlled LEDs
92ae6facd1 wifi: mac80211_hwsim: fix debugfs attribute ps with rc table support
2fcc593b50 wifi: mac80211: fix memory free error when registering wiphy fail
044bc6d3c2 ceph: avoid putting the realm twice when decoding snaps fails
d43219bb33 ceph: do not update snapshot context when there is no new snapshot
49c71b6814 iio: pressure: ms5611: fixed value compensation bug
879139bc7a iio: ms5611: Simplify IO callback parameters
80c825e1e3 nvme-pci: add NVME_QUIRK_BOGUS_NID for Micron Nitro
f4066fb910 nvme: add a bogus subsystem NQN quirk for Micron MTFDKBA2T0TFH
4f0cea018e drm/display: Don't assume dual mode adaptors support i2c sub-addressing
347f1793b5 bridge: switchdev: Fix memory leaks when changing VLAN protocol
89a7f155e6 bridge: switchdev: Notify about VLAN protocol changes
f5cbd86ebf ata: libata-core: do not issue non-internal commands once EH is pending
4034d06a4d ata: libata-scsi: simplify __ata_scsi_queuecmd()
03aabcb88a scsi: scsi_transport_sas: Fix error handling in sas_phy_add()
d9b90a99f3 Merge 5.10.156 into android12-5.10-lts
25af5a11f1 Merge 5.10.155 into android12-5.10-lts
e5d2cd6ad8 ANDROID: abi preservation for fscrypt change in 5.10.154
5bc3ece380 Revert "serial: 8250: Let drivers request full 16550A feature probing"
f466ca1247 Merge 5.10.154 into android12-5.10-lts
6d46ef50b1 Linux 5.10.156
7be134eb69 Revert "net: broadcom: Fix BCMGENET Kconfig"
957732a09c ntfs: check overflow when iterating ATTR_RECORDs
6322dda483 ntfs: fix out-of-bounds read in ntfs_attr_find()
b825bfbbaa ntfs: fix use-after-free in ntfs_attr_find()
294ef12dcc mm: fs: initialize fsdata passed to write_begin/write_end interface
a8e2fc8f7b 9p/trans_fd: always use O_NONBLOCK read/write
a5da76df46 gfs2: Switch from strlcpy to strscpy
5fa30be7ba gfs2: Check sb_bsize_shift after reading superblock
f14858bc77 9p: trans_fd/p9_conn_cancel: drop client lock earlier
4154b6afa2 kcm: close race conditions on sk_receive_queue
7deb7a9d33 kcm: avoid potential race in kcm_tx_work
35309be06b tcp: cdg: allow tcp_cdg_release() to be called multiple times
e929ec98c0 macvlan: enforce a consistent minimal mtu
95ebea5a15 uapi/linux/stddef.h: Add include guards
3f25add5ec Input: i8042 - fix leaking of platform device on module removal
7d606ae1ab kprobes: Skip clearing aggrprobe's post_handler in kprobe-on-ftrace case
89ece5ff7d scsi: scsi_debug: Fix possible UAF in sdebug_add_host_helper()
75205f1b47 scsi: target: tcm_loop: Fix possible name leak in tcm_loop_setup_hba_bus()
6e9334436d net: use struct_group to copy ip/ipv6 header addresses
9fd7bdaffe stddef: Introduce struct_group() helper macro
47c3bdd955 usbnet: smsc95xx: Fix deadlock on runtime resume
8208c266fe ring-buffer: Include dropped pages in counting dirty patches
36b5095b07 net: fix a concurrency bug in l2tp_tunnel_register()
023435a095 nvme: ensure subsystem reset is single threaded
b9a5ecf241 nvme: restrict management ioctls to admin
5e2f14d772 perf/x86/intel/pt: Fix sampling using single range output
62634b43d3 misc/vmw_vmci: fix an infoleak in vmci_host_do_receive_datagram()
c1eb46a65b docs: update mediator contact information in CoC doc
4423866d31 mmc: sdhci-pci: Fix possible memory leak caused by missing pci_dev_put()
440653a180 mmc: sdhci-pci-o2micro: fix card detect fail issue caused by CD# debounce timeout
8e70b14131 mmc: core: properly select voltage range without power cycle
05b0f6624d firmware: coreboot: Register bus in module init
deda86a0d8 iommu/vt-d: Set SRE bit only when hardware has SRS cap
d2c7d8f58e scsi: zfcp: Fix double free of FSF request when qdio send fails
db744288af maccess: Fix writing offset in case of fault in strncpy_from_kernel_nofault()
24cc679abb Input: iforce - invert valid length check when fetching device IDs
5f4611fe01 serial: 8250_lpss: Configure DMA also w/o DMA filter
8679087e93 serial: 8250: Flush DMA Rx on RLSI
a5eaad87bf serial: 8250: Fall back to non-DMA Rx if IIR_RDI occurs
f59f5a269c dm ioctl: fix misbehavior if list_versions races with module loading
67a75a9480 iio: pressure: ms5611: changed hardcoded SPI speed to value limited
d95b85c508 iio: adc: mp2629: fix potential array out of bound access
46b8bc62c5 iio: adc: mp2629: fix wrong comparison of channel
8dddf2699d iio: trigger: sysfs: fix possible memory leak in iio_sysfs_trig_init()
85d2a8b287 iio: adc: at91_adc: fix possible memory leak in at91_adc_allocate_trigger()
85cc1a2fd8 usb: typec: mux: Enter safe mode only when pins need to be reconfigured
efaab05520 usb: chipidea: fix deadlock in ci_otg_del_timer
143ba5c2d2 usb: add NO_LPM quirk for Realforce 87U Keyboard
249cef723f USB: serial: option: add Fibocom FM160 0x0111 composition
5c44c60358 USB: serial: option: add u-blox LARA-L6 modem
0e88a3cfa6 USB: serial: option: add u-blox LARA-R6 00B modem
de707957d9 USB: serial: option: remove old LARA-R6 PID
878227a3dd USB: serial: option: add Sierra Wireless EM9191
25c652811d USB: bcma: Make GPIO explicitly optional
eb3af3ea5b speakup: fix a segfault caused by switching consoles
8cbaf4ed53 slimbus: stream: correct presence rate frequencies
15155f7c0e Revert "usb: dwc3: disable USB core PHY management"
100d1e53bb ALSA: hda/realtek: Fix the speaker output on Samsung Galaxy Book Pro 360
c7dcc89482 ALSA: hda/realtek: fix speakers for Samsung Galaxy Book Pro
a80369c8ca ALSA: usb-audio: Drop snd_BUG_ON() from snd_usbmidi_output_open()
28a54854a9 tracing: kprobe: Fix potential null-ptr-deref on trace_array in kprobe_event_gen_test_exit()
bb70fcae41 tracing: kprobe: Fix potential null-ptr-deref on trace_event_file in kprobe_event_gen_test_exit()
315b149f08 tracing: Fix wild-memory-access in register_synth_event()
65ba7e7c24 tracing: Fix memory leak in test_gen_synth_cmd() and test_empty_synth_event()
5d4cc7bc1a tracing/ring-buffer: Have polling block on watermark
5fdebbeca5 ring_buffer: Do not deactivate non-existant pages
6a14828cad ftrace: Fix null pointer dereference in ftrace_add_mod()
6ed60c60ec ftrace: Optimize the allocation for mcount entries
9569eed79b ftrace: Fix the possible incorrect kernel message
5fc19c8313 cifs: add check for returning value of SMB2_set_info_init
0aeb0de528 net: thunderbolt: Fix error handling in tbnet_init()
e13ef43813 cifs: Fix wrong return value checking when GETFLAGS
9f00da9c86 net/x25: Fix skb leak in x25_lapb_receive_frame()
94822d2331 net: ag71xx: call phylink_disconnect_phy if ag71xx_hw_enable() fail in ag71xx_open()
3aeb13bc3d cifs: add check for returning value of SMB2_close_init
c24013273e platform/x86/intel: pmc: Don't unconditionally attach Intel PMC when virtualized
9ed51414ae drbd: use after free in drbd_create_device()
6b23a4b252 net: ena: Fix error handling in ena_init()
2d5a495501 net: ionic: Fix error handling in ionic_init_module()
bb9924a6ed xen/pcpu: fix possible memory leak in register_pcpu()
d6a561bd4c bnxt_en: Remove debugfs when pci_register_driver failed
389738f5db net: caif: fix double disconnect client in chnl_net_open()
fb5ee1560b net: macvlan: Use built-in RCU list checking
709aa1f73d mISDN: fix misuse of put_device() in mISDN_register_device()
417f2d2edf net: liquidio: release resources when liquidio driver open failed
4cba73f2d6 net: hinic: Fix error handling in hinic_module_init()
083a2c9ef8 mISDN: fix possible memory leak in mISDN_dsp_element_register()
6b23993d5b net: bgmac: Drop free_netdev() from bgmac_enet_remove()
1f6a73b25d bpf: Initialize same number of free nodes for each pcpu_freelist
ef2ac07ab8 ata: libata-transport: fix error handling in ata_tdev_add()
7377a14598 ata: libata-transport: fix error handling in ata_tlink_add()
b5362dc163 ata: libata-transport: fix error handling in ata_tport_add()
ac471468f7 ata: libata-transport: fix double ata_host_put() in ata_tport_add()
ac4f404c25 arm64: dts: imx8mn: Fix NAND controller size-cells
30ece7dbee arm64: dts: imx8mm: Fix NAND controller size-cells
f68a9efd78 ARM: dts: imx7: Fix NAND controller size-cells
1d160dfb3f drm: Fix potential null-ptr-deref in drm_vblank_destroy_worker()
c47a823ea1 drm/drv: Fix potential memory leak in drm_dev_init()
c776a49d09 drm/panel: simple: set bpc field for logic technologies displays
777430aa4d pinctrl: devicetree: fix null pointer dereferencing in pinctrl_dt_to_map
bce3e6fe8b parport_pc: Avoid FIFO port location truncation
a4b5423f88 siox: fix possible memory leak in siox_device_add()
0679f571d3 arm64: Fix bit-shifting UB in the MIDR_CPU_MODEL() macro
58636b5ff3 block: sed-opal: kmalloc the cmd/resp buffers
e27458b18b sctp: clear out_curr if all frag chunks of current msg are pruned
0b4c259b63 sctp: remove the unnecessary sinfo_stream check in sctp_prsctp_prune_unsent
7360e7c29d ASoC: soc-utils: Remove __exit for snd_soc_util_exit()
e60f37a1d3 bpf, test_run: Fix alignment problem in bpf_prog_test_run_skb()
b8fe1a5aa7 tty: n_gsm: fix sleep-in-atomic-context bug in gsm_control_send
0a3160f4ff serial: imx: Add missing .thaw_noirq hook
7e1f908e65 serial: 8250: omap: Flush PM QOS work on remove
d833cba201 serial: 8250: omap: Fix unpaired pm_runtime_put_sync() in omap8250_remove()
b0b6ea651e serial: 8250_omap: remove wait loop from Errata i202 workaround
f14c312c21 serial: 8250: omap: Fix missing PM runtime calls for omap8250_set_mctrl()
85cdbf04b4 serial: 8250: Remove serial_rs485 sanitization from em485
f5dedad405 ASoC: tas2764: Fix set_tdm_slot in case of single slot
9e82d78fbe ASoC: tas2770: Fix set_tdm_slot in case of single slot
8d21554ec7 ASoC: core: Fix use-after-free in snd_soc_exit()
38ca9bd336 spi: stm32: Print summary 'callbacks suppressed' message
a180da5564 drm/amdgpu: disable BACO on special BEIGE_GOBY card
f3adf0adf3 drm/amd/pm: disable BACO entry/exit completely on several sienna cichlid cards
b0faeff69a drm/amd/pm: Read BIF STRAP also for BACO check
6958556285 drm/amd/pm: support power source switch on Sienna Cichlid
7daab001a6 mmc: sdhci-esdhc-imx: use the correct host caps for MMC_CAP_8_BIT_DATA
65ac4d1807 spi: intel: Use correct mask for flash and protected regions
23793518a7 mtd: spi-nor: intel-spi: Disable write protection only if asked
a326fffdc7 ALSA: hda/realtek: fix speakers and micmute on HP 855 G8
24839d027c ASoC: codecs: jz4725b: Fix spelling mistake "Sourc" -> "Source", "Routee" -> "Route"
bd48793240 Bluetooth: L2CAP: Fix l2cap_global_chan_by_psm
ce75e90859 btrfs: remove pointless and double ulist frees in error paths of qgroup tests
16743c4bf3 drm/imx: imx-tve: Fix return type of imx_tve_connector_mode_valid
df2747f295 i2c: i801: add lis3lv02d's I2C address for Vostro 5568
959cb0fd69 i2c: tegra: Allocate DMA memory for DMA engine
6cb657722e NFSv4: Retry LOCK on OLD_STATEID during delegation return
f0187227e2 drm/amd/display: Remove wrong pipe control lock
bb3edbd092 ASoC: rt1308-sdw: add the default value of some registers
b1619f0307 selftests/intel_pstate: fix build for ARCH=x86_64
fdf6807606 selftests/futex: fix build for clang
c1f0defecb ASoC: codecs: jz4725b: fix capture selector naming
aeb7e8bc0d ASoC: codecs: jz4725b: use right control for Capture Volume
c87945c173 ASoC: codecs: jz4725b: fix reported volume for Master ctl
9aae00961a ASoC: codecs: jz4725b: add missed Line In power control bit
0b4d650f90 spi: intel: Fix the offset to get the 64K erase opcode
6910e7279f ASoC: wm8962: Add an event handler for TEMP_HP and TEMP_SPK
c7432616f6 ASoC: mt6660: Keep the pm_runtime enables before component stuff in mt6660_i2c_probe
a47606064c ASoC: wm8997: Revert "ASoC: wm8997: Fix PM disable depth imbalance in wm8997_probe"
f8f254c8b5 ASoC: wm5110: Revert "ASoC: wm5110: Fix PM disable depth imbalance in wm5110_probe"
c73aa2cc41 ASoC: wm5102: Revert "ASoC: wm5102: Fix PM disable depth imbalance in wm5102_probe"
673a7341bd Merge 5.10.153 into android12-5.10-lts
27b36ba7c2 Merge 5.10.152 into android12-5.10-lts
bf759deb0f Merge 5.10.151 into android12-5.10-lts
6b31c548a1 ANDROID: fix up struct sk_buf ABI breakage
bd66e91ad2 ANDROID: fix up CRC issue with struct tcp_sock
3905cfd1d6 Revert "serial: 8250: Toggle IER bits on only after irq has been set up"
41217963b1 Linux 5.10.155
0f544353fe io_uring: kill goto error handling in io_sqpoll_wait_sq()
154d744fbe x86/cpu: Restore AMD's DE_CFG MSR after resume
e7294b01de mmc: sdhci-esdhc-imx: Convert the driver to DT-only
534762e261 net: tun: call napi_schedule_prep() to ensure we own a napi
367bc0fa98 dmaengine: at_hdmac: Check return code of dma_async_device_register
85f97c97ef dmaengine: at_hdmac: Fix impossible condition
f53a233eaa dmaengine: at_hdmac: Don't allow CPU to reorder channel enable
f451285522 dmaengine: at_hdmac: Fix completion of unissued descriptor in case of errors
6be4ab08c8 dmaengine: at_hdmac: Fix descriptor handling when issuing it to hardware
a35dd5dd98 dmaengine: at_hdmac: Fix concurrency over the active list
0f603bf553 dmaengine: at_hdmac: Free the memset buf without holding the chan lock
7f07cecc74 dmaengine: at_hdmac: Fix concurrency over descriptor
1582cc3b48 dmaengine: at_hdmac: Fix concurrency problems by removing atc_complete_all()
9b69060a72 dmaengine: at_hdmac: Protect atchan->status with the channel lock
ee35682261 dmaengine: at_hdmac: Do not call the complete callback on device_terminate_all
7078e935b4 dmaengine: at_hdmac: Fix premature completion of desc in issue_pending
ad4cbe8e9c dmaengine: at_hdmac: Start transfer for cyclic channels in issue_pending
24f9e93e50 dmaengine: at_hdmac: Don't start transactions at tx_submit level
4b51cce72a dmaengine: at_hdmac: Fix at_lli struct definition
d37dfb9357 cert host tools: Stop complaining about deprecated OpenSSL functions
f8e0edeaa0 can: j1939: j1939_send_one(): fix missing CAN header initialization
0b692d41ee mm/memremap.c: map FS_DAX device memory as decrypted
03f9582a6a udf: Fix a slab-out-of-bounds write bug in udf_find_entry()
4ea3aa3b98 mms: sdhci-esdhc-imx: Fix SDHCI_RESET_ALL for CQHCI
9c0accfa5a btrfs: selftests: fix wrong error check in btrfs_free_dummy_root()
8fa0c22ef8 platform/x86: hp_wmi: Fix rfkill causing soft blocked wifi
b5ee579fcb drm/i915/dmabuf: fix sg_table handling in map_dma_buf
4feedde548 nilfs2: fix use-after-free bug of ns_writer on remount
1d4ff73062 nilfs2: fix deadlock in nilfs_count_free_blocks()
344ddbd688 ata: libata-scsi: fix SYNCHRONIZE CACHE (16) command failure
516f9f2300 vmlinux.lds.h: Fix placement of '.data..decrypted' section
f6896fb69d ALSA: usb-audio: Add DSD support for Accuphase DAC-60
2032c2d32b ALSA: usb-audio: Add quirk entry for M-Audio Micro
a414a6d6ef ALSA: hda/realtek: Add Positivo C6300 model quirk
3a79f9568d ALSA: hda: fix potential memleak in 'add_widget_node'
380d64168d ALSA: hda/ca0132: add quirk for EVGA Z390 DARK
181cfff57b ALSA: hda/hdmi - enable runtime pm for more AMD display audio
ea6787e482 mmc: sdhci-tegra: Fix SDHCI_RESET_ALL for CQHCI
0a8d4531a0 mmc: sdhci_am654: Fix SDHCI_RESET_ALL for CQHCI
3f558930ad mmc: sdhci-of-arasan: Fix SDHCI_RESET_ALL for CQHCI
b55e64d0a3 mmc: cqhci: Provide helper for resetting both SDHCI and CQHCI
4631cb0406 MIPS: jump_label: Fix compat branch range check
475fd3991a arm64: efi: Fix handling of misaligned runtime regions and drop warning
94ab8f88fe riscv: fix reserved memory setup
0cf9cb0614 riscv: Separate memory init from paging init
d7716240bc riscv: Enable CMA support
ecf78af514 riscv: vdso: fix build with llvm
e56d18a976 riscv: process: fix kernel info leakage
956e0216a1 net: macvlan: fix memory leaks of macvlan_common_newlink
59ec132386 ethernet: tundra: free irq when alloc ring failed in tsi108_open()
dd7beaec8b net: mv643xx_eth: disable napi when init rxq or txq failed in mv643xx_eth_open()
56d3b5531b ethernet: s2io: disable napi when start nic failed in s2io_card_up()
05b2228434 net: atlantic: macsec: clear encryption keys from the stack
1a4e495edf net: phy: mscc: macsec: clear encryption keys when freeing a flow
4ad684ba02 cxgb4vf: shut down the adapter when t4vf_update_port_info() failed in cxgb4vf_open()
38aa7ed8c2 net: cxgb3_main: disable napi when bind qsets failed in cxgb_up()
fd52dd2d6e net: cpsw: disable napi in cpsw_ndo_open()
3b27e20601 net/mlx5e: E-Switch, Fix comparing termination table instance
eb6fa0ac2a net/mlx5: Allow async trigger completion execution on single CPU systems
bdd282bba7 net: nixge: disable napi when enable interrupts failed in nixge_open()
5333cf1b7f net: marvell: prestera: fix memory leak in prestera_rxtx_switch_init()
cf4853880e perf stat: Fix printing os->prefix in CSV metrics output
3a4a3c3b1f drivers: net: xgene: disable napi when register irq failed in xgene_enet_open()
0b7ee3d50f dmaengine: mv_xor_v2: Fix a resource leak in mv_xor_v2_remove()
6e2ffae69d dmaengine: pxa_dma: use platform_get_irq_optional
f31dd15858 tipc: fix the msg->req tlv len check in tipc_nl_compat_name_table_dump_header
fbb4e8e6dc net: broadcom: Fix BCMGENET Kconfig
cb6d639bb1 net: stmmac: dwmac-meson8b: fix meson8b_devm_clk_prepare_enable()
d68fa77ee3 can: af_can: fix NULL pointer dereference in can_rx_register()
a033b86c7f ipv6: addrlabel: fix infoleak when sending struct ifaddrlblmsg to network
02f8dfee75 tcp: prohibit TCP_REPAIR_OPTIONS if data was already sent
f3aa8a7d95 drm/vc4: Fix missing platform_unregister_drivers() call in vc4_drm_register()
bcb3bb1069 hamradio: fix issue of dev reference count leakage in bpq_device_event()
bc4591a86b net: lapbether: fix issue of dev reference count leakage in lapbeth_device_event()
2bf8b1c111 KVM: s390: pv: don't allow userspace to set the clock under PV
a60cc64db7 KVM: s390x: fix SCK locking
fcbd2b3368 capabilities: fix undefined behavior in bit shift for CAP_TO_MASK
8aae24b0ed net: fman: Unregister ethernet device on removal
e2c5ee3b62 bnxt_en: fix potentially incorrect return value for ndo_rx_flow_steer
38147073c9 bnxt_en: Fix possible crash in bnxt_hwrm_set_coal()
3401f96402 net: tun: Fix memory leaks of napi_get_frags
adaa0f180d macsec: clear encryption keys from the stack after setting up offload
9dc7503bae macsec: fix detection of RXSCs when toggling offloading
7f4456f011 macsec: fix secy->n_rx_sc accounting
3b05d9073a macsec: delete new rxsc when offload fails
50868de7dc net: gso: fix panic on frag_list with mixed head alloc types
cedd4f01f6 bpf: Fix wrong reg type conversion in release_reference()
9069db2579 bpf: Add helper macro bpf_for_each_reg_in_vstate
95b6ec7337 bpf: Support for pointers beyond pkt_end.
8597b59e3d HID: hyperv: fix possible memory leak in mousevsc_probe()
8c80b2fca4 bpftool: Fix NULL pointer dereference when pin {PROG, MAP, LINK} without FILE
cc21dc48a7 bpf, sockmap: Fix the sk->sk_forward_alloc warning of sk_stream_kill_queues
e1e1218032 wifi: cfg80211: fix memory leak in query_regdb_file()
914cb94e73 wifi: cfg80211: silence a sparse RCU warning
72ea2fc299 phy: stm32: fix an error code in probe
925bf1ba76 hwspinlock: qcom: correct MMIO max register for newer SoCs
76eba54f0d fuse: fix readdir cache race
7bcea6c5c9 ANDROID: gki_defconfig: remove CONFIG_INIT_STACK_ALL_ZERO=y
d2bc3376cd Revert "serial: 8250: Fix restoring termios speed after suspend"
0b500f5b16 Merge 5.10.150 into android12-5.10-lts
f5b40c0eb9 Linux 5.10.154
bf506e366d ipc: remove memcg accounting for sops objects in do_semtimedop()
c6678c8f4f wifi: brcmfmac: Fix potential buffer overflow in brcmf_fweh_event_worker()
a6c57adec5 drm/i915/sdvo: Setup DDC fully before output init
b86830cc95 drm/i915/sdvo: Filter out invalid outputs more sensibly
9f3b867808 drm/rockchip: dsi: Force synchronous probe
23f1fc7ce5 ext4,f2fs: fix readahead of verity data
e5cef906cb KVM: x86: emulator: update the emulation mode after CR0 write
ce9261accc KVM: x86: emulator: introduce emulator_recalc_and_set_mode
c8a2fd7a71 KVM: x86: emulator: em_sysexit should update ctxt->mode
e0c7410378 KVM: x86: Mask off reserved bits in CPUID.80000001H
9302ebc1c2 KVM: x86: Mask off reserved bits in CPUID.80000008H
cc40c5f3e9 KVM: x86: Mask off reserved bits in CPUID.8000001AH
bd64a88f36 KVM: x86: Mask off reserved bits in CPUID.80000006H
156451a67b ext4: fix BUG_ON() when directory entry has invalid rec_len
5370b965b7 ext4: fix warning in 'ext4_da_release_space'
c9598cf629 parisc: Avoid printing the hardware path twice
98f836e80d parisc: Export iosapic_serial_irq() symbol for serial port driver
814af9a32b parisc: Make 8250_gsc driver dependend on CONFIG_PARISC
29d106d086 perf/x86/intel: Add Cooper Lake stepping to isolation_ucodes[]
98f6e7c337 perf/x86/intel: Fix pebs event constraints for ICL
3be2d66822 efi: random: Use 'ACPI reclaim' memory for random seed
83294f7c77 efi: random: reduce seed size to 32 bytes
f8e8cda869 fuse: add file_modified() to fallocate
cdf01c807e capabilities: fix potential memleak on error path from vfs_getxattr_alloc()
ff32d8a099 tracing/histogram: Update document for KEYS_MAX size
533bfacbac tools/nolibc/string: Fix memcmp() implementation
f100a02748 kprobe: reverse kp->flags when arm_kprobe failed
bef08acbe5 tracing: kprobe: Fix memory leak in test_gen_kprobe/kretprobe_cmd()
2bf33b5ea4 tcp/udp: Make early_demux back namespacified.
ea5f2fd464 ftrace: Fix use-after-free for dynamic ftrace_ops
06de93a47c btrfs: fix type of parameter generation in btrfs_get_dentry
e33ce54cef coresight: cti: Fix hang in cti_disable_hw()
015ac18be7 binder: fix UAF of alloc->vma in race with munmap()
836686e1a0 memcg: enable accounting of ipc resources
e4e4b24b42 mtd: rawnand: gpmi: Set WAIT_FOR_READY timeout based on program/erase times
818c36b988 tcp/udp: Fix memory leak in ipv6_renew_options().
29997a6fa6 fscrypt: fix keyring memory leak on mount failure
391cceee6d fscrypt: stop using keyrings subsystem for fscrypt_master_key
092401142b fscrypt: simplify master key locking
54c13d3520 ALSA: usb-audio: Add quirks for MacroSilicon MS2100/MS2106 devices
a0e2577cf3 block, bfq: protect 'bfqd->queued' by 'bfqd->lock'
26ca2ac091 Bluetooth: L2CAP: Fix attempting to access uninitialized memory
6b6f94fb9a Bluetooth: L2CAP: Fix accepting connection request for invalid SPSM
bfd5e62f9a i2c: piix4: Fix adapter not be removed in piix4_remove()
fc3e2fa0a5 arm64: dts: juno: Add thermal critical trip points
b743ecf29c firmware: arm_scmi: Make Rx chan_setup fail on memory errors
29e8e9bfc2 firmware: arm_scmi: Suppress the driver's bind attributes
d7b1e2cbe0 ARM: dts: imx6qdl-gw59{10,13}: fix user pushbutton GPIO offset
160d8904b2 efi/tpm: Pass correct address to memblock_reserve
c40b4d604b i2c: xiic: Add platform module alias
5bf8c7798b drm/amdgpu: set vm_update_mode=0 as default for Sienna Cichlid in SRIOV case
496eb203d0 HID: saitek: add madcatz variant of MMO7 mouse device ID
ff06067b70 scsi: core: Restrict legal sdev_state transitions via sysfs
9edf20e5a1 ACPI: APEI: Fix integer overflow in ghes_estatus_pool_init()
be6e22f546 media: meson: vdec: fix possible refcount leak in vdec_probe()
c5fd54a65c media: dvb-frontends/drxk: initialize err to 0
7fdc58d8c2 media: cros-ec-cec: limit msg.len to CEC_MAX_MSG_SIZE
1609231f86 media: s5p_cec: limit msg.len to CEC_MAX_MSG_SIZE
c46759e370 media: rkisp1: Zero v4l2_subdev_format fields in when validating links
3144ce5574 media: rkisp1: Initialize color space on resizer sink and source pads
6b24d9c2ac s390/boot: add secure boot trailer
efc6420d65 xhci-pci: Set runtime PM as default policy on all xHC 1.2 or later devices
37bb57908d mtd: parsers: bcm47xxpart: Fix halfblock reads
85e458369c mtd: parsers: bcm47xxpart: print correct offset on read error
ec54104feb fbdev: stifb: Fall back to cfb_fillrect() on 32-bit HCRX cards
f8c86d7829 video/fbdev/stifb: Implement the stifb_fillrect() function
e975d7aeca mmc: sdhci-pci-core: Disable ES for ASUS BIOS on Jasper Lake
afeae13b8a mmc: sdhci-pci: Avoid comma separated statements
a06721767c mmc: sdhci-esdhc-imx: Propagate ESDHC_FLAG_HS400* only on 8bit bus
59400c9b0d drm/msm/hdmi: fix IRQ lifetime
8225bdaec5 drm/msm/hdmi: Remove spurious IRQF_ONESHOT flag
5dbb47ee89 ipv6: fix WARNING in ip6_route_net_exit_late()
1c89642e7f net, neigh: Fix null-ptr-deref in neigh_table_clear()
634f066d02 net: mdio: fix undefined behavior in bit shift for __mdiobus_register
d9ec6e2fbd Bluetooth: L2CAP: fix use-after-free in l2cap_conn_del()
cb1c012099 Bluetooth: L2CAP: Fix use-after-free caused by l2cap_reassemble_sdu
0a0dead4ad btrfs: fix ulist leaks in error paths of qgroup self tests
61e0612811 btrfs: fix inode list leak during backref walking at find_parent_nodes()
a52e24c7fc btrfs: fix inode list leak during backref walking at resolve_indirect_refs()
81204283ea isdn: mISDN: netjet: fix wrong check of device registration
e77d213843 mISDN: fix possible memory leak in mISDN_register_device()
f06186e527 rose: Fix NULL pointer dereference in rose_send_frame()
2c8d81bdb2 ipvs: fix WARNING in ip_vs_app_net_cleanup()
931f56d59c ipvs: fix WARNING in __ip_vs_cleanup_batch()
d69328cdb9 ipvs: use explicitly signed chars
b2d7a92aff netfilter: nf_tables: release flow rule object from commit path
3583826b44 net: tun: fix bugs for oversize packet when napi frags enabled
5960b9081b net: sched: Fix use after free in red_enqueue()
24f9c41435 ata: pata_legacy: fix pdc20230_set_piomode()
c85ee1c3cb net: fec: fix improper use of NETDEV_TX_BUSY
52438e734c nfc: nfcmrvl: Fix potential memory leak in nfcmrvl_i2c_nci_send()
0acfcd2aed nfc: s3fwrn5: Fix potential memory leak in s3fwrn5_nci_send()
9ae2c9a91f nfc: nxp-nci: Fix potential memory leak in nxp_nci_send()
eecea068bf NFC: nxp-nci: remove unnecessary labels
e8c11ee2d0 nfc: fdp: Fix potential memory leak in fdp_nci_send()
31b83d6990 nfc: fdp: drop ftrace-like debugging messages
4e1e4485b2 RDMA/qedr: clean up work queue on failure in qedr_alloc_resources()
d360e875c0 RDMA/core: Fix null-ptr-deref in ib_core_cleanup()
37a098fc9b net: dsa: Fix possible memory leaks in dsa_loop_init()
45aea4fbf6 nfs4: Fix kmemleak when allocate slot failed
f0f1c74fa6 NFSv4.1: We must always send RECLAIM_COMPLETE after a reboot
10c554d722 NFSv4.1: Handle RECLAIM_COMPLETE trunking errors
4813dd737d NFSv4: Fix a potential state reclaim deadlock
7c4260f8f1 IB/hfi1: Correctly move list in sc_disable()
87ac93c8dd RDMA/cma: Use output interface for net_dev check
4dbb739eb2 KVM: x86: Add compat handler for KVM_X86_SET_MSR_FILTER
bb584caee8 KVM: x86: Copy filter arg outside kvm_vm_ioctl_set_msr_filter()
9faacf442d KVM: x86: Protect the unused bits in MSR exiting flags
5bdbccc79c x86/topology: Fix duplicated core ID within a package
6c31fc028a x86/topology: Fix multiple packages shown on a single-package system
f5ad52da14 x86/topology: Set cpu_die_id only if DIE_TYPE found
570fa3bcd2 KVM: x86: Treat #DBs from the emulator as fault-like (code and DR7.GD=1)
e5d7c6786b KVM: x86: Trace re-injected exceptions
8364786152 KVM: nVMX: Don't propagate vmcs12's PERF_GLOBAL_CTRL settings to vmcs02
523e1dd9f8 KVM: nVMX: Pull KVM L0's desired controls directly from vmcs01
028fcabd8a serial: ar933x: Deassert Transmit Enable on ->rs485_config()
e6da7808c9 serial: 8250: Let drivers request full 16550A feature probing
95aa34f721 Linux 5.10.153
26a2b9c468 serial: Deassert Transmit Enable on probe in driver-specific way
4a230f65d6 serial: core: move RS485 configuration tasks from drivers into core
eb69c07eca can: rcar_canfd: rcar_canfd_handle_global_receive(): fix IRQ storm on global FIFO receive
d5924531dd arm64/kexec: Test page size support with new TGRAN range values
c911f03f8d arm64/mm: Fix __enable_mmu() for new TGRAN range values
d523384766 scsi: sd: Revert "scsi: sd: Remove a local variable"
52a43b8200 arm64: Add AMPERE1 to the Spectre-BHB affected list
9889ca7efa net: enetc: survive memory pressure without crashing
fdba224ab0 net/mlx5: Fix crash during sync firmware reset
bbcc06933f net/mlx5: Fix possible use-after-free in async command interface
16376ba5cf net/mlx5e: Do not increment ESN when updating IPsec ESN state
0d88359092 nh: fix scope used to find saddr when adding non gw nh
3519b5ddac net: ehea: fix possible memory leak in ehea_register_port()
79631daa5a openvswitch: switch from WARN to pr_warn
00d6f33f67 ALSA: aoa: Fix I2S device accounting
ce6fd1c382 ALSA: aoa: i2sbus: fix possible memory leak in i2sbus_add_dev()
97262705c0 net: fec: limit register access on i.MX6UL
df67a8e625 PM: domains: Fix handling of unavailable/disabled idle states
1f262d8088 net: ksz884x: fix missing pci_disable_device() on error in pcidev_init()
6170b4579f i40e: Fix flow-type by setting GL_HASH_INSET registers
9abae363af i40e: Fix VF hang when reset is triggered on another VF
23d5599058 i40e: Fix ethtool rx-flow-hash setting for X722
44affe7ede ipv6: ensure sane device mtu in tunnels
905f05c0ab media: vivid: set num_in/outputs to 0 if not supported
b6c7446d0a media: videodev2.h: V4L2_DV_BT_BLANKING_HEIGHT should check 'interlaced'
683015ae16 media: v4l2-dv-timings: add sanity checks for blanking values
147b8f1892 media: vivid: dev->bitmap_cap wasn't freed in all cases
1cf51d5158 media: vivid: s_fbuf: add more sanity checks
3221c2701d PM: hibernate: Allow hybrid sleep to work with s2idle
0eb19ecbd0 can: mcp251x: mcp251x_can_probe(): add missing unregister_candev() in error path
6b2d07fc0b can: mscan: mpc5xxx: mpc5xxx_can_probe(): add missing put_clock() in error path
1634d5d39c tcp: fix indefinite deferral of RTO with SACK reneging
4f23cb2be5 tcp: fix a signed-integer-overflow bug in tcp_add_backlog()
49713d7c38 tcp: minor optimization in tcp_add_backlog()
aab883bd60 net: lantiq_etop: don't free skb when returning NETDEV_TX_BUSY
c3edc6e808 net: fix UAF issue in nfqnl_nf_hook_drop() when ops_init() failed
e2a28807b1 kcm: annotate data-races around kcm->rx_wait
c325f92d8d kcm: annotate data-races around kcm->rx_psock
af7879529e atlantic: fix deadlock at aq_nic_stop
d7ccd49c4d amd-xgbe: add the bit rate quirk for Molex cables
17350734fd amd-xgbe: fix the SFP compliance codes check for DAC cables
b55d6ea965 x86/unwind/orc: Fix unreliable stack dump with gcov
0ce1ef3353 net: hinic: fix the issue of double release MBOX callback of VF
6603843c80 net: hinic: fix the issue of CMDQ memory leaks
bb01910763 net: hinic: fix memory leak when reading function table
ce605b68db net: hinic: fix incorrect assignment issue in hinic_set_interrupt_cfg()
62f0a08e82 net: netsec: fix error handling in netsec_register_mdio()
32a3d4660b tipc: fix a null-ptr-deref in tipc_topsrv_accept
fb94152aae perf/x86/intel/lbr: Use setup_clear_cpu_cap() instead of clear_cpu_cap()
bfce730886 ALSA: ac97: fix possible memory leak in snd_ac97_dev_register()
2663b16c76 ASoC: qcom: lpass-cpu: Mark HDMI TX parity register as volatile
a527557299 arc: iounmap() arg is volatile
648ac633e7 ASoC: qcom: lpass-cpu: mark HDMI TX registers as volatile
6571f6ca8a drm/msm: Fix return type of mdp4_lvds_connector_mode_valid
4953a989b7 media: v4l2: Fix v4l2_i2c_subdev_set_name function documentation
9d00384270 net: ieee802154: fix error return code in dgram_bind()
568e3812b1 mm,hugetlb: take hugetlb_lock before decrementing h->resv_huge_pages
935a8b6202 mm/memory: add non-anonymous page check in the copy_present_page()
49db6cb814 xen/gntdev: Prevent leaking grants
a3f2cc11d6 Xen/gntdev: don't ignore kernel unmapping error
467230b9ef s390/pci: add missing EX_TABLE entries to __pcistg_mio_inuser()/__pcilg_mio_inuser()
fe187c801a s390/futex: add missing EX_TABLE entry to __futex_atomic_op()
449070996c perf auxtrace: Fix address filter symbol name match for modules
6f72a3977b kernfs: fix use-after-free in __kernfs_remove
0bcd1ab3e8 counter: microchip-tcb-capture: Handle Signal1 read and Synapse
8bf037279b mmc: core: Fix kernel panic when remove non-standard SDIO card
5684808b26 mmc: sdhci_am654: 'select', not 'depends' REGMAP_MMIO
b686ffc0ac drm/msm/dp: fix IRQ lifetime
08c7375fa2 drm/msm/hdmi: fix memory corruption with too many bridges
21c4679af0 drm/msm/dsi: fix memory corruption with too many bridges
44a86d96fa scsi: qla2xxx: Use transport-defined speed mask for supported_speeds
c368f751da mac802154: Fix LQI recording
9ba2990f4e exec: Copy oldsighand->action under spin-lock
7062153004 fs/binfmt_elf: Fix memory leak in load_elf_binary()
d9ddfeb01f fbdev: smscufx: Fix several use-after-free bugs
f19f1a75d3 iio: temperature: ltc2983: allocate iio channels once
af236da855 iio: light: tsl2583: Fix module unloading
90ff5bef2b tools: iio: iio_utils: fix digit calculation
678d2cc204 xhci: Remove device endpoints from bandwidth list when freeing the device
3b250824b6 xhci: Add quirk to reset host back to default state at shutdown
63c7df3c81 mtd: rawnand: marvell: Use correct logic for nand-keep-config
228101fc83 usb: xhci: add XHCI_SPURIOUS_SUCCESS to ASM1042 despite being a V0.96 controller
2bc4f99ee2 usb: bdc: change state when port disconnected
e440957f9c usb: dwc3: gadget: Don't set IMI for no_interrupt
fb074d622c usb: dwc3: gadget: Stop processing more requests on IMI
c29fcef579 USB: add RESET_RESUME quirk for NVIDIA Jetson devices in RCM
4cc7a360ec ALSA: rme9652: use explicitly signed char
8959092300 ALSA: au88x0: use explicitly signed char
2bf5b16315 ALSA: Use del_timer_sync() before freeing timer
ca1034bff8 can: kvaser_usb: Fix possible completions during init_completion
370be31cde can: j1939: transport: j1939_session_skb_drop_old(): spin_unlock_irqrestore() before kfree_skb()
7d51b4c67c Linux 5.10.152
43d5109296 udp: Update reuse->has_conns under reuseport_lock.
a50ed2d287 mm: /proc/pid/smaps_rollup: fix no vma's null-deref
31b1570677 blk-wbt: fix that 'rwb->wc' is always set to 1 in wbt_init()
e2f9b62ead mmc: core: Add SD card quirk for broken discard
3a260e9844 Makefile.debug: re-enable debug info for .S files
6ab2287b26 x86/Kconfig: Drop check for -mabi=ms for CONFIG_EFI_STUB
67dafece56 ACPI: video: Force backlight native for more TongFang devices
dcaf631320 hv_netvsc: Fix race between VF offering and VF association message from host
da54c5f4b5 perf/x86/intel/pt: Relax address filter validation
79c3482fbe riscv: topology: fix default topology reporting
a6e770733d arm64: topology: move store_cpu_topology() to shared code
cb1024d8a4 arm64: dts: qcom: sc7180-trogdor: Fixup modem memory region
f687e2111b fcntl: fix potential deadlocks for &fown_struct.lock
b1efc19644 fcntl: make F_GETOWN(EX) return 0 on dead owner task
ca4c498382 perf: Skip and warn on unknown format 'configN' attrs
dea47fefa6 perf pmu: Validate raw event with sysfs exported format bits
86e995f964 riscv: always honor the CONFIG_CMDLINE_FORCE when parsing dtb
0e4c06ae7c riscv: Add machine name to kernel boot log and stack dump output
7fba4a389d mmc: sdhci-tegra: Use actual clock rate for SW tuning correction
3c6a888e35 xen/gntdev: Accommodate VMA splitting
5232411f37 xen: assume XENFEAT_gnttab_map_avail_bits being set for pv guests
ea82edad0a tracing: Do not free snapshot if tracer is on cmdline
bd6af07e79 tracing: Simplify conditional compilation code in tracing_set_tracer()
4e3a15ca24 dmaengine: mxs: use platform_driver_register
1da5d24970 dmaengine: mxs-dma: Remove the unused .id_table
1414e9bf3c drm/virtio: Use appropriate atomic state in virtio_gpu_plane_cleanup_fb()
d74196bb27 iommu/vt-d: Clean up si_domain in the init_dmars() error path
ef11e8ec00 iommu/vt-d: Allow NVS regions in arch_rmrr_sanity_check()
35c92435be net: phy: dp83822: disable MDI crossover status change interrupt
7aa3d623c1 net: sched: fix race condition in qdisc_graft()
2974f3b330 net: hns: fix possible memory leak in hnae_ae_register()
3032e316e0 sfc: include vport_id in filter spec hash and equal()
ded86c4191 net: sched: sfb: fix null pointer access issue when sfb_init() fails
305aa36b62 net: sched: delete duplicate cleanup of backlog and qlen
ae48bee283 net: sched: cake: fix null pointer access issue when cake_init() fails
2008ad08a2 nvme-hwmon: kmalloc the NVME SMART log buffer
770b7e3a2c nvme-hwmon: consistently ignore errors from nvme_hwmon_init
67106ac272 nvme-hwmon: Return error code when registration fails
bc17f727b0 nvme-hwmon: rework to avoid devm allocation
191d71c635 ionic: catch NULL pointer issue on reconfig
ff7ba76675 net: hsr: avoid possible NULL deref in skb_clone()
7286f87551 cifs: Fix xid leak in cifs_ses_add_channel()
2d08311aa3 cifs: Fix xid leak in cifs_flock()
bf49d4fe4a cifs: Fix xid leak in cifs_copy_file_range()
05cc22c008 net: phy: dp83867: Extend RX strap quirk for SGMII mode
118f412bed net/atm: fix proc_mpc_write incorrect return value
c8310a99e7 sfc: Change VF mac via PF as first preference if available.
39d10f0dfb HID: magicmouse: Do not set BTN_MOUSE on double report
ed5baf3d0a i40e: Fix DMA mappings leak
e558e14893 tipc: fix an information leak in tipc_topsrv_kern_subscr
1f4ed95ce6 tipc: Fix recognition of trial period
fc8c6b8bb2 ACPI: extlog: Handle multiple records
57e157749a btrfs: fix processing of delayed tree block refs during backref walking
590929ef69 btrfs: fix processing of delayed data refs during backref walking
cc841a8a70 r8152: add PID for the Lenovo OneLink+ Dock
51b96ecaed arm64: errata: Remove AES hwcap for COMPAT tasks
910ba49b33 blk-wbt: call rq_qos_add() after wb_normal is initialized
392536023d block: wbt: Remove unnecessary invoking of wbt_update_limits in wbt_init
ab6aaa8210 media: venus: dec: Handle the case where find_format fails
bce5808fc9 media: mceusb: set timeout to at least timeout provided
6d725672ce KVM: arm64: vgic: Fix exit condition in scan_its_table()
34db701dc6 kvm: Add support for arch compat vm ioctls
e55feb31df cpufreq: qcom: fix memory leak in error path
303d0f7614 ata: ahci: Match EM_MAX_SLOTS with SATA_PMP_MAX_PORTS
6a2aadcb01 ata: ahci-imx: Fix MODULE_ALIAS
d9f0159da0 hwmon/coretemp: Handle large core ID value
0fb04676c4 x86/microcode/AMD: Apply the patch early on every logical thread
6dcf1f0802 i2c: qcom-cci: Fix ordering of pm_runtime_xx and i2c_add_adapter
794ded0bc4 cpufreq: qcom: fix writes in read-only memory region
2723875e9d selinux: enable use of both GFP_KERNEL and GFP_ATOMIC in convert_context()
0d65f040fd ocfs2: fix BUG when iput after ocfs2_mknod fails
b838dcfda1 ocfs2: clear dinode links count in case of error
c34d1b22fe Linux 5.10.151
ecad331211 kbuild: Add skip_encoding_btf_enum64 option to pahole
c5006abb80 kbuild: Unify options for BTF generation for vmlinux and modules
f5f413cb3e kbuild: skip per-CPU BTF generation for pahole v1.18-v1.21
06481cd9f7 kbuild: Quote OBJCOPY var to avoid a pahole call break the build
bbaea0f1cd bpf: Generate BTF_KIND_FLOAT when linking vmlinux
a10a57a224 Linux 5.10.150
243c8f42ba Revert "drm/amdgpu: make sure to init common IP before gmc"
8026d58b49 gcov: support GCC 12.1 and newer compilers
cbf2c43b36 f2fs: fix wrong condition to trigger background checkpoint correctly
7b19858803 thermal: intel_powerclamp: Use first online CPU as control_cpu
f039b43cba inet: fully convert sk->sk_rx_dst to RCU rules
67de22cb0b ext4: continue to expand file system when the target size doesn't reach
357db159e9 Revert "drm/amdgpu: use dirty framebuffer helper"
98ab15bfdc Revert "drm/amdgpu: move nbio sdma_doorbell_range() into sdma code for vega"
791489a5c5 net/ieee802154: don't warn zero-sized raw_sendmsg()
a96336a5f2 Revert "net/ieee802154: reject zero-sized raw_sendmsg()"
dc54ff9fc4 net: ieee802154: return -EINVAL for unknown addr type
45c3396675 mm: hugetlb: fix UAF in hugetlb_handle_userfault
c378c479c5 io_uring/af_unix: defer registered files gc to io_uring release
67cbc8865a io_uring: correct pinned_vm accounting
904f881b57 arm64: topology: fix possible overflow in amu_fie_setup()
b5dc2f2578 perf intel-pt: Fix segfault in intel_pt_print_info() with uClibc
9b4e849777 clk: bcm2835: Make peripheral PLLC critical
b8bbae3236 usb: idmouse: fix an uninit-value in idmouse_open
d5bb45f47b nvmet-tcp: add bounds check on Transfer Tag
b79da0080d nvme: copy firmware_rev on each init
e6cc39db24 staging: rtl8723bs: fix a potential memory leak in rtw_init_cmd_priv()
3a5a34ed9d Revert "usb: storage: Add quirk for Samsung Fit flash"
acf0006f2b usb: musb: Fix musb_gadget.c rxstate overflow bug
91271a3e77 usb: host: xhci: Fix potential memory leak in xhci_alloc_stream_info()
782b3e71c9 md/raid5: Wait for MD_SB_CHANGE_PENDING in raid5d
dbcca76435 HID: roccat: Fix use-after-free in roccat_read()
f00c049ede soundwire: intel: fix error handling on dai registration issues
f04a673d4a soundwire: cadence: Don't overwrite msg->buf during write commands
c263516c2c bcache: fix set_at_max_writeback_rate() for multiple attached devices
fcad2ac863 ata: libahci_platform: Sanity check the DT child nodes number
19c010ae44 blk-throttle: prevent overflow while calculating wait time
1b3cebeca9 staging: vt6655: fix potential memory leak
89f305a714 power: supply: adp5061: fix out-of-bounds read in adp5061_get_chg_type()
b2700f98b3 nbd: Fix hung when signal interrupts nbd_start_device_ioctl()
5942e5c63d scsi: 3w-9xxx: Avoid disabling device if failing to enable it
48727117bd usb: host: xhci-plat: suspend/resume clks for brcm
c13d0d2f5a usb: host: xhci-plat: suspend and resume clocks
12d31182de clk: zynqmp: pll: rectify rate rounding in zynqmp_pll_round_rate
c2257c8a50 media: cx88: Fix a null-ptr-deref bug in buffer_prepare()
d9e2585c3b clk: zynqmp: Fix stack-out-of-bounds in strncpy`
70f8b48d0b btrfs: scrub: try to fix super block errors
8f554dd23c arm64: dts: imx8mq-librem5: Add bq25895 as max17055's power supply
451ce2521c kselftest/arm64: Fix validatation termination record after EXTRA_CONTEXT
017cabfb3f ARM: dts: imx6sx: add missing properties for sram
9d3ca48722 ARM: dts: imx6sll: add missing properties for sram
9735f2b62b ARM: dts: imx6sl: add missing properties for sram
2829b6ad30 ARM: dts: imx6qp: add missing properties for sram
0c3a0b3d5e ARM: dts: imx6dl: add missing properties for sram
2763a3b43a ARM: dts: imx6q: add missing properties for sram
82e0d91484 ARM: dts: imx7d-sdb: config the max pressure for tsc2046
166feb964f drm/amd/display: Remove interface for periodic interrupt 1
1bb6f4a8db drm/dp: Don't rewrite link config when setting phy test pattern
bb91c06b0b mmc: sdhci-msm: add compatible string check for sdm670
8a427a2283 drm/meson: explicitly remove aggregate driver at module unload time
1c7d957c5d drm/amdgpu: fix initial connector audio value
69130888b2 ASoC: SOF: pci: Change DMI match info to support all Chrome platforms
54f2585e2d platform/x86: msi-laptop: Change DMI match / alias strings to fix module autoloading
a9d6a7c9b6 platform/chrome: cros_ec: Notify the PM of wake events during resume
e29d20deaf drm: panel-orientation-quirks: Add quirk for Anbernic Win600
bfdb391d57 drm/vc4: vec: Fix timings for VEC modes
b70f8abc1a drm: bridge: dw_hdmi: only trigger hotplug event on link change
bbe2f6f903 udmabuf: Set ubuf->sg = NULL if the creation of sg table fails
0a4fddc95c drm/amd/display: fix overflow on MIN_I64 definition
3959e8faf8 gpu: lontium-lt9611: Fix NULL pointer dereference in lt9611_connector_init()
c28a8082b2 drm: Prevent drm_copy_field() to attempt copying a NULL pointer
e7d7018003 drm: Use size_t type for len variable in drm_copy_field()
3339a51bcd drm/nouveau/nouveau_bo: fix potential memory leak in nouveau_bo_alloc()
484400d433 r8152: Rate limit overflow messages
0c108cf3ad Bluetooth: L2CAP: Fix user-after-free
65029aaedd net: If sock is dead don't access sock's sk_wq in sk_stream_wait_memory
4851303c85 wifi: rt2x00: correctly set BBP register 86 for MT7620
a016144479 wifi: rt2x00: set SoC wmac clock register
5aa0461d11 wifi: rt2x00: set VGC gain for both chains of MT7620
8d9c00979a wifi: rt2x00: set correct TX_SW_CFG1 MAC register for MT7620
27ed98e8a9 wifi: rt2x00: don't run Rt5592 IQ calibration on MT7620
3d67986e72 can: bcm: check the result of can_send() in bcm_can_tx()
7b674dce41 Bluetooth: hci_sysfs: Fix attempting to call device_add multiple times
e25ca9af8a Bluetooth: L2CAP: initialize delayed works at l2cap_chan_create()
b051d9bf98 regulator: core: Prevent integer underflow
e01d96494a wifi: brcmfmac: fix use-after-free bug in brcmf_netdev_start_xmit()
be81c44242 xfrm: Update ipcomp_scratches with NULL when freed
9661724f62 wifi: ath9k: avoid uninit memory read in ath9k_htc_rx_msg()
0958e487e8 tcp: annotate data-race around tcp_md5sig_pool_populated
129ca0db95 openvswitch: Fix overreporting of drops in dropwatch
4398e8a7fd openvswitch: Fix double reporting of drops in dropwatch
e3c9b94734 bpftool: Clear errno after libcap's checks
50e45034c5 wifi: brcmfmac: fix invalid address access when enabling SCAN log level
bbacfcde5f NFSD: fix use-after-free on source server when doing inter-server copy
3de402a524 NFSD: Return nfserr_serverfault if splice_ok but buf->pages have data
1f730d4ae6 x86/entry: Work around Clang __bdos() bug
513943bf87 thermal: intel_powerclamp: Use get_cpu() instead of smp_processor_id() to avoid crash
708b9abe1b powercap: intel_rapl: fix UBSAN shift-out-of-bounds issue
b434edb0e9 MIPS: BCM47XX: Cast memcmp() of function to (void *)
6c61a37ea7 ACPI: video: Add Toshiba Satellite/Portege Z830 quirk
0dd025483f rcu-tasks: Convert RCU_LOCKDEP_WARN() to WARN_ONCE()
36d4ffbedf rcu: Back off upon fill_page_cache_func() allocation failure
278d8ba2b2 selftest: tpm2: Add Client.__del__() to close /dev/tpm* handle
b60aa21e2f f2fs: fix to account FS_CP_DATA_IO correctly
0b8230d44c f2fs: fix to avoid REQ_TIME and CP_TIME collision
ecbd95958c f2fs: fix race condition on setting FI_NO_EXTENT flag
110146ce8f ACPI: APEI: do not add task_work to kernel thread to avoid memory leak
dce07e87ee thermal/drivers/qcom/tsens-v0_1: Fix MSM8939 fourth sensor hw_id
3a720eb890 crypto: cavium - prevent integer overflow loading firmware
7bfa7d6773 crypto: marvell/octeontx - prevent integer overflows
cdd42eb468 kbuild: rpm-pkg: fix breakage when V=1 is used
6d1aef17e7 kbuild: remove the target in signal traps when interrupted
8d76dd5080 tracing: kprobe: Make gen test module work in arm and riscv
c6512a6f0c tracing: kprobe: Fix kprobe event gen test module on exit
9e6ba62d41 iommu/iova: Fix module config properly
426d5bc089 crypto: qat - fix DMA transfer direction
a43babc059 crypto: qat - use pre-allocated buffers in datapath
a91af50850 crypto: qat - fix use of 'dma_map_single'
8a4ed09ed8 crypto: inside-secure - Change swab to swab32
d33935e666 crypto: ccp - Release dma channels before dmaengine unrgister
a1354bdd19 crypto: akcipher - default implementation for setting a private key
2fee0dbfae iommu/omap: Fix buffer overflow in debugfs
cfde58a8e4 cgroup/cpuset: Enable update_tasks_cpumask() on top_cpuset
ab2485eb5d hwrng: imx-rngc - Moving IRQ handler registering after imx_rngc_irq_mask_clear()
d88b88514e crypto: hisilicon/zip - fix mismatch in get/set sgl_sge_nr
25f1342473 crypto: sahara - don't sleep when in softirq
2d285164fb powerpc: Fix SPE Power ISA properties for e500v1 platforms
2bde4e1e4f powerpc/64s: Fix GENERIC_CPU build flags for PPC970 / G5
7ae8bed908 x86/hyperv: Fix 'struct hv_enlightened_vmcs' definition
6315998170 powerpc/powernv: add missing of_node_put() in opal_export_attrs()
434db6d17b powerpc/pci_dn: Add missing of_node_put()
718e2d8023 powerpc/sysdev/fsl_msi: Add missing of_node_put()
592d283a65 powerpc/math_emu/efp: Include module.h
44c26ceffa mailbox: bcm-ferxrm-mailbox: Fix error check for dma_map_sg
b1616599c9 clk: ast2600: BCLK comes from EPLL
6d01017247 clk: ti: dra7-atl: Fix reference leak in of_dra7_atl_clk_probe
9b65fd6513 clk: bcm2835: fix bcm2835_clock_rate_from_divisor declaration
9a6087a438 clk: baikal-t1: Add SATA internal ref clock buffer
5f143f3bc2 clk: baikal-t1: Add shared xGMAC ref/ptp clocks internal parent
823fd52391 clk: baikal-t1: Fix invalid xGMAC PTP clock divider
2f19a1050e clk: vc5: Fix 5P49V6901 outputs disabling when enabling FOD
92f52770a7 spmi: pmic-arb: correct duplicate APID to PPID mapping logic
a01c0c1600 dmaengine: ioat: stop mod_timer from resurrecting deleted timer in __cleanup()
1dd5148445 clk: mediatek: mt8183: mfgcfg: Propagate rate changes to parent
6e58f2469e mfd: sm501: Add check for platform_driver_register()
3469dd8e22 mfd: fsl-imx25: Fix check for platform_get_irq() errors
b425e03c96 mfd: lp8788: Fix an error handling path in lp8788_irq_init() and lp8788_irq_init()
f7b4388636 mfd: lp8788: Fix an error handling path in lp8788_probe()
08d4051803 mfd: fsl-imx25: Fix an error handling path in mx25_tsadc_setup_irq()
28868b940b mfd: intel_soc_pmic: Fix an error handling path in intel_soc_pmic_i2c_probe()
382a5fc49e fsi: core: Check error number after calling ida_simple_get
ed8e6011b9 clk: qcom: apss-ipq6018: mark apcs_alias0_core_clk as critical
884a788f06 scsi: iscsi: iscsi_tcp: Fix null-ptr-deref while calling getpeername()
a9e5176ead scsi: libsas: Fix use-after-free bug in smp_execute_task_sg()
8f740c11d8 serial: 8250: Fix restoring termios speed after suspend
ab5a3e7144 firmware: google: Test spinlock on panic path to avoid lockups
95ac62e854 staging: vt6655: fix some erroneous memory clean-up loops
878f987166 phy: qualcomm: call clk_disable_unprepare in the error handling
9a56ade124 tty: serial: fsl_lpuart: disable dma rx/tx use flags in lpuart_dma_shutdown
572fb97fce serial: 8250: Toggle IER bits on only after irq has been set up
3fbfa5e3cc serial: 8250: Add an empty line and remove some useless {}
71ffe5111f drivers: serial: jsm: fix some leaks in probe
7efdd91d54 usb: gadget: function: fix dangling pnp_string in f_printer.c
cc952e3bf6 xhci: Don't show warning for reinit on known broken suspend
dac769dd7d IB: Set IOVA/LENGTH on IB_MR in core/uverbs layers
360386e11c RDMA/cm: Use SLID in the work completion as the DLID in responder side
a1263294b5 md/raid5: Ensure stripe_fill happens on non-read IO with journal
76694e9ce0 md: Replace snprintf with scnprintf
7bd5f3b4a8 mtd: rawnand: meson: fix bit map use in meson_nfc_ecc_correct()
f5325f3202 ata: fix ata_id_has_dipm()
f5a6fa1877 ata: fix ata_id_has_ncq_autosense()
3c34a91c8a ata: fix ata_id_has_devslp()
fc61a0c820 ata: fix ata_id_sense_reporting_enabled() and ata_id_has_sense_reporting()
e3917c85f4 RDMA/siw: Always consume all skbuf data in sk_data_ready() upcall.
3a9d7d8dcf mtd: rawnand: fsl_elbc: Fix none ECC mode
f87f720811 mtd: devices: docg3: check the return value of devm_ioremap() in the probe
d06cc0e11d dyndbg: drop EXPORTed dynamic_debug_exec_queries
1d65985589 dyndbg: let query-modname override actual module name
c0e206da44 dyndbg: fix module.dyndbg handling
5047bd3bd7 dyndbg: fix static_branch manipulation
af12e209a9 dmaengine: hisilicon: Add multi-thread support for a DMA channel
d3fd838536 dmaengine: hisilicon: Fix CQ head update
d5065ca461 dmaengine: hisilicon: Disable channels when unregister hisi_dma
f59861946f fpga: prevent integer overflow in dfl_feature_ioctl_set_irq()
7ba19a60c7 misc: ocxl: fix possible refcount leak in afu_ioctl()
cf3bb86edd RDMA/rxe: Fix the error caused by qp->sk
cdce36a88d RDMA/rxe: Fix "kernel NULL pointer dereference" error
2630cc8832 media: xilinx: vipp: Fix refcount leak in xvip_graph_dma_init
40aa0999a3 media: meson: vdec: add missing clk_disable_unprepare on error in vdec_hevc_start()
551b87976a tty: xilinx_uartps: Fix the ignore_status
28cdf6c6fb media: exynos4-is: fimc-is: Add of_node_put() when breaking out of loop
1f683bff1a HSI: omap_ssi_port: Fix dma_map_sg error check
962f22e7f7 HSI: omap_ssi: Fix refcount leak in ssi_probe
70f0a0a27d clk: tegra20: Fix refcount leak in tegra20_clock_init
c01bfd23cc clk: tegra: Fix refcount leak in tegra114_clock_init
f487137a53 clk: tegra: Fix refcount leak in tegra210_clock_init
59e90c4d98 clk: sprd: Hold reference returned by of_get_parent()
57141b1dd6 clk: berlin: Add of_node_put() for of_get_parent()
dc190b46c6 clk: qoriq: Hold reference returned by of_get_parent()
baadc6f58f clk: oxnas: Hold reference returned by of_get_parent()
b95f4f9054 clk: meson: Hold reference returned by of_get_parent()
beec2f0255 usb: common: debug: Check non-standard control requests
9d965a22f6 usb: common: move function's kerneldoc next to its definition
20b63631a3 usb: common: add function to get interval expressed in us unit
c1ef8c66a3 usb: common: Parse for USB SSP genXxY
ffffb159e1 usb: ch9: Add USB 3.2 SSP attributes
aa7aada4b7 iio: ABI: Fix wrong format of differential capacitance channel ABI.
b9a0526cd0 iio: inkern: only release the device node when done with it
44ec4b04fc iio: adc: at91-sama5d2_adc: disable/prepare buffer on suspend/resume
513c72d76d iio: adc: at91-sama5d2_adc: lock around oversampling and sample freq
d259b90f0c iio: adc: at91-sama5d2_adc: check return status for pressure and touch
bc2b97e177 iio: adc: at91-sama5d2_adc: fix AT91_SAMA5D2_MR_TRACKTIM_MAX
5b9bb0cbd9 ARM: dts: exynos: fix polarity of VBUS GPIO of Origen
657de36c72 arm64: ftrace: fix module PLTs with mcount
40e966a404 ARM: Drop CMDLINE_* dependency on ATAGS
477dbf9d1b ARM: dts: exynos: correct s5k6a3 reset polarity on Midas family
5bbd3dd7f9 soc/tegra: fuse: Drop Kconfig dependency on TEGRA20_APB_DMA
09c35f1520 ia64: export memory_add_physaddr_to_nid to fix cxl build error
e31c0e14cf ARM: dts: kirkwood: lsxl: remove first ethernet port
df4f05b356 ARM: dts: kirkwood: lsxl: fix serial line
43faaedf3a ARM: dts: turris-omnia: Fix mpp26 pin name and comment
d5c2051898 soc: qcom: smem_state: Add refcounting for the 'state->of_node'
39781c98ad soc: qcom: smsm: Fix refcount leak bugs in qcom_smsm_probe()
1d312c12c9 memory: of: Fix refcount leak bug in of_lpddr3_get_ddr_timings()
daaec4b3fe memory: of: Fix refcount leak bug in of_get_ddr_timings()
fde46754d5 memory: pl353-smc: Fix refcount leak bug in pl353_smc_probe()
2c442b0c06 ALSA: hda/hdmi: Don't skip notification handling during PM operation
f182de42d7 ASoC: mt6660: Fix PM disable depth imbalance in mt6660_i2c_probe
37e3e01c9a ASoC: wm5102: Fix PM disable depth imbalance in wm5102_probe
fb23569699 ASoC: wm5110: Fix PM disable depth imbalance in wm5110_probe
c1b269dda1 ASoC: wm8997: Fix PM disable depth imbalance in wm8997_probe
71704c2e1b mmc: wmt-sdmmc: Fix an error handling path in wmt_mci_probe()
c940636d9c ALSA: dmaengine: increment buffer pointer atomically
4993c1511d ASoC: da7219: Fix an error handling path in da7219_register_dai_clks()
ef59819976 drm/msm/dp: correct 1.62G link rate at dp_catalog_ctrl_config_msa()
598d8f7d86 drm/msm/dpu: index dpu_kms->hw_vbif using vbif_idx
a9a60d6405 ASoC: eureka-tlv320: Hold reference returned from of_find_xxx API
ad0b8ed172 mmc: au1xmmc: Fix an error handling path in au1xmmc_probe()
1f340e1c1c drm/omap: dss: Fix refcount leak bugs
cbe37857dd ALSA: hda: beep: Simplify keep-power-at-enable behavior
f0fb0817eb ASoC: rsnd: Add check for rsnd_mod_power_on
877e92e9b1 drm/bridge: megachips: Fix a null pointer dereference bug
c577b4e972 drm: fix drm_mipi_dbi build errors
804d8e59f3 platform/x86: msi-laptop: Fix resource cleanup
c21c08fab7 platform/x86: msi-laptop: Fix old-ec check for backlight registering
b77755f58e ASoC: tas2764: Fix mute/unmute
2e6b64df54 ASoC: tas2764: Drop conflicting set_bias_level power setting
c2c6022e10 ASoC: tas2764: Allow mono streams
868fc93b61 platform/chrome: fix memory corruption in ioctl
84da5cdf43 platform/chrome: fix double-free in chromeos_laptop_prepare()
5e25bfcd12 drm:pl111: Add of_node_put() when breaking out of for_each_available_child_of_node()
ad06d6bed5 drm/dp_mst: fix drm_dp_dpcd_read return value checks
3f5889fd65 drm/bridge: parade-ps8640: Fix regulator supply order
45120fa5e5 drm/mipi-dsi: Detach devices when removing the host
050b650507 drm/bridge: Avoid uninitialized variable warning
7839f2b349 drm: bridge: adv7511: fix CEC power down control register offset
29f50bcf0f net: mvpp2: fix mvpp2 debugfs leak
6cb54f2162 once: add DO_ONCE_SLOW() for sleepable contexts
67cb80a9d2 net/ieee802154: reject zero-sized raw_sendmsg()
6cc0e2afc6 bnx2x: fix potential memory leak in bnx2x_tpa_stop()
da349221c4 net: rds: don't hold sock lock when cancelling work from rds_tcp_reset_callbacks()
d9e25dc053 spi: Ensure that sg_table won't be used after being freed
96a3ddb870 tcp: fix tcp_cwnd_validate() to not forget is_cwnd_limited
f65955340e sctp: handle the error returned from sctp_auth_asoc_init_active_key
2a1d036320 mISDN: fix use-after-free bugs in l1oip timer handlers
b4a5905fd2 vhost/vsock: Use kvmalloc/kvfree for larger packets.
d2b5dc3a53 wifi: rtl8xxxu: Fix AIFS written to REG_EDCA_*_PARAM
17196f2f98 spi: s3c64xx: Fix large transfers with DMA
b284e1fe15 netfilter: nft_fib: Fix for rpath check with VRF devices
b384e8fb16 Bluetooth: hci_core: Fix not handling link timeouts propertly
129f01116b i2c: mlxbf: support lock mechanism
534909fe3c spi/omap100k:Fix PM disable depth imbalance in omap1_spi100k_probe
9da61e7b59 spi: dw: Fix PM disable depth imbalance in dw_spi_bt1_probe
1ef5798638 x86/cpu: Include the header of init_ia32_feat_ctl()'s prototype
6ed7b05a35 x86/microcode/AMD: Track patch allocation size explicitly
07299e52e5 wifi: ath11k: fix number of VHT beamformee spatial streams
d7cc0d51ff Bluetooth: hci_{ldisc,serdev}: check percpu_init_rwsem() failure
ed403bcd97 bpf: Ensure correct locking around vulnerable function find_vpid()
2a1c29dc9b net: fs_enet: Fix wrong check in do_pd_setup
795954d751 wifi: rtl8xxxu: Remove copy-paste leftover in gen2_update_rate_mask
226e6f2412 wifi: rtl8xxxu: gen2: Fix mistake in path B IQ calibration
0a60ac7a0d bpf: btf: fix truncated last_member_type_id in btf_struct_resolve
8398a45d3d spi: meson-spicc: do not rely on busy flag in pow2 clk ops
351cf55595 wifi: rtl8xxxu: Fix skb misuse in TX queue selection
1e91179057 spi: qup: add missing clk_disable_unprepare on error in spi_qup_pm_resume_runtime()
7b83d11d48 spi: qup: add missing clk_disable_unprepare on error in spi_qup_resume()
5576008305 selftests/xsk: Avoid use-after-free on ctx
c823df0679 wifi: rtl8xxxu: tighten bounds checking in rtl8xxxu_read_efuse()
ea1b6b5409 Bluetooth: btusb: mediatek: fix WMT failure during runtime suspend
07194ccbb1 Bluetooth: btusb: fix excessive stack usage
cdadf95435 Bluetooth: btusb: Fine-tune mt7663 mechanism.
294395caac x86/resctrl: Fix to restore to original value when re-enabling hardware prefetch register
029a1de92c spi: mt7621: Fix an error message in mt7621_spi_probe()
2afb93e4e4 bpftool: Fix a wrong type cast in btf_dumper_int
61905bbb61 wifi: mac80211: allow bw change during channel switch in mesh
7565207066 leds: lm3601x: Don't use mutex after it was destroyed
08faf07717 wifi: ath10k: add peer map clean up for peer delete in ath10k_sta_state()
e060c4b9f3 nfsd: Fix a memory leak in an error handling path
730191a098 objtool: Preserve special st_shndx indexes in elf_update_symbol
84837738d4 ARM: 9247/1: mm: set readonly for MT_MEMORY_RO with ARM_LPAE
f1d6edeaa8 ARM: 9244/1: dump: Fix wrong pg_level in walk_pmd()
da2aecef86 MIPS: SGI-IP27: Fix platform-device leak in bridge_platform_create()
0c667858c0 MIPS: SGI-IP27: Free some unused memory
3598445698 sh: machvec: Use char[] for section boundaries
6e4be747f1 userfaultfd: open userfaultfds with O_RDONLY
28d9b39733 selinux: use "grep -E" instead of "egrep"
d11e09953c smb3: must initialize two ACL struct fields to zero
abd13b2100 drm/i915: Fix watermark calculations for gen12+ MC CCS modifier
fd37286f39 drm/i915: Fix watermark calculations for gen12+ RC CCS modifier
5d6093c49c drm/nouveau: fix a use-after-free in nouveau_gem_prime_import_sg_table()
57f1a89a8e drm/nouveau/kms/nv140-: Disable interlacing
d0febad83e staging: greybus: audio_helper: remove unused and wrong debugfs usage
ceeb8d4a43 KVM: VMX: Drop bits 31:16 when shoving exception error code into VMCS
83fe0b009b KVM: nVMX: Unconditionally purge queued/injected events on nested "exit"
085ca1d33b KVM: x86/emulator: Fix handing of POP SS to correctly set interruptibility
bda8120e5b media: cedrus: Set the platform driver data earlier
dbdd3b1448 efi: libstub: drop pointless get_memory_map() call
68158654b5 thunderbolt: Explicitly enable lane adapter hotplug events at startup
fc08f84381 tracing: Disable interrupt or preemption before acquiring arch_spinlock_t
0cf6c09daf ring-buffer: Fix race between reset page and reading page
588f02f8b9 ring-buffer: Add ring_buffer_wake_waiters()
586f02c500 ring-buffer: Check pending waiters when doing wake ups as well
6617e5132c ring-buffer: Have the shortest_full queue be the shortest not longest
4a3bbd40e4 ring-buffer: Allow splice to read previous partially read pages
f2ca4609d0 ftrace: Properly unset FTRACE_HASH_FL_MOD
846f041203 livepatch: fix race between fork and KLP transition
2189756eab ext4: update 'state->fc_regions_size' after successful memory allocation
2cfb769d60 ext4: fix potential memory leak in ext4_fc_record_regions()
c9ce7766dc ext4: fix potential memory leak in ext4_fc_record_modified_inode()
d575fb52c4 ext4: fix miss release buffer head in ext4_fc_write_inode
74d2a398d2 ext4: place buffer head allocation before handle start
fbb0e601bd ext4: ext4_read_bh_lock() should submit IO if the buffer isn't uptodate
0e1764ad71 ext4: don't increase iversion counter for ea_inodes
483831ad04 ext4: fix check for block being out of directory size
ac66db1a43 ext4: make ext4_lazyinit_thread freezable
f34ab95162 ext4: fix null-ptr-deref in ext4_write_info
fb98cb61ef ext4: avoid crash when inline data creation follows DIO write
e65506ff18 jbd2: add miss release buffer head in fc_do_one_pass()
1d4d16daec jbd2: fix potential use-after-free in jbd2_fc_wait_bufs
7a33dde572 jbd2: fix potential buffer head reference count leak
eea3e455a3 jbd2: wake up journal waiters in FIFO order, not LIFO
ba52e685d2 hardening: Remove Clang's enable flag for -ftrivial-auto-var-init=zero
bdcb1d7cf2 hardening: Avoid harmless Clang option under CONFIG_INIT_STACK_ALL_ZERO
d621a87064 hardening: Clarify Kconfig text for auto-var-init
4a8e8bf280 f2fs: fix to do sanity check on summary info
73fb4bd2c0 f2fs: fix to do sanity check on destination blkaddr during recovery
12014eaf1b f2fs: increase the limit for reserve_root
47b5ffe863 btrfs: fix race between quota enable and quota rescan ioctl
e504729496 fbdev: smscufx: Fix use-after-free in ufx_ops_open()
9931bd05bb scsi: qedf: Populate sysfs attributes for vport
102c4b6e8c powerpc/boot: Explicitly disable usage of SPE instructions
7db60fd46e powercap: intel_rapl: Use standard Energy Unit for SPR Dram RAPL domain
9119a92ad9 PCI: Sanitise firmware BAR assignments behind a PCI-PCI bridge
a3c08c0217 mm/mmap: undo ->mmap() when arch_validate_flags() fails
7d551b7d61 block: fix inflight statistics of part0
0a12979089 drm/udl: Restore display mode on resume
f134f261d7 drm/virtio: Check whether transferred 2D BO is shmem
303436e301 nvme-pci: set min_align_mask before calculating max_hw_sectors
6a73e6edcb UM: cpuinfo: Fix a warning for CONFIG_CPUMASK_OFFSTACK
1a053f597f riscv: Pass -mno-relax only on lld < 15.0.0
d15dca1d46 riscv: Make VM_WRITE imply VM_READ
d8c6f9b2e1 riscv: Allow PROT_WRITE-only mmap()
a6dcc6cfa2 parisc: fbdev/stifb: Align graphics memory size to 4MB
2ce9fab94b RISC-V: Make port I/O string accessors actually work
ffb571e123 regulator: qcom_rpm: Fix circular deferral regression
85909424a1 hwmon: (gsc-hwmon) Call of_node_get() before of_find_xxx API
8ef0e1c0ae ASoC: wcd934x: fix order of Slimbus unprepare/disable
9b2c82af65 ASoC: wcd9335: fix order of Slimbus unprepare/disable
1c20d672e3 platform/chrome: cros_ec_proto: Update version on GET_NEXT_EVENT failure
6b7ae4a904 quota: Check next/prev free block number after reading from quota file
5b1a56beb6 HID: multitouch: Add memory barriers
bfe60d7641 fs: dlm: handle -EBUSY first in lock arg validation
0b2d8e4db4 fs: dlm: fix race between test_bit() and queue_work()
057d5838c7 mmc: sdhci-sprd: Fix minimum clock limit
448fffc1ae can: kvaser_usb_leaf: Fix CAN state after restart
a3776e09b3 can: kvaser_usb_leaf: Fix TX queue out of sync after restart
0f8c88978d can: kvaser_usb_leaf: Fix overread with an invalid command
5d1cb7bfad can: kvaser_usb: Fix use of uninitialized completion
b239a0993a usb: add quirks for Lenovo OneLink+ Dock
afbbf305db iio: pressure: dps310: Reset chip after timeout
9daadd1d10 iio: pressure: dps310: Refactor startup procedure
ae49d80400 iio: adc: ad7923: fix channel readings for some variants
ea4dcd3d6a iio: ltc2497: Fix reading conversion results
30e1bd0d3e iio: dac: ad5593r: Fix i2c read protocol requirements
9312e04b6c cifs: Fix the error length of VALIDATE_NEGOTIATE_INFO message
64f23e5430 cifs: destage dirty pages before re-reading them for cache=none
50d3d89537 mtd: rawnand: atmel: Unmap streaming DMA mappings
e8eb44eeee ALSA: hda/realtek: Add Intel Reference SSID to support headset keys
4491fbd0a7 ALSA: hda/realtek: Add quirk for ASUS GV601R laptop
4285d06d12 ALSA: hda/realtek: Correct pin configs for ASUS G533Z
768cd2cd1a ALSA: hda/realtek: remove ALC289_FIXUP_DUAL_SPK for Dell 5530
3e29645fba ALSA: usb-audio: Fix NULL dererence at error path
bc1d16d282 ALSA: usb-audio: Fix potential memory leaks
ef1658bc48 ALSA: rawmidi: Drop register_mutex in snd_rawmidi_free()
026fcb6336 ALSA: oss: Fix potential deadlock at unregistration

Also update the .xml file to handle the few ABI changes in this merge
that required an update due to private pointers changing types and ABI
padding structures being used to preserve the ABI:

Leaf changes summary: 4 artifacts changed (1 filtered out)
Changed leaf types summary: 4 (1 filtered out) leaf types changed
Removed/Changed/Added functions summary: 0 Removed, 0 Changed, 0 Added function
Removed/Changed/Added variables summary: 0 Removed, 0 Changed, 0 Added variable

'struct fscrypt_info at fscrypt_private.h:195:1' changed:
  type size hasn't changed
  there are data member changes:
    type 'key*' of 'fscrypt_info::ci_master_key' changed:
      pointer type changed from: 'key*' to: 'fscrypt_master_key*'
  5197 impacted interfaces

'struct sk_buff at skbuff.h:717:1' changed:
  type size hasn't changed
  there are data member changes:
    data member u64 android_kabi_reserved1 at offset 1472 (in bits) became anonymous data member 'union {struct {__u8 scm_io_uring; __u8 android_kabi_reserved1_padding1; __u16 android_kabi_reserved1_padding2; __u32 android_kabi_reserved1_padding3;}; struct {u64 android_kabi_reserved1;}; union {};}'
  5197 impacted interfaces

'struct super_block at fs.h:1450:1' changed:
  type size hasn't changed
  there are data member changes:
    type 'key*' of 'super_block::s_master_keys' changed:
      pointer type changed from: 'key*' to: 'fscrypt_keyring*'
  5197 impacted interfaces

'struct tcp_sock at tcp.h:146:1' changed:
  type size hasn't changed
  one impacted interface

Change-Id: I6f2a7b91e1df96bede8aafa944a04b3e08ed33a1
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
2023-01-21 12:06:54 +00:00

5882 lines
161 KiB
C

// SPDX-License-Identifier: GPL-2.0-only
/*
* Generic hugetlb support.
* (C) Nadia Yvette Chambers, April 2004
*/
#include <linux/list.h>
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/seq_file.h>
#include <linux/sysctl.h>
#include <linux/highmem.h>
#include <linux/mmu_notifier.h>
#include <linux/nodemask.h>
#include <linux/pagemap.h>
#include <linux/mempolicy.h>
#include <linux/compiler.h>
#include <linux/cpuset.h>
#include <linux/mutex.h>
#include <linux/memblock.h>
#include <linux/sysfs.h>
#include <linux/slab.h>
#include <linux/sched/mm.h>
#include <linux/mmdebug.h>
#include <linux/sched/signal.h>
#include <linux/rmap.h>
#include <linux/string_helpers.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/jhash.h>
#include <linux/numa.h>
#include <linux/llist.h>
#include <linux/cma.h>
#include <asm/page.h>
#include <asm/pgalloc.h>
#include <asm/tlb.h>
#include <linux/io.h>
#include <linux/hugetlb.h>
#include <linux/hugetlb_cgroup.h>
#include <linux/node.h>
#include <linux/page_owner.h>
#include "internal.h"
int hugetlb_max_hstate __read_mostly;
unsigned int default_hstate_idx;
struct hstate hstates[HUGE_MAX_HSTATE];
#ifdef CONFIG_CMA
static struct cma *hugetlb_cma[MAX_NUMNODES];
#endif
static unsigned long hugetlb_cma_size __initdata;
/*
* Minimum page order among possible hugepage sizes, set to a proper value
* at boot time.
*/
static unsigned int minimum_order __read_mostly = UINT_MAX;
__initdata LIST_HEAD(huge_boot_pages);
/* for command line parsing */
static struct hstate * __initdata parsed_hstate;
static unsigned long __initdata default_hstate_max_huge_pages;
static bool __initdata parsed_valid_hugepagesz = true;
static bool __initdata parsed_default_hugepagesz;
/*
* Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
* free_huge_pages, and surplus_huge_pages.
*/
DEFINE_SPINLOCK(hugetlb_lock);
/*
* Serializes faults on the same logical page. This is used to
* prevent spurious OOMs when the hugepage pool is fully utilized.
*/
static int num_fault_mutexes;
struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
static inline bool PageHugeFreed(struct page *head)
{
return page_private(head + 4) == -1UL;
}
static inline void SetPageHugeFreed(struct page *head)
{
set_page_private(head + 4, -1UL);
}
static inline void ClearPageHugeFreed(struct page *head)
{
set_page_private(head + 4, 0);
}
/* Forward declaration */
static int hugetlb_acct_memory(struct hstate *h, long delta);
static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
{
bool free = (spool->count == 0) && (spool->used_hpages == 0);
spin_unlock(&spool->lock);
/* If no pages are used, and no other handles to the subpool
* remain, give up any reservations based on minimum size and
* free the subpool */
if (free) {
if (spool->min_hpages != -1)
hugetlb_acct_memory(spool->hstate,
-spool->min_hpages);
kfree(spool);
}
}
struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
long min_hpages)
{
struct hugepage_subpool *spool;
spool = kzalloc(sizeof(*spool), GFP_KERNEL);
if (!spool)
return NULL;
spin_lock_init(&spool->lock);
spool->count = 1;
spool->max_hpages = max_hpages;
spool->hstate = h;
spool->min_hpages = min_hpages;
if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) {
kfree(spool);
return NULL;
}
spool->rsv_hpages = min_hpages;
return spool;
}
void hugepage_put_subpool(struct hugepage_subpool *spool)
{
spin_lock(&spool->lock);
BUG_ON(!spool->count);
spool->count--;
unlock_or_release_subpool(spool);
}
/*
* Subpool accounting for allocating and reserving pages.
* Return -ENOMEM if there are not enough resources to satisfy the
* request. Otherwise, return the number of pages by which the
* global pools must be adjusted (upward). The returned value may
* only be different than the passed value (delta) in the case where
* a subpool minimum size must be maintained.
*/
static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
long delta)
{
long ret = delta;
if (!spool)
return ret;
spin_lock(&spool->lock);
if (spool->max_hpages != -1) { /* maximum size accounting */
if ((spool->used_hpages + delta) <= spool->max_hpages)
spool->used_hpages += delta;
else {
ret = -ENOMEM;
goto unlock_ret;
}
}
/* minimum size accounting */
if (spool->min_hpages != -1 && spool->rsv_hpages) {
if (delta > spool->rsv_hpages) {
/*
* Asking for more reserves than those already taken on
* behalf of subpool. Return difference.
*/
ret = delta - spool->rsv_hpages;
spool->rsv_hpages = 0;
} else {
ret = 0; /* reserves already accounted for */
spool->rsv_hpages -= delta;
}
}
unlock_ret:
spin_unlock(&spool->lock);
return ret;
}
/*
* Subpool accounting for freeing and unreserving pages.
* Return the number of global page reservations that must be dropped.
* The return value may only be different than the passed value (delta)
* in the case where a subpool minimum size must be maintained.
*/
static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
long delta)
{
long ret = delta;
if (!spool)
return delta;
spin_lock(&spool->lock);
if (spool->max_hpages != -1) /* maximum size accounting */
spool->used_hpages -= delta;
/* minimum size accounting */
if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) {
if (spool->rsv_hpages + delta <= spool->min_hpages)
ret = 0;
else
ret = spool->rsv_hpages + delta - spool->min_hpages;
spool->rsv_hpages += delta;
if (spool->rsv_hpages > spool->min_hpages)
spool->rsv_hpages = spool->min_hpages;
}
/*
* If hugetlbfs_put_super couldn't free spool due to an outstanding
* quota reference, free it now.
*/
unlock_or_release_subpool(spool);
return ret;
}
static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
{
return HUGETLBFS_SB(inode->i_sb)->spool;
}
static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
{
return subpool_inode(file_inode(vma->vm_file));
}
/* Helper that removes a struct file_region from the resv_map cache and returns
* it for use.
*/
static struct file_region *
get_file_region_entry_from_cache(struct resv_map *resv, long from, long to)
{
struct file_region *nrg = NULL;
VM_BUG_ON(resv->region_cache_count <= 0);
resv->region_cache_count--;
nrg = list_first_entry(&resv->region_cache, struct file_region, link);
list_del(&nrg->link);
nrg->from = from;
nrg->to = to;
return nrg;
}
static void copy_hugetlb_cgroup_uncharge_info(struct file_region *nrg,
struct file_region *rg)
{
#ifdef CONFIG_CGROUP_HUGETLB
nrg->reservation_counter = rg->reservation_counter;
nrg->css = rg->css;
if (rg->css)
css_get(rg->css);
#endif
}
/* Helper that records hugetlb_cgroup uncharge info. */
static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg,
struct hstate *h,
struct resv_map *resv,
struct file_region *nrg)
{
#ifdef CONFIG_CGROUP_HUGETLB
if (h_cg) {
nrg->reservation_counter =
&h_cg->rsvd_hugepage[hstate_index(h)];
nrg->css = &h_cg->css;
/*
* The caller will hold exactly one h_cg->css reference for the
* whole contiguous reservation region. But this area might be
* scattered when there are already some file_regions reside in
* it. As a result, many file_regions may share only one css
* reference. In order to ensure that one file_region must hold
* exactly one h_cg->css reference, we should do css_get for
* each file_region and leave the reference held by caller
* untouched.
*/
css_get(&h_cg->css);
if (!resv->pages_per_hpage)
resv->pages_per_hpage = pages_per_huge_page(h);
/* pages_per_hpage should be the same for all entries in
* a resv_map.
*/
VM_BUG_ON(resv->pages_per_hpage != pages_per_huge_page(h));
} else {
nrg->reservation_counter = NULL;
nrg->css = NULL;
}
#endif
}
static void put_uncharge_info(struct file_region *rg)
{
#ifdef CONFIG_CGROUP_HUGETLB
if (rg->css)
css_put(rg->css);
#endif
}
static bool has_same_uncharge_info(struct file_region *rg,
struct file_region *org)
{
#ifdef CONFIG_CGROUP_HUGETLB
return rg && org &&
rg->reservation_counter == org->reservation_counter &&
rg->css == org->css;
#else
return true;
#endif
}
static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
{
struct file_region *nrg = NULL, *prg = NULL;
prg = list_prev_entry(rg, link);
if (&prg->link != &resv->regions && prg->to == rg->from &&
has_same_uncharge_info(prg, rg)) {
prg->to = rg->to;
list_del(&rg->link);
put_uncharge_info(rg);
kfree(rg);
rg = prg;
}
nrg = list_next_entry(rg, link);
if (&nrg->link != &resv->regions && nrg->from == rg->to &&
has_same_uncharge_info(nrg, rg)) {
nrg->from = rg->from;
list_del(&rg->link);
put_uncharge_info(rg);
kfree(rg);
}
}
/*
* Must be called with resv->lock held.
*
* Calling this with regions_needed != NULL will count the number of pages
* to be added but will not modify the linked list. And regions_needed will
* indicate the number of file_regions needed in the cache to carry out to add
* the regions for this range.
*/
static long add_reservation_in_range(struct resv_map *resv, long f, long t,
struct hugetlb_cgroup *h_cg,
struct hstate *h, long *regions_needed)
{
long add = 0;
struct list_head *head = &resv->regions;
long last_accounted_offset = f;
struct file_region *rg = NULL, *trg = NULL, *nrg = NULL;
if (regions_needed)
*regions_needed = 0;
/* In this loop, we essentially handle an entry for the range
* [last_accounted_offset, rg->from), at every iteration, with some
* bounds checking.
*/
list_for_each_entry_safe(rg, trg, head, link) {
/* Skip irrelevant regions that start before our range. */
if (rg->from < f) {
/* If this region ends after the last accounted offset,
* then we need to update last_accounted_offset.
*/
if (rg->to > last_accounted_offset)
last_accounted_offset = rg->to;
continue;
}
/* When we find a region that starts beyond our range, we've
* finished.
*/
if (rg->from > t)
break;
/* Add an entry for last_accounted_offset -> rg->from, and
* update last_accounted_offset.
*/
if (rg->from > last_accounted_offset) {
add += rg->from - last_accounted_offset;
if (!regions_needed) {
nrg = get_file_region_entry_from_cache(
resv, last_accounted_offset, rg->from);
record_hugetlb_cgroup_uncharge_info(h_cg, h,
resv, nrg);
list_add(&nrg->link, rg->link.prev);
coalesce_file_region(resv, nrg);
} else
*regions_needed += 1;
}
last_accounted_offset = rg->to;
}
/* Handle the case where our range extends beyond
* last_accounted_offset.
*/
if (last_accounted_offset < t) {
add += t - last_accounted_offset;
if (!regions_needed) {
nrg = get_file_region_entry_from_cache(
resv, last_accounted_offset, t);
record_hugetlb_cgroup_uncharge_info(h_cg, h, resv, nrg);
list_add(&nrg->link, rg->link.prev);
coalesce_file_region(resv, nrg);
} else
*regions_needed += 1;
}
VM_BUG_ON(add < 0);
return add;
}
/* Must be called with resv->lock acquired. Will drop lock to allocate entries.
*/
static int allocate_file_region_entries(struct resv_map *resv,
int regions_needed)
__must_hold(&resv->lock)
{
struct list_head allocated_regions;
int to_allocate = 0, i = 0;
struct file_region *trg = NULL, *rg = NULL;
VM_BUG_ON(regions_needed < 0);
INIT_LIST_HEAD(&allocated_regions);
/*
* Check for sufficient descriptors in the cache to accommodate
* the number of in progress add operations plus regions_needed.
*
* This is a while loop because when we drop the lock, some other call
* to region_add or region_del may have consumed some region_entries,
* so we keep looping here until we finally have enough entries for
* (adds_in_progress + regions_needed).
*/
while (resv->region_cache_count <
(resv->adds_in_progress + regions_needed)) {
to_allocate = resv->adds_in_progress + regions_needed -
resv->region_cache_count;
/* At this point, we should have enough entries in the cache
* for all the existings adds_in_progress. We should only be
* needing to allocate for regions_needed.
*/
VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress);
spin_unlock(&resv->lock);
for (i = 0; i < to_allocate; i++) {
trg = kmalloc(sizeof(*trg), GFP_KERNEL);
if (!trg)
goto out_of_memory;
list_add(&trg->link, &allocated_regions);
}
spin_lock(&resv->lock);
list_splice(&allocated_regions, &resv->region_cache);
resv->region_cache_count += to_allocate;
}
return 0;
out_of_memory:
list_for_each_entry_safe(rg, trg, &allocated_regions, link) {
list_del(&rg->link);
kfree(rg);
}
return -ENOMEM;
}
/*
* Add the huge page range represented by [f, t) to the reserve
* map. Regions will be taken from the cache to fill in this range.
* Sufficient regions should exist in the cache due to the previous
* call to region_chg with the same range, but in some cases the cache will not
* have sufficient entries due to races with other code doing region_add or
* region_del. The extra needed entries will be allocated.
*
* regions_needed is the out value provided by a previous call to region_chg.
*
* Return the number of new huge pages added to the map. This number is greater
* than or equal to zero. If file_region entries needed to be allocated for
* this operation and we were not able to allocate, it returns -ENOMEM.
* region_add of regions of length 1 never allocate file_regions and cannot
* fail; region_chg will always allocate at least 1 entry and a region_add for
* 1 page will only require at most 1 entry.
*/
static long region_add(struct resv_map *resv, long f, long t,
long in_regions_needed, struct hstate *h,
struct hugetlb_cgroup *h_cg)
{
long add = 0, actual_regions_needed = 0;
spin_lock(&resv->lock);
retry:
/* Count how many regions are actually needed to execute this add. */
add_reservation_in_range(resv, f, t, NULL, NULL,
&actual_regions_needed);
/*
* Check for sufficient descriptors in the cache to accommodate
* this add operation. Note that actual_regions_needed may be greater
* than in_regions_needed, as the resv_map may have been modified since
* the region_chg call. In this case, we need to make sure that we
* allocate extra entries, such that we have enough for all the
* existing adds_in_progress, plus the excess needed for this
* operation.
*/
if (actual_regions_needed > in_regions_needed &&
resv->region_cache_count <
resv->adds_in_progress +
(actual_regions_needed - in_regions_needed)) {
/* region_add operation of range 1 should never need to
* allocate file_region entries.
*/
VM_BUG_ON(t - f <= 1);
if (allocate_file_region_entries(
resv, actual_regions_needed - in_regions_needed)) {
return -ENOMEM;
}
goto retry;
}
add = add_reservation_in_range(resv, f, t, h_cg, h, NULL);
resv->adds_in_progress -= in_regions_needed;
spin_unlock(&resv->lock);
VM_BUG_ON(add < 0);
return add;
}
/*
* Examine the existing reserve map and determine how many
* huge pages in the specified range [f, t) are NOT currently
* represented. This routine is called before a subsequent
* call to region_add that will actually modify the reserve
* map to add the specified range [f, t). region_chg does
* not change the number of huge pages represented by the
* map. A number of new file_region structures is added to the cache as a
* placeholder, for the subsequent region_add call to use. At least 1
* file_region structure is added.
*
* out_regions_needed is the number of regions added to the
* resv->adds_in_progress. This value needs to be provided to a follow up call
* to region_add or region_abort for proper accounting.
*
* Returns the number of huge pages that need to be added to the existing
* reservation map for the range [f, t). This number is greater or equal to
* zero. -ENOMEM is returned if a new file_region structure or cache entry
* is needed and can not be allocated.
*/
static long region_chg(struct resv_map *resv, long f, long t,
long *out_regions_needed)
{
long chg = 0;
spin_lock(&resv->lock);
/* Count how many hugepages in this range are NOT represented. */
chg = add_reservation_in_range(resv, f, t, NULL, NULL,
out_regions_needed);
if (*out_regions_needed == 0)
*out_regions_needed = 1;
if (allocate_file_region_entries(resv, *out_regions_needed))
return -ENOMEM;
resv->adds_in_progress += *out_regions_needed;
spin_unlock(&resv->lock);
return chg;
}
/*
* Abort the in progress add operation. The adds_in_progress field
* of the resv_map keeps track of the operations in progress between
* calls to region_chg and region_add. Operations are sometimes
* aborted after the call to region_chg. In such cases, region_abort
* is called to decrement the adds_in_progress counter. regions_needed
* is the value returned by the region_chg call, it is used to decrement
* the adds_in_progress counter.
*
* NOTE: The range arguments [f, t) are not needed or used in this
* routine. They are kept to make reading the calling code easier as
* arguments will match the associated region_chg call.
*/
static void region_abort(struct resv_map *resv, long f, long t,
long regions_needed)
{
spin_lock(&resv->lock);
VM_BUG_ON(!resv->region_cache_count);
resv->adds_in_progress -= regions_needed;
spin_unlock(&resv->lock);
}
/*
* Delete the specified range [f, t) from the reserve map. If the
* t parameter is LONG_MAX, this indicates that ALL regions after f
* should be deleted. Locate the regions which intersect [f, t)
* and either trim, delete or split the existing regions.
*
* Returns the number of huge pages deleted from the reserve map.
* In the normal case, the return value is zero or more. In the
* case where a region must be split, a new region descriptor must
* be allocated. If the allocation fails, -ENOMEM will be returned.
* NOTE: If the parameter t == LONG_MAX, then we will never split
* a region and possibly return -ENOMEM. Callers specifying
* t == LONG_MAX do not need to check for -ENOMEM error.
*/
static long region_del(struct resv_map *resv, long f, long t)
{
struct list_head *head = &resv->regions;
struct file_region *rg, *trg;
struct file_region *nrg = NULL;
long del = 0;
retry:
spin_lock(&resv->lock);
list_for_each_entry_safe(rg, trg, head, link) {
/*
* Skip regions before the range to be deleted. file_region
* ranges are normally of the form [from, to). However, there
* may be a "placeholder" entry in the map which is of the form
* (from, to) with from == to. Check for placeholder entries
* at the beginning of the range to be deleted.
*/
if (rg->to <= f && (rg->to != rg->from || rg->to != f))
continue;
if (rg->from >= t)
break;
if (f > rg->from && t < rg->to) { /* Must split region */
/*
* Check for an entry in the cache before dropping
* lock and attempting allocation.
*/
if (!nrg &&
resv->region_cache_count > resv->adds_in_progress) {
nrg = list_first_entry(&resv->region_cache,
struct file_region,
link);
list_del(&nrg->link);
resv->region_cache_count--;
}
if (!nrg) {
spin_unlock(&resv->lock);
nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
if (!nrg)
return -ENOMEM;
goto retry;
}
del += t - f;
hugetlb_cgroup_uncharge_file_region(
resv, rg, t - f, false);
/* New entry for end of split region */
nrg->from = t;
nrg->to = rg->to;
copy_hugetlb_cgroup_uncharge_info(nrg, rg);
INIT_LIST_HEAD(&nrg->link);
/* Original entry is trimmed */
rg->to = f;
list_add(&nrg->link, &rg->link);
nrg = NULL;
break;
}
if (f <= rg->from && t >= rg->to) { /* Remove entire region */
del += rg->to - rg->from;
hugetlb_cgroup_uncharge_file_region(resv, rg,
rg->to - rg->from, true);
list_del(&rg->link);
kfree(rg);
continue;
}
if (f <= rg->from) { /* Trim beginning of region */
hugetlb_cgroup_uncharge_file_region(resv, rg,
t - rg->from, false);
del += t - rg->from;
rg->from = t;
} else { /* Trim end of region */
hugetlb_cgroup_uncharge_file_region(resv, rg,
rg->to - f, false);
del += rg->to - f;
rg->to = f;
}
}
spin_unlock(&resv->lock);
kfree(nrg);
return del;
}
/*
* A rare out of memory error was encountered which prevented removal of
* the reserve map region for a page. The huge page itself was free'ed
* and removed from the page cache. This routine will adjust the subpool
* usage count, and the global reserve count if needed. By incrementing
* these counts, the reserve map entry which could not be deleted will
* appear as a "reserved" entry instead of simply dangling with incorrect
* counts.
*/
void hugetlb_fix_reserve_counts(struct inode *inode)
{
struct hugepage_subpool *spool = subpool_inode(inode);
long rsv_adjust;
bool reserved = false;
rsv_adjust = hugepage_subpool_get_pages(spool, 1);
if (rsv_adjust > 0) {
struct hstate *h = hstate_inode(inode);
if (!hugetlb_acct_memory(h, 1))
reserved = true;
} else if (!rsv_adjust) {
reserved = true;
}
if (!reserved)
pr_warn("hugetlb: Huge Page Reserved count may go negative.\n");
}
/*
* Count and return the number of huge pages in the reserve map
* that intersect with the range [f, t).
*/
static long region_count(struct resv_map *resv, long f, long t)
{
struct list_head *head = &resv->regions;
struct file_region *rg;
long chg = 0;
spin_lock(&resv->lock);
/* Locate each segment we overlap with, and count that overlap. */
list_for_each_entry(rg, head, link) {
long seg_from;
long seg_to;
if (rg->to <= f)
continue;
if (rg->from >= t)
break;
seg_from = max(rg->from, f);
seg_to = min(rg->to, t);
chg += seg_to - seg_from;
}
spin_unlock(&resv->lock);
return chg;
}
/*
* Convert the address within this vma to the page offset within
* the mapping, in pagecache page units; huge pages here.
*/
static pgoff_t vma_hugecache_offset(struct hstate *h,
struct vm_area_struct *vma, unsigned long address)
{
return ((address - vma->vm_start) >> huge_page_shift(h)) +
(vma->vm_pgoff >> huge_page_order(h));
}
pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
unsigned long address)
{
return vma_hugecache_offset(hstate_vma(vma), vma, address);
}
EXPORT_SYMBOL_GPL(linear_hugepage_index);
/*
* Return the size of the pages allocated when backing a VMA. In the majority
* cases this will be same size as used by the page table entries.
*/
unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
{
if (vma->vm_ops && vma->vm_ops->pagesize)
return vma->vm_ops->pagesize(vma);
return PAGE_SIZE;
}
EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
/*
* Return the page size being used by the MMU to back a VMA. In the majority
* of cases, the page size used by the kernel matches the MMU size. On
* architectures where it differs, an architecture-specific 'strong'
* version of this symbol is required.
*/
__weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
{
return vma_kernel_pagesize(vma);
}
/*
* Flags for MAP_PRIVATE reservations. These are stored in the bottom
* bits of the reservation map pointer, which are always clear due to
* alignment.
*/
#define HPAGE_RESV_OWNER (1UL << 0)
#define HPAGE_RESV_UNMAPPED (1UL << 1)
#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
/*
* These helpers are used to track how many pages are reserved for
* faults in a MAP_PRIVATE mapping. Only the process that called mmap()
* is guaranteed to have their future faults succeed.
*
* With the exception of reset_vma_resv_huge_pages() which is called at fork(),
* the reserve counters are updated with the hugetlb_lock held. It is safe
* to reset the VMA at fork() time as it is not in use yet and there is no
* chance of the global counters getting corrupted as a result of the values.
*
* The private mapping reservation is represented in a subtly different
* manner to a shared mapping. A shared mapping has a region map associated
* with the underlying file, this region map represents the backing file
* pages which have ever had a reservation assigned which this persists even
* after the page is instantiated. A private mapping has a region map
* associated with the original mmap which is attached to all VMAs which
* reference it, this region map represents those offsets which have consumed
* reservation ie. where pages have been instantiated.
*/
static unsigned long get_vma_private_data(struct vm_area_struct *vma)
{
return (unsigned long)vma->vm_private_data;
}
static void set_vma_private_data(struct vm_area_struct *vma,
unsigned long value)
{
vma->vm_private_data = (void *)value;
}
static void
resv_map_set_hugetlb_cgroup_uncharge_info(struct resv_map *resv_map,
struct hugetlb_cgroup *h_cg,
struct hstate *h)
{
#ifdef CONFIG_CGROUP_HUGETLB
if (!h_cg || !h) {
resv_map->reservation_counter = NULL;
resv_map->pages_per_hpage = 0;
resv_map->css = NULL;
} else {
resv_map->reservation_counter =
&h_cg->rsvd_hugepage[hstate_index(h)];
resv_map->pages_per_hpage = pages_per_huge_page(h);
resv_map->css = &h_cg->css;
}
#endif
}
struct resv_map *resv_map_alloc(void)
{
struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL);
if (!resv_map || !rg) {
kfree(resv_map);
kfree(rg);
return NULL;
}
kref_init(&resv_map->refs);
spin_lock_init(&resv_map->lock);
INIT_LIST_HEAD(&resv_map->regions);
resv_map->adds_in_progress = 0;
/*
* Initialize these to 0. On shared mappings, 0's here indicate these
* fields don't do cgroup accounting. On private mappings, these will be
* re-initialized to the proper values, to indicate that hugetlb cgroup
* reservations are to be un-charged from here.
*/
resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, NULL, NULL);
INIT_LIST_HEAD(&resv_map->region_cache);
list_add(&rg->link, &resv_map->region_cache);
resv_map->region_cache_count = 1;
return resv_map;
}
void resv_map_release(struct kref *ref)
{
struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
struct list_head *head = &resv_map->region_cache;
struct file_region *rg, *trg;
/* Clear out any active regions before we release the map. */
region_del(resv_map, 0, LONG_MAX);
/* ... and any entries left in the cache */
list_for_each_entry_safe(rg, trg, head, link) {
list_del(&rg->link);
kfree(rg);
}
VM_BUG_ON(resv_map->adds_in_progress);
kfree(resv_map);
}
static inline struct resv_map *inode_resv_map(struct inode *inode)
{
/*
* At inode evict time, i_mapping may not point to the original
* address space within the inode. This original address space
* contains the pointer to the resv_map. So, always use the
* address space embedded within the inode.
* The VERY common case is inode->mapping == &inode->i_data but,
* this may not be true for device special inodes.
*/
return (struct resv_map *)(&inode->i_data)->private_data;
}
static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
{
VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
if (vma->vm_flags & VM_MAYSHARE) {
struct address_space *mapping = vma->vm_file->f_mapping;
struct inode *inode = mapping->host;
return inode_resv_map(inode);
} else {
return (struct resv_map *)(get_vma_private_data(vma) &
~HPAGE_RESV_MASK);
}
}
static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
{
VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
set_vma_private_data(vma, (get_vma_private_data(vma) &
HPAGE_RESV_MASK) | (unsigned long)map);
}
static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
{
VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
set_vma_private_data(vma, get_vma_private_data(vma) | flags);
}
static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
{
VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
return (get_vma_private_data(vma) & flag) != 0;
}
/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
{
VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
if (!(vma->vm_flags & VM_MAYSHARE))
vma->vm_private_data = (void *)0;
}
/* Returns true if the VMA has associated reserve pages */
static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
{
if (vma->vm_flags & VM_NORESERVE) {
/*
* This address is already reserved by other process(chg == 0),
* so, we should decrement reserved count. Without decrementing,
* reserve count remains after releasing inode, because this
* allocated page will go into page cache and is regarded as
* coming from reserved pool in releasing step. Currently, we
* don't have any other solution to deal with this situation
* properly, so add work-around here.
*/
if (vma->vm_flags & VM_MAYSHARE && chg == 0)
return true;
else
return false;
}
/* Shared mappings always use reserves */
if (vma->vm_flags & VM_MAYSHARE) {
/*
* We know VM_NORESERVE is not set. Therefore, there SHOULD
* be a region map for all pages. The only situation where
* there is no region map is if a hole was punched via
* fallocate. In this case, there really are no reserves to
* use. This situation is indicated if chg != 0.
*/
if (chg)
return false;
else
return true;
}
/*
* Only the process that called mmap() has reserves for
* private mappings.
*/
if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
/*
* Like the shared case above, a hole punch or truncate
* could have been performed on the private mapping.
* Examine the value of chg to determine if reserves
* actually exist or were previously consumed.
* Very Subtle - The value of chg comes from a previous
* call to vma_needs_reserves(). The reserve map for
* private mappings has different (opposite) semantics
* than that of shared mappings. vma_needs_reserves()
* has already taken this difference in semantics into
* account. Therefore, the meaning of chg is the same
* as in the shared case above. Code could easily be
* combined, but keeping it separate draws attention to
* subtle differences.
*/
if (chg)
return false;
else
return true;
}
return false;
}
static void enqueue_huge_page(struct hstate *h, struct page *page)
{
int nid = page_to_nid(page);
list_move(&page->lru, &h->hugepage_freelists[nid]);
h->free_huge_pages++;
h->free_huge_pages_node[nid]++;
SetPageHugeFreed(page);
}
static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
{
struct page *page;
bool nocma = !!(current->flags & PF_MEMALLOC_NOCMA);
list_for_each_entry(page, &h->hugepage_freelists[nid], lru) {
if (nocma && is_migrate_cma_page(page))
continue;
if (PageHWPoison(page))
continue;
list_move(&page->lru, &h->hugepage_activelist);
set_page_refcounted(page);
ClearPageHugeFreed(page);
h->free_huge_pages--;
h->free_huge_pages_node[nid]--;
return page;
}
return NULL;
}
static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, int nid,
nodemask_t *nmask)
{
unsigned int cpuset_mems_cookie;
struct zonelist *zonelist;
struct zone *zone;
struct zoneref *z;
int node = NUMA_NO_NODE;
zonelist = node_zonelist(nid, gfp_mask);
retry_cpuset:
cpuset_mems_cookie = read_mems_allowed_begin();
for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) {
struct page *page;
if (!cpuset_zone_allowed(zone, gfp_mask))
continue;
/*
* no need to ask again on the same node. Pool is node rather than
* zone aware
*/
if (zone_to_nid(zone) == node)
continue;
node = zone_to_nid(zone);
page = dequeue_huge_page_node_exact(h, node);
if (page)
return page;
}
if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie)))
goto retry_cpuset;
return NULL;
}
static struct page *dequeue_huge_page_vma(struct hstate *h,
struct vm_area_struct *vma,
unsigned long address, int avoid_reserve,
long chg)
{
struct page *page;
struct mempolicy *mpol;
gfp_t gfp_mask;
nodemask_t *nodemask;
int nid;
/*
* A child process with MAP_PRIVATE mappings created by their parent
* have no page reserves. This check ensures that reservations are
* not "stolen". The child may still get SIGKILLed
*/
if (!vma_has_reserves(vma, chg) &&
h->free_huge_pages - h->resv_huge_pages == 0)
goto err;
/* If reserves cannot be used, ensure enough pages are in the pool */
if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
goto err;
gfp_mask = htlb_alloc_mask(h);
nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
if (page && !avoid_reserve && vma_has_reserves(vma, chg)) {
SetPagePrivate(page);
h->resv_huge_pages--;
}
mpol_cond_put(mpol);
return page;
err:
return NULL;
}
/*
* common helper functions for hstate_next_node_to_{alloc|free}.
* We may have allocated or freed a huge page based on a different
* nodes_allowed previously, so h->next_node_to_{alloc|free} might
* be outside of *nodes_allowed. Ensure that we use an allowed
* node for alloc or free.
*/
static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
{
nid = next_node_in(nid, *nodes_allowed);
VM_BUG_ON(nid >= MAX_NUMNODES);
return nid;
}
static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
{
if (!node_isset(nid, *nodes_allowed))
nid = next_node_allowed(nid, nodes_allowed);
return nid;
}
/*
* returns the previously saved node ["this node"] from which to
* allocate a persistent huge page for the pool and advance the
* next node from which to allocate, handling wrap at end of node
* mask.
*/
static int hstate_next_node_to_alloc(struct hstate *h,
nodemask_t *nodes_allowed)
{
int nid;
VM_BUG_ON(!nodes_allowed);
nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
return nid;
}
/*
* helper for free_pool_huge_page() - return the previously saved
* node ["this node"] from which to free a huge page. Advance the
* next node id whether or not we find a free huge page to free so
* that the next attempt to free addresses the next node.
*/
static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
{
int nid;
VM_BUG_ON(!nodes_allowed);
nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
return nid;
}
#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \
for (nr_nodes = nodes_weight(*mask); \
nr_nodes > 0 && \
((node = hstate_next_node_to_alloc(hs, mask)) || 1); \
nr_nodes--)
#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \
for (nr_nodes = nodes_weight(*mask); \
nr_nodes > 0 && \
((node = hstate_next_node_to_free(hs, mask)) || 1); \
nr_nodes--)
#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
static void destroy_compound_gigantic_page(struct page *page,
unsigned int order)
{
int i;
int nr_pages = 1 << order;
struct page *p = page + 1;
atomic_set(compound_mapcount_ptr(page), 0);
atomic_set(compound_pincount_ptr(page), 0);
for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
clear_compound_head(p);
set_page_refcounted(p);
}
set_compound_order(page, 0);
page[1].compound_nr = 0;
__ClearPageHead(page);
}
static void free_gigantic_page(struct page *page, unsigned int order)
{
/*
* If the page isn't allocated using the cma allocator,
* cma_release() returns false.
*/
#ifdef CONFIG_CMA
if (cma_release(hugetlb_cma[page_to_nid(page)], page, 1 << order))
return;
#endif
free_contig_range(page_to_pfn(page), 1 << order);
}
#ifdef CONFIG_CONTIG_ALLOC
static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nodemask)
{
unsigned long nr_pages = 1UL << huge_page_order(h);
if (nid == NUMA_NO_NODE)
nid = numa_mem_id();
#ifdef CONFIG_CMA
{
struct page *page;
int node;
if (hugetlb_cma[nid]) {
page = cma_alloc(hugetlb_cma[nid], nr_pages,
huge_page_order(h),
GFP_KERNEL | __GFP_NOWARN);
if (page)
return page;
}
if (!(gfp_mask & __GFP_THISNODE)) {
for_each_node_mask(node, *nodemask) {
if (node == nid || !hugetlb_cma[node])
continue;
page = cma_alloc(hugetlb_cma[node], nr_pages,
huge_page_order(h),
GFP_KERNEL | __GFP_NOWARN);
if (page)
return page;
}
}
}
#endif
return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask);
}
#else /* !CONFIG_CONTIG_ALLOC */
static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nodemask)
{
return NULL;
}
#endif /* CONFIG_CONTIG_ALLOC */
#else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nodemask)
{
return NULL;
}
static inline void free_gigantic_page(struct page *page, unsigned int order) { }
static inline void destroy_compound_gigantic_page(struct page *page,
unsigned int order) { }
#endif
static void update_and_free_page(struct hstate *h, struct page *page)
{
int i;
struct page *subpage = page;
if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
return;
h->nr_huge_pages--;
h->nr_huge_pages_node[page_to_nid(page)]--;
for (i = 0; i < pages_per_huge_page(h);
i++, subpage = mem_map_next(subpage, page, i)) {
subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
1 << PG_referenced | 1 << PG_dirty |
1 << PG_active | 1 << PG_private |
1 << PG_writeback);
}
VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page);
set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
set_page_refcounted(page);
if (hstate_is_gigantic(h)) {
/*
* Temporarily drop the hugetlb_lock, because
* we might block in free_gigantic_page().
*/
spin_unlock(&hugetlb_lock);
destroy_compound_gigantic_page(page, huge_page_order(h));
free_gigantic_page(page, huge_page_order(h));
spin_lock(&hugetlb_lock);
} else {
__free_pages(page, huge_page_order(h));
}
}
struct hstate *size_to_hstate(unsigned long size)
{
struct hstate *h;
for_each_hstate(h) {
if (huge_page_size(h) == size)
return h;
}
return NULL;
}
/*
* Test to determine whether the hugepage is "active/in-use" (i.e. being linked
* to hstate->hugepage_activelist.)
*
* This function can be called for tail pages, but never returns true for them.
*/
bool page_huge_active(struct page *page)
{
return PageHeadHuge(page) && PagePrivate(&page[1]);
}
/* never called for tail page */
void set_page_huge_active(struct page *page)
{
VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
SetPagePrivate(&page[1]);
}
static void clear_page_huge_active(struct page *page)
{
VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
ClearPagePrivate(&page[1]);
}
/*
* Internal hugetlb specific page flag. Do not use outside of the hugetlb
* code
*/
static inline bool PageHugeTemporary(struct page *page)
{
if (!PageHuge(page))
return false;
return (unsigned long)page[2].mapping == -1U;
}
static inline void SetPageHugeTemporary(struct page *page)
{
page[2].mapping = (void *)-1U;
}
static inline void ClearPageHugeTemporary(struct page *page)
{
page[2].mapping = NULL;
}
static void __free_huge_page(struct page *page)
{
/*
* Can't pass hstate in here because it is called from the
* compound page destructor.
*/
struct hstate *h = page_hstate(page);
int nid = page_to_nid(page);
struct hugepage_subpool *spool =
(struct hugepage_subpool *)page_private(page);
bool restore_reserve;
VM_BUG_ON_PAGE(page_count(page), page);
VM_BUG_ON_PAGE(page_mapcount(page), page);
set_page_private(page, 0);
page->mapping = NULL;
restore_reserve = PagePrivate(page);
ClearPagePrivate(page);
/*
* If PagePrivate() was set on page, page allocation consumed a
* reservation. If the page was associated with a subpool, there
* would have been a page reserved in the subpool before allocation
* via hugepage_subpool_get_pages(). Since we are 'restoring' the
* reservtion, do not call hugepage_subpool_put_pages() as this will
* remove the reserved page from the subpool.
*/
if (!restore_reserve) {
/*
* A return code of zero implies that the subpool will be
* under its minimum size if the reservation is not restored
* after page is free. Therefore, force restore_reserve
* operation.
*/
if (hugepage_subpool_put_pages(spool, 1) == 0)
restore_reserve = true;
}
spin_lock(&hugetlb_lock);
clear_page_huge_active(page);
hugetlb_cgroup_uncharge_page(hstate_index(h),
pages_per_huge_page(h), page);
hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h),
pages_per_huge_page(h), page);
if (restore_reserve)
h->resv_huge_pages++;
if (PageHugeTemporary(page)) {
list_del(&page->lru);
ClearPageHugeTemporary(page);
update_and_free_page(h, page);
} else if (h->surplus_huge_pages_node[nid]) {
/* remove the page from active list */
list_del(&page->lru);
update_and_free_page(h, page);
h->surplus_huge_pages--;
h->surplus_huge_pages_node[nid]--;
} else {
arch_clear_hugepage_flags(page);
enqueue_huge_page(h, page);
}
spin_unlock(&hugetlb_lock);
}
/*
* As free_huge_page() can be called from a non-task context, we have
* to defer the actual freeing in a workqueue to prevent potential
* hugetlb_lock deadlock.
*
* free_hpage_workfn() locklessly retrieves the linked list of pages to
* be freed and frees them one-by-one. As the page->mapping pointer is
* going to be cleared in __free_huge_page() anyway, it is reused as the
* llist_node structure of a lockless linked list of huge pages to be freed.
*/
static LLIST_HEAD(hpage_freelist);
static void free_hpage_workfn(struct work_struct *work)
{
struct llist_node *node;
struct page *page;
node = llist_del_all(&hpage_freelist);
while (node) {
page = container_of((struct address_space **)node,
struct page, mapping);
node = node->next;
__free_huge_page(page);
}
}
static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
void free_huge_page(struct page *page)
{
/*
* Defer freeing if in non-task context to avoid hugetlb_lock deadlock.
*/
if (!in_task()) {
/*
* Only call schedule_work() if hpage_freelist is previously
* empty. Otherwise, schedule_work() had been called but the
* workfn hasn't retrieved the list yet.
*/
if (llist_add((struct llist_node *)&page->mapping,
&hpage_freelist))
schedule_work(&free_hpage_work);
return;
}
__free_huge_page(page);
}
static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
{
INIT_LIST_HEAD(&page->lru);
set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
set_hugetlb_cgroup(page, NULL);
set_hugetlb_cgroup_rsvd(page, NULL);
spin_lock(&hugetlb_lock);
h->nr_huge_pages++;
h->nr_huge_pages_node[nid]++;
ClearPageHugeFreed(page);
spin_unlock(&hugetlb_lock);
}
static void prep_compound_gigantic_page(struct page *page, unsigned int order)
{
int i;
int nr_pages = 1 << order;
struct page *p = page + 1;
/* we rely on prep_new_huge_page to set the destructor */
set_compound_order(page, order);
__ClearPageReserved(page);
__SetPageHead(page);
for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
/*
* For gigantic hugepages allocated through bootmem at
* boot, it's safer to be consistent with the not-gigantic
* hugepages and clear the PG_reserved bit from all tail pages
* too. Otherwise drivers using get_user_pages() to access tail
* pages may get the reference counting wrong if they see
* PG_reserved set on a tail page (despite the head page not
* having PG_reserved set). Enforcing this consistency between
* head and tail pages allows drivers to optimize away a check
* on the head page when they need know if put_page() is needed
* after get_user_pages().
*/
__ClearPageReserved(p);
set_page_count(p, 0);
set_compound_head(p, page);
}
atomic_set(compound_mapcount_ptr(page), -1);
atomic_set(compound_pincount_ptr(page), 0);
}
/*
* PageHuge() only returns true for hugetlbfs pages, but not for normal or
* transparent huge pages. See the PageTransHuge() documentation for more
* details.
*/
int PageHuge(struct page *page)
{
if (!PageCompound(page))
return 0;
page = compound_head(page);
return page[1].compound_dtor == HUGETLB_PAGE_DTOR;
}
EXPORT_SYMBOL_GPL(PageHuge);
/*
* PageHeadHuge() only returns true for hugetlbfs head page, but not for
* normal or transparent huge pages.
*/
int PageHeadHuge(struct page *page_head)
{
if (!PageHead(page_head))
return 0;
return page_head[1].compound_dtor == HUGETLB_PAGE_DTOR;
}
/*
* Find and lock address space (mapping) in write mode.
*
* Upon entry, the page is locked which means that page_mapping() is
* stable. Due to locking order, we can only trylock_write. If we can
* not get the lock, simply return NULL to caller.
*/
struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage)
{
struct address_space *mapping = page_mapping(hpage);
if (!mapping)
return mapping;
if (i_mmap_trylock_write(mapping))
return mapping;
return NULL;
}
pgoff_t hugetlb_basepage_index(struct page *page)
{
struct page *page_head = compound_head(page);
pgoff_t index = page_index(page_head);
unsigned long compound_idx;
if (compound_order(page_head) >= MAX_ORDER)
compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
else
compound_idx = page - page_head;
return (index << compound_order(page_head)) + compound_idx;
}
static struct page *alloc_buddy_huge_page(struct hstate *h,
gfp_t gfp_mask, int nid, nodemask_t *nmask,
nodemask_t *node_alloc_noretry)
{
int order = huge_page_order(h);
struct page *page;
bool alloc_try_hard = true;
/*
* By default we always try hard to allocate the page with
* __GFP_RETRY_MAYFAIL flag. However, if we are allocating pages in
* a loop (to adjust global huge page counts) and previous allocation
* failed, do not continue to try hard on the same node. Use the
* node_alloc_noretry bitmap to manage this state information.
*/
if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry))
alloc_try_hard = false;
gfp_mask |= __GFP_COMP|__GFP_NOWARN;
if (alloc_try_hard)
gfp_mask |= __GFP_RETRY_MAYFAIL;
if (nid == NUMA_NO_NODE)
nid = numa_mem_id();
page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask);
if (page)
__count_vm_event(HTLB_BUDDY_PGALLOC);
else
__count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
/*
* If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this
* indicates an overall state change. Clear bit so that we resume
* normal 'try hard' allocations.
*/
if (node_alloc_noretry && page && !alloc_try_hard)
node_clear(nid, *node_alloc_noretry);
/*
* If we tried hard to get a page but failed, set bit so that
* subsequent attempts will not try as hard until there is an
* overall state change.
*/
if (node_alloc_noretry && !page && alloc_try_hard)
node_set(nid, *node_alloc_noretry);
return page;
}
/*
* Common helper to allocate a fresh hugetlb page. All specific allocators
* should use this function to get new hugetlb pages
*/
static struct page *alloc_fresh_huge_page(struct hstate *h,
gfp_t gfp_mask, int nid, nodemask_t *nmask,
nodemask_t *node_alloc_noretry)
{
struct page *page;
if (hstate_is_gigantic(h))
page = alloc_gigantic_page(h, gfp_mask, nid, nmask);
else
page = alloc_buddy_huge_page(h, gfp_mask,
nid, nmask, node_alloc_noretry);
if (!page)
return NULL;
if (hstate_is_gigantic(h))
prep_compound_gigantic_page(page, huge_page_order(h));
prep_new_huge_page(h, page, page_to_nid(page));
return page;
}
/*
* Allocates a fresh page to the hugetlb allocator pool in the node interleaved
* manner.
*/
static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
nodemask_t *node_alloc_noretry)
{
struct page *page;
int nr_nodes, node;
gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed,
node_alloc_noretry);
if (page)
break;
}
if (!page)
return 0;
put_page(page); /* free it into the hugepage allocator */
return 1;
}
/*
* Free huge page from pool from next node to free.
* Attempt to keep persistent huge pages more or less
* balanced over allowed nodes.
* Called with hugetlb_lock locked.
*/
static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
bool acct_surplus)
{
int nr_nodes, node;
int ret = 0;
for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
/*
* If we're returning unused surplus pages, only examine
* nodes with surplus pages.
*/
if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
!list_empty(&h->hugepage_freelists[node])) {
struct page *page =
list_entry(h->hugepage_freelists[node].next,
struct page, lru);
list_del(&page->lru);
h->free_huge_pages--;
h->free_huge_pages_node[node]--;
if (acct_surplus) {
h->surplus_huge_pages--;
h->surplus_huge_pages_node[node]--;
}
update_and_free_page(h, page);
ret = 1;
break;
}
}
return ret;
}
/*
* Dissolve a given free hugepage into free buddy pages. This function does
* nothing for in-use hugepages and non-hugepages.
* This function returns values like below:
*
* -EBUSY: failed to dissolved free hugepages or the hugepage is in-use
* (allocated or reserved.)
* 0: successfully dissolved free hugepages or the page is not a
* hugepage (considered as already dissolved)
*/
int dissolve_free_huge_page(struct page *page)
{
int rc = -EBUSY;
retry:
/* Not to disrupt normal path by vainly holding hugetlb_lock */
if (!PageHuge(page))
return 0;
spin_lock(&hugetlb_lock);
if (!PageHuge(page)) {
rc = 0;
goto out;
}
if (!page_count(page)) {
struct page *head = compound_head(page);
struct hstate *h = page_hstate(head);
int nid = page_to_nid(head);
if (h->free_huge_pages - h->resv_huge_pages == 0)
goto out;
/*
* We should make sure that the page is already on the free list
* when it is dissolved.
*/
if (unlikely(!PageHugeFreed(head))) {
spin_unlock(&hugetlb_lock);
cond_resched();
/*
* Theoretically, we should return -EBUSY when we
* encounter this race. In fact, we have a chance
* to successfully dissolve the page if we do a
* retry. Because the race window is quite small.
* If we seize this opportunity, it is an optimization
* for increasing the success rate of dissolving page.
*/
goto retry;
}
/*
* Move PageHWPoison flag from head page to the raw error page,
* which makes any subpages rather than the error page reusable.
*/
if (PageHWPoison(head) && page != head) {
SetPageHWPoison(page);
ClearPageHWPoison(head);
}
list_del(&head->lru);
h->free_huge_pages--;
h->free_huge_pages_node[nid]--;
h->max_huge_pages--;
update_and_free_page(h, head);
rc = 0;
}
out:
spin_unlock(&hugetlb_lock);
return rc;
}
/*
* Dissolve free hugepages in a given pfn range. Used by memory hotplug to
* make specified memory blocks removable from the system.
* Note that this will dissolve a free gigantic hugepage completely, if any
* part of it lies within the given range.
* Also note that if dissolve_free_huge_page() returns with an error, all
* free hugepages that were dissolved before that error are lost.
*/
int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
{
unsigned long pfn;
struct page *page;
int rc = 0;
if (!hugepages_supported())
return rc;
for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) {
page = pfn_to_page(pfn);
rc = dissolve_free_huge_page(page);
if (rc)
break;
}
return rc;
}
/*
* Allocates a fresh surplus page from the page allocator.
*/
static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nmask)
{
struct page *page = NULL;
if (hstate_is_gigantic(h))
return NULL;
spin_lock(&hugetlb_lock);
if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages)
goto out_unlock;
spin_unlock(&hugetlb_lock);
page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
if (!page)
return NULL;
spin_lock(&hugetlb_lock);
/*
* We could have raced with the pool size change.
* Double check that and simply deallocate the new page
* if we would end up overcommiting the surpluses. Abuse
* temporary page to workaround the nasty free_huge_page
* codeflow
*/
if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
SetPageHugeTemporary(page);
spin_unlock(&hugetlb_lock);
put_page(page);
return NULL;
} else {
h->surplus_huge_pages++;
h->surplus_huge_pages_node[page_to_nid(page)]++;
}
out_unlock:
spin_unlock(&hugetlb_lock);
return page;
}
static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nmask)
{
struct page *page;
if (hstate_is_gigantic(h))
return NULL;
page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
if (!page)
return NULL;
/*
* We do not account these pages as surplus because they are only
* temporary and will be released properly on the last reference
*/
SetPageHugeTemporary(page);
return page;
}
/*
* Use the VMA's mpolicy to allocate a huge page from the buddy.
*/
static
struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr)
{
struct page *page;
struct mempolicy *mpol;
gfp_t gfp_mask = htlb_alloc_mask(h);
int nid;
nodemask_t *nodemask;
nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
mpol_cond_put(mpol);
return page;
}
/* page migration callback function */
struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
nodemask_t *nmask, gfp_t gfp_mask)
{
spin_lock(&hugetlb_lock);
if (h->free_huge_pages - h->resv_huge_pages > 0) {
struct page *page;
page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask);
if (page) {
spin_unlock(&hugetlb_lock);
return page;
}
}
spin_unlock(&hugetlb_lock);
return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask);
}
/* mempolicy aware migration callback */
struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
unsigned long address)
{
struct mempolicy *mpol;
nodemask_t *nodemask;
struct page *page;
gfp_t gfp_mask;
int node;
gfp_mask = htlb_alloc_mask(h);
node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
page = alloc_huge_page_nodemask(h, node, nodemask, gfp_mask);
mpol_cond_put(mpol);
return page;
}
/*
* Increase the hugetlb pool such that it can accommodate a reservation
* of size 'delta'.
*/
static int gather_surplus_pages(struct hstate *h, int delta)
__must_hold(&hugetlb_lock)
{
struct list_head surplus_list;
struct page *page, *tmp;
int ret, i;
int needed, allocated;
bool alloc_ok = true;
needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
if (needed <= 0) {
h->resv_huge_pages += delta;
return 0;
}
allocated = 0;
INIT_LIST_HEAD(&surplus_list);
ret = -ENOMEM;
retry:
spin_unlock(&hugetlb_lock);
for (i = 0; i < needed; i++) {
page = alloc_surplus_huge_page(h, htlb_alloc_mask(h),
NUMA_NO_NODE, NULL);
if (!page) {
alloc_ok = false;
break;
}
list_add(&page->lru, &surplus_list);
cond_resched();
}
allocated += i;
/*
* After retaking hugetlb_lock, we need to recalculate 'needed'
* because either resv_huge_pages or free_huge_pages may have changed.
*/
spin_lock(&hugetlb_lock);
needed = (h->resv_huge_pages + delta) -
(h->free_huge_pages + allocated);
if (needed > 0) {
if (alloc_ok)
goto retry;
/*
* We were not able to allocate enough pages to
* satisfy the entire reservation so we free what
* we've allocated so far.
*/
goto free;
}
/*
* The surplus_list now contains _at_least_ the number of extra pages
* needed to accommodate the reservation. Add the appropriate number
* of pages to the hugetlb pool and free the extras back to the buddy
* allocator. Commit the entire reservation here to prevent another
* process from stealing the pages as they are added to the pool but
* before they are reserved.
*/
needed += allocated;
h->resv_huge_pages += delta;
ret = 0;
/* Free the needed pages to the hugetlb pool */
list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
if ((--needed) < 0)
break;
/*
* This page is now managed by the hugetlb allocator and has
* no users -- drop the buddy allocator's reference.
*/
put_page_testzero(page);
VM_BUG_ON_PAGE(page_count(page), page);
enqueue_huge_page(h, page);
}
free:
spin_unlock(&hugetlb_lock);
/* Free unnecessary surplus pages to the buddy allocator */
list_for_each_entry_safe(page, tmp, &surplus_list, lru)
put_page(page);
spin_lock(&hugetlb_lock);
return ret;
}
/*
* This routine has two main purposes:
* 1) Decrement the reservation count (resv_huge_pages) by the value passed
* in unused_resv_pages. This corresponds to the prior adjustments made
* to the associated reservation map.
* 2) Free any unused surplus pages that may have been allocated to satisfy
* the reservation. As many as unused_resv_pages may be freed.
*
* Called with hugetlb_lock held. However, the lock could be dropped (and
* reacquired) during calls to cond_resched_lock. Whenever dropping the lock,
* we must make sure nobody else can claim pages we are in the process of
* freeing. Do this by ensuring resv_huge_page always is greater than the
* number of huge pages we plan to free when dropping the lock.
*/
static void return_unused_surplus_pages(struct hstate *h,
unsigned long unused_resv_pages)
{
unsigned long nr_pages;
/* Cannot return gigantic pages currently */
if (hstate_is_gigantic(h))
goto out;
/*
* Part (or even all) of the reservation could have been backed
* by pre-allocated pages. Only free surplus pages.
*/
nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
/*
* We want to release as many surplus pages as possible, spread
* evenly across all nodes with memory. Iterate across these nodes
* until we can no longer free unreserved surplus pages. This occurs
* when the nodes with surplus pages have no free pages.
* free_pool_huge_page() will balance the freed pages across the
* on-line nodes with memory and will handle the hstate accounting.
*
* Note that we decrement resv_huge_pages as we free the pages. If
* we drop the lock, resv_huge_pages will still be sufficiently large
* to cover subsequent pages we may free.
*/
while (nr_pages--) {
h->resv_huge_pages--;
unused_resv_pages--;
if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
goto out;
cond_resched_lock(&hugetlb_lock);
}
out:
/* Fully uncommit the reservation */
h->resv_huge_pages -= unused_resv_pages;
}
/*
* vma_needs_reservation, vma_commit_reservation and vma_end_reservation
* are used by the huge page allocation routines to manage reservations.
*
* vma_needs_reservation is called to determine if the huge page at addr
* within the vma has an associated reservation. If a reservation is
* needed, the value 1 is returned. The caller is then responsible for
* managing the global reservation and subpool usage counts. After
* the huge page has been allocated, vma_commit_reservation is called
* to add the page to the reservation map. If the page allocation fails,
* the reservation must be ended instead of committed. vma_end_reservation
* is called in such cases.
*
* In the normal case, vma_commit_reservation returns the same value
* as the preceding vma_needs_reservation call. The only time this
* is not the case is if a reserve map was changed between calls. It
* is the responsibility of the caller to notice the difference and
* take appropriate action.
*
* vma_add_reservation is used in error paths where a reservation must
* be restored when a newly allocated huge page must be freed. It is
* to be called after calling vma_needs_reservation to determine if a
* reservation exists.
*/
enum vma_resv_mode {
VMA_NEEDS_RESV,
VMA_COMMIT_RESV,
VMA_END_RESV,
VMA_ADD_RESV,
};
static long __vma_reservation_common(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr,
enum vma_resv_mode mode)
{
struct resv_map *resv;
pgoff_t idx;
long ret;
long dummy_out_regions_needed;
resv = vma_resv_map(vma);
if (!resv)
return 1;
idx = vma_hugecache_offset(h, vma, addr);
switch (mode) {
case VMA_NEEDS_RESV:
ret = region_chg(resv, idx, idx + 1, &dummy_out_regions_needed);
/* We assume that vma_reservation_* routines always operate on
* 1 page, and that adding to resv map a 1 page entry can only
* ever require 1 region.
*/
VM_BUG_ON(dummy_out_regions_needed != 1);
break;
case VMA_COMMIT_RESV:
ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
/* region_add calls of range 1 should never fail. */
VM_BUG_ON(ret < 0);
break;
case VMA_END_RESV:
region_abort(resv, idx, idx + 1, 1);
ret = 0;
break;
case VMA_ADD_RESV:
if (vma->vm_flags & VM_MAYSHARE) {
ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
/* region_add calls of range 1 should never fail. */
VM_BUG_ON(ret < 0);
} else {
region_abort(resv, idx, idx + 1, 1);
ret = region_del(resv, idx, idx + 1);
}
break;
default:
BUG();
}
if (vma->vm_flags & VM_MAYSHARE)
return ret;
else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && ret >= 0) {
/*
* In most cases, reserves always exist for private mappings.
* However, a file associated with mapping could have been
* hole punched or truncated after reserves were consumed.
* As subsequent fault on such a range will not use reserves.
* Subtle - The reserve map for private mappings has the
* opposite meaning than that of shared mappings. If NO
* entry is in the reserve map, it means a reservation exists.
* If an entry exists in the reserve map, it means the
* reservation has already been consumed. As a result, the
* return value of this routine is the opposite of the
* value returned from reserve map manipulation routines above.
*/
if (ret)
return 0;
else
return 1;
}
else
return ret < 0 ? ret : 0;
}
static long vma_needs_reservation(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr)
{
return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV);
}
static long vma_commit_reservation(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr)
{
return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV);
}
static void vma_end_reservation(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr)
{
(void)__vma_reservation_common(h, vma, addr, VMA_END_RESV);
}
static long vma_add_reservation(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr)
{
return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV);
}
/*
* This routine is called to restore a reservation on error paths. In the
* specific error paths, a huge page was allocated (via alloc_huge_page)
* and is about to be freed. If a reservation for the page existed,
* alloc_huge_page would have consumed the reservation and set PagePrivate
* in the newly allocated page. When the page is freed via free_huge_page,
* the global reservation count will be incremented if PagePrivate is set.
* However, free_huge_page can not adjust the reserve map. Adjust the
* reserve map here to be consistent with global reserve count adjustments
* to be made by free_huge_page.
*/
static void restore_reserve_on_error(struct hstate *h,
struct vm_area_struct *vma, unsigned long address,
struct page *page)
{
if (unlikely(PagePrivate(page))) {
long rc = vma_needs_reservation(h, vma, address);
if (unlikely(rc < 0)) {
/*
* Rare out of memory condition in reserve map
* manipulation. Clear PagePrivate so that
* global reserve count will not be incremented
* by free_huge_page. This will make it appear
* as though the reservation for this page was
* consumed. This may prevent the task from
* faulting in the page at a later time. This
* is better than inconsistent global huge page
* accounting of reserve counts.
*/
ClearPagePrivate(page);
} else if (rc) {
rc = vma_add_reservation(h, vma, address);
if (unlikely(rc < 0))
/*
* See above comment about rare out of
* memory condition.
*/
ClearPagePrivate(page);
} else
vma_end_reservation(h, vma, address);
}
}
struct page *alloc_huge_page(struct vm_area_struct *vma,
unsigned long addr, int avoid_reserve)
{
struct hugepage_subpool *spool = subpool_vma(vma);
struct hstate *h = hstate_vma(vma);
struct page *page;
long map_chg, map_commit;
long gbl_chg;
int ret, idx;
struct hugetlb_cgroup *h_cg;
bool deferred_reserve;
idx = hstate_index(h);
/*
* Examine the region/reserve map to determine if the process
* has a reservation for the page to be allocated. A return
* code of zero indicates a reservation exists (no change).
*/
map_chg = gbl_chg = vma_needs_reservation(h, vma, addr);
if (map_chg < 0)
return ERR_PTR(-ENOMEM);
/*
* Processes that did not create the mapping will have no
* reserves as indicated by the region/reserve map. Check
* that the allocation will not exceed the subpool limit.
* Allocations for MAP_NORESERVE mappings also need to be
* checked against any subpool limit.
*/
if (map_chg || avoid_reserve) {
gbl_chg = hugepage_subpool_get_pages(spool, 1);
if (gbl_chg < 0) {
vma_end_reservation(h, vma, addr);
return ERR_PTR(-ENOSPC);
}
/*
* Even though there was no reservation in the region/reserve
* map, there could be reservations associated with the
* subpool that can be used. This would be indicated if the
* return value of hugepage_subpool_get_pages() is zero.
* However, if avoid_reserve is specified we still avoid even
* the subpool reservations.
*/
if (avoid_reserve)
gbl_chg = 1;
}
/* If this allocation is not consuming a reservation, charge it now.
*/
deferred_reserve = map_chg || avoid_reserve || !vma_resv_map(vma);
if (deferred_reserve) {
ret = hugetlb_cgroup_charge_cgroup_rsvd(
idx, pages_per_huge_page(h), &h_cg);
if (ret)
goto out_subpool_put;
}
ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
if (ret)
goto out_uncharge_cgroup_reservation;
spin_lock(&hugetlb_lock);
/*
* glb_chg is passed to indicate whether or not a page must be taken
* from the global free pool (global change). gbl_chg == 0 indicates
* a reservation exists for the allocation.
*/
page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
if (!page) {
spin_unlock(&hugetlb_lock);
page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
if (!page)
goto out_uncharge_cgroup;
spin_lock(&hugetlb_lock);
if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
SetPagePrivate(page);
h->resv_huge_pages--;
}
list_add(&page->lru, &h->hugepage_activelist);
/* Fall through */
}
hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
/* If allocation is not consuming a reservation, also store the
* hugetlb_cgroup pointer on the page.
*/
if (deferred_reserve) {
hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h),
h_cg, page);
}
spin_unlock(&hugetlb_lock);
set_page_private(page, (unsigned long)spool);
map_commit = vma_commit_reservation(h, vma, addr);
if (unlikely(map_chg > map_commit)) {
/*
* The page was added to the reservation map between
* vma_needs_reservation and vma_commit_reservation.
* This indicates a race with hugetlb_reserve_pages.
* Adjust for the subpool count incremented above AND
* in hugetlb_reserve_pages for the same page. Also,
* the reservation count added in hugetlb_reserve_pages
* no longer applies.
*/
long rsv_adjust;
rsv_adjust = hugepage_subpool_put_pages(spool, 1);
hugetlb_acct_memory(h, -rsv_adjust);
if (deferred_reserve)
hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h),
pages_per_huge_page(h), page);
}
return page;
out_uncharge_cgroup:
hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
out_uncharge_cgroup_reservation:
if (deferred_reserve)
hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h),
h_cg);
out_subpool_put:
if (map_chg || avoid_reserve)
hugepage_subpool_put_pages(spool, 1);
vma_end_reservation(h, vma, addr);
return ERR_PTR(-ENOSPC);
}
int alloc_bootmem_huge_page(struct hstate *h)
__attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
int __alloc_bootmem_huge_page(struct hstate *h)
{
struct huge_bootmem_page *m;
int nr_nodes, node;
for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
void *addr;
addr = memblock_alloc_try_nid_raw(
huge_page_size(h), huge_page_size(h),
0, MEMBLOCK_ALLOC_ACCESSIBLE, node);
if (addr) {
/*
* Use the beginning of the huge page to store the
* huge_bootmem_page struct (until gather_bootmem
* puts them into the mem_map).
*/
m = addr;
goto found;
}
}
return 0;
found:
BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h)));
/* Put them into a private list first because mem_map is not up yet */
INIT_LIST_HEAD(&m->list);
list_add(&m->list, &huge_boot_pages);
m->hstate = h;
return 1;
}
/*
* Put bootmem huge pages into the standard lists after mem_map is up.
* Note: This only applies to gigantic (order > MAX_ORDER) pages.
*/
static void __init gather_bootmem_prealloc(void)
{
struct huge_bootmem_page *m;
list_for_each_entry(m, &huge_boot_pages, list) {
struct page *page = virt_to_page(m);
struct hstate *h = m->hstate;
VM_BUG_ON(!hstate_is_gigantic(h));
WARN_ON(page_count(page) != 1);
prep_compound_gigantic_page(page, huge_page_order(h));
WARN_ON(PageReserved(page));
prep_new_huge_page(h, page, page_to_nid(page));
put_page(page); /* free it into the hugepage allocator */
/*
* We need to restore the 'stolen' pages to totalram_pages
* in order to fix confusing memory reports from free(1) and
* other side-effects, like CommitLimit going negative.
*/
adjust_managed_page_count(page, pages_per_huge_page(h));
cond_resched();
}
}
static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
{
unsigned long i;
nodemask_t *node_alloc_noretry;
if (!hstate_is_gigantic(h)) {
/*
* Bit mask controlling how hard we retry per-node allocations.
* Ignore errors as lower level routines can deal with
* node_alloc_noretry == NULL. If this kmalloc fails at boot
* time, we are likely in bigger trouble.
*/
node_alloc_noretry = kmalloc(sizeof(*node_alloc_noretry),
GFP_KERNEL);
} else {
/* allocations done at boot time */
node_alloc_noretry = NULL;
}
/* bit mask controlling how hard we retry per-node allocations */
if (node_alloc_noretry)
nodes_clear(*node_alloc_noretry);
for (i = 0; i < h->max_huge_pages; ++i) {
if (hstate_is_gigantic(h)) {
if (hugetlb_cma_size) {
pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n");
goto free;
}
if (!alloc_bootmem_huge_page(h))
break;
} else if (!alloc_pool_huge_page(h,
&node_states[N_MEMORY],
node_alloc_noretry))
break;
cond_resched();
}
if (i < h->max_huge_pages) {
char buf[32];
string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
pr_warn("HugeTLB: allocating %lu of page size %s failed. Only allocated %lu hugepages.\n",
h->max_huge_pages, buf, i);
h->max_huge_pages = i;
}
free:
kfree(node_alloc_noretry);
}
static void __init hugetlb_init_hstates(void)
{
struct hstate *h;
for_each_hstate(h) {
if (minimum_order > huge_page_order(h))
minimum_order = huge_page_order(h);
/* oversize hugepages were init'ed in early boot */
if (!hstate_is_gigantic(h))
hugetlb_hstate_alloc_pages(h);
}
VM_BUG_ON(minimum_order == UINT_MAX);
}
static void __init report_hugepages(void)
{
struct hstate *h;
for_each_hstate(h) {
char buf[32];
string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n",
buf, h->free_huge_pages);
}
}
#ifdef CONFIG_HIGHMEM
static void try_to_free_low(struct hstate *h, unsigned long count,
nodemask_t *nodes_allowed)
{
int i;
if (hstate_is_gigantic(h))
return;
for_each_node_mask(i, *nodes_allowed) {
struct page *page, *next;
struct list_head *freel = &h->hugepage_freelists[i];
list_for_each_entry_safe(page, next, freel, lru) {
if (count >= h->nr_huge_pages)
return;
if (PageHighMem(page))
continue;
list_del(&page->lru);
update_and_free_page(h, page);
h->free_huge_pages--;
h->free_huge_pages_node[page_to_nid(page)]--;
}
}
}
#else
static inline void try_to_free_low(struct hstate *h, unsigned long count,
nodemask_t *nodes_allowed)
{
}
#endif
/*
* Increment or decrement surplus_huge_pages. Keep node-specific counters
* balanced by operating on them in a round-robin fashion.
* Returns 1 if an adjustment was made.
*/
static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
int delta)
{
int nr_nodes, node;
VM_BUG_ON(delta != -1 && delta != 1);
if (delta < 0) {
for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
if (h->surplus_huge_pages_node[node])
goto found;
}
} else {
for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
if (h->surplus_huge_pages_node[node] <
h->nr_huge_pages_node[node])
goto found;
}
}
return 0;
found:
h->surplus_huge_pages += delta;
h->surplus_huge_pages_node[node] += delta;
return 1;
}
#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
nodemask_t *nodes_allowed)
{
unsigned long min_count, ret;
NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL);
/*
* Bit mask controlling how hard we retry per-node allocations.
* If we can not allocate the bit mask, do not attempt to allocate
* the requested huge pages.
*/
if (node_alloc_noretry)
nodes_clear(*node_alloc_noretry);
else
return -ENOMEM;
spin_lock(&hugetlb_lock);
/*
* Check for a node specific request.
* Changing node specific huge page count may require a corresponding
* change to the global count. In any case, the passed node mask
* (nodes_allowed) will restrict alloc/free to the specified node.
*/
if (nid != NUMA_NO_NODE) {
unsigned long old_count = count;
count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
/*
* User may have specified a large count value which caused the
* above calculation to overflow. In this case, they wanted
* to allocate as many huge pages as possible. Set count to
* largest possible value to align with their intention.
*/
if (count < old_count)
count = ULONG_MAX;
}
/*
* Gigantic pages runtime allocation depend on the capability for large
* page range allocation.
* If the system does not provide this feature, return an error when
* the user tries to allocate gigantic pages but let the user free the
* boottime allocated gigantic pages.
*/
if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
if (count > persistent_huge_pages(h)) {
spin_unlock(&hugetlb_lock);
NODEMASK_FREE(node_alloc_noretry);
return -EINVAL;
}
/* Fall through to decrease pool */
}
/*
* Increase the pool size
* First take pages out of surplus state. Then make up the
* remaining difference by allocating fresh huge pages.
*
* We might race with alloc_surplus_huge_page() here and be unable
* to convert a surplus huge page to a normal huge page. That is
* not critical, though, it just means the overall size of the
* pool might be one hugepage larger than it needs to be, but
* within all the constraints specified by the sysctls.
*/
while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
if (!adjust_pool_surplus(h, nodes_allowed, -1))
break;
}
while (count > persistent_huge_pages(h)) {
/*
* If this allocation races such that we no longer need the
* page, free_huge_page will handle it by freeing the page
* and reducing the surplus.
*/
spin_unlock(&hugetlb_lock);
/* yield cpu to avoid soft lockup */
cond_resched();
ret = alloc_pool_huge_page(h, nodes_allowed,
node_alloc_noretry);
spin_lock(&hugetlb_lock);
if (!ret)
goto out;
/* Bail for signals. Probably ctrl-c from user */
if (signal_pending(current))
goto out;
}
/*
* Decrease the pool size
* First return free pages to the buddy allocator (being careful
* to keep enough around to satisfy reservations). Then place
* pages into surplus state as needed so the pool will shrink
* to the desired size as pages become free.
*
* By placing pages into the surplus state independent of the
* overcommit value, we are allowing the surplus pool size to
* exceed overcommit. There are few sane options here. Since
* alloc_surplus_huge_page() is checking the global counter,
* though, we'll note that we're not allowed to exceed surplus
* and won't grow the pool anywhere else. Not until one of the
* sysctls are changed, or the surplus pages go out of use.
*/
min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
min_count = max(count, min_count);
try_to_free_low(h, min_count, nodes_allowed);
while (min_count < persistent_huge_pages(h)) {
if (!free_pool_huge_page(h, nodes_allowed, 0))
break;
cond_resched_lock(&hugetlb_lock);
}
while (count < persistent_huge_pages(h)) {
if (!adjust_pool_surplus(h, nodes_allowed, 1))
break;
}
out:
h->max_huge_pages = persistent_huge_pages(h);
spin_unlock(&hugetlb_lock);
NODEMASK_FREE(node_alloc_noretry);
return 0;
}
#define HSTATE_ATTR_RO(_name) \
static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
#define HSTATE_ATTR(_name) \
static struct kobj_attribute _name##_attr = \
__ATTR(_name, 0644, _name##_show, _name##_store)
static struct kobject *hugepages_kobj;
static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp);
static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp)
{
int i;
for (i = 0; i < HUGE_MAX_HSTATE; i++)
if (hstate_kobjs[i] == kobj) {
if (nidp)
*nidp = NUMA_NO_NODE;
return &hstates[i];
}
return kobj_to_node_hstate(kobj, nidp);
}
static ssize_t nr_hugepages_show_common(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
struct hstate *h;
unsigned long nr_huge_pages;
int nid;
h = kobj_to_hstate(kobj, &nid);
if (nid == NUMA_NO_NODE)
nr_huge_pages = h->nr_huge_pages;
else
nr_huge_pages = h->nr_huge_pages_node[nid];
return sprintf(buf, "%lu\n", nr_huge_pages);
}
static ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
struct hstate *h, int nid,
unsigned long count, size_t len)
{
int err;
nodemask_t nodes_allowed, *n_mask;
if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
return -EINVAL;
if (nid == NUMA_NO_NODE) {
/*
* global hstate attribute
*/
if (!(obey_mempolicy &&
init_nodemask_of_mempolicy(&nodes_allowed)))
n_mask = &node_states[N_MEMORY];
else
n_mask = &nodes_allowed;
} else {
/*
* Node specific request. count adjustment happens in
* set_max_huge_pages() after acquiring hugetlb_lock.
*/
init_nodemask_of_node(&nodes_allowed, nid);
n_mask = &nodes_allowed;
}
err = set_max_huge_pages(h, count, nid, n_mask);
return err ? err : len;
}
static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
struct kobject *kobj, const char *buf,
size_t len)
{
struct hstate *h;
unsigned long count;
int nid;
int err;
err = kstrtoul(buf, 10, &count);
if (err)
return err;
h = kobj_to_hstate(kobj, &nid);
return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len);
}
static ssize_t nr_hugepages_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return nr_hugepages_show_common(kobj, attr, buf);
}
static ssize_t nr_hugepages_store(struct kobject *kobj,
struct kobj_attribute *attr, const char *buf, size_t len)
{
return nr_hugepages_store_common(false, kobj, buf, len);
}
HSTATE_ATTR(nr_hugepages);
#ifdef CONFIG_NUMA
/*
* hstate attribute for optionally mempolicy-based constraint on persistent
* huge page alloc/free.
*/
static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return nr_hugepages_show_common(kobj, attr, buf);
}
static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
struct kobj_attribute *attr, const char *buf, size_t len)
{
return nr_hugepages_store_common(true, kobj, buf, len);
}
HSTATE_ATTR(nr_hugepages_mempolicy);
#endif
static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
struct hstate *h = kobj_to_hstate(kobj, NULL);
return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
}
static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
struct kobj_attribute *attr, const char *buf, size_t count)
{
int err;
unsigned long input;
struct hstate *h = kobj_to_hstate(kobj, NULL);
if (hstate_is_gigantic(h))
return -EINVAL;
err = kstrtoul(buf, 10, &input);
if (err)
return err;
spin_lock(&hugetlb_lock);
h->nr_overcommit_huge_pages = input;
spin_unlock(&hugetlb_lock);
return count;
}
HSTATE_ATTR(nr_overcommit_hugepages);
static ssize_t free_hugepages_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
struct hstate *h;
unsigned long free_huge_pages;
int nid;
h = kobj_to_hstate(kobj, &nid);
if (nid == NUMA_NO_NODE)
free_huge_pages = h->free_huge_pages;
else
free_huge_pages = h->free_huge_pages_node[nid];
return sprintf(buf, "%lu\n", free_huge_pages);
}
HSTATE_ATTR_RO(free_hugepages);
static ssize_t resv_hugepages_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
struct hstate *h = kobj_to_hstate(kobj, NULL);
return sprintf(buf, "%lu\n", h->resv_huge_pages);
}
HSTATE_ATTR_RO(resv_hugepages);
static ssize_t surplus_hugepages_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
struct hstate *h;
unsigned long surplus_huge_pages;
int nid;
h = kobj_to_hstate(kobj, &nid);
if (nid == NUMA_NO_NODE)
surplus_huge_pages = h->surplus_huge_pages;
else
surplus_huge_pages = h->surplus_huge_pages_node[nid];
return sprintf(buf, "%lu\n", surplus_huge_pages);
}
HSTATE_ATTR_RO(surplus_hugepages);
static struct attribute *hstate_attrs[] = {
&nr_hugepages_attr.attr,
&nr_overcommit_hugepages_attr.attr,
&free_hugepages_attr.attr,
&resv_hugepages_attr.attr,
&surplus_hugepages_attr.attr,
#ifdef CONFIG_NUMA
&nr_hugepages_mempolicy_attr.attr,
#endif
NULL,
};
static const struct attribute_group hstate_attr_group = {
.attrs = hstate_attrs,
};
static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
struct kobject **hstate_kobjs,
const struct attribute_group *hstate_attr_group)
{
int retval;
int hi = hstate_index(h);
hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
if (!hstate_kobjs[hi])
return -ENOMEM;
retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group);
if (retval) {
kobject_put(hstate_kobjs[hi]);
hstate_kobjs[hi] = NULL;
}
return retval;
}
static void __init hugetlb_sysfs_init(void)
{
struct hstate *h;
int err;
hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
if (!hugepages_kobj)
return;
for_each_hstate(h) {
err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
hstate_kobjs, &hstate_attr_group);
if (err)
pr_err("HugeTLB: Unable to add hstate %s", h->name);
}
}
#ifdef CONFIG_NUMA
/*
* node_hstate/s - associate per node hstate attributes, via their kobjects,
* with node devices in node_devices[] using a parallel array. The array
* index of a node device or _hstate == node id.
* This is here to avoid any static dependency of the node device driver, in
* the base kernel, on the hugetlb module.
*/
struct node_hstate {
struct kobject *hugepages_kobj;
struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
};
static struct node_hstate node_hstates[MAX_NUMNODES];
/*
* A subset of global hstate attributes for node devices
*/
static struct attribute *per_node_hstate_attrs[] = {
&nr_hugepages_attr.attr,
&free_hugepages_attr.attr,
&surplus_hugepages_attr.attr,
NULL,
};
static const struct attribute_group per_node_hstate_attr_group = {
.attrs = per_node_hstate_attrs,
};
/*
* kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj.
* Returns node id via non-NULL nidp.
*/
static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
{
int nid;
for (nid = 0; nid < nr_node_ids; nid++) {
struct node_hstate *nhs = &node_hstates[nid];
int i;
for (i = 0; i < HUGE_MAX_HSTATE; i++)
if (nhs->hstate_kobjs[i] == kobj) {
if (nidp)
*nidp = nid;
return &hstates[i];
}
}
BUG();
return NULL;
}
/*
* Unregister hstate attributes from a single node device.
* No-op if no hstate attributes attached.
*/
static void hugetlb_unregister_node(struct node *node)
{
struct hstate *h;
struct node_hstate *nhs = &node_hstates[node->dev.id];
if (!nhs->hugepages_kobj)
return; /* no hstate attributes */
for_each_hstate(h) {
int idx = hstate_index(h);
if (nhs->hstate_kobjs[idx]) {
kobject_put(nhs->hstate_kobjs[idx]);
nhs->hstate_kobjs[idx] = NULL;
}
}
kobject_put(nhs->hugepages_kobj);
nhs->hugepages_kobj = NULL;
}
/*
* Register hstate attributes for a single node device.
* No-op if attributes already registered.
*/
static void hugetlb_register_node(struct node *node)
{
struct hstate *h;
struct node_hstate *nhs = &node_hstates[node->dev.id];
int err;
if (nhs->hugepages_kobj)
return; /* already allocated */
nhs->hugepages_kobj = kobject_create_and_add("hugepages",
&node->dev.kobj);
if (!nhs->hugepages_kobj)
return;
for_each_hstate(h) {
err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj,
nhs->hstate_kobjs,
&per_node_hstate_attr_group);
if (err) {
pr_err("HugeTLB: Unable to add hstate %s for node %d\n",
h->name, node->dev.id);
hugetlb_unregister_node(node);
break;
}
}
}
/*
* hugetlb init time: register hstate attributes for all registered node
* devices of nodes that have memory. All on-line nodes should have
* registered their associated device by this time.
*/
static void __init hugetlb_register_all_nodes(void)
{
int nid;
for_each_node_state(nid, N_MEMORY) {
struct node *node = node_devices[nid];
if (node->dev.id == nid)
hugetlb_register_node(node);
}
/*
* Let the node device driver know we're here so it can
* [un]register hstate attributes on node hotplug.
*/
register_hugetlbfs_with_node(hugetlb_register_node,
hugetlb_unregister_node);
}
#else /* !CONFIG_NUMA */
static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
{
BUG();
if (nidp)
*nidp = -1;
return NULL;
}
static void hugetlb_register_all_nodes(void) { }
#endif
static int __init hugetlb_init(void)
{
int i;
if (!hugepages_supported()) {
if (hugetlb_max_hstate || default_hstate_max_huge_pages)
pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n");
return 0;
}
/*
* Make sure HPAGE_SIZE (HUGETLB_PAGE_ORDER) hstate exists. Some
* architectures depend on setup being done here.
*/
hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
if (!parsed_default_hugepagesz) {
/*
* If we did not parse a default huge page size, set
* default_hstate_idx to HPAGE_SIZE hstate. And, if the
* number of huge pages for this default size was implicitly
* specified, set that here as well.
* Note that the implicit setting will overwrite an explicit
* setting. A warning will be printed in this case.
*/
default_hstate_idx = hstate_index(size_to_hstate(HPAGE_SIZE));
if (default_hstate_max_huge_pages) {
if (default_hstate.max_huge_pages) {
char buf[32];
string_get_size(huge_page_size(&default_hstate),
1, STRING_UNITS_2, buf, 32);
pr_warn("HugeTLB: Ignoring hugepages=%lu associated with %s page size\n",
default_hstate.max_huge_pages, buf);
pr_warn("HugeTLB: Using hugepages=%lu for number of default huge pages\n",
default_hstate_max_huge_pages);
}
default_hstate.max_huge_pages =
default_hstate_max_huge_pages;
}
}
hugetlb_cma_check();
hugetlb_init_hstates();
gather_bootmem_prealloc();
report_hugepages();
hugetlb_sysfs_init();
hugetlb_register_all_nodes();
hugetlb_cgroup_file_init();
#ifdef CONFIG_SMP
num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus());
#else
num_fault_mutexes = 1;
#endif
hugetlb_fault_mutex_table =
kmalloc_array(num_fault_mutexes, sizeof(struct mutex),
GFP_KERNEL);
BUG_ON(!hugetlb_fault_mutex_table);
for (i = 0; i < num_fault_mutexes; i++)
mutex_init(&hugetlb_fault_mutex_table[i]);
return 0;
}
subsys_initcall(hugetlb_init);
/* Overwritten by architectures with more huge page sizes */
bool __init __attribute((weak)) arch_hugetlb_valid_size(unsigned long size)
{
return size == HPAGE_SIZE;
}
void __init hugetlb_add_hstate(unsigned int order)
{
struct hstate *h;
unsigned long i;
if (size_to_hstate(PAGE_SIZE << order)) {
return;
}
BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
BUG_ON(order == 0);
h = &hstates[hugetlb_max_hstate++];
h->order = order;
h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
h->nr_huge_pages = 0;
h->free_huge_pages = 0;
for (i = 0; i < MAX_NUMNODES; ++i)
INIT_LIST_HEAD(&h->hugepage_freelists[i]);
INIT_LIST_HEAD(&h->hugepage_activelist);
h->next_nid_to_alloc = first_memory_node;
h->next_nid_to_free = first_memory_node;
snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
huge_page_size(h)/1024);
parsed_hstate = h;
}
/*
* hugepages command line processing
* hugepages normally follows a valid hugepagsz or default_hugepagsz
* specification. If not, ignore the hugepages value. hugepages can also
* be the first huge page command line option in which case it implicitly
* specifies the number of huge pages for the default size.
*/
static int __init hugepages_setup(char *s)
{
unsigned long *mhp;
static unsigned long *last_mhp;
if (!parsed_valid_hugepagesz) {
pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s);
parsed_valid_hugepagesz = true;
return 0;
}
/*
* !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter
* yet, so this hugepages= parameter goes to the "default hstate".
* Otherwise, it goes with the previously parsed hugepagesz or
* default_hugepagesz.
*/
else if (!hugetlb_max_hstate)
mhp = &default_hstate_max_huge_pages;
else
mhp = &parsed_hstate->max_huge_pages;
if (mhp == last_mhp) {
pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s);
return 0;
}
if (sscanf(s, "%lu", mhp) <= 0)
*mhp = 0;
/*
* Global state is always initialized later in hugetlb_init.
* But we need to allocate >= MAX_ORDER hstates here early to still
* use the bootmem allocator.
*/
if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER)
hugetlb_hstate_alloc_pages(parsed_hstate);
last_mhp = mhp;
return 1;
}
__setup("hugepages=", hugepages_setup);
/*
* hugepagesz command line processing
* A specific huge page size can only be specified once with hugepagesz.
* hugepagesz is followed by hugepages on the command line. The global
* variable 'parsed_valid_hugepagesz' is used to determine if prior
* hugepagesz argument was valid.
*/
static int __init hugepagesz_setup(char *s)
{
unsigned long size;
struct hstate *h;
parsed_valid_hugepagesz = false;
size = (unsigned long)memparse(s, NULL);
if (!arch_hugetlb_valid_size(size)) {
pr_err("HugeTLB: unsupported hugepagesz=%s\n", s);
return 0;
}
h = size_to_hstate(size);
if (h) {
/*
* hstate for this size already exists. This is normally
* an error, but is allowed if the existing hstate is the
* default hstate. More specifically, it is only allowed if
* the number of huge pages for the default hstate was not
* previously specified.
*/
if (!parsed_default_hugepagesz || h != &default_hstate ||
default_hstate.max_huge_pages) {
pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s);
return 0;
}
/*
* No need to call hugetlb_add_hstate() as hstate already
* exists. But, do set parsed_hstate so that a following
* hugepages= parameter will be applied to this hstate.
*/
parsed_hstate = h;
parsed_valid_hugepagesz = true;
return 1;
}
hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
parsed_valid_hugepagesz = true;
return 1;
}
__setup("hugepagesz=", hugepagesz_setup);
/*
* default_hugepagesz command line input
* Only one instance of default_hugepagesz allowed on command line.
*/
static int __init default_hugepagesz_setup(char *s)
{
unsigned long size;
parsed_valid_hugepagesz = false;
if (parsed_default_hugepagesz) {
pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s);
return 0;
}
size = (unsigned long)memparse(s, NULL);
if (!arch_hugetlb_valid_size(size)) {
pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s);
return 0;
}
hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
parsed_valid_hugepagesz = true;
parsed_default_hugepagesz = true;
default_hstate_idx = hstate_index(size_to_hstate(size));
/*
* The number of default huge pages (for this size) could have been
* specified as the first hugetlb parameter: hugepages=X. If so,
* then default_hstate_max_huge_pages is set. If the default huge
* page size is gigantic (>= MAX_ORDER), then the pages must be
* allocated here from bootmem allocator.
*/
if (default_hstate_max_huge_pages) {
default_hstate.max_huge_pages = default_hstate_max_huge_pages;
if (hstate_is_gigantic(&default_hstate))
hugetlb_hstate_alloc_pages(&default_hstate);
default_hstate_max_huge_pages = 0;
}
return 1;
}
__setup("default_hugepagesz=", default_hugepagesz_setup);
static unsigned int allowed_mems_nr(struct hstate *h)
{
int node;
unsigned int nr = 0;
nodemask_t *mpol_allowed;
unsigned int *array = h->free_huge_pages_node;
gfp_t gfp_mask = htlb_alloc_mask(h);
mpol_allowed = policy_nodemask_current(gfp_mask);
for_each_node_mask(node, cpuset_current_mems_allowed) {
if (!mpol_allowed ||
(mpol_allowed && node_isset(node, *mpol_allowed)))
nr += array[node];
}
return nr;
}
#ifdef CONFIG_SYSCTL
static int proc_hugetlb_doulongvec_minmax(struct ctl_table *table, int write,
void *buffer, size_t *length,
loff_t *ppos, unsigned long *out)
{
struct ctl_table dup_table;
/*
* In order to avoid races with __do_proc_doulongvec_minmax(), we
* can duplicate the @table and alter the duplicate of it.
*/
dup_table = *table;
dup_table.data = out;
return proc_doulongvec_minmax(&dup_table, write, buffer, length, ppos);
}
static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
struct hstate *h = &default_hstate;
unsigned long tmp = h->max_huge_pages;
int ret;
if (!hugepages_supported())
return -EOPNOTSUPP;
ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos,
&tmp);
if (ret)
goto out;
if (write)
ret = __nr_hugepages_store_common(obey_mempolicy, h,
NUMA_NO_NODE, tmp, *length);
out:
return ret;
}
int hugetlb_sysctl_handler(struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
return hugetlb_sysctl_handler_common(false, table, write,
buffer, length, ppos);
}
#ifdef CONFIG_NUMA
int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
return hugetlb_sysctl_handler_common(true, table, write,
buffer, length, ppos);
}
#endif /* CONFIG_NUMA */
int hugetlb_overcommit_handler(struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
struct hstate *h = &default_hstate;
unsigned long tmp;
int ret;
if (!hugepages_supported())
return -EOPNOTSUPP;
tmp = h->nr_overcommit_huge_pages;
if (write && hstate_is_gigantic(h))
return -EINVAL;
ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos,
&tmp);
if (ret)
goto out;
if (write) {
spin_lock(&hugetlb_lock);
h->nr_overcommit_huge_pages = tmp;
spin_unlock(&hugetlb_lock);
}
out:
return ret;
}
#endif /* CONFIG_SYSCTL */
void hugetlb_report_meminfo(struct seq_file *m)
{
struct hstate *h;
unsigned long total = 0;
if (!hugepages_supported())
return;
for_each_hstate(h) {
unsigned long count = h->nr_huge_pages;
total += (PAGE_SIZE << huge_page_order(h)) * count;
if (h == &default_hstate)
seq_printf(m,
"HugePages_Total: %5lu\n"
"HugePages_Free: %5lu\n"
"HugePages_Rsvd: %5lu\n"
"HugePages_Surp: %5lu\n"
"Hugepagesize: %8lu kB\n",
count,
h->free_huge_pages,
h->resv_huge_pages,
h->surplus_huge_pages,
(PAGE_SIZE << huge_page_order(h)) / 1024);
}
seq_printf(m, "Hugetlb: %8lu kB\n", total / 1024);
}
int hugetlb_report_node_meminfo(char *buf, int len, int nid)
{
struct hstate *h = &default_hstate;
if (!hugepages_supported())
return 0;
return sysfs_emit_at(buf, len,
"Node %d HugePages_Total: %5u\n"
"Node %d HugePages_Free: %5u\n"
"Node %d HugePages_Surp: %5u\n",
nid, h->nr_huge_pages_node[nid],
nid, h->free_huge_pages_node[nid],
nid, h->surplus_huge_pages_node[nid]);
}
void hugetlb_show_meminfo(void)
{
struct hstate *h;
int nid;
if (!hugepages_supported())
return;
for_each_node_state(nid, N_MEMORY)
for_each_hstate(h)
pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
nid,
h->nr_huge_pages_node[nid],
h->free_huge_pages_node[nid],
h->surplus_huge_pages_node[nid],
1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
}
void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm)
{
seq_printf(m, "HugetlbPages:\t%8lu kB\n",
atomic_long_read(&mm->hugetlb_usage) << (PAGE_SHIFT - 10));
}
/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
unsigned long hugetlb_total_pages(void)
{
struct hstate *h;
unsigned long nr_total_pages = 0;
for_each_hstate(h)
nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h);
return nr_total_pages;
}
static int hugetlb_acct_memory(struct hstate *h, long delta)
{
int ret = -ENOMEM;
spin_lock(&hugetlb_lock);
/*
* When cpuset is configured, it breaks the strict hugetlb page
* reservation as the accounting is done on a global variable. Such
* reservation is completely rubbish in the presence of cpuset because
* the reservation is not checked against page availability for the
* current cpuset. Application can still potentially OOM'ed by kernel
* with lack of free htlb page in cpuset that the task is in.
* Attempt to enforce strict accounting with cpuset is almost
* impossible (or too ugly) because cpuset is too fluid that
* task or memory node can be dynamically moved between cpusets.
*
* The change of semantics for shared hugetlb mapping with cpuset is
* undesirable. However, in order to preserve some of the semantics,
* we fall back to check against current free page availability as
* a best attempt and hopefully to minimize the impact of changing
* semantics that cpuset has.
*
* Apart from cpuset, we also have memory policy mechanism that
* also determines from which node the kernel will allocate memory
* in a NUMA system. So similar to cpuset, we also should consider
* the memory policy of the current task. Similar to the description
* above.
*/
if (delta > 0) {
if (gather_surplus_pages(h, delta) < 0)
goto out;
if (delta > allowed_mems_nr(h)) {
return_unused_surplus_pages(h, delta);
goto out;
}
}
ret = 0;
if (delta < 0)
return_unused_surplus_pages(h, (unsigned long) -delta);
out:
spin_unlock(&hugetlb_lock);
return ret;
}
static void hugetlb_vm_op_open(struct vm_area_struct *vma)
{
struct resv_map *resv = vma_resv_map(vma);
/*
* This new VMA should share its siblings reservation map if present.
* The VMA will only ever have a valid reservation map pointer where
* it is being copied for another still existing VMA. As that VMA
* has a reference to the reservation map it cannot disappear until
* after this open call completes. It is therefore safe to take a
* new reference here without additional locking.
*/
if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
resv_map_dup_hugetlb_cgroup_uncharge_info(resv);
kref_get(&resv->refs);
}
}
static void hugetlb_vm_op_close(struct vm_area_struct *vma)
{
struct hstate *h = hstate_vma(vma);
struct resv_map *resv = vma_resv_map(vma);
struct hugepage_subpool *spool = subpool_vma(vma);
unsigned long reserve, start, end;
long gbl_reserve;
if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
return;
start = vma_hugecache_offset(h, vma, vma->vm_start);
end = vma_hugecache_offset(h, vma, vma->vm_end);
reserve = (end - start) - region_count(resv, start, end);
hugetlb_cgroup_uncharge_counter(resv, start, end);
if (reserve) {
/*
* Decrement reserve counts. The global reserve count may be
* adjusted if the subpool has a minimum size.
*/
gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
hugetlb_acct_memory(h, -gbl_reserve);
}
kref_put(&resv->refs, resv_map_release);
}
static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
{
if (addr & ~(huge_page_mask(hstate_vma(vma))))
return -EINVAL;
return 0;
}
static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
{
struct hstate *hstate = hstate_vma(vma);
return 1UL << huge_page_shift(hstate);
}
/*
* We cannot handle pagefaults against hugetlb pages at all. They cause
* handle_mm_fault() to try to instantiate regular-sized pages in the
* hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get
* this far.
*/
static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf)
{
BUG();
return 0;
}
/*
* When a new function is introduced to vm_operations_struct and added
* to hugetlb_vm_ops, please consider adding the function to shm_vm_ops.
* This is because under System V memory model, mappings created via
* shmget/shmat with "huge page" specified are backed by hugetlbfs files,
* their original vm_ops are overwritten with shm_vm_ops.
*/
const struct vm_operations_struct hugetlb_vm_ops = {
.fault = hugetlb_vm_op_fault,
.open = hugetlb_vm_op_open,
.close = hugetlb_vm_op_close,
.split = hugetlb_vm_op_split,
.pagesize = hugetlb_vm_op_pagesize,
};
static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
int writable)
{
pte_t entry;
if (writable) {
entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page,
vma->vm_page_prot)));
} else {
entry = huge_pte_wrprotect(mk_huge_pte(page,
vma->vm_page_prot));
}
entry = pte_mkyoung(entry);
entry = pte_mkhuge(entry);
entry = arch_make_huge_pte(entry, vma, page, writable);
return entry;
}
static void set_huge_ptep_writable(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep)
{
pte_t entry;
entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep)));
if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1))
update_mmu_cache(vma, address, ptep);
}
bool is_hugetlb_entry_migration(pte_t pte)
{
swp_entry_t swp;
if (huge_pte_none(pte) || pte_present(pte))
return false;
swp = pte_to_swp_entry(pte);
if (is_migration_entry(swp))
return true;
else
return false;
}
static bool is_hugetlb_entry_hwpoisoned(pte_t pte)
{
swp_entry_t swp;
if (huge_pte_none(pte) || pte_present(pte))
return false;
swp = pte_to_swp_entry(pte);
if (is_hwpoison_entry(swp))
return true;
else
return false;
}
int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
struct vm_area_struct *vma)
{
pte_t *src_pte, *dst_pte, entry, dst_entry;
struct page *ptepage;
unsigned long addr;
int cow;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
struct address_space *mapping = vma->vm_file->f_mapping;
struct mmu_notifier_range range;
int ret = 0;
cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
if (cow) {
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src,
vma->vm_start,
vma->vm_end);
mmu_notifier_invalidate_range_start(&range);
} else {
/*
* For shared mappings i_mmap_rwsem must be held to call
* huge_pte_alloc, otherwise the returned ptep could go
* away if part of a shared pmd and another thread calls
* huge_pmd_unshare.
*/
i_mmap_lock_read(mapping);
}
for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
spinlock_t *src_ptl, *dst_ptl;
src_pte = huge_pte_offset(src, addr, sz);
if (!src_pte)
continue;
dst_pte = huge_pte_alloc(dst, vma, addr, sz);
if (!dst_pte) {
ret = -ENOMEM;
break;
}
/*
* If the pagetables are shared don't copy or take references.
* dst_pte == src_pte is the common case of src/dest sharing.
*
* However, src could have 'unshared' and dst shares with
* another vma. If dst_pte !none, this implies sharing.
* Check here before taking page table lock, and once again
* after taking the lock below.
*/
dst_entry = huge_ptep_get(dst_pte);
if ((dst_pte == src_pte) || !huge_pte_none(dst_entry))
continue;
dst_ptl = huge_pte_lock(h, dst, dst_pte);
src_ptl = huge_pte_lockptr(h, src, src_pte);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
entry = huge_ptep_get(src_pte);
dst_entry = huge_ptep_get(dst_pte);
if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) {
/*
* Skip if src entry none. Also, skip in the
* unlikely case dst entry !none as this implies
* sharing with another vma.
*/
;
} else if (unlikely(is_hugetlb_entry_migration(entry) ||
is_hugetlb_entry_hwpoisoned(entry))) {
swp_entry_t swp_entry = pte_to_swp_entry(entry);
if (is_write_migration_entry(swp_entry) && cow) {
/*
* COW mappings require pages in both
* parent and child to be set to read.
*/
make_migration_entry_read(&swp_entry);
entry = swp_entry_to_pte(swp_entry);
set_huge_swap_pte_at(src, addr, src_pte,
entry, sz);
}
set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
} else {
if (cow) {
/*
* No need to notify as we are downgrading page
* table protection not changing it to point
* to a new page.
*
* See Documentation/vm/mmu_notifier.rst
*/
huge_ptep_set_wrprotect(src, addr, src_pte);
}
entry = huge_ptep_get(src_pte);
ptepage = pte_page(entry);
get_page(ptepage);
page_dup_rmap(ptepage, true);
set_huge_pte_at(dst, addr, dst_pte, entry);
hugetlb_count_add(pages_per_huge_page(h), dst);
}
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
}
if (cow)
mmu_notifier_invalidate_range_end(&range);
else
i_mmap_unlock_read(mapping);
return ret;
}
void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long start, unsigned long end,
struct page *ref_page)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long address;
pte_t *ptep;
pte_t pte;
spinlock_t *ptl;
struct page *page;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
struct mmu_notifier_range range;
bool force_flush = false;
WARN_ON(!is_vm_hugetlb_page(vma));
BUG_ON(start & ~huge_page_mask(h));
BUG_ON(end & ~huge_page_mask(h));
/*
* This is a hugetlb vma, all the pte entries should point
* to huge page.
*/
tlb_change_page_size(tlb, sz);
tlb_start_vma(tlb, vma);
/*
* If sharing possible, alert mmu notifiers of worst case.
*/
mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start,
end);
adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
mmu_notifier_invalidate_range_start(&range);
address = start;
for (; address < end; address += sz) {
ptep = huge_pte_offset(mm, address, sz);
if (!ptep)
continue;
ptl = huge_pte_lock(h, mm, ptep);
if (huge_pmd_unshare(mm, vma, &address, ptep)) {
spin_unlock(ptl);
tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE);
force_flush = true;
continue;
}
pte = huge_ptep_get(ptep);
if (huge_pte_none(pte)) {
spin_unlock(ptl);
continue;
}
/*
* Migrating hugepage or HWPoisoned hugepage is already
* unmapped and its refcount is dropped, so just clear pte here.
*/
if (unlikely(!pte_present(pte))) {
huge_pte_clear(mm, address, ptep, sz);
spin_unlock(ptl);
continue;
}
page = pte_page(pte);
/*
* If a reference page is supplied, it is because a specific
* page is being unmapped, not a range. Ensure the page we
* are about to unmap is the actual page of interest.
*/
if (ref_page) {
if (page != ref_page) {
spin_unlock(ptl);
continue;
}
/*
* Mark the VMA as having unmapped its page so that
* future faults in this VMA will fail rather than
* looking like data was lost
*/
set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
}
pte = huge_ptep_get_and_clear(mm, address, ptep);
tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
if (huge_pte_dirty(pte))
set_page_dirty(page);
hugetlb_count_sub(pages_per_huge_page(h), mm);
page_remove_rmap(page, true);
spin_unlock(ptl);
tlb_remove_page_size(tlb, page, huge_page_size(h));
/*
* Bail out after unmapping reference page if supplied
*/
if (ref_page)
break;
}
mmu_notifier_invalidate_range_end(&range);
tlb_end_vma(tlb, vma);
/*
* If we unshared PMDs, the TLB flush was not recorded in mmu_gather. We
* could defer the flush until now, since by holding i_mmap_rwsem we
* guaranteed that the last refernece would not be dropped. But we must
* do the flushing before we return, as otherwise i_mmap_rwsem will be
* dropped and the last reference to the shared PMDs page might be
* dropped as well.
*
* In theory we could defer the freeing of the PMD pages as well, but
* huge_pmd_unshare() relies on the exact page_count for the PMD page to
* detect sharing, so we cannot defer the release of the page either.
* Instead, do flush now.
*/
if (force_flush)
tlb_flush_mmu_tlbonly(tlb);
}
void __unmap_hugepage_range_final(struct mmu_gather *tlb,
struct vm_area_struct *vma, unsigned long start,
unsigned long end, struct page *ref_page)
{
__unmap_hugepage_range(tlb, vma, start, end, ref_page);
/*
* Clear this flag so that x86's huge_pmd_share page_table_shareable
* test will fail on a vma being torn down, and not grab a page table
* on its way out. We're lucky that the flag has such an appropriate
* name, and can in fact be safely cleared here. We could clear it
* before the __unmap_hugepage_range above, but all that's necessary
* is to clear it before releasing the i_mmap_rwsem. This works
* because in the context this is called, the VMA is about to be
* destroyed and the i_mmap_rwsem is held.
*/
vma->vm_flags &= ~VM_MAYSHARE;
}
void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
unsigned long end, struct page *ref_page)
{
struct mm_struct *mm;
struct mmu_gather tlb;
unsigned long tlb_start = start;
unsigned long tlb_end = end;
/*
* If shared PMDs were possibly used within this vma range, adjust
* start/end for worst case tlb flushing.
* Note that we can not be sure if PMDs are shared until we try to
* unmap pages. However, we want to make sure TLB flushing covers
* the largest possible range.
*/
adjust_range_if_pmd_sharing_possible(vma, &tlb_start, &tlb_end);
mm = vma->vm_mm;
tlb_gather_mmu(&tlb, mm, tlb_start, tlb_end);
__unmap_hugepage_range(&tlb, vma, start, end, ref_page);
tlb_finish_mmu(&tlb, tlb_start, tlb_end);
}
/*
* This is called when the original mapper is failing to COW a MAP_PRIVATE
* mappping it owns the reserve page for. The intention is to unmap the page
* from other VMAs and let the children be SIGKILLed if they are faulting the
* same region.
*/
static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
struct page *page, unsigned long address)
{
struct hstate *h = hstate_vma(vma);
struct vm_area_struct *iter_vma;
struct address_space *mapping;
pgoff_t pgoff;
/*
* vm_pgoff is in PAGE_SIZE units, hence the different calculation
* from page cache lookup which is in HPAGE_SIZE units.
*/
address = address & huge_page_mask(h);
pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
vma->vm_pgoff;
mapping = vma->vm_file->f_mapping;
/*
* Take the mapping lock for the duration of the table walk. As
* this mapping should be shared between all the VMAs,
* __unmap_hugepage_range() is called as the lock is already held
*/
i_mmap_lock_write(mapping);
vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
/* Do not unmap the current VMA */
if (iter_vma == vma)
continue;
/*
* Shared VMAs have their own reserves and do not affect
* MAP_PRIVATE accounting but it is possible that a shared
* VMA is using the same page so check and skip such VMAs.
*/
if (iter_vma->vm_flags & VM_MAYSHARE)
continue;
/*
* Unmap the page from other VMAs without their own reserves.
* They get marked to be SIGKILLed if they fault in these
* areas. This is because a future no-page fault on this VMA
* could insert a zeroed page instead of the data existing
* from the time of fork. This would look like data corruption
*/
if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
unmap_hugepage_range(iter_vma, address,
address + huge_page_size(h), page);
}
i_mmap_unlock_write(mapping);
}
/*
* Hugetlb_cow() should be called with page lock of the original hugepage held.
* Called with hugetlb_instantiation_mutex held and pte_page locked so we
* cannot race with other handlers or page migration.
* Keep the pte_same checks anyway to make transition from the mutex easier.
*/
static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *ptep,
struct page *pagecache_page, spinlock_t *ptl)
{
pte_t pte;
struct hstate *h = hstate_vma(vma);
struct page *old_page, *new_page;
int outside_reserve = 0;
vm_fault_t ret = 0;
unsigned long haddr = address & huge_page_mask(h);
struct mmu_notifier_range range;
pte = huge_ptep_get(ptep);
old_page = pte_page(pte);
retry_avoidcopy:
/* If no-one else is actually using this page, avoid the copy
* and just make the page writable */
if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
page_move_anon_rmap(old_page, vma);
set_huge_ptep_writable(vma, haddr, ptep);
return 0;
}
/*
* If the process that created a MAP_PRIVATE mapping is about to
* perform a COW due to a shared page count, attempt to satisfy
* the allocation without using the existing reserves. The pagecache
* page is used to determine if the reserve at this address was
* consumed or not. If reserves were used, a partial faulted mapping
* at the time of fork() could consume its reserves on COW instead
* of the full address range.
*/
if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
old_page != pagecache_page)
outside_reserve = 1;
get_page(old_page);
/*
* Drop page table lock as buddy allocator may be called. It will
* be acquired again before returning to the caller, as expected.
*/
spin_unlock(ptl);
new_page = alloc_huge_page(vma, haddr, outside_reserve);
if (IS_ERR(new_page)) {
/*
* If a process owning a MAP_PRIVATE mapping fails to COW,
* it is due to references held by a child and an insufficient
* huge page pool. To guarantee the original mappers
* reliability, unmap the page from child processes. The child
* may get SIGKILLed if it later faults.
*/
if (outside_reserve) {
struct address_space *mapping = vma->vm_file->f_mapping;
pgoff_t idx;
u32 hash;
put_page(old_page);
BUG_ON(huge_pte_none(pte));
/*
* Drop hugetlb_fault_mutex and i_mmap_rwsem before
* unmapping. unmapping needs to hold i_mmap_rwsem
* in write mode. Dropping i_mmap_rwsem in read mode
* here is OK as COW mappings do not interact with
* PMD sharing.
*
* Reacquire both after unmap operation.
*/
idx = vma_hugecache_offset(h, vma, haddr);
hash = hugetlb_fault_mutex_hash(mapping, idx);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
i_mmap_unlock_read(mapping);
unmap_ref_private(mm, vma, old_page, haddr);
i_mmap_lock_read(mapping);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
spin_lock(ptl);
ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
if (likely(ptep &&
pte_same(huge_ptep_get(ptep), pte)))
goto retry_avoidcopy;
/*
* race occurs while re-acquiring page table
* lock, and our job is done.
*/
return 0;
}
ret = vmf_error(PTR_ERR(new_page));
goto out_release_old;
}
/*
* When the original hugepage is shared one, it does not have
* anon_vma prepared.
*/
if (unlikely(anon_vma_prepare(vma))) {
ret = VM_FAULT_OOM;
goto out_release_all;
}
copy_user_huge_page(new_page, old_page, address, vma,
pages_per_huge_page(h));
__SetPageUptodate(new_page);
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, haddr,
haddr + huge_page_size(h));
mmu_notifier_invalidate_range_start(&range);
/*
* Retake the page table lock to check for racing updates
* before the page tables are altered
*/
spin_lock(ptl);
ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
ClearPagePrivate(new_page);
/* Break COW */
huge_ptep_clear_flush(vma, haddr, ptep);
mmu_notifier_invalidate_range(mm, range.start, range.end);
set_huge_pte_at(mm, haddr, ptep,
make_huge_pte(vma, new_page, 1));
page_remove_rmap(old_page, true);
hugepage_add_new_anon_rmap(new_page, vma, haddr);
set_page_huge_active(new_page);
/* Make the old page be freed below */
new_page = old_page;
}
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(&range);
out_release_all:
restore_reserve_on_error(h, vma, haddr, new_page);
put_page(new_page);
out_release_old:
put_page(old_page);
spin_lock(ptl); /* Caller expects lock to be held */
return ret;
}
/* Return the pagecache page at a given address within a VMA */
static struct page *hugetlbfs_pagecache_page(struct hstate *h,
struct vm_area_struct *vma, unsigned long address)
{
struct address_space *mapping;
pgoff_t idx;
mapping = vma->vm_file->f_mapping;
idx = vma_hugecache_offset(h, vma, address);
return find_lock_page(mapping, idx);
}
/*
* Return whether there is a pagecache page to back given address within VMA.
* Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
*/
static bool hugetlbfs_pagecache_present(struct hstate *h,
struct vm_area_struct *vma, unsigned long address)
{
struct address_space *mapping;
pgoff_t idx;
struct page *page;
mapping = vma->vm_file->f_mapping;
idx = vma_hugecache_offset(h, vma, address);
page = find_get_page(mapping, idx);
if (page)
put_page(page);
return page != NULL;
}
int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
pgoff_t idx)
{
struct inode *inode = mapping->host;
struct hstate *h = hstate_inode(inode);
int err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
if (err)
return err;
ClearPagePrivate(page);
/*
* set page dirty so that it will not be removed from cache/file
* by non-hugetlbfs specific code paths.
*/
set_page_dirty(page);
spin_lock(&inode->i_lock);
inode->i_blocks += blocks_per_huge_page(h);
spin_unlock(&inode->i_lock);
return 0;
}
static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
struct address_space *mapping,
pgoff_t idx,
unsigned int flags,
unsigned long haddr,
unsigned long reason)
{
u32 hash = hugetlb_fault_mutex_hash(mapping, idx);
struct vm_fault vmf = {
.vma = vma,
.address = haddr,
.flags = flags,
/*
* Hard to debug if it ends up being
* used by a callee that assumes
* something about the other
* uninitialized fields... same as in
* memory.c
*/
};
/*
* vma_lock and hugetlb_fault_mutex must be dropped
* before handling userfault. Also mmap_lock will
* be dropped during handling userfault, any vma
* operation should be careful from here.
*/
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
i_mmap_unlock_read(mapping);
return handle_userfault(&vmf, VM_UFFD_MISSING);
}
static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
struct vm_area_struct *vma,
struct address_space *mapping, pgoff_t idx,
unsigned long address, pte_t *ptep, unsigned int flags)
{
struct hstate *h = hstate_vma(vma);
vm_fault_t ret = VM_FAULT_SIGBUS;
int anon_rmap = 0;
unsigned long size;
struct page *page;
pte_t new_pte;
spinlock_t *ptl;
unsigned long haddr = address & huge_page_mask(h);
bool new_page = false;
u32 hash = hugetlb_fault_mutex_hash(mapping, idx);
/*
* Currently, we are forced to kill the process in the event the
* original mapper has unmapped pages from the child due to a failed
* COW. Warn that such a situation has occurred as it may not be obvious
*/
if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n",
current->pid);
goto out;
}
/*
* We can not race with truncation due to holding i_mmap_rwsem.
* i_size is modified when holding i_mmap_rwsem, so check here
* once for faults beyond end of file.
*/
size = i_size_read(mapping->host) >> huge_page_shift(h);
if (idx >= size)
goto out;
retry:
page = find_lock_page(mapping, idx);
if (!page) {
/* Check for page in userfault range */
if (userfaultfd_missing(vma)) {
ret = hugetlb_handle_userfault(vma, mapping, idx,
flags, haddr,
VM_UFFD_MISSING);
goto out;
}
page = alloc_huge_page(vma, haddr, 0);
if (IS_ERR(page)) {
/*
* Returning error will result in faulting task being
* sent SIGBUS. The hugetlb fault mutex prevents two
* tasks from racing to fault in the same page which
* could result in false unable to allocate errors.
* Page migration does not take the fault mutex, but
* does a clear then write of pte's under page table
* lock. Page fault code could race with migration,
* notice the clear pte and try to allocate a page
* here. Before returning error, get ptl and make
* sure there really is no pte entry.
*/
ptl = huge_pte_lock(h, mm, ptep);
if (!huge_pte_none(huge_ptep_get(ptep))) {
ret = 0;
spin_unlock(ptl);
goto out;
}
spin_unlock(ptl);
ret = vmf_error(PTR_ERR(page));
goto out;
}
clear_huge_page(page, address, pages_per_huge_page(h));
__SetPageUptodate(page);
new_page = true;
if (vma->vm_flags & VM_MAYSHARE) {
int err = huge_add_to_page_cache(page, mapping, idx);
if (err) {
put_page(page);
if (err == -EEXIST)
goto retry;
goto out;
}
} else {
lock_page(page);
if (unlikely(anon_vma_prepare(vma))) {
ret = VM_FAULT_OOM;
goto backout_unlocked;
}
anon_rmap = 1;
}
} else {
/*
* If memory error occurs between mmap() and fault, some process
* don't have hwpoisoned swap entry for errored virtual address.
* So we need to block hugepage fault by PG_hwpoison bit check.
*/
if (unlikely(PageHWPoison(page))) {
ret = VM_FAULT_HWPOISON_LARGE |
VM_FAULT_SET_HINDEX(hstate_index(h));
goto backout_unlocked;
}
/* Check for page in userfault range. */
if (userfaultfd_minor(vma)) {
unlock_page(page);
put_page(page);
ret = hugetlb_handle_userfault(vma, mapping, idx,
flags, haddr,
VM_UFFD_MINOR);
goto out;
}
}
/*
* If we are going to COW a private mapping later, we examine the
* pending reservations for this page now. This will ensure that
* any allocations necessary to record that reservation occur outside
* the spinlock.
*/
if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
if (vma_needs_reservation(h, vma, haddr) < 0) {
ret = VM_FAULT_OOM;
goto backout_unlocked;
}
/* Just decrements count, does not deallocate */
vma_end_reservation(h, vma, haddr);
}
ptl = huge_pte_lock(h, mm, ptep);
ret = 0;
if (!huge_pte_none(huge_ptep_get(ptep)))
goto backout;
if (anon_rmap) {
ClearPagePrivate(page);
hugepage_add_new_anon_rmap(page, vma, haddr);
} else
page_dup_rmap(page, true);
new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
&& (vma->vm_flags & VM_SHARED)));
set_huge_pte_at(mm, haddr, ptep, new_pte);
hugetlb_count_add(pages_per_huge_page(h), mm);
if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
/* Optimization, do the COW without a second fault */
ret = hugetlb_cow(mm, vma, address, ptep, page, ptl);
}
spin_unlock(ptl);
/*
* Only make newly allocated pages active. Existing pages found
* in the pagecache could be !page_huge_active() if they have been
* isolated for migration.
*/
if (new_page)
set_page_huge_active(page);
unlock_page(page);
out:
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
i_mmap_unlock_read(mapping);
return ret;
backout:
spin_unlock(ptl);
backout_unlocked:
unlock_page(page);
restore_reserve_on_error(h, vma, haddr, page);
put_page(page);
goto out;
}
#ifdef CONFIG_SMP
u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
{
unsigned long key[2];
u32 hash;
key[0] = (unsigned long) mapping;
key[1] = idx;
hash = jhash2((u32 *)&key, sizeof(key)/(sizeof(u32)), 0);
return hash & (num_fault_mutexes - 1);
}
#else
/*
* For uniprocesor systems we always use a single mutex, so just
* return 0 and avoid the hashing overhead.
*/
u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
{
return 0;
}
#endif
vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, unsigned int flags)
{
pte_t *ptep, entry;
spinlock_t *ptl;
vm_fault_t ret;
u32 hash;
pgoff_t idx;
struct page *page = NULL;
struct page *pagecache_page = NULL;
struct hstate *h = hstate_vma(vma);
struct address_space *mapping;
int need_wait_lock = 0;
unsigned long haddr = address & huge_page_mask(h);
ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
if (ptep) {
/*
* Since we hold no locks, ptep could be stale. That is
* OK as we are only making decisions based on content and
* not actually modifying content here.
*/
entry = huge_ptep_get(ptep);
if (unlikely(is_hugetlb_entry_migration(entry))) {
migration_entry_wait_huge(vma, mm, ptep);
return 0;
} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
return VM_FAULT_HWPOISON_LARGE |
VM_FAULT_SET_HINDEX(hstate_index(h));
}
/*
* Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
* until finished with ptep. This serves two purposes:
* 1) It prevents huge_pmd_unshare from being called elsewhere
* and making the ptep no longer valid.
* 2) It synchronizes us with i_size modifications during truncation.
*
* ptep could have already be assigned via huge_pte_offset. That
* is OK, as huge_pte_alloc will return the same value unless
* something has changed.
*/
mapping = vma->vm_file->f_mapping;
i_mmap_lock_read(mapping);
ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
if (!ptep) {
i_mmap_unlock_read(mapping);
return VM_FAULT_OOM;
}
/*
* Serialize hugepage allocation and instantiation, so that we don't
* get spurious allocation failures if two CPUs race to instantiate
* the same page in the page cache.
*/
idx = vma_hugecache_offset(h, vma, haddr);
hash = hugetlb_fault_mutex_hash(mapping, idx);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
entry = huge_ptep_get(ptep);
if (huge_pte_none(entry))
/*
* hugetlb_no_page will drop vma lock and hugetlb fault
* mutex internally, which make us return immediately.
*/
return hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags);
ret = 0;
/*
* entry could be a migration/hwpoison entry at this point, so this
* check prevents the kernel from going below assuming that we have
* an active hugepage in pagecache. This goto expects the 2nd page
* fault, and is_hugetlb_entry_(migration|hwpoisoned) check will
* properly handle it.
*/
if (!pte_present(entry))
goto out_mutex;
/*
* If we are going to COW the mapping later, we examine the pending
* reservations for this page now. This will ensure that any
* allocations necessary to record that reservation occur outside the
* spinlock. For private mappings, we also lookup the pagecache
* page now as it is used to determine if a reservation has been
* consumed.
*/
if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
if (vma_needs_reservation(h, vma, haddr) < 0) {
ret = VM_FAULT_OOM;
goto out_mutex;
}
/* Just decrements count, does not deallocate */
vma_end_reservation(h, vma, haddr);
if (!(vma->vm_flags & VM_MAYSHARE))
pagecache_page = hugetlbfs_pagecache_page(h,
vma, haddr);
}
ptl = huge_pte_lock(h, mm, ptep);
/* Check for a racing update before calling hugetlb_cow */
if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
goto out_ptl;
/*
* hugetlb_cow() requires page locks of pte_page(entry) and
* pagecache_page, so here we need take the former one
* when page != pagecache_page or !pagecache_page.
*/
page = pte_page(entry);
if (page != pagecache_page)
if (!trylock_page(page)) {
need_wait_lock = 1;
goto out_ptl;
}
get_page(page);
if (flags & FAULT_FLAG_WRITE) {
if (!huge_pte_write(entry)) {
ret = hugetlb_cow(mm, vma, address, ptep,
pagecache_page, ptl);
goto out_put_page;
}
entry = huge_pte_mkdirty(entry);
}
entry = pte_mkyoung(entry);
if (huge_ptep_set_access_flags(vma, haddr, ptep, entry,
flags & FAULT_FLAG_WRITE))
update_mmu_cache(vma, haddr, ptep);
out_put_page:
if (page != pagecache_page)
unlock_page(page);
put_page(page);
out_ptl:
spin_unlock(ptl);
if (pagecache_page) {
unlock_page(pagecache_page);
put_page(pagecache_page);
}
out_mutex:
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
i_mmap_unlock_read(mapping);
/*
* Generally it's safe to hold refcount during waiting page lock. But
* here we just wait to defer the next page fault to avoid busy loop and
* the page is not used after unlocked before returning from the current
* page fault. So we are safe from accessing freed page, even if we wait
* here without taking refcount.
*/
if (need_wait_lock)
wait_on_page_locked(page);
return ret;
}
#ifdef CONFIG_USERFAULTFD
/*
* Used by userfaultfd UFFDIO_COPY. Based on mcopy_atomic_pte with
* modifications for huge pages.
*/
int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
pte_t *dst_pte,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
unsigned long src_addr,
enum mcopy_atomic_mode mode,
struct page **pagep)
{
bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
struct address_space *mapping;
pgoff_t idx;
unsigned long size;
int vm_shared = dst_vma->vm_flags & VM_SHARED;
struct hstate *h = hstate_vma(dst_vma);
pte_t _dst_pte;
spinlock_t *ptl;
int ret;
struct page *page;
int writable;
mapping = dst_vma->vm_file->f_mapping;
idx = vma_hugecache_offset(h, dst_vma, dst_addr);
if (is_continue) {
ret = -EFAULT;
page = find_lock_page(mapping, idx);
if (!page)
goto out;
} else if (!*pagep) {
/* If a page already exists, then it's UFFDIO_COPY for
* a non-missing case. Return -EEXIST.
*/
if (vm_shared &&
hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
ret = -EEXIST;
goto out;
}
page = alloc_huge_page(dst_vma, dst_addr, 0);
if (IS_ERR(page)) {
ret = -ENOMEM;
goto out;
}
ret = copy_huge_page_from_user(page,
(const void __user *) src_addr,
pages_per_huge_page(h), false);
/* fallback to copy_from_user outside mmap_lock */
if (unlikely(ret)) {
ret = -ENOENT;
*pagep = page;
/* don't free the page */
goto out;
}
} else {
page = *pagep;
*pagep = NULL;
}
/*
* The memory barrier inside __SetPageUptodate makes sure that
* preceding stores to the page contents become visible before
* the set_pte_at() write.
*/
__SetPageUptodate(page);
/* Add shared, newly allocated pages to the page cache. */
if (vm_shared && !is_continue) {
size = i_size_read(mapping->host) >> huge_page_shift(h);
ret = -EFAULT;
if (idx >= size)
goto out_release_nounlock;
/*
* Serialization between remove_inode_hugepages() and
* huge_add_to_page_cache() below happens through the
* hugetlb_fault_mutex_table that here must be hold by
* the caller.
*/
ret = huge_add_to_page_cache(page, mapping, idx);
if (ret)
goto out_release_nounlock;
}
ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
spin_lock(ptl);
/*
* Recheck the i_size after holding PT lock to make sure not
* to leave any page mapped (as page_mapped()) beyond the end
* of the i_size (remove_inode_hugepages() is strict about
* enforcing that). If we bail out here, we'll also leave a
* page in the radix tree in the vm_shared case beyond the end
* of the i_size, but remove_inode_hugepages() will take care
* of it as soon as we drop the hugetlb_fault_mutex_table.
*/
size = i_size_read(mapping->host) >> huge_page_shift(h);
ret = -EFAULT;
if (idx >= size)
goto out_release_unlock;
ret = -EEXIST;
if (!huge_pte_none(huge_ptep_get(dst_pte)))
goto out_release_unlock;
if (vm_shared) {
page_dup_rmap(page, true);
} else {
ClearPagePrivate(page);
hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
}
/* For CONTINUE on a non-shared VMA, don't set VM_WRITE for CoW. */
if (is_continue && !vm_shared)
writable = 0;
else
writable = dst_vma->vm_flags & VM_WRITE;
_dst_pte = make_huge_pte(dst_vma, page, writable);
if (writable)
_dst_pte = huge_pte_mkdirty(_dst_pte);
_dst_pte = pte_mkyoung(_dst_pte);
set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
(void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte,
dst_vma->vm_flags & VM_WRITE);
hugetlb_count_add(pages_per_huge_page(h), dst_mm);
/* No need to invalidate - it was non-present before */
update_mmu_cache(dst_vma, dst_addr, dst_pte);
spin_unlock(ptl);
if (!is_continue)
set_page_huge_active(page);
if (vm_shared || is_continue)
unlock_page(page);
ret = 0;
out:
return ret;
out_release_unlock:
spin_unlock(ptl);
if (vm_shared || is_continue)
unlock_page(page);
out_release_nounlock:
put_page(page);
goto out;
}
#endif /* CONFIG_USERFAULTFD */
long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page **pages, struct vm_area_struct **vmas,
unsigned long *position, unsigned long *nr_pages,
long i, unsigned int flags, int *locked)
{
unsigned long pfn_offset;
unsigned long vaddr = *position;
unsigned long remainder = *nr_pages;
struct hstate *h = hstate_vma(vma);
int err = -EFAULT;
while (vaddr < vma->vm_end && remainder) {
pte_t *pte;
spinlock_t *ptl = NULL;
int absent;
struct page *page;
/*
* If we have a pending SIGKILL, don't keep faulting pages and
* potentially allocating memory.
*/
if (fatal_signal_pending(current)) {
remainder = 0;
break;
}
/*
* Some archs (sparc64, sh*) have multiple pte_ts to
* each hugepage. We have to make sure we get the
* first, for the page indexing below to work.
*
* Note that page table lock is not held when pte is null.
*/
pte = huge_pte_offset(mm, vaddr & huge_page_mask(h),
huge_page_size(h));
if (pte)
ptl = huge_pte_lock(h, mm, pte);
absent = !pte || huge_pte_none(huge_ptep_get(pte));
/*
* When coredumping, it suits get_dump_page if we just return
* an error where there's an empty slot with no huge pagecache
* to back it. This way, we avoid allocating a hugepage, and
* the sparse dumpfile avoids allocating disk blocks, but its
* huge holes still show up with zeroes where they need to be.
*/
if (absent && (flags & FOLL_DUMP) &&
!hugetlbfs_pagecache_present(h, vma, vaddr)) {
if (pte)
spin_unlock(ptl);
remainder = 0;
break;
}
/*
* We need call hugetlb_fault for both hugepages under migration
* (in which case hugetlb_fault waits for the migration,) and
* hwpoisoned hugepages (in which case we need to prevent the
* caller from accessing to them.) In order to do this, we use
* here is_swap_pte instead of is_hugetlb_entry_migration and
* is_hugetlb_entry_hwpoisoned. This is because it simply covers
* both cases, and because we can't follow correct pages
* directly from any kind of swap entries.
*/
if (absent || is_swap_pte(huge_ptep_get(pte)) ||
((flags & FOLL_WRITE) &&
!huge_pte_write(huge_ptep_get(pte)))) {
vm_fault_t ret;
unsigned int fault_flags = 0;
if (pte)
spin_unlock(ptl);
if (flags & FOLL_WRITE)
fault_flags |= FAULT_FLAG_WRITE;
if (locked)
fault_flags |= FAULT_FLAG_ALLOW_RETRY |
FAULT_FLAG_KILLABLE;
if (flags & FOLL_NOWAIT)
fault_flags |= FAULT_FLAG_ALLOW_RETRY |
FAULT_FLAG_RETRY_NOWAIT;
if (flags & FOLL_TRIED) {
/*
* Note: FAULT_FLAG_ALLOW_RETRY and
* FAULT_FLAG_TRIED can co-exist
*/
fault_flags |= FAULT_FLAG_TRIED;
}
ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
if (ret & VM_FAULT_ERROR) {
err = vm_fault_to_errno(ret, flags);
remainder = 0;
break;
}
if (ret & VM_FAULT_RETRY) {
if (locked &&
!(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
*locked = 0;
*nr_pages = 0;
/*
* VM_FAULT_RETRY must not return an
* error, it will return zero
* instead.
*
* No need to update "position" as the
* caller will not check it after
* *nr_pages is set to 0.
*/
return i;
}
continue;
}
pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
page = pte_page(huge_ptep_get(pte));
/*
* If subpage information not requested, update counters
* and skip the same_page loop below.
*/
if (!pages && !vmas && !pfn_offset &&
(vaddr + huge_page_size(h) < vma->vm_end) &&
(remainder >= pages_per_huge_page(h))) {
vaddr += huge_page_size(h);
remainder -= pages_per_huge_page(h);
i += pages_per_huge_page(h);
spin_unlock(ptl);
continue;
}
same_page:
if (pages) {
pages[i] = mem_map_offset(page, pfn_offset);
/*
* try_grab_page() should always succeed here, because:
* a) we hold the ptl lock, and b) we've just checked
* that the huge page is present in the page tables. If
* the huge page is present, then the tail pages must
* also be present. The ptl prevents the head page and
* tail pages from being rearranged in any way. So this
* page must be available at this point, unless the page
* refcount overflowed:
*/
if (WARN_ON_ONCE(!try_grab_page(pages[i], flags))) {
spin_unlock(ptl);
remainder = 0;
err = -ENOMEM;
break;
}
}
if (vmas)
vmas[i] = vma;
vaddr += PAGE_SIZE;
++pfn_offset;
--remainder;
++i;
if (vaddr < vma->vm_end && remainder &&
pfn_offset < pages_per_huge_page(h)) {
/*
* We use pfn_offset to avoid touching the pageframes
* of this compound page.
*/
goto same_page;
}
spin_unlock(ptl);
}
*nr_pages = remainder;
/*
* setting position is actually required only if remainder is
* not zero but it's faster not to add a "if (remainder)"
* branch.
*/
*position = vaddr;
return i ? i : err;
}
unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
unsigned long address, unsigned long end, pgprot_t newprot)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long start = address;
pte_t *ptep;
pte_t pte;
struct hstate *h = hstate_vma(vma);
unsigned long pages = 0;
bool shared_pmd = false;
struct mmu_notifier_range range;
/*
* In the case of shared PMDs, the area to flush could be beyond
* start/end. Set range.start/range.end to cover the maximum possible
* range if PMD sharing is possible.
*/
mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA,
0, vma, mm, start, end);
adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
BUG_ON(address >= end);
flush_cache_range(vma, range.start, range.end);
mmu_notifier_invalidate_range_start(&range);
i_mmap_lock_write(vma->vm_file->f_mapping);
for (; address < end; address += huge_page_size(h)) {
spinlock_t *ptl;
ptep = huge_pte_offset(mm, address, huge_page_size(h));
if (!ptep)
continue;
ptl = huge_pte_lock(h, mm, ptep);
if (huge_pmd_unshare(mm, vma, &address, ptep)) {
pages++;
spin_unlock(ptl);
shared_pmd = true;
continue;
}
pte = huge_ptep_get(ptep);
if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
spin_unlock(ptl);
continue;
}
if (unlikely(is_hugetlb_entry_migration(pte))) {
swp_entry_t entry = pte_to_swp_entry(pte);
if (is_write_migration_entry(entry)) {
pte_t newpte;
make_migration_entry_read(&entry);
newpte = swp_entry_to_pte(entry);
set_huge_swap_pte_at(mm, address, ptep,
newpte, huge_page_size(h));
pages++;
}
spin_unlock(ptl);
continue;
}
if (!huge_pte_none(pte)) {
pte_t old_pte;
old_pte = huge_ptep_modify_prot_start(vma, address, ptep);
pte = pte_mkhuge(huge_pte_modify(old_pte, newprot));
pte = arch_make_huge_pte(pte, vma, NULL, 0);
huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
pages++;
}
spin_unlock(ptl);
}
/*
* Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare
* may have cleared our pud entry and done put_page on the page table:
* once we release i_mmap_rwsem, another task can do the final put_page
* and that page table be reused and filled with junk. If we actually
* did unshare a page of pmds, flush the range corresponding to the pud.
*/
if (shared_pmd)
flush_hugetlb_tlb_range(vma, range.start, range.end);
else
flush_hugetlb_tlb_range(vma, start, end);
/*
* No need to call mmu_notifier_invalidate_range() we are downgrading
* page table protection not changing it to point to a new page.
*
* See Documentation/vm/mmu_notifier.rst
*/
i_mmap_unlock_write(vma->vm_file->f_mapping);
mmu_notifier_invalidate_range_end(&range);
return pages << h->order;
}
int hugetlb_reserve_pages(struct inode *inode,
long from, long to,
struct vm_area_struct *vma,
vm_flags_t vm_flags)
{
long ret, chg, add = -1;
struct hstate *h = hstate_inode(inode);
struct hugepage_subpool *spool = subpool_inode(inode);
struct resv_map *resv_map;
struct hugetlb_cgroup *h_cg = NULL;
long gbl_reserve, regions_needed = 0;
/* This should never happen */
if (from > to) {
VM_WARN(1, "%s called with a negative range\n", __func__);
return -EINVAL;
}
/*
* Only apply hugepage reservation if asked. At fault time, an
* attempt will be made for VM_NORESERVE to allocate a page
* without using reserves
*/
if (vm_flags & VM_NORESERVE)
return 0;
/*
* Shared mappings base their reservation on the number of pages that
* are already allocated on behalf of the file. Private mappings need
* to reserve the full area even if read-only as mprotect() may be
* called to make the mapping read-write. Assume !vma is a shm mapping
*/
if (!vma || vma->vm_flags & VM_MAYSHARE) {
/*
* resv_map can not be NULL as hugetlb_reserve_pages is only
* called for inodes for which resv_maps were created (see
* hugetlbfs_get_inode).
*/
resv_map = inode_resv_map(inode);
chg = region_chg(resv_map, from, to, &regions_needed);
} else {
/* Private mapping. */
resv_map = resv_map_alloc();
if (!resv_map)
return -ENOMEM;
chg = to - from;
set_vma_resv_map(vma, resv_map);
set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
}
if (chg < 0) {
ret = chg;
goto out_err;
}
ret = hugetlb_cgroup_charge_cgroup_rsvd(
hstate_index(h), chg * pages_per_huge_page(h), &h_cg);
if (ret < 0) {
ret = -ENOMEM;
goto out_err;
}
if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) {
/* For private mappings, the hugetlb_cgroup uncharge info hangs
* of the resv_map.
*/
resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, h_cg, h);
}
/*
* There must be enough pages in the subpool for the mapping. If
* the subpool has a minimum size, there may be some global
* reservations already in place (gbl_reserve).
*/
gbl_reserve = hugepage_subpool_get_pages(spool, chg);
if (gbl_reserve < 0) {
ret = -ENOSPC;
goto out_uncharge_cgroup;
}
/*
* Check enough hugepages are available for the reservation.
* Hand the pages back to the subpool if there are not
*/
ret = hugetlb_acct_memory(h, gbl_reserve);
if (ret < 0) {
goto out_put_pages;
}
/*
* Account for the reservations made. Shared mappings record regions
* that have reservations as they are shared by multiple VMAs.
* When the last VMA disappears, the region map says how much
* the reservation was and the page cache tells how much of
* the reservation was consumed. Private mappings are per-VMA and
* only the consumed reservations are tracked. When the VMA
* disappears, the original reservation is the VMA size and the
* consumed reservations are stored in the map. Hence, nothing
* else has to be done for private mappings here
*/
if (!vma || vma->vm_flags & VM_MAYSHARE) {
add = region_add(resv_map, from, to, regions_needed, h, h_cg);
if (unlikely(add < 0)) {
hugetlb_acct_memory(h, -gbl_reserve);
ret = add;
goto out_put_pages;
} else if (unlikely(chg > add)) {
/*
* pages in this range were added to the reserve
* map between region_chg and region_add. This
* indicates a race with alloc_huge_page. Adjust
* the subpool and reserve counts modified above
* based on the difference.
*/
long rsv_adjust;
/*
* hugetlb_cgroup_uncharge_cgroup_rsvd() will put the
* reference to h_cg->css. See comment below for detail.
*/
hugetlb_cgroup_uncharge_cgroup_rsvd(
hstate_index(h),
(chg - add) * pages_per_huge_page(h), h_cg);
rsv_adjust = hugepage_subpool_put_pages(spool,
chg - add);
hugetlb_acct_memory(h, -rsv_adjust);
} else if (h_cg) {
/*
* The file_regions will hold their own reference to
* h_cg->css. So we should release the reference held
* via hugetlb_cgroup_charge_cgroup_rsvd() when we are
* done.
*/
hugetlb_cgroup_put_rsvd_cgroup(h_cg);
}
}
return 0;
out_put_pages:
/* put back original number of pages, chg */
(void)hugepage_subpool_put_pages(spool, chg);
out_uncharge_cgroup:
hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),
chg * pages_per_huge_page(h), h_cg);
out_err:
if (!vma || vma->vm_flags & VM_MAYSHARE)
/* Only call region_abort if the region_chg succeeded but the
* region_add failed or didn't run.
*/
if (chg >= 0 && add < 0)
region_abort(resv_map, from, to, regions_needed);
if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
kref_put(&resv_map->refs, resv_map_release);
return ret;
}
long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
long freed)
{
struct hstate *h = hstate_inode(inode);
struct resv_map *resv_map = inode_resv_map(inode);
long chg = 0;
struct hugepage_subpool *spool = subpool_inode(inode);
long gbl_reserve;
/*
* Since this routine can be called in the evict inode path for all
* hugetlbfs inodes, resv_map could be NULL.
*/
if (resv_map) {
chg = region_del(resv_map, start, end);
/*
* region_del() can fail in the rare case where a region
* must be split and another region descriptor can not be
* allocated. If end == LONG_MAX, it will not fail.
*/
if (chg < 0)
return chg;
}
spin_lock(&inode->i_lock);
inode->i_blocks -= (blocks_per_huge_page(h) * freed);
spin_unlock(&inode->i_lock);
/*
* If the subpool has a minimum size, the number of global
* reservations to be released may be adjusted.
*/
gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
hugetlb_acct_memory(h, -gbl_reserve);
return 0;
}
#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
static unsigned long page_table_shareable(struct vm_area_struct *svma,
struct vm_area_struct *vma,
unsigned long addr, pgoff_t idx)
{
unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
svma->vm_start;
unsigned long sbase = saddr & PUD_MASK;
unsigned long s_end = sbase + PUD_SIZE;
/* Allow segments to share if only one is marked locked */
unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK;
/*
* match the virtual addresses, permission and the alignment of the
* page table page.
*/
if (pmd_index(addr) != pmd_index(saddr) ||
vm_flags != svm_flags ||
sbase < svma->vm_start || svma->vm_end < s_end)
return 0;
return saddr;
}
static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
{
unsigned long base = addr & PUD_MASK;
unsigned long end = base + PUD_SIZE;
/*
* check on proper vm_flags and page table alignment
*/
if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, base, end))
return true;
return false;
}
bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
{
#ifdef CONFIG_USERFAULTFD
if (uffd_disable_huge_pmd_share(vma))
return false;
#endif
return vma_shareable(vma, addr);
}
/*
* Determine if start,end range within vma could be mapped by shared pmd.
* If yes, adjust start and end to cover range associated with possible
* shared pmd mappings.
*/
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
unsigned long *start, unsigned long *end)
{
unsigned long v_start = ALIGN(vma->vm_start, PUD_SIZE),
v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
/*
* vma need span at least one aligned PUD size and the start,end range
* must at least partialy within it.
*/
if (!(vma->vm_flags & VM_MAYSHARE) || !(v_end > v_start) ||
(*end <= v_start) || (*start >= v_end))
return;
/* Extend the range to be PUD aligned for a worst case scenario */
if (*start > v_start)
*start = ALIGN_DOWN(*start, PUD_SIZE);
if (*end < v_end)
*end = ALIGN(*end, PUD_SIZE);
}
/*
* Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
* and returns the corresponding pte. While this is not necessary for the
* !shared pmd case because we can allocate the pmd later as well, it makes the
* code much cleaner.
*
* This routine must be called with i_mmap_rwsem held in at least read mode if
* sharing is possible. For hugetlbfs, this prevents removal of any page
* table entries associated with the address space. This is important as we
* are setting up sharing based on existing page table entries (mappings).
*
* NOTE: This routine is only called from huge_pte_alloc. Some callers of
* huge_pte_alloc know that sharing is not possible and do not take
* i_mmap_rwsem as a performance optimization. This is handled by the
* if !vma_shareable check at the beginning of the routine. i_mmap_rwsem is
* only required for subsequent processing.
*/
pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pud_t *pud)
{
struct address_space *mapping = vma->vm_file->f_mapping;
pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
vma->vm_pgoff;
struct vm_area_struct *svma;
unsigned long saddr;
pte_t *spte = NULL;
pte_t *pte;
spinlock_t *ptl;
i_mmap_assert_locked(mapping);
vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
if (svma == vma)
continue;
saddr = page_table_shareable(svma, vma, addr, idx);
if (saddr) {
spte = huge_pte_offset(svma->vm_mm, saddr,
vma_mmu_pagesize(svma));
if (spte) {
get_page(virt_to_page(spte));
break;
}
}
}
if (!spte)
goto out;
ptl = huge_pte_lock(hstate_vma(vma), mm, spte);
if (pud_none(*pud)) {
pud_populate(mm, pud,
(pmd_t *)((unsigned long)spte & PAGE_MASK));
mm_inc_nr_pmds(mm);
} else {
put_page(virt_to_page(spte));
}
spin_unlock(ptl);
out:
pte = (pte_t *)pmd_alloc(mm, pud, addr);
return pte;
}
/*
* unmap huge page backed by shared pte.
*
* Hugetlb pte page is ref counted at the time of mapping. If pte is shared
* indicated by page_count > 1, unmap is achieved by clearing pud and
* decrementing the ref count. If count == 1, the pte page is not shared.
*
* Called with page table lock held and i_mmap_rwsem held in write mode.
*
* returns: 1 successfully unmapped a shared pte page
* 0 the underlying pte page is not shared, or it is the last user
*/
int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long *addr, pte_t *ptep)
{
pgd_t *pgd = pgd_offset(mm, *addr);
p4d_t *p4d = p4d_offset(pgd, *addr);
pud_t *pud = pud_offset(p4d, *addr);
i_mmap_assert_write_locked(vma->vm_file->f_mapping);
BUG_ON(page_count(virt_to_page(ptep)) == 0);
if (page_count(virt_to_page(ptep)) == 1)
return 0;
pud_clear(pud);
put_page(virt_to_page(ptep));
mm_dec_nr_pmds(mm);
/*
* This update of passed address optimizes loops sequentially
* processing addresses in increments of huge page size (PMD_SIZE
* in this case). By clearing the pud, a PUD_SIZE area is unmapped.
* Update address to the 'last page' in the cleared area so that
* calling loop can move to first page past this area.
*/
*addr |= PUD_SIZE - PMD_SIZE;
return 1;
}
#else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pud_t *pud)
{
return NULL;
}
int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long *addr, pte_t *ptep)
{
return 0;
}
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
unsigned long *start, unsigned long *end)
{
}
bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
{
return false;
}
#endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, unsigned long sz)
{
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pte_t *pte = NULL;
pgd = pgd_offset(mm, addr);
p4d = p4d_alloc(mm, pgd, addr);
if (!p4d)
return NULL;
pud = pud_alloc(mm, p4d, addr);
if (pud) {
if (sz == PUD_SIZE) {
pte = (pte_t *)pud;
} else {
BUG_ON(sz != PMD_SIZE);
if (want_pmd_share(vma, addr) && pud_none(*pud))
pte = huge_pmd_share(mm, vma, addr, pud);
else
pte = (pte_t *)pmd_alloc(mm, pud, addr);
}
}
BUG_ON(pte && pte_present(*pte) && !pte_huge(*pte));
return pte;
}
/*
* huge_pte_offset() - Walk the page table to resolve the hugepage
* entry at address @addr
*
* Return: Pointer to page table entry (PUD or PMD) for
* address @addr, or NULL if a !p*d_present() entry is encountered and the
* size @sz doesn't match the hugepage size at this level of the page
* table.
*/
pte_t *huge_pte_offset(struct mm_struct *mm,
unsigned long addr, unsigned long sz)
{
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pgd = pgd_offset(mm, addr);
if (!pgd_present(*pgd))
return NULL;
p4d = p4d_offset(pgd, addr);
if (!p4d_present(*p4d))
return NULL;
pud = pud_offset(p4d, addr);
if (sz == PUD_SIZE)
/* must be pud huge, non-present or none */
return (pte_t *)pud;
if (!pud_present(*pud))
return NULL;
/* must have a valid entry and size to go further */
pmd = pmd_offset(pud, addr);
/* must be pmd huge, non-present or none */
return (pte_t *)pmd;
}
#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
/*
* These functions are overwritable if your architecture needs its own
* behavior.
*/
struct page * __weak
follow_huge_addr(struct mm_struct *mm, unsigned long address,
int write)
{
return ERR_PTR(-EINVAL);
}
struct page * __weak
follow_huge_pd(struct vm_area_struct *vma,
unsigned long address, hugepd_t hpd, int flags, int pdshift)
{
WARN(1, "hugepd follow called with no support for hugepage directory format\n");
return NULL;
}
struct page * __weak
follow_huge_pmd_pte(struct vm_area_struct *vma, unsigned long address, int flags)
{
struct hstate *h = hstate_vma(vma);
struct mm_struct *mm = vma->vm_mm;
struct page *page = NULL;
spinlock_t *ptl;
pte_t *ptep, pte;
/* FOLL_GET and FOLL_PIN are mutually exclusive. */
if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
(FOLL_PIN | FOLL_GET)))
return NULL;
retry:
ptep = huge_pte_offset(mm, address, huge_page_size(h));
if (!ptep)
return NULL;
ptl = huge_pte_lock(h, mm, ptep);
pte = huge_ptep_get(ptep);
if (pte_present(pte)) {
page = pte_page(pte) +
((address & ~huge_page_mask(h)) >> PAGE_SHIFT);
/*
* try_grab_page() should always succeed here, because: a) we
* hold the pmd (ptl) lock, and b) we've just checked that the
* huge pmd (head) page is present in the page tables. The ptl
* prevents the head page and tail pages from being rearranged
* in any way. So this page must be available at this point,
* unless the page refcount overflowed:
*/
if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
page = NULL;
goto out;
}
} else {
if (is_hugetlb_entry_migration(pte)) {
spin_unlock(ptl);
__migration_entry_wait(mm, ptep, ptl);
goto retry;
}
/*
* hwpoisoned entry is treated as no_page_table in
* follow_page_mask().
*/
}
out:
spin_unlock(ptl);
return page;
}
struct page * __weak
follow_huge_pud(struct mm_struct *mm, unsigned long address,
pud_t *pud, int flags)
{
if (flags & (FOLL_GET | FOLL_PIN))
return NULL;
return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
}
struct page * __weak
follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags)
{
if (flags & (FOLL_GET | FOLL_PIN))
return NULL;
return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
}
bool isolate_huge_page(struct page *page, struct list_head *list)
{
bool ret = true;
spin_lock(&hugetlb_lock);
if (!PageHeadHuge(page) || !page_huge_active(page) ||
!get_page_unless_zero(page)) {
ret = false;
goto unlock;
}
clear_page_huge_active(page);
list_move_tail(&page->lru, list);
unlock:
spin_unlock(&hugetlb_lock);
return ret;
}
void putback_active_hugepage(struct page *page)
{
VM_BUG_ON_PAGE(!PageHead(page), page);
spin_lock(&hugetlb_lock);
set_page_huge_active(page);
list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
spin_unlock(&hugetlb_lock);
put_page(page);
}
void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
{
struct hstate *h = page_hstate(oldpage);
hugetlb_cgroup_migrate(oldpage, newpage);
set_page_owner_migrate_reason(newpage, reason);
/*
* transfer temporary state of the new huge page. This is
* reverse to other transitions because the newpage is going to
* be final while the old one will be freed so it takes over
* the temporary status.
*
* Also note that we have to transfer the per-node surplus state
* here as well otherwise the global surplus count will not match
* the per-node's.
*/
if (PageHugeTemporary(newpage)) {
int old_nid = page_to_nid(oldpage);
int new_nid = page_to_nid(newpage);
SetPageHugeTemporary(oldpage);
ClearPageHugeTemporary(newpage);
spin_lock(&hugetlb_lock);
if (h->surplus_huge_pages_node[old_nid]) {
h->surplus_huge_pages_node[old_nid]--;
h->surplus_huge_pages_node[new_nid]++;
}
spin_unlock(&hugetlb_lock);
}
}
/*
* This function will unconditionally remove all the shared pmd pgtable entries
* within the specific vma for a hugetlbfs memory range.
*/
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
{
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
struct mm_struct *mm = vma->vm_mm;
struct mmu_notifier_range range;
unsigned long address, start, end;
spinlock_t *ptl;
pte_t *ptep;
if (!(vma->vm_flags & VM_MAYSHARE))
return;
start = ALIGN(vma->vm_start, PUD_SIZE);
end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
if (start >= end)
return;
flush_cache_range(vma, start, end);
/*
* No need to call adjust_range_if_pmd_sharing_possible(), because
* we have already done the PUD_SIZE alignment.
*/
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
start, end);
mmu_notifier_invalidate_range_start(&range);
i_mmap_lock_write(vma->vm_file->f_mapping);
for (address = start; address < end; address += PUD_SIZE) {
unsigned long tmp = address;
ptep = huge_pte_offset(mm, address, sz);
if (!ptep)
continue;
ptl = huge_pte_lock(h, mm, ptep);
/* We don't want 'address' to be changed */
huge_pmd_unshare(mm, vma, &tmp, ptep);
spin_unlock(ptl);
}
flush_hugetlb_tlb_range(vma, start, end);
i_mmap_unlock_write(vma->vm_file->f_mapping);
/*
* No need to call mmu_notifier_invalidate_range(), see
* Documentation/vm/mmu_notifier.rst.
*/
mmu_notifier_invalidate_range_end(&range);
}
#ifdef CONFIG_CMA
static bool cma_reserve_called __initdata;
static int __init cmdline_parse_hugetlb_cma(char *p)
{
hugetlb_cma_size = memparse(p, &p);
return 0;
}
early_param("hugetlb_cma", cmdline_parse_hugetlb_cma);
void __init hugetlb_cma_reserve(int order)
{
unsigned long size, reserved, per_node;
int nid;
cma_reserve_called = true;
if (!hugetlb_cma_size)
return;
if (hugetlb_cma_size < (PAGE_SIZE << order)) {
pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n",
(PAGE_SIZE << order) / SZ_1M);
return;
}
/*
* If 3 GB area is requested on a machine with 4 numa nodes,
* let's allocate 1 GB on first three nodes and ignore the last one.
*/
per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes);
pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n",
hugetlb_cma_size / SZ_1M, per_node / SZ_1M);
reserved = 0;
for_each_node_state(nid, N_ONLINE) {
int res;
char name[CMA_MAX_NAME];
size = min(per_node, hugetlb_cma_size - reserved);
size = round_up(size, PAGE_SIZE << order);
snprintf(name, sizeof(name), "hugetlb%d", nid);
res = cma_declare_contiguous_nid(0, size, 0, PAGE_SIZE << order,
0, false, name,
&hugetlb_cma[nid], nid);
if (res) {
pr_warn("hugetlb_cma: reservation failed: err %d, node %d",
res, nid);
continue;
}
reserved += size;
pr_info("hugetlb_cma: reserved %lu MiB on node %d\n",
size / SZ_1M, nid);
if (reserved >= hugetlb_cma_size)
break;
}
}
void __init hugetlb_cma_check(void)
{
if (!hugetlb_cma_size || cma_reserve_called)
return;
pr_warn("hugetlb_cma: the option isn't supported by current arch\n");
}
#endif /* CONFIG_CMA */