Files
android_kernel_xiaomi_sm8450/mm/memory.c
Greg Kroah-Hartman d0782c9411 Merge tag 'android12-5.10.160_r00' into android12-5.10
This is the merge of the upstream LTS release of 5.10.160 into the
android12-5.10 branch.

It contains the following commits:

003c389455 Merge 5.10.160 into android12-5.10-lts
a2428a8dcb Linux 5.10.160
54c15f67cb ASoC: ops: Correct bounds check for second channel on SX controls
74b139c63f nvme-pci: clear the prp2 field when not used
77ebf88e00 ASoC: cs42l51: Correct PGA Volume minimum value
4db1d19b74 can: mcba_usb: Fix termination command argument
683837f2f6 can: sja1000: fix size of OCR_MODE_MASK define
434b523671 pinctrl: meditatek: Startup with the IRQs disabled
5cb4abb0ca libbpf: Use page size as max_entries when probing ring buffer map
50b5f6d4d9 ASoC: ops: Check bounds for second channel in snd_soc_put_volsw_sx()
344739dc56 ASoC: fsl_micfil: explicitly clear CHnF flags
a49c1a7307 ASoC: fsl_micfil: explicitly clear software reset bit
75454b4bbf io_uring: add missing item types for splice request
17f386e6b7 fuse: always revalidate if exclusive create
eb6313c129 nfp: fix use-after-free in area_cache_get()
965d93fb39 vfs: fix copy_file_range() averts filesystem freeze protection
ed96733949 vfs: fix copy_file_range() regression in cross-fs copies
970862a96c x86/smpboot: Move rcu_cpu_starting() earlier
32e45c58a0 Merge "Merge 5.10.159 into android12-5.10-lts" into android12-5.10-lts
d31626cbea ANDROID: usb: gadget: uvc: remove duplicate code in unbind
01ef2d0b53 Merge 5.10.159 into android12-5.10-lts
931578be69 Linux 5.10.159
4fd6f84e0a can: esd_usb: Allow REC and TEC to return to zero
cf0e423106 macsec: add missing attribute validation for offload
6b03e41767 net: mvneta: Fix an out of bounds check
8208d7e56b ipv6: avoid use-after-free in ip6_fragment()
3d59adad12 net: plip: don't call kfree_skb/dev_kfree_skb() under spin_lock_irq()
a00444e25b xen/netback: fix build warning
87277bdf2c ethernet: aeroflex: fix potential skb leak in greth_init_rings()
cc668fddde tipc: call tipc_lxc_xmit without holding node_read_lock
4be43e46c3 net: dsa: sja1105: fix memory leak in sja1105_setup_devlink_regions()
8e3f9ac009 ipv4: Fix incorrect route flushing when table ID 0 is used
5211e5ff9d ipv4: Fix incorrect route flushing when source address is deleted
36e248269a tipc: Fix potential OOB in tipc_link_proto_rcv()
93aaa4bb72 net: hisilicon: Fix potential use-after-free in hix5hd2_rx()
296a50aa8b net: hisilicon: Fix potential use-after-free in hisi_femac_rx()
8d1aed7a11 net: thunderx: Fix missing destroy_workqueue of nicvf_rx_mode_wq
a5cfbc1995 ip_gre: do not report erspan version on GRE interface
696e34d54c net: stmmac: fix "snps,axi-config" node property parsing
ca26f45083 nvme initialize core quirks before calling nvme_init_subsystem
27eb2d7a1b NFC: nci: Bounds check struct nfc_target arrays
a2506b19d7 i40e: Disallow ip4 and ip6 l4_4_bytes
8329b65e34 i40e: Fix for VF MAC address 0
215f3ac53b i40e: Fix not setting default xps_cpus after reset
146ebee8fc net: mvneta: Prevent out of bounds read in mvneta_config_rss()
e6860c889f xen-netfront: Fix NULL sring after live migration
3d3b30718a net: encx24j600: Fix invalid logic in reading of MISTAT register
51ba1820e7 net: encx24j600: Add parentheses to fix precedence
42c319635c mac802154: fix missing INIT_LIST_HEAD in ieee802154_if_add()
4c693330ce selftests: rtnetlink: correct xfrm policy rule in kci_test_ipsec_offload
bccda3ad07 net: dsa: ksz: Check return value
e7b9504581 Bluetooth: Fix not cleanup led when bt_init fails
1717354d77 Bluetooth: 6LoWPAN: add missing hci_dev_put() in get_l2cap_conn()
80c69b31aa vmxnet3: correctly report encapsulated LRO packet
575a6266f6 af_unix: Get user_ns from in_skb in unix_diag_get_exact().
6c788c0a25 drm: bridge: dw_hdmi: fix preference of RGB modes over YUV420
de918d9738 igb: Allocate MSI-X vector when testing
6595c9208d e1000e: Fix TX dispatch condition
5ee6413d3d gpio: amd8111: Fix PCI device reference count leak
b9aca69a6c drm/bridge: ti-sn65dsi86: Fix output polarity setting bug
b46e8c50c3 netfilter: ctnetlink: fix compilation warning after data race fixes in ct mark
0a8e66e375 ca8210: Fix crash by zero initializing data
27c71825ff ieee802154: cc2520: Fix error return code in cc2520_hw_init()
a0418d0a6b netfilter: nft_set_pipapo: Actually validate intervals in fields after the first one
cb283cca1d rtc: mc146818-lib: fix signedness bug in mc146818_get_time()
5c432383b6 rtc: mc146818-lib: fix locking in mc146818_set_time
5e26531d81 rtc: cmos: Disable irq around direct invocation of cmos_interrupt()
fccee93eb2 mm/hugetlb: fix races when looking up a CONT-PTE/PMD size hugetlb page
c42221efb1 can: af_can: fix NULL pointer dereference in can_rcv_filter
bc03f809da HID: core: fix shift-out-of-bounds in hid_report_raw_event
959a23a4d1 HID: hid-lg4ff: Add check for empty lbuf
4dde75945a HID: usbhid: Add ALWAYS_POLL quirk for some mice
11e95d85c3 drm/shmem-helper: Avoid vm_open error paths
6a4da05acd drm/shmem-helper: Remove errant put in error path
007f561f59 drm/vmwgfx: Don't use screen objects when SEV is active
3cb78c3925 KVM: s390: vsie: Fix the initialization of the epoch extension (epdx) field
549b46f813 Bluetooth: Fix crash when replugging CSR fake controllers
380d183e99 Bluetooth: btusb: Add debug message for CSR controllers
f1cf856123 mm/gup: fix gup_pud_range() for dax
f1f7f36cf6 memcg: fix possible use-after-free in memcg_write_event_control()
32f01f0306 media: v4l2-dv-timings.c: fix too strict blanking sanity checks
043b2bc96c Revert "ARM: dts: imx7: Fix NAND controller size-cells"
abfb8ae69b media: videobuf2-core: take mmap_lock in vb2_get_unmapped_area()
83632fc414 xen/netback: don't call kfree_skb() with interrupts disabled
3eecd2bc10 xen/netback: do some code cleanup
49e07c0768 xen/netback: Ensure protocol headers don't fall in the non-linear area
db44a9443e rtc: mc146818: Reduce spinlock section in mc146818_set_time()
17293d630f rtc: cmos: Replace spin_lock_irqsave with spin_lock in hard IRQ
acfd8ef683 rtc: cmos: avoid UIP when reading alarm time
949bae0282 rtc: cmos: avoid UIP when writing alarm time
33ac73a41a rtc: mc146818-lib: extract mc146818_avoid_UIP
8bb5fe5830 rtc: mc146818-lib: fix RTC presence check
775d4661f1 rtc: Check return value from mc146818_get_time()
b9a5c470e0 rtc: mc146818-lib: change return values of mc146818_get_time()
94eaf9966e rtc: cmos: remove stale REVISIT comments
f5b51f8550 rtc: mc146818: Dont test for bit 0-5 in Register D
3736972360 rtc: mc146818: Detect and handle broken RTCs
7c7075c88d rtc: mc146818: Prevent reading garbage
7f445ca2e0 mm/khugepaged: invoke MMU notifiers in shmem/file collapse paths
4a1cdb49d0 mm/khugepaged: fix GUP-fast interaction by sending IPI
cdfd3739b2 mm/khugepaged: take the right locks for page table retraction
1c0eec6a1d net: usb: qmi_wwan: add u-blox 0x1342 composition
a8c5ffb4df 9p/xen: check logical size for buffer size
ec36ebae36 usb: dwc3: gadget: Disable GUSB2PHYCFG.SUSPHY for End Transfer
d9b53caf01 fbcon: Use kzalloc() in fbcon_prepare_logo()
8b130c770d regulator: twl6030: fix get status of twl6032 regulators
f6f45e5383 ASoC: soc-pcm: Add NULL check in BE reparenting
688a45aff2 btrfs: send: avoid unaligned encoded writes when attempting to clone range
15c42ab8d4 ALSA: seq: Fix function prototype mismatch in snd_seq_expand_var_event
d38e021416 regulator: slg51000: Wait after asserting CS pin
1331bcfcac 9p/fd: Use P9_HDRSZ for header size
96b43f36a5 ARM: dts: rockchip: disable arm_global_timer on rk3066 and rk3188
ddf58f5939 ASoC: wm8962: Wait for updated value of WM8962_CLOCKING1 register
dbd78abd69 ARM: 9266/1: mm: fix no-MMU ZERO_PAGE() implementation
bb1866cf1e ARM: 9251/1: perf: Fix stacktraces for tracepoint events in THUMB2 kernels
b1f40a0cdf ARM: dts: rockchip: rk3188: fix lcdc1-rgb24 node name
5f9474d07b arm64: dts: rockchip: fix ir-receiver node names
060d58924a ARM: dts: rockchip: fix ir-receiver node names
3e0c466771 arm: dts: rockchip: fix node name for hym8563 rtc
3ada63a876 arm64: dts: rockchip: keep I2S1 disabled for GPIO function on ROCK Pi 4 series
202ee06349 Revert "mmc: sdhci: Fix voltage switch delay"
0b0939466f ANDROID: gki_defconfig: add CONFIG_FUNCTION_ERROR_INJECTION
5ab4c6b843 Merge 5.10.158 into android12-5.10-lts
592346d5dc Linux 5.10.158
cc1b4718cc ipc/sem: Fix dangling sem_array access in semtimedop race
d072a10c81 v4l2: don't fall back to follow_pfn() if pin_user_pages_fast() fails
9ba389863a proc: proc_skip_spaces() shouldn't think it is working on C strings
4aa32aaef6 proc: avoid integer type confusion in get_proc_long
5f2f775605 block: unhash blkdev part inode when the part is deleted
a82869ac52 Input: raydium_ts_i2c - fix memory leak in raydium_i2c_send()
4e0d6c687c char: tpm: Protect tpm_pm_suspend with locks
5a6f935ef3 Revert "clocksource/drivers/riscv: Events are stopped during CPU suspend"
f075cf139f ACPI: HMAT: Fix initiator registration for single-initiator systems
f3b76b4d38 ACPI: HMAT: remove unnecessary variable initialization
63e72417a1 i2c: imx: Only DMA messages with I2C_M_DMA_SAFE flag set
df76136598 i2c: npcm7xx: Fix error handling in npcm_i2c_init()
7462cd2443 x86/pm: Add enumeration check before spec MSRs save/restore setup
5e3d4a68e2 x86/tsx: Add a feature bit for TSX control MSR support
b7f7a0402e Revert "tty: n_gsm: avoid call of sleeping functions from atomic context"
481f9ed8eb ipv4: Fix route deletion when nexthop info is not specified
0b5394229e ipv4: Handle attempt to delete multipath route when fib_info contains an nh reference
4919503426 selftests: net: fix nexthop warning cleanup double ip typo
7ca14c5f24 selftests: net: add delete nexthop route warning test
f09ac62f0e Kconfig.debug: provide a little extra FRAME_WARN leeway when KASAN is enabled
19d91d3798 parisc: Increase FRAME_WARN to 2048 bytes on parisc
fcf20da099 xtensa: increase size of gcc stack frame check
a1877001ed parisc: Increase size of gcc stack frame check
a5c65cd56a iommu/vt-d: Fix PCI device refcount leak in dmar_dev_scope_init()
10ed7655a1 iommu/vt-d: Fix PCI device refcount leak in has_external_pci()
302edce1dd pinctrl: single: Fix potential division by zero
b50c964189 ASoC: ops: Fix bounds check for _sx controls
a2efc46524 io_uring: don't hold uring_lock when calling io_run_task_work*
be111ebd88 tracing: Free buffers when a used dynamic event is removed
648b92e576 drm/i915: Never return 0 if not all requests retired
8649c023c4 drm/amdgpu: temporarily disable broken Clang builds due to blown stack-frame
940b774069 mmc: sdhci: Fix voltage switch delay
ed19662453 mmc: sdhci-sprd: Fix no reset data and command after voltage switch
ef767907e7 mmc: sdhci-esdhc-imx: correct CQHCI exit halt state check
46ee041cd6 mmc: core: Fix ambiguous TRIM and DISCARD arg
b79be962b5 mmc: mmc_test: Fix removal of debugfs file
d4fc344c0d net: stmmac: Set MAC's flow control register to reflect current settings
549e24409a pinctrl: intel: Save and restore pins in "direct IRQ" mode
471fb7b735 x86/bugs: Make sure MSR_SPEC_CTRL is updated properly upon resume from S3
e858917ab7 nilfs2: fix NULL pointer dereference in nilfs_palloc_commit_free_entry()
6ddf788400 tools/vm/slabinfo-gnuplot: use "grep -E" instead of "egrep"
c099d12c55 error-injection: Add prompt for function error injection
26b6f927bb riscv: vdso: fix section overlapping under some conditions
2b1d8f27e2 net/mlx5: DR, Fix uninitialized var warning
c40db1e5f3 hwmon: (coretemp) fix pci device refcount leak in nv1a_ram_new()
f06e0cd01e hwmon: (coretemp) Check for null before removing sysfs attrs
d93522d04f net: ethernet: renesas: ravb: Fix promiscuous mode after system resumed
176ee6c673 sctp: fix memory leak in sctp_stream_outq_migrate()
1c38c88acc packet: do not set TP_STATUS_CSUM_VALID on CHECKSUM_COMPLETE
5f442e1d40 net: tun: Fix use-after-free in tun_detach()
5fa0fc5876 afs: Fix fileserver probe RTT handling
7ca81a161e net: hsr: Fix potential use-after-free
a1ba595e35 tipc: re-fetch skb cb after tipc_msg_validate
4621bdfff5 dsa: lan9303: Correct stat name
45752af024 net: ethernet: nixge: fix NULL dereference
e01c154237 net/9p: Fix a potential socket leak in p9_socket_open
b080d4668f net: net_netdev: Fix error handling in ntb_netdev_init_module()
fe6bc99c27 net: phy: fix null-ptr-deref while probe() failed
0184ede0ec wifi: mac8021: fix possible oob access in ieee80211_get_rate_duration
e2ed90fd3a wifi: cfg80211: don't allow multi-BSSID in S1G
9e6b79a3cd wifi: cfg80211: fix buffer overflow in elem comparison
6922948c2e aquantia: Do not purge addresses when setting the number of rings
fa59d49a49 qlcnic: fix sleep-in-atomic-context bugs caused by msleep
d753f554f2 can: cc770: cc770_isa_probe(): add missing free_cc770dev()
e74746bf04 can: sja1000_isa: sja1000_isa_probe(): add missing free_sja1000dev()
0d2f9d95d9 net/mlx5e: Fix use-after-free when reverting termination table
2cb84ff349 net/mlx5: Fix uninitialized variable bug in outlen_write()
b775f37d94 e100: Fix possible use after free in e100_xmit_prepare
086f656e44 e100: switch from 'pci_' to 'dma_' API
971c55f076 iavf: Fix error handling in iavf_init_module()
d389a4c698 iavf: remove redundant ret variable
fd4960ea53 fm10k: Fix error handling in fm10k_init_module()
dd425cec79 i40e: Fix error handling in i40e_init_module()
f166c62cad ixgbevf: Fix resource leak in ixgbevf_init_module()
8f7047f418 of: property: decrement node refcount in of_fwnode_get_reference_args()
be006212bd bpf: Do not copy spin lock field from user in bpf_selem_alloc
90907cd4d1 hwmon: (ibmpex) Fix possible UAF when ibmpex_register_bmc() fails
7649bba263 hwmon: (i5500_temp) fix missing pci_disable_device()
dddfc03f04 hwmon: (ina3221) Fix shunt sum critical calculation
984fcd3ec1 hwmon: (ltc2947) fix temperature scaling
8a549ab672 libbpf: Handle size overflow for ringbuf mmap
cc140c729c ARM: at91: rm9200: fix usb device clock id
592724b14d scripts/faddr2line: Fix regression in name resolution on ppc64le
353c3aaaf3 bpf, perf: Use subprog name when reporting subprog ksymbol
d48f6a5784 iio: light: rpr0521: add missing Kconfig dependencies
5eb114f55b iio: health: afe4404: Fix oob read in afe4404_[read|write]_raw
b1756af172 iio: health: afe4403: Fix oob read in afe4403_read_raw
01d7c41eac btrfs: qgroup: fix sleep from invalid context bug in btrfs_qgroup_inherit()
d3f5be8246 drm/amdgpu: Partially revert "drm/amdgpu: update drm_display_info correctly when the edid is read"
00570fafc2 drm/amdgpu: update drm_display_info correctly when the edid is read
44b204730b drm/display/dp_mst: Fix drm_dp_mst_add_affected_dsc_crtcs() return code
1faf21bdd1 btrfs: move QUOTA_ENABLED check to rescan_should_stop from btrfs_qgroup_rescan_worker
6050872f9f spi: spi-imx: Fix spi_bus_clk if requested clock is higher than input clock
7b020665d4 btrfs: free btrfs_path before copying inodes to userspace
d5b7a34379 btrfs: sink iterator parameter to btrfs_ioctl_logical_to_ino
f3226d86f8 Revert "xfrm: fix "disable_policy" on ipv4 early demux"
982d7f3eb8 Merge 5.10.157 into android12-5.10-lts
37d3df60cb ANDROID: CRC ABI fixups in ip.h and ipv6.h
f4245f0538 Linux 5.10.157
4801672fb0 fuse: lock inode unconditionally in fuse_fallocate()
86f0082fb9 drm/i915: fix TLB invalidation for Gen12 video and compute engines
feb97cf45e drm/amdgpu: always register an MMU notifier for userptr
596b7d55d7 drm/amd/dc/dce120: Fix audio register mapping, stop triggering KASAN
c86c1a7037 btrfs: sysfs: normalize the error handling branch in btrfs_init_sysfs()
1581830c0e btrfs: free btrfs_path before copying subvol info to userspace
0bdb8f7ef8 btrfs: free btrfs_path before copying fspath to userspace
24a37ba2cb btrfs: free btrfs_path before copying root refs to userspace
b56d6e5585 genirq: Take the proposed affinity at face value if force==true
9d90a2b98e irqchip/gic-v3: Always trust the managed affinity provided by the core code
e0d2c59ee9 genirq: Always limit the affinity to online CPUs
f8f80d532f genirq/msi: Shutdown managed interrupts with unsatifiable affinities
3eb6b89a4e wifi: wilc1000: validate number of channels
5a068535c0 wifi: wilc1000: validate length of IEEE80211_P2P_ATTR_CHANNEL_LIST attribute
905f886eae wifi: wilc1000: validate length of IEEE80211_P2P_ATTR_OPER_CHANNEL attribute
7c6535fb4d wifi: wilc1000: validate pairwise and authentication suite offsets
64b7f9a7dd dm integrity: clear the journal on suspend
d306f73079 dm integrity: flush the journal on suspend
79d9a11679 gpu: host1x: Avoid trying to use GART on Tegra20
a7f30b5b8d net: usb: qmi_wwan: add Telit 0x103a composition
7e8eaa939e tcp: configurable source port perturb table size
0acc008cf9 platform/x86: hp-wmi: Ignore Smart Experience App event
0964b77bab zonefs: fix zone report size in __zonefs_io_error()
a5937dae66 platform/x86: acer-wmi: Enable SW_TABLET_MODE on Switch V 10 (SW5-017)
52fb7bcea0 platform/x86: asus-wmi: add missing pci_dev_put() in asus_wmi_set_xusb2pr()
4fa717ba2d xen/platform-pci: add missing free_irq() in error path
f45a5a6c9f xen-pciback: Allow setting PCI_MSIX_FLAGS_MASKALL too
9bbb587472 Input: soc_button_array - add Acer Switch V 10 to dmi_use_low_level_irq[]
4ea4316dff Input: soc_button_array - add use_low_level_irq module parameter
c1620e996d Input: goodix - try resetting the controller when no config is set
f4db050958 serial: 8250: 8250_omap: Avoid RS485 RTS glitch on ->set_termios()
7c3e39ccf5 ASoC: Intel: bytcht_es8316: Add quirk for the Nanote UMPC-01
36e0b97619 Input: synaptics - switch touchpad on HP Laptop 15-da3001TU to RMI mode
ae9e0cc973 binder: Gracefully handle BINDER_TYPE_FDA objects with num_fds=0
017de84253 binder: Address corner cases in deferred copy and fixup
2e3c27f241 binder: fix pointer cast warning
c9d3f25a7f binder: defer copies of pre-patched txn data
5204296fc7 binder: read pre-translated fds from sender buffer
23e9d815fa binder: avoid potential data leakage when copying txn
22870431cd x86/ioremap: Fix page aligned size calculation in __ioremap_caller()
3fdeacf087 KVM: x86: remove exit_int_info warning in svm_handle_exit
7e5cb13091 KVM: x86: nSVM: leave nested mode on vCPU free
d925dd3e44 mm: vmscan: fix extreme overreclaim and swap floods
a4a62a23fa gcov: clang: fix the buffer overflow issue
e7f21d10e9 nilfs2: fix nilfs_sufile_mark_dirty() not set segment usage as dirty
f06b7e6a77 usb: dwc3: gadget: Clear ep descriptor last
cff7523ab8 usb: dwc3: gadget: Return -ESHUTDOWN on ep disable
a32635528d usb: dwc3: gadget: conditionally remove requests
ca3a08e9d9 ceph: fix NULL pointer dereference for req->r_session
00c004c070 ceph: Use kcalloc for allocating multiple elements
69263bf781 ceph: fix possible NULL pointer dereference for req->r_session
8e137ace53 ceph: put the requests/sessions when it fails to alloc memory
38993788f4 ceph: fix off by one bugs in unsafe_request_wait()
8a31ae7f77 ceph: flush the mdlog before waiting on unsafe reqs
78b2f546f7 ceph: flush mdlog before umounting
d94ba7b3b7 ceph: make iterate_sessions a global symbol
9ac038d3c2 ceph: make ceph_create_session_msg a global symbol
8382cdf0ab usb: cdns3: Add support for DRD CDNSP
57112da86b mmc: sdhci-brcmstb: Fix SDHCI_RESET_ALL for CQHCI
b5d770977b mmc: sdhci-brcmstb: Enable Clock Gating to save power
049194538c mmc: sdhci-brcmstb: Re-organize flags
fbe955be26 nios2: add FORCE for vmlinuz.gz
c0a9c9973d init/Kconfig: fix CC_HAS_ASM_GOTO_TIED_OUTPUT test with dash
456e895fd0 iio: core: Fix entry not deleted when iio_register_sw_trigger_type() fails
fa9efcbfbf iio: light: apds9960: fix wrong register for gesture gain
bd1b8041c2 arm64: dts: rockchip: lower rk3399-puma-haikou SD controller clock frequency
86ba9c8595 ext4: fix use-after-free in ext4_ext_shift_extents
350e98a08a usb: dwc3: exynos: Fix remove() function
d21d26e65b lib/vdso: use "grep -E" instead of "egrep"
c0cf8bc259 net: enetc: preserve TX ring priority across reconfiguration
de4dd4f9b3 net: enetc: cache accesses to &priv->si->hw
1f080b8caa net: enetc: manage ENETC_F_QBV in priv->active_offloads only when enabled
1d840c5d67 s390/crashdump: fix TOD programmable field size
11052f1188 net: thunderx: Fix the ACPI memory leak
b034fe2a08 nfc: st-nci: fix memory leaks in EVT_TRANSACTION
e14583073f nfc: st-nci: fix incorrect validating logic in EVT_TRANSACTION
9cc863d523 arcnet: fix potential memory leak in com20020_probe()
4d2be0cf27 net: arcnet: Fix RESET flag handling
e61b00374a s390/dasd: fix no record found for raw_track_access
aeebb07499 ipv4: Fix error return code in fib_table_insert()
c0af4d005a dccp/tcp: Reset saddr on failure after inet6?_hash_connect().
b8e494240e netfilter: flowtable_offload: add missing locking
af9de5cdcb dma-buf: fix racing conflict of dma_heap_add()
c40b76dfa7 bnx2x: fix pci device refcount leak in bnx2x_vf_is_pcie_pending()
f81e9c0510 regulator: twl6030: re-add TWL6032_SUBCLASS
32b944b9c4 NFC: nci: fix memory leak in nci_rx_data_packet()
68a7aec3f4 net: sched: allow act_ct to be built without NF_NAT
8e2664e12b sfc: fix potential memleak in __ef100_hard_start_xmit()
6b638a16ea xfrm: Fix ignored return value in xfrm6_init()
c7788361a6 tipc: check skb_linearize() return value in tipc_disc_rcv()
4058e3b74a tipc: add an extra conn_get in tipc_conn_alloc
e87a077d09 tipc: set con sock in tipc_conn_alloc
891daa95b0 net/mlx5: Fix handling of entry refcount when command is not issued to FW
e06ff9f8fe net/mlx5: Fix FW tracer timestamp calculation
5689eba90a netfilter: ipset: regression in ip_set_hash_ip.c
e62e62ea91 netfilter: ipset: Limit the maximal range of consecutive elements to add/delete
8dca384970 Drivers: hv: vmbus: fix possible memory leak in vmbus_device_register()
909186cf34 Drivers: hv: vmbus: fix double free in the error path of vmbus_add_channel_work()
f42802e14a macsec: Fix invalid error code set
72be055615 nfp: add port from netdev validation for EEPROM access
ce41e03cac nfp: fill splittable of devlink_port_attrs correctly
0b553ded34 net: pch_gbe: fix pci device refcount leak while module exiting
2c59ef9ab6 net/qla3xxx: fix potential memleak in ql3xxx_send()
a24d5f6c8b net/mlx4: Check retval of mlx4_bitmap_init
da86a63479 net: ethernet: mtk_eth_soc: fix error handling in mtk_open()
756534f7cf ARM: dts: imx6q-prti6q: Fix ref/tcxo-clock-frequency properties
290a71ff72 ARM: mxs: fix memory leak in mxs_machine_init()
5c97af75f5 netfilter: conntrack: Fix data-races around ct mark
459332f8db 9p/fd: fix issue of list_del corruption in p9_fd_cancel()
26bb8f6aaa net: pch_gbe: fix potential memleak in pch_gbe_tx_queue()
398a860a44 nfc/nci: fix race with opening and closing
3535c632e6 rxrpc: Fix race between conn bundle lookup and bundle removal [ZDI-CAN-15975]
23c03ee0ee rxrpc: Use refcount_t rather than atomic_t
bddde342c6 rxrpc: Allow list of in-use local UDP endpoints to be viewed in /proc
a2d5dba2fc net: liquidio: simplify if expression
8124a02e17 ARM: dts: at91: sam9g20ek: enable udc vbus gpio pinctrl
b547bf71fa tee: optee: fix possible memory leak in optee_register_device()
b76c5a99f4 bus: sunxi-rsb: Support atomic transfers
0c059b7d2a regulator: core: fix UAF in destroy_regulator()
fcb2d28636 spi: dw-dma: decrease reference count in dw_spi_dma_init_mfld()
0b6441abfa regulator: core: fix kobject release warning and memory leak in regulator_register()
26d3d3ffa8 scsi: storvsc: Fix handling of srb_status and capacity change events
c34db0d6b8 ASoC: soc-pcm: Don't zero TDM masks in __soc_pcm_open()
4f6c7344ab ASoC: sgtl5000: Reset the CHIP_CLK_CTRL reg on remove
164a5b50d1 ASoC: hdac_hda: fix hda pcm buffer overflow issue
7cfb4b8579 ARM: dts: am335x-pcm-953: Define fixed regulators in root node
b7000254c1 af_key: Fix send_acquire race with pfkey_register
51969d679b xfrm: replay: Fix ESN wrap around for GSO
497653f6d2 xfrm: fix "disable_policy" on ipv4 early demux
836bbdfcf8 MIPS: pic32: treat port as signed integer
c0bb600f07 RISC-V: vdso: Do not add missing symbols to version section in linker script
81cc6d8400 arm64/syscall: Include asm/ptrace.h in syscall_wrapper header.
fa5f2c72d3 block, bfq: fix null pointer dereference in bfq_bio_bfqg()
d29bde8689 drm: panel-orientation-quirks: Add quirk for Acer Switch V 10 (SW5-017)
f7ce6fb04e scsi: scsi_debug: Make the READ CAPACITY response compliant with ZBC
2574903ee2 scsi: ibmvfc: Avoid path failures during live migration
7fc62181c1 platform/x86: touchscreen_dmi: Add info for the RCA Cambio W101 v2 2-in-1
f54a11b6bf Revert "net: macsec: report real_dev features when HW offloading is enabled"
f4b8c0710a selftests/bpf: Add verifier test for release_reference()
361a165098 spi: stm32: fix stm32_spi_prepare_mbr() that halves spi clk for every run
2c1ca23555 wifi: mac80211: Fix ack frame idr leak when mesh has no route
8d39913158 wifi: airo: do not assign -1 to unsigned char
8552e6048e audit: fix undefined behavior in bit shift for AUDIT_BIT
1c9eb641d1 riscv: dts: sifive unleashed: Add PWM controlled LEDs
92ae6facd1 wifi: mac80211_hwsim: fix debugfs attribute ps with rc table support
2fcc593b50 wifi: mac80211: fix memory free error when registering wiphy fail
044bc6d3c2 ceph: avoid putting the realm twice when decoding snaps fails
d43219bb33 ceph: do not update snapshot context when there is no new snapshot
49c71b6814 iio: pressure: ms5611: fixed value compensation bug
879139bc7a iio: ms5611: Simplify IO callback parameters
80c825e1e3 nvme-pci: add NVME_QUIRK_BOGUS_NID for Micron Nitro
f4066fb910 nvme: add a bogus subsystem NQN quirk for Micron MTFDKBA2T0TFH
4f0cea018e drm/display: Don't assume dual mode adaptors support i2c sub-addressing
347f1793b5 bridge: switchdev: Fix memory leaks when changing VLAN protocol
89a7f155e6 bridge: switchdev: Notify about VLAN protocol changes
f5cbd86ebf ata: libata-core: do not issue non-internal commands once EH is pending
4034d06a4d ata: libata-scsi: simplify __ata_scsi_queuecmd()
03aabcb88a scsi: scsi_transport_sas: Fix error handling in sas_phy_add()
d9b90a99f3 Merge 5.10.156 into android12-5.10-lts
25af5a11f1 Merge 5.10.155 into android12-5.10-lts
e5d2cd6ad8 ANDROID: abi preservation for fscrypt change in 5.10.154
5bc3ece380 Revert "serial: 8250: Let drivers request full 16550A feature probing"
f466ca1247 Merge 5.10.154 into android12-5.10-lts
6d46ef50b1 Linux 5.10.156
7be134eb69 Revert "net: broadcom: Fix BCMGENET Kconfig"
957732a09c ntfs: check overflow when iterating ATTR_RECORDs
6322dda483 ntfs: fix out-of-bounds read in ntfs_attr_find()
b825bfbbaa ntfs: fix use-after-free in ntfs_attr_find()
294ef12dcc mm: fs: initialize fsdata passed to write_begin/write_end interface
a8e2fc8f7b 9p/trans_fd: always use O_NONBLOCK read/write
a5da76df46 gfs2: Switch from strlcpy to strscpy
5fa30be7ba gfs2: Check sb_bsize_shift after reading superblock
f14858bc77 9p: trans_fd/p9_conn_cancel: drop client lock earlier
4154b6afa2 kcm: close race conditions on sk_receive_queue
7deb7a9d33 kcm: avoid potential race in kcm_tx_work
35309be06b tcp: cdg: allow tcp_cdg_release() to be called multiple times
e929ec98c0 macvlan: enforce a consistent minimal mtu
95ebea5a15 uapi/linux/stddef.h: Add include guards
3f25add5ec Input: i8042 - fix leaking of platform device on module removal
7d606ae1ab kprobes: Skip clearing aggrprobe's post_handler in kprobe-on-ftrace case
89ece5ff7d scsi: scsi_debug: Fix possible UAF in sdebug_add_host_helper()
75205f1b47 scsi: target: tcm_loop: Fix possible name leak in tcm_loop_setup_hba_bus()
6e9334436d net: use struct_group to copy ip/ipv6 header addresses
9fd7bdaffe stddef: Introduce struct_group() helper macro
47c3bdd955 usbnet: smsc95xx: Fix deadlock on runtime resume
8208c266fe ring-buffer: Include dropped pages in counting dirty patches
36b5095b07 net: fix a concurrency bug in l2tp_tunnel_register()
023435a095 nvme: ensure subsystem reset is single threaded
b9a5ecf241 nvme: restrict management ioctls to admin
5e2f14d772 perf/x86/intel/pt: Fix sampling using single range output
62634b43d3 misc/vmw_vmci: fix an infoleak in vmci_host_do_receive_datagram()
c1eb46a65b docs: update mediator contact information in CoC doc
4423866d31 mmc: sdhci-pci: Fix possible memory leak caused by missing pci_dev_put()
440653a180 mmc: sdhci-pci-o2micro: fix card detect fail issue caused by CD# debounce timeout
8e70b14131 mmc: core: properly select voltage range without power cycle
05b0f6624d firmware: coreboot: Register bus in module init
deda86a0d8 iommu/vt-d: Set SRE bit only when hardware has SRS cap
d2c7d8f58e scsi: zfcp: Fix double free of FSF request when qdio send fails
db744288af maccess: Fix writing offset in case of fault in strncpy_from_kernel_nofault()
24cc679abb Input: iforce - invert valid length check when fetching device IDs
5f4611fe01 serial: 8250_lpss: Configure DMA also w/o DMA filter
8679087e93 serial: 8250: Flush DMA Rx on RLSI
a5eaad87bf serial: 8250: Fall back to non-DMA Rx if IIR_RDI occurs
f59f5a269c dm ioctl: fix misbehavior if list_versions races with module loading
67a75a9480 iio: pressure: ms5611: changed hardcoded SPI speed to value limited
d95b85c508 iio: adc: mp2629: fix potential array out of bound access
46b8bc62c5 iio: adc: mp2629: fix wrong comparison of channel
8dddf2699d iio: trigger: sysfs: fix possible memory leak in iio_sysfs_trig_init()
85d2a8b287 iio: adc: at91_adc: fix possible memory leak in at91_adc_allocate_trigger()
85cc1a2fd8 usb: typec: mux: Enter safe mode only when pins need to be reconfigured
efaab05520 usb: chipidea: fix deadlock in ci_otg_del_timer
143ba5c2d2 usb: add NO_LPM quirk for Realforce 87U Keyboard
249cef723f USB: serial: option: add Fibocom FM160 0x0111 composition
5c44c60358 USB: serial: option: add u-blox LARA-L6 modem
0e88a3cfa6 USB: serial: option: add u-blox LARA-R6 00B modem
de707957d9 USB: serial: option: remove old LARA-R6 PID
878227a3dd USB: serial: option: add Sierra Wireless EM9191
25c652811d USB: bcma: Make GPIO explicitly optional
eb3af3ea5b speakup: fix a segfault caused by switching consoles
8cbaf4ed53 slimbus: stream: correct presence rate frequencies
15155f7c0e Revert "usb: dwc3: disable USB core PHY management"
100d1e53bb ALSA: hda/realtek: Fix the speaker output on Samsung Galaxy Book Pro 360
c7dcc89482 ALSA: hda/realtek: fix speakers for Samsung Galaxy Book Pro
a80369c8ca ALSA: usb-audio: Drop snd_BUG_ON() from snd_usbmidi_output_open()
28a54854a9 tracing: kprobe: Fix potential null-ptr-deref on trace_array in kprobe_event_gen_test_exit()
bb70fcae41 tracing: kprobe: Fix potential null-ptr-deref on trace_event_file in kprobe_event_gen_test_exit()
315b149f08 tracing: Fix wild-memory-access in register_synth_event()
65ba7e7c24 tracing: Fix memory leak in test_gen_synth_cmd() and test_empty_synth_event()
5d4cc7bc1a tracing/ring-buffer: Have polling block on watermark
5fdebbeca5 ring_buffer: Do not deactivate non-existant pages
6a14828cad ftrace: Fix null pointer dereference in ftrace_add_mod()
6ed60c60ec ftrace: Optimize the allocation for mcount entries
9569eed79b ftrace: Fix the possible incorrect kernel message
5fc19c8313 cifs: add check for returning value of SMB2_set_info_init
0aeb0de528 net: thunderbolt: Fix error handling in tbnet_init()
e13ef43813 cifs: Fix wrong return value checking when GETFLAGS
9f00da9c86 net/x25: Fix skb leak in x25_lapb_receive_frame()
94822d2331 net: ag71xx: call phylink_disconnect_phy if ag71xx_hw_enable() fail in ag71xx_open()
3aeb13bc3d cifs: add check for returning value of SMB2_close_init
c24013273e platform/x86/intel: pmc: Don't unconditionally attach Intel PMC when virtualized
9ed51414ae drbd: use after free in drbd_create_device()
6b23a4b252 net: ena: Fix error handling in ena_init()
2d5a495501 net: ionic: Fix error handling in ionic_init_module()
bb9924a6ed xen/pcpu: fix possible memory leak in register_pcpu()
d6a561bd4c bnxt_en: Remove debugfs when pci_register_driver failed
389738f5db net: caif: fix double disconnect client in chnl_net_open()
fb5ee1560b net: macvlan: Use built-in RCU list checking
709aa1f73d mISDN: fix misuse of put_device() in mISDN_register_device()
417f2d2edf net: liquidio: release resources when liquidio driver open failed
4cba73f2d6 net: hinic: Fix error handling in hinic_module_init()
083a2c9ef8 mISDN: fix possible memory leak in mISDN_dsp_element_register()
6b23993d5b net: bgmac: Drop free_netdev() from bgmac_enet_remove()
1f6a73b25d bpf: Initialize same number of free nodes for each pcpu_freelist
ef2ac07ab8 ata: libata-transport: fix error handling in ata_tdev_add()
7377a14598 ata: libata-transport: fix error handling in ata_tlink_add()
b5362dc163 ata: libata-transport: fix error handling in ata_tport_add()
ac471468f7 ata: libata-transport: fix double ata_host_put() in ata_tport_add()
ac4f404c25 arm64: dts: imx8mn: Fix NAND controller size-cells
30ece7dbee arm64: dts: imx8mm: Fix NAND controller size-cells
f68a9efd78 ARM: dts: imx7: Fix NAND controller size-cells
1d160dfb3f drm: Fix potential null-ptr-deref in drm_vblank_destroy_worker()
c47a823ea1 drm/drv: Fix potential memory leak in drm_dev_init()
c776a49d09 drm/panel: simple: set bpc field for logic technologies displays
777430aa4d pinctrl: devicetree: fix null pointer dereferencing in pinctrl_dt_to_map
bce3e6fe8b parport_pc: Avoid FIFO port location truncation
a4b5423f88 siox: fix possible memory leak in siox_device_add()
0679f571d3 arm64: Fix bit-shifting UB in the MIDR_CPU_MODEL() macro
58636b5ff3 block: sed-opal: kmalloc the cmd/resp buffers
e27458b18b sctp: clear out_curr if all frag chunks of current msg are pruned
0b4c259b63 sctp: remove the unnecessary sinfo_stream check in sctp_prsctp_prune_unsent
7360e7c29d ASoC: soc-utils: Remove __exit for snd_soc_util_exit()
e60f37a1d3 bpf, test_run: Fix alignment problem in bpf_prog_test_run_skb()
b8fe1a5aa7 tty: n_gsm: fix sleep-in-atomic-context bug in gsm_control_send
0a3160f4ff serial: imx: Add missing .thaw_noirq hook
7e1f908e65 serial: 8250: omap: Flush PM QOS work on remove
d833cba201 serial: 8250: omap: Fix unpaired pm_runtime_put_sync() in omap8250_remove()
b0b6ea651e serial: 8250_omap: remove wait loop from Errata i202 workaround
f14c312c21 serial: 8250: omap: Fix missing PM runtime calls for omap8250_set_mctrl()
85cdbf04b4 serial: 8250: Remove serial_rs485 sanitization from em485
f5dedad405 ASoC: tas2764: Fix set_tdm_slot in case of single slot
9e82d78fbe ASoC: tas2770: Fix set_tdm_slot in case of single slot
8d21554ec7 ASoC: core: Fix use-after-free in snd_soc_exit()
38ca9bd336 spi: stm32: Print summary 'callbacks suppressed' message
a180da5564 drm/amdgpu: disable BACO on special BEIGE_GOBY card
f3adf0adf3 drm/amd/pm: disable BACO entry/exit completely on several sienna cichlid cards
b0faeff69a drm/amd/pm: Read BIF STRAP also for BACO check
6958556285 drm/amd/pm: support power source switch on Sienna Cichlid
7daab001a6 mmc: sdhci-esdhc-imx: use the correct host caps for MMC_CAP_8_BIT_DATA
65ac4d1807 spi: intel: Use correct mask for flash and protected regions
23793518a7 mtd: spi-nor: intel-spi: Disable write protection only if asked
a326fffdc7 ALSA: hda/realtek: fix speakers and micmute on HP 855 G8
24839d027c ASoC: codecs: jz4725b: Fix spelling mistake "Sourc" -> "Source", "Routee" -> "Route"
bd48793240 Bluetooth: L2CAP: Fix l2cap_global_chan_by_psm
ce75e90859 btrfs: remove pointless and double ulist frees in error paths of qgroup tests
16743c4bf3 drm/imx: imx-tve: Fix return type of imx_tve_connector_mode_valid
df2747f295 i2c: i801: add lis3lv02d's I2C address for Vostro 5568
959cb0fd69 i2c: tegra: Allocate DMA memory for DMA engine
6cb657722e NFSv4: Retry LOCK on OLD_STATEID during delegation return
f0187227e2 drm/amd/display: Remove wrong pipe control lock
bb3edbd092 ASoC: rt1308-sdw: add the default value of some registers
b1619f0307 selftests/intel_pstate: fix build for ARCH=x86_64
fdf6807606 selftests/futex: fix build for clang
c1f0defecb ASoC: codecs: jz4725b: fix capture selector naming
aeb7e8bc0d ASoC: codecs: jz4725b: use right control for Capture Volume
c87945c173 ASoC: codecs: jz4725b: fix reported volume for Master ctl
9aae00961a ASoC: codecs: jz4725b: add missed Line In power control bit
0b4d650f90 spi: intel: Fix the offset to get the 64K erase opcode
6910e7279f ASoC: wm8962: Add an event handler for TEMP_HP and TEMP_SPK
c7432616f6 ASoC: mt6660: Keep the pm_runtime enables before component stuff in mt6660_i2c_probe
a47606064c ASoC: wm8997: Revert "ASoC: wm8997: Fix PM disable depth imbalance in wm8997_probe"
f8f254c8b5 ASoC: wm5110: Revert "ASoC: wm5110: Fix PM disable depth imbalance in wm5110_probe"
c73aa2cc41 ASoC: wm5102: Revert "ASoC: wm5102: Fix PM disable depth imbalance in wm5102_probe"
673a7341bd Merge 5.10.153 into android12-5.10-lts
27b36ba7c2 Merge 5.10.152 into android12-5.10-lts
bf759deb0f Merge 5.10.151 into android12-5.10-lts
6b31c548a1 ANDROID: fix up struct sk_buf ABI breakage
bd66e91ad2 ANDROID: fix up CRC issue with struct tcp_sock
3905cfd1d6 Revert "serial: 8250: Toggle IER bits on only after irq has been set up"
41217963b1 Linux 5.10.155
0f544353fe io_uring: kill goto error handling in io_sqpoll_wait_sq()
154d744fbe x86/cpu: Restore AMD's DE_CFG MSR after resume
e7294b01de mmc: sdhci-esdhc-imx: Convert the driver to DT-only
534762e261 net: tun: call napi_schedule_prep() to ensure we own a napi
367bc0fa98 dmaengine: at_hdmac: Check return code of dma_async_device_register
85f97c97ef dmaengine: at_hdmac: Fix impossible condition
f53a233eaa dmaengine: at_hdmac: Don't allow CPU to reorder channel enable
f451285522 dmaengine: at_hdmac: Fix completion of unissued descriptor in case of errors
6be4ab08c8 dmaengine: at_hdmac: Fix descriptor handling when issuing it to hardware
a35dd5dd98 dmaengine: at_hdmac: Fix concurrency over the active list
0f603bf553 dmaengine: at_hdmac: Free the memset buf without holding the chan lock
7f07cecc74 dmaengine: at_hdmac: Fix concurrency over descriptor
1582cc3b48 dmaengine: at_hdmac: Fix concurrency problems by removing atc_complete_all()
9b69060a72 dmaengine: at_hdmac: Protect atchan->status with the channel lock
ee35682261 dmaengine: at_hdmac: Do not call the complete callback on device_terminate_all
7078e935b4 dmaengine: at_hdmac: Fix premature completion of desc in issue_pending
ad4cbe8e9c dmaengine: at_hdmac: Start transfer for cyclic channels in issue_pending
24f9e93e50 dmaengine: at_hdmac: Don't start transactions at tx_submit level
4b51cce72a dmaengine: at_hdmac: Fix at_lli struct definition
d37dfb9357 cert host tools: Stop complaining about deprecated OpenSSL functions
f8e0edeaa0 can: j1939: j1939_send_one(): fix missing CAN header initialization
0b692d41ee mm/memremap.c: map FS_DAX device memory as decrypted
03f9582a6a udf: Fix a slab-out-of-bounds write bug in udf_find_entry()
4ea3aa3b98 mms: sdhci-esdhc-imx: Fix SDHCI_RESET_ALL for CQHCI
9c0accfa5a btrfs: selftests: fix wrong error check in btrfs_free_dummy_root()
8fa0c22ef8 platform/x86: hp_wmi: Fix rfkill causing soft blocked wifi
b5ee579fcb drm/i915/dmabuf: fix sg_table handling in map_dma_buf
4feedde548 nilfs2: fix use-after-free bug of ns_writer on remount
1d4ff73062 nilfs2: fix deadlock in nilfs_count_free_blocks()
344ddbd688 ata: libata-scsi: fix SYNCHRONIZE CACHE (16) command failure
516f9f2300 vmlinux.lds.h: Fix placement of '.data..decrypted' section
f6896fb69d ALSA: usb-audio: Add DSD support for Accuphase DAC-60
2032c2d32b ALSA: usb-audio: Add quirk entry for M-Audio Micro
a414a6d6ef ALSA: hda/realtek: Add Positivo C6300 model quirk
3a79f9568d ALSA: hda: fix potential memleak in 'add_widget_node'
380d64168d ALSA: hda/ca0132: add quirk for EVGA Z390 DARK
181cfff57b ALSA: hda/hdmi - enable runtime pm for more AMD display audio
ea6787e482 mmc: sdhci-tegra: Fix SDHCI_RESET_ALL for CQHCI
0a8d4531a0 mmc: sdhci_am654: Fix SDHCI_RESET_ALL for CQHCI
3f558930ad mmc: sdhci-of-arasan: Fix SDHCI_RESET_ALL for CQHCI
b55e64d0a3 mmc: cqhci: Provide helper for resetting both SDHCI and CQHCI
4631cb0406 MIPS: jump_label: Fix compat branch range check
475fd3991a arm64: efi: Fix handling of misaligned runtime regions and drop warning
94ab8f88fe riscv: fix reserved memory setup
0cf9cb0614 riscv: Separate memory init from paging init
d7716240bc riscv: Enable CMA support
ecf78af514 riscv: vdso: fix build with llvm
e56d18a976 riscv: process: fix kernel info leakage
956e0216a1 net: macvlan: fix memory leaks of macvlan_common_newlink
59ec132386 ethernet: tundra: free irq when alloc ring failed in tsi108_open()
dd7beaec8b net: mv643xx_eth: disable napi when init rxq or txq failed in mv643xx_eth_open()
56d3b5531b ethernet: s2io: disable napi when start nic failed in s2io_card_up()
05b2228434 net: atlantic: macsec: clear encryption keys from the stack
1a4e495edf net: phy: mscc: macsec: clear encryption keys when freeing a flow
4ad684ba02 cxgb4vf: shut down the adapter when t4vf_update_port_info() failed in cxgb4vf_open()
38aa7ed8c2 net: cxgb3_main: disable napi when bind qsets failed in cxgb_up()
fd52dd2d6e net: cpsw: disable napi in cpsw_ndo_open()
3b27e20601 net/mlx5e: E-Switch, Fix comparing termination table instance
eb6fa0ac2a net/mlx5: Allow async trigger completion execution on single CPU systems
bdd282bba7 net: nixge: disable napi when enable interrupts failed in nixge_open()
5333cf1b7f net: marvell: prestera: fix memory leak in prestera_rxtx_switch_init()
cf4853880e perf stat: Fix printing os->prefix in CSV metrics output
3a4a3c3b1f drivers: net: xgene: disable napi when register irq failed in xgene_enet_open()
0b7ee3d50f dmaengine: mv_xor_v2: Fix a resource leak in mv_xor_v2_remove()
6e2ffae69d dmaengine: pxa_dma: use platform_get_irq_optional
f31dd15858 tipc: fix the msg->req tlv len check in tipc_nl_compat_name_table_dump_header
fbb4e8e6dc net: broadcom: Fix BCMGENET Kconfig
cb6d639bb1 net: stmmac: dwmac-meson8b: fix meson8b_devm_clk_prepare_enable()
d68fa77ee3 can: af_can: fix NULL pointer dereference in can_rx_register()
a033b86c7f ipv6: addrlabel: fix infoleak when sending struct ifaddrlblmsg to network
02f8dfee75 tcp: prohibit TCP_REPAIR_OPTIONS if data was already sent
f3aa8a7d95 drm/vc4: Fix missing platform_unregister_drivers() call in vc4_drm_register()
bcb3bb1069 hamradio: fix issue of dev reference count leakage in bpq_device_event()
bc4591a86b net: lapbether: fix issue of dev reference count leakage in lapbeth_device_event()
2bf8b1c111 KVM: s390: pv: don't allow userspace to set the clock under PV
a60cc64db7 KVM: s390x: fix SCK locking
fcbd2b3368 capabilities: fix undefined behavior in bit shift for CAP_TO_MASK
8aae24b0ed net: fman: Unregister ethernet device on removal
e2c5ee3b62 bnxt_en: fix potentially incorrect return value for ndo_rx_flow_steer
38147073c9 bnxt_en: Fix possible crash in bnxt_hwrm_set_coal()
3401f96402 net: tun: Fix memory leaks of napi_get_frags
adaa0f180d macsec: clear encryption keys from the stack after setting up offload
9dc7503bae macsec: fix detection of RXSCs when toggling offloading
7f4456f011 macsec: fix secy->n_rx_sc accounting
3b05d9073a macsec: delete new rxsc when offload fails
50868de7dc net: gso: fix panic on frag_list with mixed head alloc types
cedd4f01f6 bpf: Fix wrong reg type conversion in release_reference()
9069db2579 bpf: Add helper macro bpf_for_each_reg_in_vstate
95b6ec7337 bpf: Support for pointers beyond pkt_end.
8597b59e3d HID: hyperv: fix possible memory leak in mousevsc_probe()
8c80b2fca4 bpftool: Fix NULL pointer dereference when pin {PROG, MAP, LINK} without FILE
cc21dc48a7 bpf, sockmap: Fix the sk->sk_forward_alloc warning of sk_stream_kill_queues
e1e1218032 wifi: cfg80211: fix memory leak in query_regdb_file()
914cb94e73 wifi: cfg80211: silence a sparse RCU warning
72ea2fc299 phy: stm32: fix an error code in probe
925bf1ba76 hwspinlock: qcom: correct MMIO max register for newer SoCs
76eba54f0d fuse: fix readdir cache race
7bcea6c5c9 ANDROID: gki_defconfig: remove CONFIG_INIT_STACK_ALL_ZERO=y
d2bc3376cd Revert "serial: 8250: Fix restoring termios speed after suspend"
0b500f5b16 Merge 5.10.150 into android12-5.10-lts
f5b40c0eb9 Linux 5.10.154
bf506e366d ipc: remove memcg accounting for sops objects in do_semtimedop()
c6678c8f4f wifi: brcmfmac: Fix potential buffer overflow in brcmf_fweh_event_worker()
a6c57adec5 drm/i915/sdvo: Setup DDC fully before output init
b86830cc95 drm/i915/sdvo: Filter out invalid outputs more sensibly
9f3b867808 drm/rockchip: dsi: Force synchronous probe
23f1fc7ce5 ext4,f2fs: fix readahead of verity data
e5cef906cb KVM: x86: emulator: update the emulation mode after CR0 write
ce9261accc KVM: x86: emulator: introduce emulator_recalc_and_set_mode
c8a2fd7a71 KVM: x86: emulator: em_sysexit should update ctxt->mode
e0c7410378 KVM: x86: Mask off reserved bits in CPUID.80000001H
9302ebc1c2 KVM: x86: Mask off reserved bits in CPUID.80000008H
cc40c5f3e9 KVM: x86: Mask off reserved bits in CPUID.8000001AH
bd64a88f36 KVM: x86: Mask off reserved bits in CPUID.80000006H
156451a67b ext4: fix BUG_ON() when directory entry has invalid rec_len
5370b965b7 ext4: fix warning in 'ext4_da_release_space'
c9598cf629 parisc: Avoid printing the hardware path twice
98f836e80d parisc: Export iosapic_serial_irq() symbol for serial port driver
814af9a32b parisc: Make 8250_gsc driver dependend on CONFIG_PARISC
29d106d086 perf/x86/intel: Add Cooper Lake stepping to isolation_ucodes[]
98f6e7c337 perf/x86/intel: Fix pebs event constraints for ICL
3be2d66822 efi: random: Use 'ACPI reclaim' memory for random seed
83294f7c77 efi: random: reduce seed size to 32 bytes
f8e8cda869 fuse: add file_modified() to fallocate
cdf01c807e capabilities: fix potential memleak on error path from vfs_getxattr_alloc()
ff32d8a099 tracing/histogram: Update document for KEYS_MAX size
533bfacbac tools/nolibc/string: Fix memcmp() implementation
f100a02748 kprobe: reverse kp->flags when arm_kprobe failed
bef08acbe5 tracing: kprobe: Fix memory leak in test_gen_kprobe/kretprobe_cmd()
2bf33b5ea4 tcp/udp: Make early_demux back namespacified.
ea5f2fd464 ftrace: Fix use-after-free for dynamic ftrace_ops
06de93a47c btrfs: fix type of parameter generation in btrfs_get_dentry
e33ce54cef coresight: cti: Fix hang in cti_disable_hw()
015ac18be7 binder: fix UAF of alloc->vma in race with munmap()
836686e1a0 memcg: enable accounting of ipc resources
e4e4b24b42 mtd: rawnand: gpmi: Set WAIT_FOR_READY timeout based on program/erase times
818c36b988 tcp/udp: Fix memory leak in ipv6_renew_options().
29997a6fa6 fscrypt: fix keyring memory leak on mount failure
391cceee6d fscrypt: stop using keyrings subsystem for fscrypt_master_key
092401142b fscrypt: simplify master key locking
54c13d3520 ALSA: usb-audio: Add quirks for MacroSilicon MS2100/MS2106 devices
a0e2577cf3 block, bfq: protect 'bfqd->queued' by 'bfqd->lock'
26ca2ac091 Bluetooth: L2CAP: Fix attempting to access uninitialized memory
6b6f94fb9a Bluetooth: L2CAP: Fix accepting connection request for invalid SPSM
bfd5e62f9a i2c: piix4: Fix adapter not be removed in piix4_remove()
fc3e2fa0a5 arm64: dts: juno: Add thermal critical trip points
b743ecf29c firmware: arm_scmi: Make Rx chan_setup fail on memory errors
29e8e9bfc2 firmware: arm_scmi: Suppress the driver's bind attributes
d7b1e2cbe0 ARM: dts: imx6qdl-gw59{10,13}: fix user pushbutton GPIO offset
160d8904b2 efi/tpm: Pass correct address to memblock_reserve
c40b4d604b i2c: xiic: Add platform module alias
5bf8c7798b drm/amdgpu: set vm_update_mode=0 as default for Sienna Cichlid in SRIOV case
496eb203d0 HID: saitek: add madcatz variant of MMO7 mouse device ID
ff06067b70 scsi: core: Restrict legal sdev_state transitions via sysfs
9edf20e5a1 ACPI: APEI: Fix integer overflow in ghes_estatus_pool_init()
be6e22f546 media: meson: vdec: fix possible refcount leak in vdec_probe()
c5fd54a65c media: dvb-frontends/drxk: initialize err to 0
7fdc58d8c2 media: cros-ec-cec: limit msg.len to CEC_MAX_MSG_SIZE
1609231f86 media: s5p_cec: limit msg.len to CEC_MAX_MSG_SIZE
c46759e370 media: rkisp1: Zero v4l2_subdev_format fields in when validating links
3144ce5574 media: rkisp1: Initialize color space on resizer sink and source pads
6b24d9c2ac s390/boot: add secure boot trailer
efc6420d65 xhci-pci: Set runtime PM as default policy on all xHC 1.2 or later devices
37bb57908d mtd: parsers: bcm47xxpart: Fix halfblock reads
85e458369c mtd: parsers: bcm47xxpart: print correct offset on read error
ec54104feb fbdev: stifb: Fall back to cfb_fillrect() on 32-bit HCRX cards
f8c86d7829 video/fbdev/stifb: Implement the stifb_fillrect() function
e975d7aeca mmc: sdhci-pci-core: Disable ES for ASUS BIOS on Jasper Lake
afeae13b8a mmc: sdhci-pci: Avoid comma separated statements
a06721767c mmc: sdhci-esdhc-imx: Propagate ESDHC_FLAG_HS400* only on 8bit bus
59400c9b0d drm/msm/hdmi: fix IRQ lifetime
8225bdaec5 drm/msm/hdmi: Remove spurious IRQF_ONESHOT flag
5dbb47ee89 ipv6: fix WARNING in ip6_route_net_exit_late()
1c89642e7f net, neigh: Fix null-ptr-deref in neigh_table_clear()
634f066d02 net: mdio: fix undefined behavior in bit shift for __mdiobus_register
d9ec6e2fbd Bluetooth: L2CAP: fix use-after-free in l2cap_conn_del()
cb1c012099 Bluetooth: L2CAP: Fix use-after-free caused by l2cap_reassemble_sdu
0a0dead4ad btrfs: fix ulist leaks in error paths of qgroup self tests
61e0612811 btrfs: fix inode list leak during backref walking at find_parent_nodes()
a52e24c7fc btrfs: fix inode list leak during backref walking at resolve_indirect_refs()
81204283ea isdn: mISDN: netjet: fix wrong check of device registration
e77d213843 mISDN: fix possible memory leak in mISDN_register_device()
f06186e527 rose: Fix NULL pointer dereference in rose_send_frame()
2c8d81bdb2 ipvs: fix WARNING in ip_vs_app_net_cleanup()
931f56d59c ipvs: fix WARNING in __ip_vs_cleanup_batch()
d69328cdb9 ipvs: use explicitly signed chars
b2d7a92aff netfilter: nf_tables: release flow rule object from commit path
3583826b44 net: tun: fix bugs for oversize packet when napi frags enabled
5960b9081b net: sched: Fix use after free in red_enqueue()
24f9c41435 ata: pata_legacy: fix pdc20230_set_piomode()
c85ee1c3cb net: fec: fix improper use of NETDEV_TX_BUSY
52438e734c nfc: nfcmrvl: Fix potential memory leak in nfcmrvl_i2c_nci_send()
0acfcd2aed nfc: s3fwrn5: Fix potential memory leak in s3fwrn5_nci_send()
9ae2c9a91f nfc: nxp-nci: Fix potential memory leak in nxp_nci_send()
eecea068bf NFC: nxp-nci: remove unnecessary labels
e8c11ee2d0 nfc: fdp: Fix potential memory leak in fdp_nci_send()
31b83d6990 nfc: fdp: drop ftrace-like debugging messages
4e1e4485b2 RDMA/qedr: clean up work queue on failure in qedr_alloc_resources()
d360e875c0 RDMA/core: Fix null-ptr-deref in ib_core_cleanup()
37a098fc9b net: dsa: Fix possible memory leaks in dsa_loop_init()
45aea4fbf6 nfs4: Fix kmemleak when allocate slot failed
f0f1c74fa6 NFSv4.1: We must always send RECLAIM_COMPLETE after a reboot
10c554d722 NFSv4.1: Handle RECLAIM_COMPLETE trunking errors
4813dd737d NFSv4: Fix a potential state reclaim deadlock
7c4260f8f1 IB/hfi1: Correctly move list in sc_disable()
87ac93c8dd RDMA/cma: Use output interface for net_dev check
4dbb739eb2 KVM: x86: Add compat handler for KVM_X86_SET_MSR_FILTER
bb584caee8 KVM: x86: Copy filter arg outside kvm_vm_ioctl_set_msr_filter()
9faacf442d KVM: x86: Protect the unused bits in MSR exiting flags
5bdbccc79c x86/topology: Fix duplicated core ID within a package
6c31fc028a x86/topology: Fix multiple packages shown on a single-package system
f5ad52da14 x86/topology: Set cpu_die_id only if DIE_TYPE found
570fa3bcd2 KVM: x86: Treat #DBs from the emulator as fault-like (code and DR7.GD=1)
e5d7c6786b KVM: x86: Trace re-injected exceptions
8364786152 KVM: nVMX: Don't propagate vmcs12's PERF_GLOBAL_CTRL settings to vmcs02
523e1dd9f8 KVM: nVMX: Pull KVM L0's desired controls directly from vmcs01
028fcabd8a serial: ar933x: Deassert Transmit Enable on ->rs485_config()
e6da7808c9 serial: 8250: Let drivers request full 16550A feature probing
95aa34f721 Linux 5.10.153
26a2b9c468 serial: Deassert Transmit Enable on probe in driver-specific way
4a230f65d6 serial: core: move RS485 configuration tasks from drivers into core
eb69c07eca can: rcar_canfd: rcar_canfd_handle_global_receive(): fix IRQ storm on global FIFO receive
d5924531dd arm64/kexec: Test page size support with new TGRAN range values
c911f03f8d arm64/mm: Fix __enable_mmu() for new TGRAN range values
d523384766 scsi: sd: Revert "scsi: sd: Remove a local variable"
52a43b8200 arm64: Add AMPERE1 to the Spectre-BHB affected list
9889ca7efa net: enetc: survive memory pressure without crashing
fdba224ab0 net/mlx5: Fix crash during sync firmware reset
bbcc06933f net/mlx5: Fix possible use-after-free in async command interface
16376ba5cf net/mlx5e: Do not increment ESN when updating IPsec ESN state
0d88359092 nh: fix scope used to find saddr when adding non gw nh
3519b5ddac net: ehea: fix possible memory leak in ehea_register_port()
79631daa5a openvswitch: switch from WARN to pr_warn
00d6f33f67 ALSA: aoa: Fix I2S device accounting
ce6fd1c382 ALSA: aoa: i2sbus: fix possible memory leak in i2sbus_add_dev()
97262705c0 net: fec: limit register access on i.MX6UL
df67a8e625 PM: domains: Fix handling of unavailable/disabled idle states
1f262d8088 net: ksz884x: fix missing pci_disable_device() on error in pcidev_init()
6170b4579f i40e: Fix flow-type by setting GL_HASH_INSET registers
9abae363af i40e: Fix VF hang when reset is triggered on another VF
23d5599058 i40e: Fix ethtool rx-flow-hash setting for X722
44affe7ede ipv6: ensure sane device mtu in tunnels
905f05c0ab media: vivid: set num_in/outputs to 0 if not supported
b6c7446d0a media: videodev2.h: V4L2_DV_BT_BLANKING_HEIGHT should check 'interlaced'
683015ae16 media: v4l2-dv-timings: add sanity checks for blanking values
147b8f1892 media: vivid: dev->bitmap_cap wasn't freed in all cases
1cf51d5158 media: vivid: s_fbuf: add more sanity checks
3221c2701d PM: hibernate: Allow hybrid sleep to work with s2idle
0eb19ecbd0 can: mcp251x: mcp251x_can_probe(): add missing unregister_candev() in error path
6b2d07fc0b can: mscan: mpc5xxx: mpc5xxx_can_probe(): add missing put_clock() in error path
1634d5d39c tcp: fix indefinite deferral of RTO with SACK reneging
4f23cb2be5 tcp: fix a signed-integer-overflow bug in tcp_add_backlog()
49713d7c38 tcp: minor optimization in tcp_add_backlog()
aab883bd60 net: lantiq_etop: don't free skb when returning NETDEV_TX_BUSY
c3edc6e808 net: fix UAF issue in nfqnl_nf_hook_drop() when ops_init() failed
e2a28807b1 kcm: annotate data-races around kcm->rx_wait
c325f92d8d kcm: annotate data-races around kcm->rx_psock
af7879529e atlantic: fix deadlock at aq_nic_stop
d7ccd49c4d amd-xgbe: add the bit rate quirk for Molex cables
17350734fd amd-xgbe: fix the SFP compliance codes check for DAC cables
b55d6ea965 x86/unwind/orc: Fix unreliable stack dump with gcov
0ce1ef3353 net: hinic: fix the issue of double release MBOX callback of VF
6603843c80 net: hinic: fix the issue of CMDQ memory leaks
bb01910763 net: hinic: fix memory leak when reading function table
ce605b68db net: hinic: fix incorrect assignment issue in hinic_set_interrupt_cfg()
62f0a08e82 net: netsec: fix error handling in netsec_register_mdio()
32a3d4660b tipc: fix a null-ptr-deref in tipc_topsrv_accept
fb94152aae perf/x86/intel/lbr: Use setup_clear_cpu_cap() instead of clear_cpu_cap()
bfce730886 ALSA: ac97: fix possible memory leak in snd_ac97_dev_register()
2663b16c76 ASoC: qcom: lpass-cpu: Mark HDMI TX parity register as volatile
a527557299 arc: iounmap() arg is volatile
648ac633e7 ASoC: qcom: lpass-cpu: mark HDMI TX registers as volatile
6571f6ca8a drm/msm: Fix return type of mdp4_lvds_connector_mode_valid
4953a989b7 media: v4l2: Fix v4l2_i2c_subdev_set_name function documentation
9d00384270 net: ieee802154: fix error return code in dgram_bind()
568e3812b1 mm,hugetlb: take hugetlb_lock before decrementing h->resv_huge_pages
935a8b6202 mm/memory: add non-anonymous page check in the copy_present_page()
49db6cb814 xen/gntdev: Prevent leaking grants
a3f2cc11d6 Xen/gntdev: don't ignore kernel unmapping error
467230b9ef s390/pci: add missing EX_TABLE entries to __pcistg_mio_inuser()/__pcilg_mio_inuser()
fe187c801a s390/futex: add missing EX_TABLE entry to __futex_atomic_op()
449070996c perf auxtrace: Fix address filter symbol name match for modules
6f72a3977b kernfs: fix use-after-free in __kernfs_remove
0bcd1ab3e8 counter: microchip-tcb-capture: Handle Signal1 read and Synapse
8bf037279b mmc: core: Fix kernel panic when remove non-standard SDIO card
5684808b26 mmc: sdhci_am654: 'select', not 'depends' REGMAP_MMIO
b686ffc0ac drm/msm/dp: fix IRQ lifetime
08c7375fa2 drm/msm/hdmi: fix memory corruption with too many bridges
21c4679af0 drm/msm/dsi: fix memory corruption with too many bridges
44a86d96fa scsi: qla2xxx: Use transport-defined speed mask for supported_speeds
c368f751da mac802154: Fix LQI recording
9ba2990f4e exec: Copy oldsighand->action under spin-lock
7062153004 fs/binfmt_elf: Fix memory leak in load_elf_binary()
d9ddfeb01f fbdev: smscufx: Fix several use-after-free bugs
f19f1a75d3 iio: temperature: ltc2983: allocate iio channels once
af236da855 iio: light: tsl2583: Fix module unloading
90ff5bef2b tools: iio: iio_utils: fix digit calculation
678d2cc204 xhci: Remove device endpoints from bandwidth list when freeing the device
3b250824b6 xhci: Add quirk to reset host back to default state at shutdown
63c7df3c81 mtd: rawnand: marvell: Use correct logic for nand-keep-config
228101fc83 usb: xhci: add XHCI_SPURIOUS_SUCCESS to ASM1042 despite being a V0.96 controller
2bc4f99ee2 usb: bdc: change state when port disconnected
e440957f9c usb: dwc3: gadget: Don't set IMI for no_interrupt
fb074d622c usb: dwc3: gadget: Stop processing more requests on IMI
c29fcef579 USB: add RESET_RESUME quirk for NVIDIA Jetson devices in RCM
4cc7a360ec ALSA: rme9652: use explicitly signed char
8959092300 ALSA: au88x0: use explicitly signed char
2bf5b16315 ALSA: Use del_timer_sync() before freeing timer
ca1034bff8 can: kvaser_usb: Fix possible completions during init_completion
370be31cde can: j1939: transport: j1939_session_skb_drop_old(): spin_unlock_irqrestore() before kfree_skb()
7d51b4c67c Linux 5.10.152
43d5109296 udp: Update reuse->has_conns under reuseport_lock.
a50ed2d287 mm: /proc/pid/smaps_rollup: fix no vma's null-deref
31b1570677 blk-wbt: fix that 'rwb->wc' is always set to 1 in wbt_init()
e2f9b62ead mmc: core: Add SD card quirk for broken discard
3a260e9844 Makefile.debug: re-enable debug info for .S files
6ab2287b26 x86/Kconfig: Drop check for -mabi=ms for CONFIG_EFI_STUB
67dafece56 ACPI: video: Force backlight native for more TongFang devices
dcaf631320 hv_netvsc: Fix race between VF offering and VF association message from host
da54c5f4b5 perf/x86/intel/pt: Relax address filter validation
79c3482fbe riscv: topology: fix default topology reporting
a6e770733d arm64: topology: move store_cpu_topology() to shared code
cb1024d8a4 arm64: dts: qcom: sc7180-trogdor: Fixup modem memory region
f687e2111b fcntl: fix potential deadlocks for &fown_struct.lock
b1efc19644 fcntl: make F_GETOWN(EX) return 0 on dead owner task
ca4c498382 perf: Skip and warn on unknown format 'configN' attrs
dea47fefa6 perf pmu: Validate raw event with sysfs exported format bits
86e995f964 riscv: always honor the CONFIG_CMDLINE_FORCE when parsing dtb
0e4c06ae7c riscv: Add machine name to kernel boot log and stack dump output
7fba4a389d mmc: sdhci-tegra: Use actual clock rate for SW tuning correction
3c6a888e35 xen/gntdev: Accommodate VMA splitting
5232411f37 xen: assume XENFEAT_gnttab_map_avail_bits being set for pv guests
ea82edad0a tracing: Do not free snapshot if tracer is on cmdline
bd6af07e79 tracing: Simplify conditional compilation code in tracing_set_tracer()
4e3a15ca24 dmaengine: mxs: use platform_driver_register
1da5d24970 dmaengine: mxs-dma: Remove the unused .id_table
1414e9bf3c drm/virtio: Use appropriate atomic state in virtio_gpu_plane_cleanup_fb()
d74196bb27 iommu/vt-d: Clean up si_domain in the init_dmars() error path
ef11e8ec00 iommu/vt-d: Allow NVS regions in arch_rmrr_sanity_check()
35c92435be net: phy: dp83822: disable MDI crossover status change interrupt
7aa3d623c1 net: sched: fix race condition in qdisc_graft()
2974f3b330 net: hns: fix possible memory leak in hnae_ae_register()
3032e316e0 sfc: include vport_id in filter spec hash and equal()
ded86c4191 net: sched: sfb: fix null pointer access issue when sfb_init() fails
305aa36b62 net: sched: delete duplicate cleanup of backlog and qlen
ae48bee283 net: sched: cake: fix null pointer access issue when cake_init() fails
2008ad08a2 nvme-hwmon: kmalloc the NVME SMART log buffer
770b7e3a2c nvme-hwmon: consistently ignore errors from nvme_hwmon_init
67106ac272 nvme-hwmon: Return error code when registration fails
bc17f727b0 nvme-hwmon: rework to avoid devm allocation
191d71c635 ionic: catch NULL pointer issue on reconfig
ff7ba76675 net: hsr: avoid possible NULL deref in skb_clone()
7286f87551 cifs: Fix xid leak in cifs_ses_add_channel()
2d08311aa3 cifs: Fix xid leak in cifs_flock()
bf49d4fe4a cifs: Fix xid leak in cifs_copy_file_range()
05cc22c008 net: phy: dp83867: Extend RX strap quirk for SGMII mode
118f412bed net/atm: fix proc_mpc_write incorrect return value
c8310a99e7 sfc: Change VF mac via PF as first preference if available.
39d10f0dfb HID: magicmouse: Do not set BTN_MOUSE on double report
ed5baf3d0a i40e: Fix DMA mappings leak
e558e14893 tipc: fix an information leak in tipc_topsrv_kern_subscr
1f4ed95ce6 tipc: Fix recognition of trial period
fc8c6b8bb2 ACPI: extlog: Handle multiple records
57e157749a btrfs: fix processing of delayed tree block refs during backref walking
590929ef69 btrfs: fix processing of delayed data refs during backref walking
cc841a8a70 r8152: add PID for the Lenovo OneLink+ Dock
51b96ecaed arm64: errata: Remove AES hwcap for COMPAT tasks
910ba49b33 blk-wbt: call rq_qos_add() after wb_normal is initialized
392536023d block: wbt: Remove unnecessary invoking of wbt_update_limits in wbt_init
ab6aaa8210 media: venus: dec: Handle the case where find_format fails
bce5808fc9 media: mceusb: set timeout to at least timeout provided
6d725672ce KVM: arm64: vgic: Fix exit condition in scan_its_table()
34db701dc6 kvm: Add support for arch compat vm ioctls
e55feb31df cpufreq: qcom: fix memory leak in error path
303d0f7614 ata: ahci: Match EM_MAX_SLOTS with SATA_PMP_MAX_PORTS
6a2aadcb01 ata: ahci-imx: Fix MODULE_ALIAS
d9f0159da0 hwmon/coretemp: Handle large core ID value
0fb04676c4 x86/microcode/AMD: Apply the patch early on every logical thread
6dcf1f0802 i2c: qcom-cci: Fix ordering of pm_runtime_xx and i2c_add_adapter
794ded0bc4 cpufreq: qcom: fix writes in read-only memory region
2723875e9d selinux: enable use of both GFP_KERNEL and GFP_ATOMIC in convert_context()
0d65f040fd ocfs2: fix BUG when iput after ocfs2_mknod fails
b838dcfda1 ocfs2: clear dinode links count in case of error
c34d1b22fe Linux 5.10.151
ecad331211 kbuild: Add skip_encoding_btf_enum64 option to pahole
c5006abb80 kbuild: Unify options for BTF generation for vmlinux and modules
f5f413cb3e kbuild: skip per-CPU BTF generation for pahole v1.18-v1.21
06481cd9f7 kbuild: Quote OBJCOPY var to avoid a pahole call break the build
bbaea0f1cd bpf: Generate BTF_KIND_FLOAT when linking vmlinux
a10a57a224 Linux 5.10.150
243c8f42ba Revert "drm/amdgpu: make sure to init common IP before gmc"
8026d58b49 gcov: support GCC 12.1 and newer compilers
cbf2c43b36 f2fs: fix wrong condition to trigger background checkpoint correctly
7b19858803 thermal: intel_powerclamp: Use first online CPU as control_cpu
f039b43cba inet: fully convert sk->sk_rx_dst to RCU rules
67de22cb0b ext4: continue to expand file system when the target size doesn't reach
357db159e9 Revert "drm/amdgpu: use dirty framebuffer helper"
98ab15bfdc Revert "drm/amdgpu: move nbio sdma_doorbell_range() into sdma code for vega"
791489a5c5 net/ieee802154: don't warn zero-sized raw_sendmsg()
a96336a5f2 Revert "net/ieee802154: reject zero-sized raw_sendmsg()"
dc54ff9fc4 net: ieee802154: return -EINVAL for unknown addr type
45c3396675 mm: hugetlb: fix UAF in hugetlb_handle_userfault
c378c479c5 io_uring/af_unix: defer registered files gc to io_uring release
67cbc8865a io_uring: correct pinned_vm accounting
904f881b57 arm64: topology: fix possible overflow in amu_fie_setup()
b5dc2f2578 perf intel-pt: Fix segfault in intel_pt_print_info() with uClibc
9b4e849777 clk: bcm2835: Make peripheral PLLC critical
b8bbae3236 usb: idmouse: fix an uninit-value in idmouse_open
d5bb45f47b nvmet-tcp: add bounds check on Transfer Tag
b79da0080d nvme: copy firmware_rev on each init
e6cc39db24 staging: rtl8723bs: fix a potential memory leak in rtw_init_cmd_priv()
3a5a34ed9d Revert "usb: storage: Add quirk for Samsung Fit flash"
acf0006f2b usb: musb: Fix musb_gadget.c rxstate overflow bug
91271a3e77 usb: host: xhci: Fix potential memory leak in xhci_alloc_stream_info()
782b3e71c9 md/raid5: Wait for MD_SB_CHANGE_PENDING in raid5d
dbcca76435 HID: roccat: Fix use-after-free in roccat_read()
f00c049ede soundwire: intel: fix error handling on dai registration issues
f04a673d4a soundwire: cadence: Don't overwrite msg->buf during write commands
c263516c2c bcache: fix set_at_max_writeback_rate() for multiple attached devices
fcad2ac863 ata: libahci_platform: Sanity check the DT child nodes number
19c010ae44 blk-throttle: prevent overflow while calculating wait time
1b3cebeca9 staging: vt6655: fix potential memory leak
89f305a714 power: supply: adp5061: fix out-of-bounds read in adp5061_get_chg_type()
b2700f98b3 nbd: Fix hung when signal interrupts nbd_start_device_ioctl()
5942e5c63d scsi: 3w-9xxx: Avoid disabling device if failing to enable it
48727117bd usb: host: xhci-plat: suspend/resume clks for brcm
c13d0d2f5a usb: host: xhci-plat: suspend and resume clocks
12d31182de clk: zynqmp: pll: rectify rate rounding in zynqmp_pll_round_rate
c2257c8a50 media: cx88: Fix a null-ptr-deref bug in buffer_prepare()
d9e2585c3b clk: zynqmp: Fix stack-out-of-bounds in strncpy`
70f8b48d0b btrfs: scrub: try to fix super block errors
8f554dd23c arm64: dts: imx8mq-librem5: Add bq25895 as max17055's power supply
451ce2521c kselftest/arm64: Fix validatation termination record after EXTRA_CONTEXT
017cabfb3f ARM: dts: imx6sx: add missing properties for sram
9d3ca48722 ARM: dts: imx6sll: add missing properties for sram
9735f2b62b ARM: dts: imx6sl: add missing properties for sram
2829b6ad30 ARM: dts: imx6qp: add missing properties for sram
0c3a0b3d5e ARM: dts: imx6dl: add missing properties for sram
2763a3b43a ARM: dts: imx6q: add missing properties for sram
82e0d91484 ARM: dts: imx7d-sdb: config the max pressure for tsc2046
166feb964f drm/amd/display: Remove interface for periodic interrupt 1
1bb6f4a8db drm/dp: Don't rewrite link config when setting phy test pattern
bb91c06b0b mmc: sdhci-msm: add compatible string check for sdm670
8a427a2283 drm/meson: explicitly remove aggregate driver at module unload time
1c7d957c5d drm/amdgpu: fix initial connector audio value
69130888b2 ASoC: SOF: pci: Change DMI match info to support all Chrome platforms
54f2585e2d platform/x86: msi-laptop: Change DMI match / alias strings to fix module autoloading
a9d6a7c9b6 platform/chrome: cros_ec: Notify the PM of wake events during resume
e29d20deaf drm: panel-orientation-quirks: Add quirk for Anbernic Win600
bfdb391d57 drm/vc4: vec: Fix timings for VEC modes
b70f8abc1a drm: bridge: dw_hdmi: only trigger hotplug event on link change
bbe2f6f903 udmabuf: Set ubuf->sg = NULL if the creation of sg table fails
0a4fddc95c drm/amd/display: fix overflow on MIN_I64 definition
3959e8faf8 gpu: lontium-lt9611: Fix NULL pointer dereference in lt9611_connector_init()
c28a8082b2 drm: Prevent drm_copy_field() to attempt copying a NULL pointer
e7d7018003 drm: Use size_t type for len variable in drm_copy_field()
3339a51bcd drm/nouveau/nouveau_bo: fix potential memory leak in nouveau_bo_alloc()
484400d433 r8152: Rate limit overflow messages
0c108cf3ad Bluetooth: L2CAP: Fix user-after-free
65029aaedd net: If sock is dead don't access sock's sk_wq in sk_stream_wait_memory
4851303c85 wifi: rt2x00: correctly set BBP register 86 for MT7620
a016144479 wifi: rt2x00: set SoC wmac clock register
5aa0461d11 wifi: rt2x00: set VGC gain for both chains of MT7620
8d9c00979a wifi: rt2x00: set correct TX_SW_CFG1 MAC register for MT7620
27ed98e8a9 wifi: rt2x00: don't run Rt5592 IQ calibration on MT7620
3d67986e72 can: bcm: check the result of can_send() in bcm_can_tx()
7b674dce41 Bluetooth: hci_sysfs: Fix attempting to call device_add multiple times
e25ca9af8a Bluetooth: L2CAP: initialize delayed works at l2cap_chan_create()
b051d9bf98 regulator: core: Prevent integer underflow
e01d96494a wifi: brcmfmac: fix use-after-free bug in brcmf_netdev_start_xmit()
be81c44242 xfrm: Update ipcomp_scratches with NULL when freed
9661724f62 wifi: ath9k: avoid uninit memory read in ath9k_htc_rx_msg()
0958e487e8 tcp: annotate data-race around tcp_md5sig_pool_populated
129ca0db95 openvswitch: Fix overreporting of drops in dropwatch
4398e8a7fd openvswitch: Fix double reporting of drops in dropwatch
e3c9b94734 bpftool: Clear errno after libcap's checks
50e45034c5 wifi: brcmfmac: fix invalid address access when enabling SCAN log level
bbacfcde5f NFSD: fix use-after-free on source server when doing inter-server copy
3de402a524 NFSD: Return nfserr_serverfault if splice_ok but buf->pages have data
1f730d4ae6 x86/entry: Work around Clang __bdos() bug
513943bf87 thermal: intel_powerclamp: Use get_cpu() instead of smp_processor_id() to avoid crash
708b9abe1b powercap: intel_rapl: fix UBSAN shift-out-of-bounds issue
b434edb0e9 MIPS: BCM47XX: Cast memcmp() of function to (void *)
6c61a37ea7 ACPI: video: Add Toshiba Satellite/Portege Z830 quirk
0dd025483f rcu-tasks: Convert RCU_LOCKDEP_WARN() to WARN_ONCE()
36d4ffbedf rcu: Back off upon fill_page_cache_func() allocation failure
278d8ba2b2 selftest: tpm2: Add Client.__del__() to close /dev/tpm* handle
b60aa21e2f f2fs: fix to account FS_CP_DATA_IO correctly
0b8230d44c f2fs: fix to avoid REQ_TIME and CP_TIME collision
ecbd95958c f2fs: fix race condition on setting FI_NO_EXTENT flag
110146ce8f ACPI: APEI: do not add task_work to kernel thread to avoid memory leak
dce07e87ee thermal/drivers/qcom/tsens-v0_1: Fix MSM8939 fourth sensor hw_id
3a720eb890 crypto: cavium - prevent integer overflow loading firmware
7bfa7d6773 crypto: marvell/octeontx - prevent integer overflows
cdd42eb468 kbuild: rpm-pkg: fix breakage when V=1 is used
6d1aef17e7 kbuild: remove the target in signal traps when interrupted
8d76dd5080 tracing: kprobe: Make gen test module work in arm and riscv
c6512a6f0c tracing: kprobe: Fix kprobe event gen test module on exit
9e6ba62d41 iommu/iova: Fix module config properly
426d5bc089 crypto: qat - fix DMA transfer direction
a43babc059 crypto: qat - use pre-allocated buffers in datapath
a91af50850 crypto: qat - fix use of 'dma_map_single'
8a4ed09ed8 crypto: inside-secure - Change swab to swab32
d33935e666 crypto: ccp - Release dma channels before dmaengine unrgister
a1354bdd19 crypto: akcipher - default implementation for setting a private key
2fee0dbfae iommu/omap: Fix buffer overflow in debugfs
cfde58a8e4 cgroup/cpuset: Enable update_tasks_cpumask() on top_cpuset
ab2485eb5d hwrng: imx-rngc - Moving IRQ handler registering after imx_rngc_irq_mask_clear()
d88b88514e crypto: hisilicon/zip - fix mismatch in get/set sgl_sge_nr
25f1342473 crypto: sahara - don't sleep when in softirq
2d285164fb powerpc: Fix SPE Power ISA properties for e500v1 platforms
2bde4e1e4f powerpc/64s: Fix GENERIC_CPU build flags for PPC970 / G5
7ae8bed908 x86/hyperv: Fix 'struct hv_enlightened_vmcs' definition
6315998170 powerpc/powernv: add missing of_node_put() in opal_export_attrs()
434db6d17b powerpc/pci_dn: Add missing of_node_put()
718e2d8023 powerpc/sysdev/fsl_msi: Add missing of_node_put()
592d283a65 powerpc/math_emu/efp: Include module.h
44c26ceffa mailbox: bcm-ferxrm-mailbox: Fix error check for dma_map_sg
b1616599c9 clk: ast2600: BCLK comes from EPLL
6d01017247 clk: ti: dra7-atl: Fix reference leak in of_dra7_atl_clk_probe
9b65fd6513 clk: bcm2835: fix bcm2835_clock_rate_from_divisor declaration
9a6087a438 clk: baikal-t1: Add SATA internal ref clock buffer
5f143f3bc2 clk: baikal-t1: Add shared xGMAC ref/ptp clocks internal parent
823fd52391 clk: baikal-t1: Fix invalid xGMAC PTP clock divider
2f19a1050e clk: vc5: Fix 5P49V6901 outputs disabling when enabling FOD
92f52770a7 spmi: pmic-arb: correct duplicate APID to PPID mapping logic
a01c0c1600 dmaengine: ioat: stop mod_timer from resurrecting deleted timer in __cleanup()
1dd5148445 clk: mediatek: mt8183: mfgcfg: Propagate rate changes to parent
6e58f2469e mfd: sm501: Add check for platform_driver_register()
3469dd8e22 mfd: fsl-imx25: Fix check for platform_get_irq() errors
b425e03c96 mfd: lp8788: Fix an error handling path in lp8788_irq_init() and lp8788_irq_init()
f7b4388636 mfd: lp8788: Fix an error handling path in lp8788_probe()
08d4051803 mfd: fsl-imx25: Fix an error handling path in mx25_tsadc_setup_irq()
28868b940b mfd: intel_soc_pmic: Fix an error handling path in intel_soc_pmic_i2c_probe()
382a5fc49e fsi: core: Check error number after calling ida_simple_get
ed8e6011b9 clk: qcom: apss-ipq6018: mark apcs_alias0_core_clk as critical
884a788f06 scsi: iscsi: iscsi_tcp: Fix null-ptr-deref while calling getpeername()
a9e5176ead scsi: libsas: Fix use-after-free bug in smp_execute_task_sg()
8f740c11d8 serial: 8250: Fix restoring termios speed after suspend
ab5a3e7144 firmware: google: Test spinlock on panic path to avoid lockups
95ac62e854 staging: vt6655: fix some erroneous memory clean-up loops
878f987166 phy: qualcomm: call clk_disable_unprepare in the error handling
9a56ade124 tty: serial: fsl_lpuart: disable dma rx/tx use flags in lpuart_dma_shutdown
572fb97fce serial: 8250: Toggle IER bits on only after irq has been set up
3fbfa5e3cc serial: 8250: Add an empty line and remove some useless {}
71ffe5111f drivers: serial: jsm: fix some leaks in probe
7efdd91d54 usb: gadget: function: fix dangling pnp_string in f_printer.c
cc952e3bf6 xhci: Don't show warning for reinit on known broken suspend
dac769dd7d IB: Set IOVA/LENGTH on IB_MR in core/uverbs layers
360386e11c RDMA/cm: Use SLID in the work completion as the DLID in responder side
a1263294b5 md/raid5: Ensure stripe_fill happens on non-read IO with journal
76694e9ce0 md: Replace snprintf with scnprintf
7bd5f3b4a8 mtd: rawnand: meson: fix bit map use in meson_nfc_ecc_correct()
f5325f3202 ata: fix ata_id_has_dipm()
f5a6fa1877 ata: fix ata_id_has_ncq_autosense()
3c34a91c8a ata: fix ata_id_has_devslp()
fc61a0c820 ata: fix ata_id_sense_reporting_enabled() and ata_id_has_sense_reporting()
e3917c85f4 RDMA/siw: Always consume all skbuf data in sk_data_ready() upcall.
3a9d7d8dcf mtd: rawnand: fsl_elbc: Fix none ECC mode
f87f720811 mtd: devices: docg3: check the return value of devm_ioremap() in the probe
d06cc0e11d dyndbg: drop EXPORTed dynamic_debug_exec_queries
1d65985589 dyndbg: let query-modname override actual module name
c0e206da44 dyndbg: fix module.dyndbg handling
5047bd3bd7 dyndbg: fix static_branch manipulation
af12e209a9 dmaengine: hisilicon: Add multi-thread support for a DMA channel
d3fd838536 dmaengine: hisilicon: Fix CQ head update
d5065ca461 dmaengine: hisilicon: Disable channels when unregister hisi_dma
f59861946f fpga: prevent integer overflow in dfl_feature_ioctl_set_irq()
7ba19a60c7 misc: ocxl: fix possible refcount leak in afu_ioctl()
cf3bb86edd RDMA/rxe: Fix the error caused by qp->sk
cdce36a88d RDMA/rxe: Fix "kernel NULL pointer dereference" error
2630cc8832 media: xilinx: vipp: Fix refcount leak in xvip_graph_dma_init
40aa0999a3 media: meson: vdec: add missing clk_disable_unprepare on error in vdec_hevc_start()
551b87976a tty: xilinx_uartps: Fix the ignore_status
28cdf6c6fb media: exynos4-is: fimc-is: Add of_node_put() when breaking out of loop
1f683bff1a HSI: omap_ssi_port: Fix dma_map_sg error check
962f22e7f7 HSI: omap_ssi: Fix refcount leak in ssi_probe
70f0a0a27d clk: tegra20: Fix refcount leak in tegra20_clock_init
c01bfd23cc clk: tegra: Fix refcount leak in tegra114_clock_init
f487137a53 clk: tegra: Fix refcount leak in tegra210_clock_init
59e90c4d98 clk: sprd: Hold reference returned by of_get_parent()
57141b1dd6 clk: berlin: Add of_node_put() for of_get_parent()
dc190b46c6 clk: qoriq: Hold reference returned by of_get_parent()
baadc6f58f clk: oxnas: Hold reference returned by of_get_parent()
b95f4f9054 clk: meson: Hold reference returned by of_get_parent()
beec2f0255 usb: common: debug: Check non-standard control requests
9d965a22f6 usb: common: move function's kerneldoc next to its definition
20b63631a3 usb: common: add function to get interval expressed in us unit
c1ef8c66a3 usb: common: Parse for USB SSP genXxY
ffffb159e1 usb: ch9: Add USB 3.2 SSP attributes
aa7aada4b7 iio: ABI: Fix wrong format of differential capacitance channel ABI.
b9a0526cd0 iio: inkern: only release the device node when done with it
44ec4b04fc iio: adc: at91-sama5d2_adc: disable/prepare buffer on suspend/resume
513c72d76d iio: adc: at91-sama5d2_adc: lock around oversampling and sample freq
d259b90f0c iio: adc: at91-sama5d2_adc: check return status for pressure and touch
bc2b97e177 iio: adc: at91-sama5d2_adc: fix AT91_SAMA5D2_MR_TRACKTIM_MAX
5b9bb0cbd9 ARM: dts: exynos: fix polarity of VBUS GPIO of Origen
657de36c72 arm64: ftrace: fix module PLTs with mcount
40e966a404 ARM: Drop CMDLINE_* dependency on ATAGS
477dbf9d1b ARM: dts: exynos: correct s5k6a3 reset polarity on Midas family
5bbd3dd7f9 soc/tegra: fuse: Drop Kconfig dependency on TEGRA20_APB_DMA
09c35f1520 ia64: export memory_add_physaddr_to_nid to fix cxl build error
e31c0e14cf ARM: dts: kirkwood: lsxl: remove first ethernet port
df4f05b356 ARM: dts: kirkwood: lsxl: fix serial line
43faaedf3a ARM: dts: turris-omnia: Fix mpp26 pin name and comment
d5c2051898 soc: qcom: smem_state: Add refcounting for the 'state->of_node'
39781c98ad soc: qcom: smsm: Fix refcount leak bugs in qcom_smsm_probe()
1d312c12c9 memory: of: Fix refcount leak bug in of_lpddr3_get_ddr_timings()
daaec4b3fe memory: of: Fix refcount leak bug in of_get_ddr_timings()
fde46754d5 memory: pl353-smc: Fix refcount leak bug in pl353_smc_probe()
2c442b0c06 ALSA: hda/hdmi: Don't skip notification handling during PM operation
f182de42d7 ASoC: mt6660: Fix PM disable depth imbalance in mt6660_i2c_probe
37e3e01c9a ASoC: wm5102: Fix PM disable depth imbalance in wm5102_probe
fb23569699 ASoC: wm5110: Fix PM disable depth imbalance in wm5110_probe
c1b269dda1 ASoC: wm8997: Fix PM disable depth imbalance in wm8997_probe
71704c2e1b mmc: wmt-sdmmc: Fix an error handling path in wmt_mci_probe()
c940636d9c ALSA: dmaengine: increment buffer pointer atomically
4993c1511d ASoC: da7219: Fix an error handling path in da7219_register_dai_clks()
ef59819976 drm/msm/dp: correct 1.62G link rate at dp_catalog_ctrl_config_msa()
598d8f7d86 drm/msm/dpu: index dpu_kms->hw_vbif using vbif_idx
a9a60d6405 ASoC: eureka-tlv320: Hold reference returned from of_find_xxx API
ad0b8ed172 mmc: au1xmmc: Fix an error handling path in au1xmmc_probe()
1f340e1c1c drm/omap: dss: Fix refcount leak bugs
cbe37857dd ALSA: hda: beep: Simplify keep-power-at-enable behavior
f0fb0817eb ASoC: rsnd: Add check for rsnd_mod_power_on
877e92e9b1 drm/bridge: megachips: Fix a null pointer dereference bug
c577b4e972 drm: fix drm_mipi_dbi build errors
804d8e59f3 platform/x86: msi-laptop: Fix resource cleanup
c21c08fab7 platform/x86: msi-laptop: Fix old-ec check for backlight registering
b77755f58e ASoC: tas2764: Fix mute/unmute
2e6b64df54 ASoC: tas2764: Drop conflicting set_bias_level power setting
c2c6022e10 ASoC: tas2764: Allow mono streams
868fc93b61 platform/chrome: fix memory corruption in ioctl
84da5cdf43 platform/chrome: fix double-free in chromeos_laptop_prepare()
5e25bfcd12 drm:pl111: Add of_node_put() when breaking out of for_each_available_child_of_node()
ad06d6bed5 drm/dp_mst: fix drm_dp_dpcd_read return value checks
3f5889fd65 drm/bridge: parade-ps8640: Fix regulator supply order
45120fa5e5 drm/mipi-dsi: Detach devices when removing the host
050b650507 drm/bridge: Avoid uninitialized variable warning
7839f2b349 drm: bridge: adv7511: fix CEC power down control register offset
29f50bcf0f net: mvpp2: fix mvpp2 debugfs leak
6cb54f2162 once: add DO_ONCE_SLOW() for sleepable contexts
67cb80a9d2 net/ieee802154: reject zero-sized raw_sendmsg()
6cc0e2afc6 bnx2x: fix potential memory leak in bnx2x_tpa_stop()
da349221c4 net: rds: don't hold sock lock when cancelling work from rds_tcp_reset_callbacks()
d9e25dc053 spi: Ensure that sg_table won't be used after being freed
96a3ddb870 tcp: fix tcp_cwnd_validate() to not forget is_cwnd_limited
f65955340e sctp: handle the error returned from sctp_auth_asoc_init_active_key
2a1d036320 mISDN: fix use-after-free bugs in l1oip timer handlers
b4a5905fd2 vhost/vsock: Use kvmalloc/kvfree for larger packets.
d2b5dc3a53 wifi: rtl8xxxu: Fix AIFS written to REG_EDCA_*_PARAM
17196f2f98 spi: s3c64xx: Fix large transfers with DMA
b284e1fe15 netfilter: nft_fib: Fix for rpath check with VRF devices
b384e8fb16 Bluetooth: hci_core: Fix not handling link timeouts propertly
129f01116b i2c: mlxbf: support lock mechanism
534909fe3c spi/omap100k:Fix PM disable depth imbalance in omap1_spi100k_probe
9da61e7b59 spi: dw: Fix PM disable depth imbalance in dw_spi_bt1_probe
1ef5798638 x86/cpu: Include the header of init_ia32_feat_ctl()'s prototype
6ed7b05a35 x86/microcode/AMD: Track patch allocation size explicitly
07299e52e5 wifi: ath11k: fix number of VHT beamformee spatial streams
d7cc0d51ff Bluetooth: hci_{ldisc,serdev}: check percpu_init_rwsem() failure
ed403bcd97 bpf: Ensure correct locking around vulnerable function find_vpid()
2a1c29dc9b net: fs_enet: Fix wrong check in do_pd_setup
795954d751 wifi: rtl8xxxu: Remove copy-paste leftover in gen2_update_rate_mask
226e6f2412 wifi: rtl8xxxu: gen2: Fix mistake in path B IQ calibration
0a60ac7a0d bpf: btf: fix truncated last_member_type_id in btf_struct_resolve
8398a45d3d spi: meson-spicc: do not rely on busy flag in pow2 clk ops
351cf55595 wifi: rtl8xxxu: Fix skb misuse in TX queue selection
1e91179057 spi: qup: add missing clk_disable_unprepare on error in spi_qup_pm_resume_runtime()
7b83d11d48 spi: qup: add missing clk_disable_unprepare on error in spi_qup_resume()
5576008305 selftests/xsk: Avoid use-after-free on ctx
c823df0679 wifi: rtl8xxxu: tighten bounds checking in rtl8xxxu_read_efuse()
ea1b6b5409 Bluetooth: btusb: mediatek: fix WMT failure during runtime suspend
07194ccbb1 Bluetooth: btusb: fix excessive stack usage
cdadf95435 Bluetooth: btusb: Fine-tune mt7663 mechanism.
294395caac x86/resctrl: Fix to restore to original value when re-enabling hardware prefetch register
029a1de92c spi: mt7621: Fix an error message in mt7621_spi_probe()
2afb93e4e4 bpftool: Fix a wrong type cast in btf_dumper_int
61905bbb61 wifi: mac80211: allow bw change during channel switch in mesh
7565207066 leds: lm3601x: Don't use mutex after it was destroyed
08faf07717 wifi: ath10k: add peer map clean up for peer delete in ath10k_sta_state()
e060c4b9f3 nfsd: Fix a memory leak in an error handling path
730191a098 objtool: Preserve special st_shndx indexes in elf_update_symbol
84837738d4 ARM: 9247/1: mm: set readonly for MT_MEMORY_RO with ARM_LPAE
f1d6edeaa8 ARM: 9244/1: dump: Fix wrong pg_level in walk_pmd()
da2aecef86 MIPS: SGI-IP27: Fix platform-device leak in bridge_platform_create()
0c667858c0 MIPS: SGI-IP27: Free some unused memory
3598445698 sh: machvec: Use char[] for section boundaries
6e4be747f1 userfaultfd: open userfaultfds with O_RDONLY
28d9b39733 selinux: use "grep -E" instead of "egrep"
d11e09953c smb3: must initialize two ACL struct fields to zero
abd13b2100 drm/i915: Fix watermark calculations for gen12+ MC CCS modifier
fd37286f39 drm/i915: Fix watermark calculations for gen12+ RC CCS modifier
5d6093c49c drm/nouveau: fix a use-after-free in nouveau_gem_prime_import_sg_table()
57f1a89a8e drm/nouveau/kms/nv140-: Disable interlacing
d0febad83e staging: greybus: audio_helper: remove unused and wrong debugfs usage
ceeb8d4a43 KVM: VMX: Drop bits 31:16 when shoving exception error code into VMCS
83fe0b009b KVM: nVMX: Unconditionally purge queued/injected events on nested "exit"
085ca1d33b KVM: x86/emulator: Fix handing of POP SS to correctly set interruptibility
bda8120e5b media: cedrus: Set the platform driver data earlier
dbdd3b1448 efi: libstub: drop pointless get_memory_map() call
68158654b5 thunderbolt: Explicitly enable lane adapter hotplug events at startup
fc08f84381 tracing: Disable interrupt or preemption before acquiring arch_spinlock_t
0cf6c09daf ring-buffer: Fix race between reset page and reading page
588f02f8b9 ring-buffer: Add ring_buffer_wake_waiters()
586f02c500 ring-buffer: Check pending waiters when doing wake ups as well
6617e5132c ring-buffer: Have the shortest_full queue be the shortest not longest
4a3bbd40e4 ring-buffer: Allow splice to read previous partially read pages
f2ca4609d0 ftrace: Properly unset FTRACE_HASH_FL_MOD
846f041203 livepatch: fix race between fork and KLP transition
2189756eab ext4: update 'state->fc_regions_size' after successful memory allocation
2cfb769d60 ext4: fix potential memory leak in ext4_fc_record_regions()
c9ce7766dc ext4: fix potential memory leak in ext4_fc_record_modified_inode()
d575fb52c4 ext4: fix miss release buffer head in ext4_fc_write_inode
74d2a398d2 ext4: place buffer head allocation before handle start
fbb0e601bd ext4: ext4_read_bh_lock() should submit IO if the buffer isn't uptodate
0e1764ad71 ext4: don't increase iversion counter for ea_inodes
483831ad04 ext4: fix check for block being out of directory size
ac66db1a43 ext4: make ext4_lazyinit_thread freezable
f34ab95162 ext4: fix null-ptr-deref in ext4_write_info
fb98cb61ef ext4: avoid crash when inline data creation follows DIO write
e65506ff18 jbd2: add miss release buffer head in fc_do_one_pass()
1d4d16daec jbd2: fix potential use-after-free in jbd2_fc_wait_bufs
7a33dde572 jbd2: fix potential buffer head reference count leak
eea3e455a3 jbd2: wake up journal waiters in FIFO order, not LIFO
ba52e685d2 hardening: Remove Clang's enable flag for -ftrivial-auto-var-init=zero
bdcb1d7cf2 hardening: Avoid harmless Clang option under CONFIG_INIT_STACK_ALL_ZERO
d621a87064 hardening: Clarify Kconfig text for auto-var-init
4a8e8bf280 f2fs: fix to do sanity check on summary info
73fb4bd2c0 f2fs: fix to do sanity check on destination blkaddr during recovery
12014eaf1b f2fs: increase the limit for reserve_root
47b5ffe863 btrfs: fix race between quota enable and quota rescan ioctl
e504729496 fbdev: smscufx: Fix use-after-free in ufx_ops_open()
9931bd05bb scsi: qedf: Populate sysfs attributes for vport
102c4b6e8c powerpc/boot: Explicitly disable usage of SPE instructions
7db60fd46e powercap: intel_rapl: Use standard Energy Unit for SPR Dram RAPL domain
9119a92ad9 PCI: Sanitise firmware BAR assignments behind a PCI-PCI bridge
a3c08c0217 mm/mmap: undo ->mmap() when arch_validate_flags() fails
7d551b7d61 block: fix inflight statistics of part0
0a12979089 drm/udl: Restore display mode on resume
f134f261d7 drm/virtio: Check whether transferred 2D BO is shmem
303436e301 nvme-pci: set min_align_mask before calculating max_hw_sectors
6a73e6edcb UM: cpuinfo: Fix a warning for CONFIG_CPUMASK_OFFSTACK
1a053f597f riscv: Pass -mno-relax only on lld < 15.0.0
d15dca1d46 riscv: Make VM_WRITE imply VM_READ
d8c6f9b2e1 riscv: Allow PROT_WRITE-only mmap()
a6dcc6cfa2 parisc: fbdev/stifb: Align graphics memory size to 4MB
2ce9fab94b RISC-V: Make port I/O string accessors actually work
ffb571e123 regulator: qcom_rpm: Fix circular deferral regression
85909424a1 hwmon: (gsc-hwmon) Call of_node_get() before of_find_xxx API
8ef0e1c0ae ASoC: wcd934x: fix order of Slimbus unprepare/disable
9b2c82af65 ASoC: wcd9335: fix order of Slimbus unprepare/disable
1c20d672e3 platform/chrome: cros_ec_proto: Update version on GET_NEXT_EVENT failure
6b7ae4a904 quota: Check next/prev free block number after reading from quota file
5b1a56beb6 HID: multitouch: Add memory barriers
bfe60d7641 fs: dlm: handle -EBUSY first in lock arg validation
0b2d8e4db4 fs: dlm: fix race between test_bit() and queue_work()
057d5838c7 mmc: sdhci-sprd: Fix minimum clock limit
448fffc1ae can: kvaser_usb_leaf: Fix CAN state after restart
a3776e09b3 can: kvaser_usb_leaf: Fix TX queue out of sync after restart
0f8c88978d can: kvaser_usb_leaf: Fix overread with an invalid command
5d1cb7bfad can: kvaser_usb: Fix use of uninitialized completion
b239a0993a usb: add quirks for Lenovo OneLink+ Dock
afbbf305db iio: pressure: dps310: Reset chip after timeout
9daadd1d10 iio: pressure: dps310: Refactor startup procedure
ae49d80400 iio: adc: ad7923: fix channel readings for some variants
ea4dcd3d6a iio: ltc2497: Fix reading conversion results
30e1bd0d3e iio: dac: ad5593r: Fix i2c read protocol requirements
9312e04b6c cifs: Fix the error length of VALIDATE_NEGOTIATE_INFO message
64f23e5430 cifs: destage dirty pages before re-reading them for cache=none
50d3d89537 mtd: rawnand: atmel: Unmap streaming DMA mappings
e8eb44eeee ALSA: hda/realtek: Add Intel Reference SSID to support headset keys
4491fbd0a7 ALSA: hda/realtek: Add quirk for ASUS GV601R laptop
4285d06d12 ALSA: hda/realtek: Correct pin configs for ASUS G533Z
768cd2cd1a ALSA: hda/realtek: remove ALC289_FIXUP_DUAL_SPK for Dell 5530
3e29645fba ALSA: usb-audio: Fix NULL dererence at error path
bc1d16d282 ALSA: usb-audio: Fix potential memory leaks
ef1658bc48 ALSA: rawmidi: Drop register_mutex in snd_rawmidi_free()
026fcb6336 ALSA: oss: Fix potential deadlock at unregistration

Also update the .xml file to handle the few ABI changes in this merge
that required an update due to private pointers changing types and ABI
padding structures being used to preserve the ABI:

Leaf changes summary: 4 artifacts changed (1 filtered out)
Changed leaf types summary: 4 (1 filtered out) leaf types changed
Removed/Changed/Added functions summary: 0 Removed, 0 Changed, 0 Added function
Removed/Changed/Added variables summary: 0 Removed, 0 Changed, 0 Added variable

'struct fscrypt_info at fscrypt_private.h:195:1' changed:
  type size hasn't changed
  there are data member changes:
    type 'key*' of 'fscrypt_info::ci_master_key' changed:
      pointer type changed from: 'key*' to: 'fscrypt_master_key*'
  5197 impacted interfaces

'struct sk_buff at skbuff.h:717:1' changed:
  type size hasn't changed
  there are data member changes:
    data member u64 android_kabi_reserved1 at offset 1472 (in bits) became anonymous data member 'union {struct {__u8 scm_io_uring; __u8 android_kabi_reserved1_padding1; __u16 android_kabi_reserved1_padding2; __u32 android_kabi_reserved1_padding3;}; struct {u64 android_kabi_reserved1;}; union {};}'
  5197 impacted interfaces

'struct super_block at fs.h:1450:1' changed:
  type size hasn't changed
  there are data member changes:
    type 'key*' of 'super_block::s_master_keys' changed:
      pointer type changed from: 'key*' to: 'fscrypt_keyring*'
  5197 impacted interfaces

'struct tcp_sock at tcp.h:146:1' changed:
  type size hasn't changed
  one impacted interface

Change-Id: I6f2a7b91e1df96bede8aafa944a04b3e08ed33a1
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
2023-01-21 12:06:54 +00:00

5922 lines
161 KiB
C

// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/mm/memory.c
*
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
*/
/*
* demand-loading started 01.12.91 - seems it is high on the list of
* things wanted, and it should be easy to implement. - Linus
*/
/*
* Ok, demand-loading was easy, shared pages a little bit tricker. Shared
* pages started 02.12.91, seems to work. - Linus.
*
* Tested sharing by executing about 30 /bin/sh: under the old kernel it
* would have taken more than the 6M I have free, but it worked well as
* far as I could see.
*
* Also corrected some "invalidate()"s - I wasn't doing enough of them.
*/
/*
* Real VM (paging to/from disk) started 18.12.91. Much more work and
* thought has to go into this. Oh, well..
* 19.12.91 - works, somewhat. Sometimes I get faults, don't know why.
* Found it. Everything seems to work now.
* 20.12.91 - Ok, making the swap-device changeable like the root.
*/
/*
* 05.04.94 - Multi-page memory management added for v1.1.
* Idea by Alex Bligh (alex@cconcepts.co.uk)
*
* 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG
* (Gerhard.Wichert@pdb.siemens.de)
*
* Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
*/
#include <linux/kernel_stat.h>
#include <linux/mm.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/task.h>
#include <linux/hugetlb.h>
#include <linux/mman.h>
#include <linux/swap.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/memremap.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/export.h>
#include <linux/delayacct.h>
#include <linux/init.h>
#include <linux/pfn_t.h>
#include <linux/writeback.h>
#include <linux/memcontrol.h>
#include <linux/mmu_notifier.h>
#include <linux/swapops.h>
#include <linux/elf.h>
#include <linux/gfp.h>
#include <linux/migrate.h>
#include <linux/string.h>
#include <linux/debugfs.h>
#include <linux/userfaultfd_k.h>
#include <linux/dax.h>
#include <linux/oom.h>
#include <linux/numa.h>
#include <linux/perf_event.h>
#include <linux/ptrace.h>
#include <linux/vmalloc.h>
#include <trace/hooks/mm.h>
#include <trace/events/kmem.h>
#include <asm/io.h>
#include <asm/mmu_context.h>
#include <asm/pgalloc.h>
#include <linux/uaccess.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include "pgalloc-track.h"
#include "internal.h"
#include <trace/hooks/mm.h>
#define CREATE_TRACE_POINTS
#include <trace/events/pagefault.h>
#if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
#endif
#ifndef CONFIG_NEED_MULTIPLE_NODES
/* use the per-pgdat data instead for discontigmem - mbligh */
unsigned long max_mapnr;
EXPORT_SYMBOL(max_mapnr);
struct page *mem_map;
EXPORT_SYMBOL(mem_map);
#endif
/*
* A number of key systems in x86 including ioremap() rely on the assumption
* that high_memory defines the upper bound on direct map memory, then end
* of ZONE_NORMAL. Under CONFIG_DISCONTIG this means that max_low_pfn and
* highstart_pfn must be the same; there must be no gap between ZONE_NORMAL
* and ZONE_HIGHMEM.
*/
void *high_memory;
EXPORT_SYMBOL(high_memory);
/*
* Randomize the address space (stacks, mmaps, brk, etc.).
*
* ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
* as ancient (libc5 based) binaries can segfault. )
*/
int randomize_va_space __read_mostly =
#ifdef CONFIG_COMPAT_BRK
1;
#else
2;
#endif
#ifndef arch_faults_on_old_pte
static inline bool arch_faults_on_old_pte(void)
{
/*
* Those arches which don't have hw access flag feature need to
* implement their own helper. By default, "true" means pagefault
* will be hit on old pte.
*/
return true;
}
#endif
#ifndef arch_wants_old_prefaulted_pte
static inline bool arch_wants_old_prefaulted_pte(void)
{
/*
* Transitioning a PTE from 'old' to 'young' can be expensive on
* some architectures, even if it's performed in hardware. By
* default, "false" means prefaulted entries will be 'young'.
*/
return false;
}
#endif
static int __init disable_randmaps(char *s)
{
randomize_va_space = 0;
return 1;
}
__setup("norandmaps", disable_randmaps);
unsigned long zero_pfn __read_mostly;
EXPORT_SYMBOL(zero_pfn);
unsigned long highest_memmap_pfn __read_mostly;
/*
* CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
*/
static int __init init_zero_pfn(void)
{
zero_pfn = page_to_pfn(ZERO_PAGE(0));
return 0;
}
early_initcall(init_zero_pfn);
/*
* Only trace rss_stat when there is a 512kb cross over.
* Smaller changes may be lost unless every small change is
* crossing into or returning to a 512kb boundary.
*/
#define TRACE_MM_COUNTER_THRESHOLD 128
void mm_trace_rss_stat(struct mm_struct *mm, int member, long count,
long value)
{
long thresh_mask = ~(TRACE_MM_COUNTER_THRESHOLD - 1);
/* Threshold roll-over, trace it */
if ((count & thresh_mask) != ((count - value) & thresh_mask))
trace_rss_stat(mm, member, count);
}
EXPORT_SYMBOL_GPL(mm_trace_rss_stat);
#if defined(SPLIT_RSS_COUNTING)
void sync_mm_rss(struct mm_struct *mm)
{
int i;
for (i = 0; i < NR_MM_COUNTERS; i++) {
if (current->rss_stat.count[i]) {
add_mm_counter(mm, i, current->rss_stat.count[i]);
current->rss_stat.count[i] = 0;
}
}
current->rss_stat.events = 0;
}
static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
{
struct task_struct *task = current;
if (likely(task->mm == mm))
task->rss_stat.count[member] += val;
else
add_mm_counter(mm, member, val);
}
#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
/* sync counter once per 64 page faults */
#define TASK_RSS_EVENTS_THRESH (64)
static void check_sync_rss_stat(struct task_struct *task)
{
if (unlikely(task != current))
return;
if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
sync_mm_rss(task->mm);
}
#else /* SPLIT_RSS_COUNTING */
#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
static void check_sync_rss_stat(struct task_struct *task)
{
}
#endif /* SPLIT_RSS_COUNTING */
/*
* Note: this doesn't free the actual pages themselves. That
* has been handled earlier when unmapping all the memory regions.
*/
static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
unsigned long addr)
{
pgtable_t token = pmd_pgtable(*pmd);
pmd_clear(pmd);
pte_free_tlb(tlb, token, addr);
mm_dec_nr_ptes(tlb->mm);
}
static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
unsigned long addr, unsigned long end,
unsigned long floor, unsigned long ceiling)
{
pmd_t *pmd;
unsigned long next;
unsigned long start;
start = addr;
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
if (pmd_none_or_clear_bad(pmd))
continue;
free_pte_range(tlb, pmd, addr);
} while (pmd++, addr = next, addr != end);
start &= PUD_MASK;
if (start < floor)
return;
if (ceiling) {
ceiling &= PUD_MASK;
if (!ceiling)
return;
}
if (end - 1 > ceiling - 1)
return;
pmd = pmd_offset(pud, start);
pud_clear(pud);
pmd_free_tlb(tlb, pmd, start);
mm_dec_nr_pmds(tlb->mm);
}
static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
unsigned long addr, unsigned long end,
unsigned long floor, unsigned long ceiling)
{
pud_t *pud;
unsigned long next;
unsigned long start;
start = addr;
pud = pud_offset(p4d, addr);
do {
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud))
continue;
free_pmd_range(tlb, pud, addr, next, floor, ceiling);
} while (pud++, addr = next, addr != end);
start &= P4D_MASK;
if (start < floor)
return;
if (ceiling) {
ceiling &= P4D_MASK;
if (!ceiling)
return;
}
if (end - 1 > ceiling - 1)
return;
pud = pud_offset(p4d, start);
p4d_clear(p4d);
pud_free_tlb(tlb, pud, start);
mm_dec_nr_puds(tlb->mm);
}
static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
unsigned long addr, unsigned long end,
unsigned long floor, unsigned long ceiling)
{
p4d_t *p4d;
unsigned long next;
unsigned long start;
start = addr;
p4d = p4d_offset(pgd, addr);
do {
next = p4d_addr_end(addr, end);
if (p4d_none_or_clear_bad(p4d))
continue;
free_pud_range(tlb, p4d, addr, next, floor, ceiling);
} while (p4d++, addr = next, addr != end);
start &= PGDIR_MASK;
if (start < floor)
return;
if (ceiling) {
ceiling &= PGDIR_MASK;
if (!ceiling)
return;
}
if (end - 1 > ceiling - 1)
return;
p4d = p4d_offset(pgd, start);
pgd_clear(pgd);
p4d_free_tlb(tlb, p4d, start);
}
/*
* This function frees user-level page tables of a process.
*/
void free_pgd_range(struct mmu_gather *tlb,
unsigned long addr, unsigned long end,
unsigned long floor, unsigned long ceiling)
{
pgd_t *pgd;
unsigned long next;
/*
* The next few lines have given us lots of grief...
*
* Why are we testing PMD* at this top level? Because often
* there will be no work to do at all, and we'd prefer not to
* go all the way down to the bottom just to discover that.
*
* Why all these "- 1"s? Because 0 represents both the bottom
* of the address space and the top of it (using -1 for the
* top wouldn't help much: the masks would do the wrong thing).
* The rule is that addr 0 and floor 0 refer to the bottom of
* the address space, but end 0 and ceiling 0 refer to the top
* Comparisons need to use "end - 1" and "ceiling - 1" (though
* that end 0 case should be mythical).
*
* Wherever addr is brought up or ceiling brought down, we must
* be careful to reject "the opposite 0" before it confuses the
* subsequent tests. But what about where end is brought down
* by PMD_SIZE below? no, end can't go down to 0 there.
*
* Whereas we round start (addr) and ceiling down, by different
* masks at different levels, in order to test whether a table
* now has no other vmas using it, so can be freed, we don't
* bother to round floor or end up - the tests don't need that.
*/
addr &= PMD_MASK;
if (addr < floor) {
addr += PMD_SIZE;
if (!addr)
return;
}
if (ceiling) {
ceiling &= PMD_MASK;
if (!ceiling)
return;
}
if (end - 1 > ceiling - 1)
end -= PMD_SIZE;
if (addr > end - 1)
return;
/*
* We add page table cache pages with PAGE_SIZE,
* (see pte_free_tlb()), flush the tlb if we need
*/
tlb_change_page_size(tlb, PAGE_SIZE);
pgd = pgd_offset(tlb->mm, addr);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
free_p4d_range(tlb, pgd, addr, next, floor, ceiling);
} while (pgd++, addr = next, addr != end);
}
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long floor, unsigned long ceiling)
{
while (vma) {
struct vm_area_struct *next = vma->vm_next;
unsigned long addr = vma->vm_start;
/*
* Hide vma from rmap and truncate_pagecache before freeing
* pgtables
*/
vm_write_begin(vma);
unlink_anon_vmas(vma);
vm_write_end(vma);
unlink_file_vma(vma);
if (is_vm_hugetlb_page(vma)) {
hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
floor, next ? next->vm_start : ceiling);
} else {
/*
* Optimization: gather nearby vmas into one call down
*/
while (next && next->vm_start <= vma->vm_end + PMD_SIZE
&& !is_vm_hugetlb_page(next)) {
vma = next;
next = vma->vm_next;
vm_write_begin(vma);
unlink_anon_vmas(vma);
vm_write_end(vma);
unlink_file_vma(vma);
}
free_pgd_range(tlb, addr, vma->vm_end,
floor, next ? next->vm_start : ceiling);
}
vma = next;
}
}
int __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
{
spinlock_t *ptl;
pgtable_t new = pte_alloc_one(mm);
if (!new)
return -ENOMEM;
/*
* Ensure all pte setup (eg. pte page lock and page clearing) are
* visible before the pte is made visible to other CPUs by being
* put into page tables.
*
* The other side of the story is the pointer chasing in the page
* table walking code (when walking the page table without locking;
* ie. most of the time). Fortunately, these data accesses consist
* of a chain of data-dependent loads, meaning most CPUs (alpha
* being the notable exception) will already guarantee loads are
* seen in-order. See the alpha page table accessors for the
* smp_rmb() barriers in page table walking code.
*/
smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
ptl = pmd_lock(mm, pmd);
if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
mm_inc_nr_ptes(mm);
pmd_populate(mm, pmd, new);
new = NULL;
}
spin_unlock(ptl);
if (new)
pte_free(mm, new);
return 0;
}
int __pte_alloc_kernel(pmd_t *pmd)
{
pte_t *new = pte_alloc_one_kernel(&init_mm);
if (!new)
return -ENOMEM;
smp_wmb(); /* See comment in __pte_alloc */
spin_lock(&init_mm.page_table_lock);
if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
pmd_populate_kernel(&init_mm, pmd, new);
new = NULL;
}
spin_unlock(&init_mm.page_table_lock);
if (new)
pte_free_kernel(&init_mm, new);
return 0;
}
static inline void init_rss_vec(int *rss)
{
memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
}
static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
{
int i;
if (current->mm == mm)
sync_mm_rss(mm);
for (i = 0; i < NR_MM_COUNTERS; i++)
if (rss[i])
add_mm_counter(mm, i, rss[i]);
}
/*
* This function is called to print an error when a bad pte
* is found. For example, we might have a PFN-mapped pte in
* a region that doesn't allow it.
*
* The calling function must still handle the error.
*/
static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
pte_t pte, struct page *page)
{
pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
p4d_t *p4d = p4d_offset(pgd, addr);
pud_t *pud = pud_offset(p4d, addr);
pmd_t *pmd = pmd_offset(pud, addr);
struct address_space *mapping;
pgoff_t index;
static unsigned long resume;
static unsigned long nr_shown;
static unsigned long nr_unshown;
/*
* Allow a burst of 60 reports, then keep quiet for that minute;
* or allow a steady drip of one report per second.
*/
if (nr_shown == 60) {
if (time_before(jiffies, resume)) {
nr_unshown++;
return;
}
if (nr_unshown) {
pr_alert("BUG: Bad page map: %lu messages suppressed\n",
nr_unshown);
nr_unshown = 0;
}
nr_shown = 0;
}
if (nr_shown++ == 0)
resume = jiffies + 60 * HZ;
mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
index = linear_page_index(vma, addr);
pr_alert("BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
current->comm,
(long long)pte_val(pte), (long long)pmd_val(*pmd));
if (page)
dump_page(page, "bad pte");
pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n",
(void *)addr, READ_ONCE(vma->vm_flags), vma->anon_vma, mapping, index);
pr_alert("file:%pD fault:%ps mmap:%ps readpage:%ps\n",
vma->vm_file,
vma->vm_ops ? vma->vm_ops->fault : NULL,
vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
mapping ? mapping->a_ops->readpage : NULL);
dump_stack();
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
}
/*
* __vm_normal_page -- This function gets the "struct page" associated with
* a pte.
*
* "Special" mappings do not wish to be associated with a "struct page" (either
* it doesn't exist, or it exists but they don't want to touch it). In this
* case, NULL is returned here. "Normal" mappings do have a struct page.
*
* There are 2 broad cases. Firstly, an architecture may define a pte_special()
* pte bit, in which case this function is trivial. Secondly, an architecture
* may not have a spare pte bit, which requires a more complicated scheme,
* described below.
*
* A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
* special mapping (even if there are underlying and valid "struct pages").
* COWed pages of a VM_PFNMAP are always normal.
*
* The way we recognize COWed pages within VM_PFNMAP mappings is through the
* rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
* set, and the vm_pgoff will point to the first PFN mapped: thus every special
* mapping will always honor the rule
*
* pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
*
* And for normal mappings this is false.
*
* This restricts such mappings to be a linear translation from virtual address
* to pfn. To get around this restriction, we allow arbitrary mappings so long
* as the vma is not a COW mapping; in that case, we know that all ptes are
* special (because none can have been COWed).
*
*
* In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
*
* VM_MIXEDMAP mappings can likewise contain memory with or without "struct
* page" backing, however the difference is that _all_ pages with a struct
* page (that is, those where pfn_valid is true) are refcounted and considered
* normal pages by the VM. The disadvantage is that pages are refcounted
* (which can be slower and simply not an option for some PFNMAP users). The
* advantage is that we don't have to follow the strict linearity rule of
* PFNMAP mappings in order to support COWable mappings.
*
*/
struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
pte_t pte, unsigned long vma_flags)
{
unsigned long pfn = pte_pfn(pte);
if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) {
if (likely(!pte_special(pte)))
goto check_pfn;
if (vma->vm_ops && vma->vm_ops->find_special_page)
return vma->vm_ops->find_special_page(vma, addr);
if (vma_flags & (VM_PFNMAP | VM_MIXEDMAP))
return NULL;
if (is_zero_pfn(pfn))
return NULL;
if (pte_devmap(pte))
return NULL;
print_bad_pte(vma, addr, pte, NULL);
return NULL;
}
/* !CONFIG_ARCH_HAS_PTE_SPECIAL case follows: */
/*
* This part should never get called when CONFIG_SPECULATIVE_PAGE_FAULT
* is set. This is mainly because we can't rely on vm_start.
*/
if (unlikely(vma_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
if (vma_flags & VM_MIXEDMAP) {
if (!pfn_valid(pfn))
return NULL;
goto out;
} else {
unsigned long off;
off = (addr - vma->vm_start) >> PAGE_SHIFT;
if (pfn == vma->vm_pgoff + off)
return NULL;
if (!is_cow_mapping(vma_flags))
return NULL;
}
}
if (is_zero_pfn(pfn))
return NULL;
check_pfn:
if (unlikely(pfn > highest_memmap_pfn)) {
print_bad_pte(vma, addr, pte, NULL);
return NULL;
}
/*
* NOTE! We still have PageReserved() pages in the page tables.
* eg. VDSO mappings can cause them to exist.
*/
out:
return pfn_to_page(pfn);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t pmd)
{
unsigned long pfn = pmd_pfn(pmd);
/*
* There is no pmd_special() but there may be special pmds, e.g.
* in a direct-access (dax) mapping, so let's just replicate the
* !CONFIG_ARCH_HAS_PTE_SPECIAL case from vm_normal_page() here.
*/
if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
if (vma->vm_flags & VM_MIXEDMAP) {
if (!pfn_valid(pfn))
return NULL;
goto out;
} else {
unsigned long off;
off = (addr - vma->vm_start) >> PAGE_SHIFT;
if (pfn == vma->vm_pgoff + off)
return NULL;
if (!is_cow_mapping(vma->vm_flags))
return NULL;
}
}
if (pmd_devmap(pmd))
return NULL;
if (is_huge_zero_pmd(pmd))
return NULL;
if (unlikely(pfn > highest_memmap_pfn))
return NULL;
/*
* NOTE! We still have PageReserved() pages in the page tables.
* eg. VDSO mappings can cause them to exist.
*/
out:
return pfn_to_page(pfn);
}
#endif
/*
* copy one vm_area from one task to the other. Assumes the page tables
* already present in the new task to be cleared in the whole range
* covered by this vma.
*/
static unsigned long
copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *dst_vma,
struct vm_area_struct *src_vma, unsigned long addr, int *rss)
{
unsigned long vm_flags = dst_vma->vm_flags;
pte_t pte = *src_pte;
struct page *page;
swp_entry_t entry = pte_to_swp_entry(pte);
if (likely(!non_swap_entry(entry))) {
if (swap_duplicate(entry) < 0)
return entry.val;
/* make sure dst_mm is on swapoff's mmlist. */
if (unlikely(list_empty(&dst_mm->mmlist))) {
spin_lock(&mmlist_lock);
if (list_empty(&dst_mm->mmlist))
list_add(&dst_mm->mmlist,
&src_mm->mmlist);
spin_unlock(&mmlist_lock);
}
rss[MM_SWAPENTS]++;
} else if (is_migration_entry(entry)) {
page = migration_entry_to_page(entry);
rss[mm_counter(page)]++;
if (is_write_migration_entry(entry) &&
is_cow_mapping(vm_flags)) {
/*
* COW mappings require pages in both
* parent and child to be set to read.
*/
make_migration_entry_read(&entry);
pte = swp_entry_to_pte(entry);
if (pte_swp_soft_dirty(*src_pte))
pte = pte_swp_mksoft_dirty(pte);
if (pte_swp_uffd_wp(*src_pte))
pte = pte_swp_mkuffd_wp(pte);
set_pte_at(src_mm, addr, src_pte, pte);
}
} else if (is_device_private_entry(entry)) {
page = device_private_entry_to_page(entry);
/*
* Update rss count even for unaddressable pages, as
* they should treated just like normal pages in this
* respect.
*
* We will likely want to have some new rss counters
* for unaddressable pages, at some point. But for now
* keep things as they are.
*/
get_page(page);
rss[mm_counter(page)]++;
page_dup_rmap(page, false);
/*
* We do not preserve soft-dirty information, because so
* far, checkpoint/restore is the only feature that
* requires that. And checkpoint/restore does not work
* when a device driver is involved (you cannot easily
* save and restore device driver state).
*/
if (is_write_device_private_entry(entry) &&
is_cow_mapping(vm_flags)) {
make_device_private_entry_read(&entry);
pte = swp_entry_to_pte(entry);
if (pte_swp_uffd_wp(*src_pte))
pte = pte_swp_mkuffd_wp(pte);
set_pte_at(src_mm, addr, src_pte, pte);
}
}
if (!userfaultfd_wp(dst_vma))
pte = pte_swp_clear_uffd_wp(pte);
set_pte_at(dst_mm, addr, dst_pte, pte);
return 0;
}
/*
* Copy a present and normal page if necessary.
*
* NOTE! The usual case is that this doesn't need to do
* anything, and can just return a positive value. That
* will let the caller know that it can just increase
* the page refcount and re-use the pte the traditional
* way.
*
* But _if_ we need to copy it because it needs to be
* pinned in the parent (and the child should get its own
* copy rather than just a reference to the same page),
* we'll do that here and return zero to let the caller
* know we're done.
*
* And if we need a pre-allocated page but don't yet have
* one, return a negative error to let the preallocation
* code know so that it can do so outside the page table
* lock.
*/
static inline int
copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
struct page **prealloc, pte_t pte, struct page *page)
{
struct mm_struct *src_mm = src_vma->vm_mm;
struct page *new_page;
if (!is_cow_mapping(src_vma->vm_flags))
return 1;
/*
* What we want to do is to check whether this page may
* have been pinned by the parent process. If so,
* instead of wrprotect the pte on both sides, we copy
* the page immediately so that we'll always guarantee
* the pinned page won't be randomly replaced in the
* future.
*
* The page pinning checks are just "has this mm ever
* seen pinning", along with the (inexact) check of
* the page count. That might give false positives for
* for pinning, but it will work correctly.
*/
if (likely(!atomic_read(&src_mm->has_pinned)))
return 1;
if (likely(!page_maybe_dma_pinned(page)))
return 1;
/*
* The vma->anon_vma of the child process may be NULL
* because the entire vma does not contain anonymous pages.
* A BUG will occur when the copy_present_page() passes
* a copy of a non-anonymous page of that vma to the
* page_add_new_anon_rmap() to set up new anonymous rmap.
* Return 1 if the page is not an anonymous page.
*/
if (!PageAnon(page))
return 1;
new_page = *prealloc;
if (!new_page)
return -EAGAIN;
/*
* We have a prealloc page, all good! Take it
* over and copy the page & arm it.
*/
*prealloc = NULL;
copy_user_highpage(new_page, page, addr, src_vma);
__SetPageUptodate(new_page);
page_add_new_anon_rmap(new_page, dst_vma, addr, false);
lru_cache_add_inactive_or_unevictable(new_page, dst_vma);
rss[mm_counter(new_page)]++;
/* All done, just insert the new page copy in the child */
pte = mk_pte(new_page, dst_vma->vm_page_prot);
pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma->vm_flags);
if (userfaultfd_pte_wp(dst_vma, *src_pte))
/* Uffd-wp needs to be delivered to dest pte as well */
pte = pte_wrprotect(pte_mkuffd_wp(pte));
set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
return 0;
}
/*
* Copy one pte. Returns 0 if succeeded, or -EAGAIN if one preallocated page
* is required to copy this pte.
*/
static inline int
copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
struct page **prealloc)
{
struct mm_struct *src_mm = src_vma->vm_mm;
unsigned long vm_flags = src_vma->vm_flags;
pte_t pte = *src_pte;
struct page *page;
page = vm_normal_page(src_vma, addr, pte);
if (page) {
int retval;
retval = copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
addr, rss, prealloc, pte, page);
if (retval <= 0)
return retval;
get_page(page);
page_dup_rmap(page, false);
rss[mm_counter(page)]++;
}
/*
* If it's a COW mapping, write protect it both
* in the parent and the child
*/
if (is_cow_mapping(vm_flags) && pte_write(pte)) {
ptep_set_wrprotect(src_mm, addr, src_pte);
pte = pte_wrprotect(pte);
}
/*
* If it's a shared mapping, mark it clean in
* the child
*/
if (vm_flags & VM_SHARED)
pte = pte_mkclean(pte);
pte = pte_mkold(pte);
if (!userfaultfd_wp(dst_vma))
pte = pte_clear_uffd_wp(pte);
set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
return 0;
}
static inline struct page *
page_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma,
unsigned long addr)
{
struct page *new_page;
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, addr);
if (!new_page)
return NULL;
if (mem_cgroup_charge(new_page, src_mm, GFP_KERNEL)) {
put_page(new_page);
return NULL;
}
cgroup_throttle_swaprate(new_page, GFP_KERNEL);
return new_page;
}
static int
copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
unsigned long end)
{
struct mm_struct *dst_mm = dst_vma->vm_mm;
struct mm_struct *src_mm = src_vma->vm_mm;
pte_t *orig_src_pte, *orig_dst_pte;
pte_t *src_pte, *dst_pte;
spinlock_t *src_ptl, *dst_ptl;
int progress, ret = 0;
int rss[NR_MM_COUNTERS];
swp_entry_t entry = (swp_entry_t){0};
struct page *prealloc = NULL;
again:
progress = 0;
init_rss_vec(rss);
dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
if (!dst_pte) {
ret = -ENOMEM;
goto out;
}
src_pte = pte_offset_map(src_pmd, addr);
src_ptl = pte_lockptr(src_mm, src_pmd);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
orig_src_pte = src_pte;
orig_dst_pte = dst_pte;
arch_enter_lazy_mmu_mode();
do {
/*
* We are holding two locks at this point - either of them
* could generate latencies in another task on another CPU.
*/
if (progress >= 32) {
progress = 0;
if (need_resched() ||
spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
break;
}
if (pte_none(*src_pte)) {
progress++;
continue;
}
if (unlikely(!pte_present(*src_pte))) {
entry.val = copy_nonpresent_pte(dst_mm, src_mm,
dst_pte, src_pte,
dst_vma, src_vma,
addr, rss);
if (entry.val)
break;
progress += 8;
continue;
}
/* copy_present_pte() will clear `*prealloc' if consumed */
ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte,
addr, rss, &prealloc);
/*
* If we need a pre-allocated page for this pte, drop the
* locks, allocate, and try again.
*/
if (unlikely(ret == -EAGAIN))
break;
if (unlikely(prealloc)) {
/*
* pre-alloc page cannot be reused by next time so as
* to strictly follow mempolicy (e.g., alloc_page_vma()
* will allocate page according to address). This
* could only happen if one pinned pte changed.
*/
put_page(prealloc);
prealloc = NULL;
}
progress += 8;
} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
spin_unlock(src_ptl);
pte_unmap(orig_src_pte);
add_mm_rss_vec(dst_mm, rss);
pte_unmap_unlock(orig_dst_pte, dst_ptl);
cond_resched();
if (entry.val) {
if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) {
ret = -ENOMEM;
goto out;
}
entry.val = 0;
} else if (ret) {
WARN_ON_ONCE(ret != -EAGAIN);
prealloc = page_copy_prealloc(src_mm, src_vma, addr);
if (!prealloc)
return -ENOMEM;
/* We've captured and resolved the error. Reset, try again. */
ret = 0;
}
if (addr != end)
goto again;
out:
if (unlikely(prealloc))
put_page(prealloc);
return ret;
}
static inline int
copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
unsigned long end)
{
struct mm_struct *dst_mm = dst_vma->vm_mm;
struct mm_struct *src_mm = src_vma->vm_mm;
pmd_t *src_pmd, *dst_pmd;
unsigned long next;
dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
if (!dst_pmd)
return -ENOMEM;
src_pmd = pmd_offset(src_pud, addr);
do {
next = pmd_addr_end(addr, end);
if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)
|| pmd_devmap(*src_pmd)) {
int err;
VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma);
err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd,
addr, dst_vma, src_vma);
if (err == -ENOMEM)
return -ENOMEM;
if (!err)
continue;
/* fall through */
}
if (pmd_none_or_clear_bad(src_pmd))
continue;
if (copy_pte_range(dst_vma, src_vma, dst_pmd, src_pmd,
addr, next))
return -ENOMEM;
} while (dst_pmd++, src_pmd++, addr = next, addr != end);
return 0;
}
static inline int
copy_pud_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
p4d_t *dst_p4d, p4d_t *src_p4d, unsigned long addr,
unsigned long end)
{
struct mm_struct *dst_mm = dst_vma->vm_mm;
struct mm_struct *src_mm = src_vma->vm_mm;
pud_t *src_pud, *dst_pud;
unsigned long next;
dst_pud = pud_alloc(dst_mm, dst_p4d, addr);
if (!dst_pud)
return -ENOMEM;
src_pud = pud_offset(src_p4d, addr);
do {
next = pud_addr_end(addr, end);
if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
int err;
VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma);
err = copy_huge_pud(dst_mm, src_mm,
dst_pud, src_pud, addr, src_vma);
if (err == -ENOMEM)
return -ENOMEM;
if (!err)
continue;
/* fall through */
}
if (pud_none_or_clear_bad(src_pud))
continue;
if (copy_pmd_range(dst_vma, src_vma, dst_pud, src_pud,
addr, next))
return -ENOMEM;
} while (dst_pud++, src_pud++, addr = next, addr != end);
return 0;
}
static inline int
copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long addr,
unsigned long end)
{
struct mm_struct *dst_mm = dst_vma->vm_mm;
p4d_t *src_p4d, *dst_p4d;
unsigned long next;
dst_p4d = p4d_alloc(dst_mm, dst_pgd, addr);
if (!dst_p4d)
return -ENOMEM;
src_p4d = p4d_offset(src_pgd, addr);
do {
next = p4d_addr_end(addr, end);
if (p4d_none_or_clear_bad(src_p4d))
continue;
if (copy_pud_range(dst_vma, src_vma, dst_p4d, src_p4d,
addr, next))
return -ENOMEM;
} while (dst_p4d++, src_p4d++, addr = next, addr != end);
return 0;
}
int
copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
{
pgd_t *src_pgd, *dst_pgd;
unsigned long next;
unsigned long addr = src_vma->vm_start;
unsigned long end = src_vma->vm_end;
struct mm_struct *dst_mm = dst_vma->vm_mm;
struct mm_struct *src_mm = src_vma->vm_mm;
struct mmu_notifier_range range;
bool is_cow;
int ret;
/*
* Don't copy ptes where a page fault will fill them correctly.
* Fork becomes much lighter when there are big shared or private
* readonly mappings. The tradeoff is that copy_page_range is more
* efficient than faulting.
*/
if (!(src_vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) &&
!src_vma->anon_vma)
return 0;
if (is_vm_hugetlb_page(src_vma))
return copy_hugetlb_page_range(dst_mm, src_mm, src_vma);
if (unlikely(src_vma->vm_flags & VM_PFNMAP)) {
/*
* We do not free on error cases below as remove_vma
* gets called on error from higher level routine
*/
ret = track_pfn_copy(src_vma);
if (ret)
return ret;
}
/*
* We need to invalidate the secondary MMU mappings only when
* there could be a permission downgrade on the ptes of the
* parent mm. And a permission downgrade will only happen if
* is_cow_mapping() returns true.
*/
is_cow = is_cow_mapping(src_vma->vm_flags);
if (is_cow) {
mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
0, src_vma, src_mm, addr, end);
mmu_notifier_invalidate_range_start(&range);
/*
* Disabling preemption is not needed for the write side, as
* the read side doesn't spin, but goes to the mmap_lock.
*
* Use the raw variant of the seqcount_t write API to avoid
* lockdep complaining about preemptibility.
*/
mmap_assert_write_locked(src_mm);
raw_write_seqcount_begin(&src_mm->write_protect_seq);
}
ret = 0;
dst_pgd = pgd_offset(dst_mm, addr);
src_pgd = pgd_offset(src_mm, addr);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(src_pgd))
continue;
if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
addr, next))) {
ret = -ENOMEM;
break;
}
} while (dst_pgd++, src_pgd++, addr = next, addr != end);
if (is_cow) {
raw_write_seqcount_end(&src_mm->write_protect_seq);
mmu_notifier_invalidate_range_end(&range);
}
return ret;
}
/* Whether we should zap all COWed (private) pages too */
static inline bool should_zap_cows(struct zap_details *details)
{
/* By default, zap all pages */
if (!details)
return true;
/* Or, we zap COWed pages only if the caller wants to */
return !details->check_mapping;
}
static unsigned long zap_pte_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
struct zap_details *details)
{
struct mm_struct *mm = tlb->mm;
int force_flush = 0;
int rss[NR_MM_COUNTERS];
spinlock_t *ptl;
pte_t *start_pte;
pte_t *pte;
swp_entry_t entry;
tlb_change_page_size(tlb, PAGE_SIZE);
again:
init_rss_vec(rss);
start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
pte = start_pte;
flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
do {
pte_t ptent = *pte;
if (pte_none(ptent))
continue;
if (need_resched())
break;
if (pte_present(ptent)) {
struct page *page;
page = vm_normal_page(vma, addr, ptent);
if (unlikely(details) && page) {
/*
* unmap_shared_mapping_pages() wants to
* invalidate cache without truncating:
* unmap shared but keep private pages.
*/
if (details->check_mapping &&
details->check_mapping != page_rmapping(page))
continue;
}
ptent = ptep_get_and_clear_full(mm, addr, pte,
tlb->fullmm);
tlb_remove_tlb_entry(tlb, pte, addr);
if (unlikely(!page))
continue;
if (!PageAnon(page)) {
if (pte_dirty(ptent)) {
force_flush = 1;
set_page_dirty(page);
}
if (pte_young(ptent) &&
likely(!(vma->vm_flags & VM_SEQ_READ)))
mark_page_accessed(page);
}
rss[mm_counter(page)]--;
page_remove_rmap(page, false);
if (unlikely(page_mapcount(page) < 0))
print_bad_pte(vma, addr, ptent, page);
if (unlikely(__tlb_remove_page(tlb, page)) ||
lru_cache_disabled()) {
force_flush = 1;
addr += PAGE_SIZE;
break;
}
continue;
}
entry = pte_to_swp_entry(ptent);
if (is_device_private_entry(entry)) {
struct page *page = device_private_entry_to_page(entry);
if (unlikely(details && details->check_mapping)) {
/*
* unmap_shared_mapping_pages() wants to
* invalidate cache without truncating:
* unmap shared but keep private pages.
*/
if (details->check_mapping !=
page_rmapping(page))
continue;
}
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
rss[mm_counter(page)]--;
page_remove_rmap(page, false);
put_page(page);
continue;
}
if (!non_swap_entry(entry)) {
/* Genuine swap entry, hence a private anon page */
if (!should_zap_cows(details))
continue;
rss[MM_SWAPENTS]--;
} else if (is_migration_entry(entry)) {
struct page *page;
page = migration_entry_to_page(entry);
if (details && details->check_mapping &&
details->check_mapping != page_rmapping(page))
continue;
rss[mm_counter(page)]--;
}
if (unlikely(!free_swap_and_cache(entry)))
print_bad_pte(vma, addr, ptent, NULL);
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
} while (pte++, addr += PAGE_SIZE, addr != end);
add_mm_rss_vec(mm, rss);
arch_leave_lazy_mmu_mode();
/* Do the actual TLB flush before dropping ptl */
if (force_flush)
tlb_flush_mmu_tlbonly(tlb);
pte_unmap_unlock(start_pte, ptl);
/*
* If we forced a TLB flush (either due to running out of
* batch buffers or because we needed to flush dirty TLB
* entries before releasing the ptl), free the batched
* memory too. Restart if we didn't do everything.
*/
if (force_flush) {
force_flush = 0;
tlb_flush_mmu(tlb);
}
if (addr != end) {
cond_resched();
goto again;
}
return addr;
}
static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pud_t *pud,
unsigned long addr, unsigned long end,
struct zap_details *details)
{
pmd_t *pmd;
unsigned long next;
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
if (next - addr != HPAGE_PMD_SIZE)
__split_huge_pmd(vma, pmd, addr, false, NULL);
else if (zap_huge_pmd(tlb, vma, pmd, addr))
goto next;
/* fall through */
} else if (details && details->single_page &&
PageTransCompound(details->single_page) &&
next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) {
spinlock_t *ptl = pmd_lock(tlb->mm, pmd);
/*
* Take and drop THP pmd lock so that we cannot return
* prematurely, while zap_huge_pmd() has cleared *pmd,
* but not yet decremented compound_mapcount().
*/
spin_unlock(ptl);
}
/*
* Here there can be other concurrent MADV_DONTNEED or
* trans huge page faults running, and if the pmd is
* none or trans huge it can change under us. This is
* because MADV_DONTNEED holds the mmap_lock in read
* mode.
*/
if (pmd_none_or_trans_huge_or_clear_bad(pmd))
goto next;
next = zap_pte_range(tlb, vma, pmd, addr, next, details);
next:
cond_resched();
} while (pmd++, addr = next, addr != end);
return addr;
}
static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, p4d_t *p4d,
unsigned long addr, unsigned long end,
struct zap_details *details)
{
pud_t *pud;
unsigned long next;
pud = pud_offset(p4d, addr);
do {
next = pud_addr_end(addr, end);
if (pud_trans_huge(*pud) || pud_devmap(*pud)) {
if (next - addr != HPAGE_PUD_SIZE) {
mmap_assert_locked(tlb->mm);
split_huge_pud(vma, pud, addr);
} else if (zap_huge_pud(tlb, vma, pud, addr))
goto next;
/* fall through */
}
if (pud_none_or_clear_bad(pud))
continue;
next = zap_pmd_range(tlb, vma, pud, addr, next, details);
next:
cond_resched();
} while (pud++, addr = next, addr != end);
return addr;
}
static inline unsigned long zap_p4d_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pgd_t *pgd,
unsigned long addr, unsigned long end,
struct zap_details *details)
{
p4d_t *p4d;
unsigned long next;
p4d = p4d_offset(pgd, addr);
do {
next = p4d_addr_end(addr, end);
if (p4d_none_or_clear_bad(p4d))
continue;
next = zap_pud_range(tlb, vma, p4d, addr, next, details);
} while (p4d++, addr = next, addr != end);
return addr;
}
void unmap_page_range(struct mmu_gather *tlb,
struct vm_area_struct *vma,
unsigned long addr, unsigned long end,
struct zap_details *details)
{
pgd_t *pgd;
unsigned long next;
BUG_ON(addr >= end);
tlb_start_vma(tlb, vma);
pgd = pgd_offset(vma->vm_mm, addr);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
next = zap_p4d_range(tlb, vma, pgd, addr, next, details);
} while (pgd++, addr = next, addr != end);
tlb_end_vma(tlb, vma);
}
static void unmap_single_vma(struct mmu_gather *tlb,
struct vm_area_struct *vma, unsigned long start_addr,
unsigned long end_addr,
struct zap_details *details)
{
unsigned long start = max(vma->vm_start, start_addr);
unsigned long end;
if (start >= vma->vm_end)
return;
end = min(vma->vm_end, end_addr);
if (end <= vma->vm_start)
return;
if (vma->vm_file)
uprobe_munmap(vma, start, end);
if (unlikely(vma->vm_flags & VM_PFNMAP))
untrack_pfn(vma, 0, 0);
if (start != end) {
if (unlikely(is_vm_hugetlb_page(vma))) {
/*
* It is undesirable to test vma->vm_file as it
* should be non-null for valid hugetlb area.
* However, vm_file will be NULL in the error
* cleanup path of mmap_region. When
* hugetlbfs ->mmap method fails,
* mmap_region() nullifies vma->vm_file
* before calling this function to clean up.
* Since no pte has actually been setup, it is
* safe to do nothing in this case.
*/
if (vma->vm_file) {
i_mmap_lock_write(vma->vm_file->f_mapping);
__unmap_hugepage_range_final(tlb, vma, start, end, NULL);
i_mmap_unlock_write(vma->vm_file->f_mapping);
}
} else
unmap_page_range(tlb, vma, start, end, details);
}
}
/**
* unmap_vmas - unmap a range of memory covered by a list of vma's
* @tlb: address of the caller's struct mmu_gather
* @vma: the starting vma
* @start_addr: virtual address at which to start unmapping
* @end_addr: virtual address at which to end unmapping
*
* Unmap all pages in the vma list.
*
* Only addresses between `start' and `end' will be unmapped.
*
* The VMA list must be sorted in ascending virtual address order.
*
* unmap_vmas() assumes that the caller will flush the whole unmapped address
* range after unmap_vmas() returns. So the only responsibility here is to
* ensure that any thus-far unmapped pages are flushed before unmap_vmas()
* drops the lock and schedules.
*/
void unmap_vmas(struct mmu_gather *tlb,
struct vm_area_struct *vma, unsigned long start_addr,
unsigned long end_addr)
{
struct mmu_notifier_range range;
mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
start_addr, end_addr);
mmu_notifier_invalidate_range_start(&range);
for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
mmu_notifier_invalidate_range_end(&range);
}
/**
* zap_page_range - remove user pages in a given range
* @vma: vm_area_struct holding the applicable pages
* @start: starting address of pages to zap
* @size: number of bytes to zap
*
* Caller must protect the VMA list
*/
void zap_page_range(struct vm_area_struct *vma, unsigned long start,
unsigned long size)
{
struct mmu_notifier_range range;
struct mmu_gather tlb;
lru_add_drain();
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
start, start + size);
tlb_gather_mmu(&tlb, vma->vm_mm, start, range.end);
update_hiwater_rss(vma->vm_mm);
mmu_notifier_invalidate_range_start(&range);
for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next)
unmap_single_vma(&tlb, vma, start, range.end, NULL);
mmu_notifier_invalidate_range_end(&range);
tlb_finish_mmu(&tlb, start, range.end);
}
/**
* zap_page_range_single - remove user pages in a given range
* @vma: vm_area_struct holding the applicable pages
* @address: starting address of pages to zap
* @size: number of bytes to zap
* @details: details of shared cache invalidation
*
* The range must fit into one VMA.
*/
static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
unsigned long size, struct zap_details *details)
{
struct mmu_notifier_range range;
struct mmu_gather tlb;
lru_add_drain();
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
address, address + size);
tlb_gather_mmu(&tlb, vma->vm_mm, address, range.end);
update_hiwater_rss(vma->vm_mm);
mmu_notifier_invalidate_range_start(&range);
unmap_single_vma(&tlb, vma, address, range.end, details);
mmu_notifier_invalidate_range_end(&range);
tlb_finish_mmu(&tlb, address, range.end);
}
/**
* zap_vma_ptes - remove ptes mapping the vma
* @vma: vm_area_struct holding ptes to be zapped
* @address: starting address of pages to zap
* @size: number of bytes to zap
*
* This function only unmaps ptes assigned to VM_PFNMAP vmas.
*
* The entire address range must be fully contained within the vma.
*
*/
void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
unsigned long size)
{
if (address < vma->vm_start || address + size > vma->vm_end ||
!(vma->vm_flags & VM_PFNMAP))
return;
zap_page_range_single(vma, address, size, NULL);
}
EXPORT_SYMBOL_GPL(zap_vma_ptes);
static pmd_t *walk_to_pmd(struct mm_struct *mm, unsigned long addr)
{
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pgd = pgd_offset(mm, addr);
p4d = p4d_alloc(mm, pgd, addr);
if (!p4d)
return NULL;
pud = pud_alloc(mm, p4d, addr);
if (!pud)
return NULL;
pmd = pmd_alloc(mm, pud, addr);
if (!pmd)
return NULL;
VM_BUG_ON(pmd_trans_huge(*pmd));
return pmd;
}
pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
spinlock_t **ptl)
{
pmd_t *pmd = walk_to_pmd(mm, addr);
if (!pmd)
return NULL;
return pte_alloc_map_lock(mm, pmd, addr, ptl);
}
static int validate_page_before_insert(struct page *page)
{
if (PageAnon(page) || PageSlab(page) || page_has_type(page))
return -EINVAL;
flush_dcache_page(page);
return 0;
}
static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte,
unsigned long addr, struct page *page, pgprot_t prot)
{
if (!pte_none(*pte))
return -EBUSY;
/* Ok, finally just insert the thing.. */
get_page(page);
inc_mm_counter_fast(mm, mm_counter_file(page));
page_add_file_rmap(page, false);
set_pte_at(mm, addr, pte, mk_pte(page, prot));
return 0;
}
/*
* This is the old fallback for page remapping.
*
* For historical reasons, it only allows reserved pages. Only
* old drivers should use this, and they needed to mark their
* pages reserved for the old functions anyway.
*/
static int insert_page(struct vm_area_struct *vma, unsigned long addr,
struct page *page, pgprot_t prot)
{
struct mm_struct *mm = vma->vm_mm;
int retval;
pte_t *pte;
spinlock_t *ptl;
retval = validate_page_before_insert(page);
if (retval)
goto out;
retval = -ENOMEM;
pte = get_locked_pte(mm, addr, &ptl);
if (!pte)
goto out;
retval = insert_page_into_pte_locked(mm, pte, addr, page, prot);
pte_unmap_unlock(pte, ptl);
out:
return retval;
}
#ifdef pte_index
static int insert_page_in_batch_locked(struct mm_struct *mm, pte_t *pte,
unsigned long addr, struct page *page, pgprot_t prot)
{
int err;
if (!page_count(page))
return -EINVAL;
err = validate_page_before_insert(page);
if (err)
return err;
return insert_page_into_pte_locked(mm, pte, addr, page, prot);
}
/* insert_pages() amortizes the cost of spinlock operations
* when inserting pages in a loop. Arch *must* define pte_index.
*/
static int insert_pages(struct vm_area_struct *vma, unsigned long addr,
struct page **pages, unsigned long *num, pgprot_t prot)
{
pmd_t *pmd = NULL;
pte_t *start_pte, *pte;
spinlock_t *pte_lock;
struct mm_struct *const mm = vma->vm_mm;
unsigned long curr_page_idx = 0;
unsigned long remaining_pages_total = *num;
unsigned long pages_to_write_in_pmd;
int ret;
more:
ret = -EFAULT;
pmd = walk_to_pmd(mm, addr);
if (!pmd)
goto out;
pages_to_write_in_pmd = min_t(unsigned long,
remaining_pages_total, PTRS_PER_PTE - pte_index(addr));
/* Allocate the PTE if necessary; takes PMD lock once only. */
ret = -ENOMEM;
if (pte_alloc(mm, pmd))
goto out;
while (pages_to_write_in_pmd) {
int pte_idx = 0;
const int batch_size = min_t(int, pages_to_write_in_pmd, 8);
start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock);
for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) {
int err = insert_page_in_batch_locked(mm, pte,
addr, pages[curr_page_idx], prot);
if (unlikely(err)) {
pte_unmap_unlock(start_pte, pte_lock);
ret = err;
remaining_pages_total -= pte_idx;
goto out;
}
addr += PAGE_SIZE;
++curr_page_idx;
}
pte_unmap_unlock(start_pte, pte_lock);
pages_to_write_in_pmd -= batch_size;
remaining_pages_total -= batch_size;
}
if (remaining_pages_total)
goto more;
ret = 0;
out:
*num = remaining_pages_total;
return ret;
}
#endif /* ifdef pte_index */
/**
* vm_insert_pages - insert multiple pages into user vma, batching the pmd lock.
* @vma: user vma to map to
* @addr: target start user address of these pages
* @pages: source kernel pages
* @num: in: number of pages to map. out: number of pages that were *not*
* mapped. (0 means all pages were successfully mapped).
*
* Preferred over vm_insert_page() when inserting multiple pages.
*
* In case of error, we may have mapped a subset of the provided
* pages. It is the caller's responsibility to account for this case.
*
* The same restrictions apply as in vm_insert_page().
*/
int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
struct page **pages, unsigned long *num)
{
#ifdef pte_index
const unsigned long end_addr = addr + (*num * PAGE_SIZE) - 1;
if (addr < vma->vm_start || end_addr >= vma->vm_end)
return -EFAULT;
if (!(vma->vm_flags & VM_MIXEDMAP)) {
BUG_ON(mmap_read_trylock(vma->vm_mm));
BUG_ON(vma->vm_flags & VM_PFNMAP);
vma->vm_flags |= VM_MIXEDMAP;
}
/* Defer page refcount checking till we're about to map that page. */
return insert_pages(vma, addr, pages, num, vma->vm_page_prot);
#else
unsigned long idx = 0, pgcount = *num;
int err = -EINVAL;
for (; idx < pgcount; ++idx) {
err = vm_insert_page(vma, addr + (PAGE_SIZE * idx), pages[idx]);
if (err)
break;
}
*num = pgcount - idx;
return err;
#endif /* ifdef pte_index */
}
EXPORT_SYMBOL(vm_insert_pages);
/**
* vm_insert_page - insert single page into user vma
* @vma: user vma to map to
* @addr: target user address of this page
* @page: source kernel page
*
* This allows drivers to insert individual pages they've allocated
* into a user vma.
*
* The page has to be a nice clean _individual_ kernel allocation.
* If you allocate a compound page, you need to have marked it as
* such (__GFP_COMP), or manually just split the page up yourself
* (see split_page()).
*
* NOTE! Traditionally this was done with "remap_pfn_range()" which
* took an arbitrary page protection parameter. This doesn't allow
* that. Your vma protection will have to be set up correctly, which
* means that if you want a shared writable mapping, you'd better
* ask for a shared writable mapping!
*
* The page does not need to be reserved.
*
* Usually this function is called from f_op->mmap() handler
* under mm->mmap_lock write-lock, so it can change vma->vm_flags.
* Caller must set VM_MIXEDMAP on vma if it wants to call this
* function from other places, for example from page-fault handler.
*
* Return: %0 on success, negative error code otherwise.
*/
int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
struct page *page)
{
if (addr < vma->vm_start || addr >= vma->vm_end)
return -EFAULT;
if (!page_count(page))
return -EINVAL;
if (!(vma->vm_flags & VM_MIXEDMAP)) {
BUG_ON(mmap_read_trylock(vma->vm_mm));
BUG_ON(vma->vm_flags & VM_PFNMAP);
vma->vm_flags |= VM_MIXEDMAP;
}
return insert_page(vma, addr, page, vma->vm_page_prot);
}
EXPORT_SYMBOL(vm_insert_page);
/*
* __vm_map_pages - maps range of kernel pages into user vma
* @vma: user vma to map to
* @pages: pointer to array of source kernel pages
* @num: number of pages in page array
* @offset: user's requested vm_pgoff
*
* This allows drivers to map range of kernel pages into a user vma.
*
* Return: 0 on success and error code otherwise.
*/
static int __vm_map_pages(struct vm_area_struct *vma, struct page **pages,
unsigned long num, unsigned long offset)
{
unsigned long count = vma_pages(vma);
unsigned long uaddr = vma->vm_start;
int ret, i;
/* Fail if the user requested offset is beyond the end of the object */
if (offset >= num)
return -ENXIO;
/* Fail if the user requested size exceeds available object size */
if (count > num - offset)
return -ENXIO;
for (i = 0; i < count; i++) {
ret = vm_insert_page(vma, uaddr, pages[offset + i]);
if (ret < 0)
return ret;
uaddr += PAGE_SIZE;
}
return 0;
}
/**
* vm_map_pages - maps range of kernel pages starts with non zero offset
* @vma: user vma to map to
* @pages: pointer to array of source kernel pages
* @num: number of pages in page array
*
* Maps an object consisting of @num pages, catering for the user's
* requested vm_pgoff
*
* If we fail to insert any page into the vma, the function will return
* immediately leaving any previously inserted pages present. Callers
* from the mmap handler may immediately return the error as their caller
* will destroy the vma, removing any successfully inserted pages. Other
* callers should make their own arrangements for calling unmap_region().
*
* Context: Process context. Called by mmap handlers.
* Return: 0 on success and error code otherwise.
*/
int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
unsigned long num)
{
return __vm_map_pages(vma, pages, num, vma->vm_pgoff);
}
EXPORT_SYMBOL(vm_map_pages);
/**
* vm_map_pages_zero - map range of kernel pages starts with zero offset
* @vma: user vma to map to
* @pages: pointer to array of source kernel pages
* @num: number of pages in page array
*
* Similar to vm_map_pages(), except that it explicitly sets the offset
* to 0. This function is intended for the drivers that did not consider
* vm_pgoff.
*
* Context: Process context. Called by mmap handlers.
* Return: 0 on success and error code otherwise.
*/
int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
unsigned long num)
{
return __vm_map_pages(vma, pages, num, 0);
}
EXPORT_SYMBOL(vm_map_pages_zero);
static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
pfn_t pfn, pgprot_t prot, bool mkwrite)
{
struct mm_struct *mm = vma->vm_mm;
pte_t *pte, entry;
spinlock_t *ptl;
pte = get_locked_pte(mm, addr, &ptl);
if (!pte)
return VM_FAULT_OOM;
if (!pte_none(*pte)) {
if (mkwrite) {
/*
* For read faults on private mappings the PFN passed
* in may not match the PFN we have mapped if the
* mapped PFN is a writeable COW page. In the mkwrite
* case we are creating a writable PTE for a shared
* mapping and we expect the PFNs to match. If they
* don't match, we are likely racing with block
* allocation and mapping invalidation so just skip the
* update.
*/
if (pte_pfn(*pte) != pfn_t_to_pfn(pfn)) {
WARN_ON_ONCE(!is_zero_pfn(pte_pfn(*pte)));
goto out_unlock;
}
entry = pte_mkyoung(*pte);
entry = maybe_mkwrite(pte_mkdirty(entry),
vma->vm_flags);
if (ptep_set_access_flags(vma, addr, pte, entry, 1))
update_mmu_cache(vma, addr, pte);
}
goto out_unlock;
}
/* Ok, finally just insert the thing.. */
if (pfn_t_devmap(pfn))
entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
else
entry = pte_mkspecial(pfn_t_pte(pfn, prot));
if (mkwrite) {
entry = pte_mkyoung(entry);
entry = maybe_mkwrite(pte_mkdirty(entry), vma->vm_flags);
}
set_pte_at(mm, addr, pte, entry);
update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
out_unlock:
pte_unmap_unlock(pte, ptl);
return VM_FAULT_NOPAGE;
}
/**
* vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot
* @vma: user vma to map to
* @addr: target user address of this page
* @pfn: source kernel pfn
* @pgprot: pgprot flags for the inserted page
*
* This is exactly like vmf_insert_pfn(), except that it allows drivers
* to override pgprot on a per-page basis.
*
* This only makes sense for IO mappings, and it makes no sense for
* COW mappings. In general, using multiple vmas is preferable;
* vmf_insert_pfn_prot should only be used if using multiple VMAs is
* impractical.
*
* See vmf_insert_mixed_prot() for a discussion of the implication of using
* a value of @pgprot different from that of @vma->vm_page_prot.
*
* Context: Process context. May allocate using %GFP_KERNEL.
* Return: vm_fault_t value.
*/
vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn, pgprot_t pgprot)
{
/*
* Technically, architectures with pte_special can avoid all these
* restrictions (same for remap_pfn_range). However we would like
* consistency in testing and feature parity among all, so we should
* try to keep these invariants in place for everybody.
*/
BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
(VM_PFNMAP|VM_MIXEDMAP));
BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
if (addr < vma->vm_start || addr >= vma->vm_end)
return VM_FAULT_SIGBUS;
if (!pfn_modify_allowed(pfn, pgprot))
return VM_FAULT_SIGBUS;
track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
false);
}
EXPORT_SYMBOL(vmf_insert_pfn_prot);
/**
* vmf_insert_pfn - insert single pfn into user vma
* @vma: user vma to map to
* @addr: target user address of this page
* @pfn: source kernel pfn
*
* Similar to vm_insert_page, this allows drivers to insert individual pages
* they've allocated into a user vma. Same comments apply.
*
* This function should only be called from a vm_ops->fault handler, and
* in that case the handler should return the result of this function.
*
* vma cannot be a COW mapping.
*
* As this is called only for pages that do not currently exist, we
* do not need to flush old virtual caches or the TLB.
*
* Context: Process context. May allocate using %GFP_KERNEL.
* Return: vm_fault_t value.
*/
vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn)
{
return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
}
EXPORT_SYMBOL(vmf_insert_pfn);
static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
{
/* these checks mirror the abort conditions in vm_normal_page */
if (vma->vm_flags & VM_MIXEDMAP)
return true;
if (pfn_t_devmap(pfn))
return true;
if (pfn_t_special(pfn))
return true;
if (is_zero_pfn(pfn_t_to_pfn(pfn)))
return true;
return false;
}
static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
unsigned long addr, pfn_t pfn, pgprot_t pgprot,
bool mkwrite)
{
int err;
BUG_ON(!vm_mixed_ok(vma, pfn));
if (addr < vma->vm_start || addr >= vma->vm_end)
return VM_FAULT_SIGBUS;
track_pfn_insert(vma, &pgprot, pfn);
if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
return VM_FAULT_SIGBUS;
/*
* If we don't have pte special, then we have to use the pfn_valid()
* based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
* refcount the page if pfn_valid is true (hence insert_page rather
* than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP
* without pte special, it would there be refcounted as a normal page.
*/
if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) &&
!pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
struct page *page;
/*
* At this point we are committed to insert_page()
* regardless of whether the caller specified flags that
* result in pfn_t_has_page() == false.
*/
page = pfn_to_page(pfn_t_to_pfn(pfn));
err = insert_page(vma, addr, page, pgprot);
} else {
return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
}
if (err == -ENOMEM)
return VM_FAULT_OOM;
if (err < 0 && err != -EBUSY)
return VM_FAULT_SIGBUS;
return VM_FAULT_NOPAGE;
}
/**
* vmf_insert_mixed_prot - insert single pfn into user vma with specified pgprot
* @vma: user vma to map to
* @addr: target user address of this page
* @pfn: source kernel pfn
* @pgprot: pgprot flags for the inserted page
*
* This is exactly like vmf_insert_mixed(), except that it allows drivers
* to override pgprot on a per-page basis.
*
* Typically this function should be used by drivers to set caching- and
* encryption bits different than those of @vma->vm_page_prot, because
* the caching- or encryption mode may not be known at mmap() time.
* This is ok as long as @vma->vm_page_prot is not used by the core vm
* to set caching and encryption bits for those vmas (except for COW pages).
* This is ensured by core vm only modifying these page table entries using
* functions that don't touch caching- or encryption bits, using pte_modify()
* if needed. (See for example mprotect()).
* Also when new page-table entries are created, this is only done using the
* fault() callback, and never using the value of vma->vm_page_prot,
* except for page-table entries that point to anonymous pages as the result
* of COW.
*
* Context: Process context. May allocate using %GFP_KERNEL.
* Return: vm_fault_t value.
*/
vm_fault_t vmf_insert_mixed_prot(struct vm_area_struct *vma, unsigned long addr,
pfn_t pfn, pgprot_t pgprot)
{
return __vm_insert_mixed(vma, addr, pfn, pgprot, false);
}
EXPORT_SYMBOL(vmf_insert_mixed_prot);
vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
pfn_t pfn)
{
return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, false);
}
EXPORT_SYMBOL(vmf_insert_mixed);
/*
* If the insertion of PTE failed because someone else already added a
* different entry in the mean time, we treat that as success as we assume
* the same entry was actually inserted.
*/
vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
unsigned long addr, pfn_t pfn)
{
return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, true);
}
EXPORT_SYMBOL(vmf_insert_mixed_mkwrite);
/*
* maps a range of physical memory into the requested pages. the old
* mappings are removed. any references to nonexistent pages results
* in null mappings (currently treated as "copy-on-access")
*/
static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
unsigned long addr, unsigned long end,
unsigned long pfn, pgprot_t prot)
{
pte_t *pte, *mapped_pte;
spinlock_t *ptl;
int err = 0;
mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
if (!pte)
return -ENOMEM;
arch_enter_lazy_mmu_mode();
do {
BUG_ON(!pte_none(*pte));
if (!pfn_modify_allowed(pfn, prot)) {
err = -EACCES;
break;
}
set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
pfn++;
} while (pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(mapped_pte, ptl);
return err;
}
static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
unsigned long addr, unsigned long end,
unsigned long pfn, pgprot_t prot)
{
pmd_t *pmd;
unsigned long next;
int err;
pfn -= addr >> PAGE_SHIFT;
pmd = pmd_alloc(mm, pud, addr);
if (!pmd)
return -ENOMEM;
VM_BUG_ON(pmd_trans_huge(*pmd));
do {
next = pmd_addr_end(addr, end);
err = remap_pte_range(mm, pmd, addr, next,
pfn + (addr >> PAGE_SHIFT), prot);
if (err)
return err;
} while (pmd++, addr = next, addr != end);
return 0;
}
static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d,
unsigned long addr, unsigned long end,
unsigned long pfn, pgprot_t prot)
{
pud_t *pud;
unsigned long next;
int err;
pfn -= addr >> PAGE_SHIFT;
pud = pud_alloc(mm, p4d, addr);
if (!pud)
return -ENOMEM;
do {
next = pud_addr_end(addr, end);
err = remap_pmd_range(mm, pud, addr, next,
pfn + (addr >> PAGE_SHIFT), prot);
if (err)
return err;
} while (pud++, addr = next, addr != end);
return 0;
}
static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
unsigned long addr, unsigned long end,
unsigned long pfn, pgprot_t prot)
{
p4d_t *p4d;
unsigned long next;
int err;
pfn -= addr >> PAGE_SHIFT;
p4d = p4d_alloc(mm, pgd, addr);
if (!p4d)
return -ENOMEM;
do {
next = p4d_addr_end(addr, end);
err = remap_pud_range(mm, p4d, addr, next,
pfn + (addr >> PAGE_SHIFT), prot);
if (err)
return err;
} while (p4d++, addr = next, addr != end);
return 0;
}
/**
* remap_pfn_range - remap kernel memory to userspace
* @vma: user vma to map to
* @addr: target page aligned user address to start at
* @pfn: page frame number of kernel physical memory address
* @size: size of mapping area
* @prot: page protection flags for this mapping
*
* Note: this is only safe if the mm semaphore is held when called.
*
* Return: %0 on success, negative error code otherwise.
*/
int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn, unsigned long size, pgprot_t prot)
{
pgd_t *pgd;
unsigned long next;
unsigned long end = addr + PAGE_ALIGN(size);
struct mm_struct *mm = vma->vm_mm;
unsigned long remap_pfn = pfn;
int err;
if (WARN_ON_ONCE(!PAGE_ALIGNED(addr)))
return -EINVAL;
/*
* Physically remapped pages are special. Tell the
* rest of the world about it:
* VM_IO tells people not to look at these pages
* (accesses can have side effects).
* VM_PFNMAP tells the core MM that the base pages are just
* raw PFN mappings, and do not have a "struct page" associated
* with them.
* VM_DONTEXPAND
* Disable vma merging and expanding with mremap().
* VM_DONTDUMP
* Omit vma from core dump, even when VM_IO turned off.
*
* There's a horrible special case to handle copy-on-write
* behaviour that some programs depend on. We mark the "original"
* un-COW'ed pages by matching them up with "vma->vm_pgoff".
* See vm_normal_page() for details.
*/
if (is_cow_mapping(vma->vm_flags)) {
if (addr != vma->vm_start || end != vma->vm_end)
return -EINVAL;
vma->vm_pgoff = pfn;
}
err = track_pfn_remap(vma, &prot, remap_pfn, addr, PAGE_ALIGN(size));
if (err)
return -EINVAL;
vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
BUG_ON(addr >= end);
pfn -= addr >> PAGE_SHIFT;
pgd = pgd_offset(mm, addr);
flush_cache_range(vma, addr, end);
do {
next = pgd_addr_end(addr, end);
err = remap_p4d_range(mm, pgd, addr, next,
pfn + (addr >> PAGE_SHIFT), prot);
if (err)
break;
} while (pgd++, addr = next, addr != end);
if (err)
untrack_pfn(vma, remap_pfn, PAGE_ALIGN(size));
return err;
}
EXPORT_SYMBOL(remap_pfn_range);
/**
* vm_iomap_memory - remap memory to userspace
* @vma: user vma to map to
* @start: start of the physical memory to be mapped
* @len: size of area
*
* This is a simplified io_remap_pfn_range() for common driver use. The
* driver just needs to give us the physical memory range to be mapped,
* we'll figure out the rest from the vma information.
*
* NOTE! Some drivers might want to tweak vma->vm_page_prot first to get
* whatever write-combining details or similar.
*
* Return: %0 on success, negative error code otherwise.
*/
int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
{
unsigned long vm_len, pfn, pages;
/* Check that the physical memory area passed in looks valid */
if (start + len < start)
return -EINVAL;
/*
* You *really* shouldn't map things that aren't page-aligned,
* but we've historically allowed it because IO memory might
* just have smaller alignment.
*/
len += start & ~PAGE_MASK;
pfn = start >> PAGE_SHIFT;
pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
if (pfn + pages < pfn)
return -EINVAL;
/* We start the mapping 'vm_pgoff' pages into the area */
if (vma->vm_pgoff > pages)
return -EINVAL;
pfn += vma->vm_pgoff;
pages -= vma->vm_pgoff;
/* Can we fit all of the mapping? */
vm_len = vma->vm_end - vma->vm_start;
if (vm_len >> PAGE_SHIFT > pages)
return -EINVAL;
/* Ok, let it rip */
return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
}
EXPORT_SYMBOL(vm_iomap_memory);
static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
unsigned long addr, unsigned long end,
pte_fn_t fn, void *data, bool create,
pgtbl_mod_mask *mask)
{
pte_t *pte;
int err = 0;
spinlock_t *ptl;
if (create) {
pte = (mm == &init_mm) ?
pte_alloc_kernel_track(pmd, addr, mask) :
pte_alloc_map_lock(mm, pmd, addr, &ptl);
if (!pte)
return -ENOMEM;
} else {
pte = (mm == &init_mm) ?
pte_offset_kernel(pmd, addr) :
pte_offset_map_lock(mm, pmd, addr, &ptl);
}
BUG_ON(pmd_huge(*pmd));
arch_enter_lazy_mmu_mode();
if (fn) {
do {
if (create || !pte_none(*pte)) {
err = fn(pte++, addr, data);
if (err)
break;
}
} while (addr += PAGE_SIZE, addr != end);
}
*mask |= PGTBL_PTE_MODIFIED;
arch_leave_lazy_mmu_mode();
if (mm != &init_mm)
pte_unmap_unlock(pte-1, ptl);
return err;
}
static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
unsigned long addr, unsigned long end,
pte_fn_t fn, void *data, bool create,
pgtbl_mod_mask *mask)
{
pmd_t *pmd;
unsigned long next;
int err = 0;
BUG_ON(pud_huge(*pud));
if (create) {
pmd = pmd_alloc_track(mm, pud, addr, mask);
if (!pmd)
return -ENOMEM;
} else {
pmd = pmd_offset(pud, addr);
}
do {
next = pmd_addr_end(addr, end);
if (create || !pmd_none_or_clear_bad(pmd)) {
err = apply_to_pte_range(mm, pmd, addr, next, fn, data,
create, mask);
if (err)
break;
}
} while (pmd++, addr = next, addr != end);
return err;
}
static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
unsigned long addr, unsigned long end,
pte_fn_t fn, void *data, bool create,
pgtbl_mod_mask *mask)
{
pud_t *pud;
unsigned long next;
int err = 0;
if (create) {
pud = pud_alloc_track(mm, p4d, addr, mask);
if (!pud)
return -ENOMEM;
} else {
pud = pud_offset(p4d, addr);
}
do {
next = pud_addr_end(addr, end);
if (create || !pud_none_or_clear_bad(pud)) {
err = apply_to_pmd_range(mm, pud, addr, next, fn, data,
create, mask);
if (err)
break;
}
} while (pud++, addr = next, addr != end);
return err;
}
static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
unsigned long addr, unsigned long end,
pte_fn_t fn, void *data, bool create,
pgtbl_mod_mask *mask)
{
p4d_t *p4d;
unsigned long next;
int err = 0;
if (create) {
p4d = p4d_alloc_track(mm, pgd, addr, mask);
if (!p4d)
return -ENOMEM;
} else {
p4d = p4d_offset(pgd, addr);
}
do {
next = p4d_addr_end(addr, end);
if (create || !p4d_none_or_clear_bad(p4d)) {
err = apply_to_pud_range(mm, p4d, addr, next, fn, data,
create, mask);
if (err)
break;
}
} while (p4d++, addr = next, addr != end);
return err;
}
static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
unsigned long size, pte_fn_t fn,
void *data, bool create)
{
pgd_t *pgd;
unsigned long start = addr, next;
unsigned long end = addr + size;
pgtbl_mod_mask mask = 0;
int err = 0;
if (WARN_ON(addr >= end))
return -EINVAL;
pgd = pgd_offset(mm, addr);
do {
next = pgd_addr_end(addr, end);
if (!create && pgd_none_or_clear_bad(pgd))
continue;
err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, create, &mask);
if (err)
break;
} while (pgd++, addr = next, addr != end);
if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
arch_sync_kernel_mappings(start, start + size);
return err;
}
/*
* Scan a region of virtual memory, filling in page tables as necessary
* and calling a provided function on each leaf page table.
*/
int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
unsigned long size, pte_fn_t fn, void *data)
{
return __apply_to_page_range(mm, addr, size, fn, data, true);
}
EXPORT_SYMBOL_GPL(apply_to_page_range);
#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
static bool pte_spinlock(struct vm_fault *vmf)
{
bool ret = false;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
pmd_t pmdval;
#endif
/* Check if vma is still valid */
if (!(vmf->flags & FAULT_FLAG_SPECULATIVE)) {
vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
spin_lock(vmf->ptl);
return true;
}
local_irq_disable();
if (vma_has_changed(vmf)) {
trace_spf_vma_changed(_RET_IP_, vmf->vma, vmf->address);
goto out;
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
* We check if the pmd value is still the same to ensure that there
* is not a huge collapse operation in progress in our back.
*/
pmdval = READ_ONCE(*vmf->pmd);
if (!pmd_same(pmdval, vmf->orig_pmd)) {
trace_spf_pmd_changed(_RET_IP_, vmf->vma, vmf->address);
goto out;
}
#endif
vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
if (unlikely(!spin_trylock(vmf->ptl))) {
trace_spf_pte_lock(_RET_IP_, vmf->vma, vmf->address);
goto out;
}
if (vma_has_changed(vmf)) {
spin_unlock(vmf->ptl);
trace_spf_vma_changed(_RET_IP_, vmf->vma, vmf->address);
goto out;
}
ret = true;
out:
local_irq_enable();
return ret;
}
static bool __pte_map_lock_speculative(struct vm_fault *vmf, unsigned long addr)
{
bool ret = false;
pte_t *pte;
spinlock_t *ptl;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
pmd_t pmdval;
#endif
/*
* The first vma_has_changed() guarantees the page-tables are still
* valid, having IRQs disabled ensures they stay around, hence the
* second vma_has_changed() to make sure they are still valid once
* we've got the lock. After that a concurrent zap_pte_range() will
* block on the PTL and thus we're safe.
*/
local_irq_disable();
if (vma_has_changed(vmf)) {
trace_spf_vma_changed(_RET_IP_, vmf->vma, addr);
goto out;
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
* We check if the pmd value is still the same to ensure that there
* is not a huge collapse operation in progress in our back.
*/
pmdval = READ_ONCE(*vmf->pmd);
if (!pmd_same(pmdval, vmf->orig_pmd)) {
trace_spf_pmd_changed(_RET_IP_, vmf->vma, addr);
goto out;
}
#endif
/*
* Same as pte_offset_map_lock() except that we call
* spin_trylock() in place of spin_lock() to avoid race with
* unmap path which may have the lock and wait for this CPU
* to invalidate TLB but this CPU has irq disabled.
* Since we are in a speculative patch, accept it could fail
*/
ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
pte = pte_offset_map(vmf->pmd, addr);
if (unlikely(!spin_trylock(ptl))) {
pte_unmap(pte);
trace_spf_pte_lock(_RET_IP_, vmf->vma, addr);
goto out;
}
if (vma_has_changed(vmf)) {
pte_unmap_unlock(pte, ptl);
trace_spf_vma_changed(_RET_IP_, vmf->vma, addr);
goto out;
}
vmf->pte = pte;
vmf->ptl = ptl;
ret = true;
out:
local_irq_enable();
return ret;
}
static bool pte_map_lock(struct vm_fault *vmf)
{
if (!(vmf->flags & FAULT_FLAG_SPECULATIVE)) {
vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
return true;
}
return __pte_map_lock_speculative(vmf, vmf->address);
}
bool pte_map_lock_addr(struct vm_fault *vmf, unsigned long addr)
{
if (!(vmf->flags & FAULT_FLAG_SPECULATIVE)) {
vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
addr, &vmf->ptl);
return true;
}
return __pte_map_lock_speculative(vmf, addr);
}
static bool __read_mostly allow_file_spec_access;
static int __init allow_file_spec_access_setup(char *str)
{
allow_file_spec_access = true;
return 1;
}
__setup("allow_file_spec_access", allow_file_spec_access_setup);
static bool vmf_allows_speculation(struct vm_fault *vmf)
{
if (vma_is_anonymous(vmf->vma)) {
/*
* __anon_vma_prepare() requires the mmap_sem to be held
* because vm_next and vm_prev must be safe. This can't be
* guaranteed in the speculative path.
*/
if (!vmf->vma->anon_vma) {
trace_spf_vma_notsup(_RET_IP_, vmf->vma, vmf->address);
return false;
}
return true;
}
if (!allow_file_spec_access) {
/*
* Can't call vm_ops service has we don't know what they would
* do with the VMA.
* This include huge page from hugetlbfs.
*/
trace_spf_vma_notsup(_RET_IP_, vmf->vma, vmf->address);
return false;
}
if (!(vmf->vma->vm_flags & VM_SHARED) &&
(vmf->flags & FAULT_FLAG_WRITE) &&
!vmf->vma->anon_vma) {
/*
* non-anonymous private COW without anon_vma.
* See above.
*/
trace_spf_vma_notsup(_RET_IP_, vmf->vma, vmf->address);
return false;
}
if (vmf->vma->vm_ops->allow_speculation &&
vmf->vma->vm_ops->allow_speculation()) {
return true;
}
trace_spf_vma_notsup(_RET_IP_, vmf->vma, vmf->address);
return false;
}
#else
static inline bool pte_spinlock(struct vm_fault *vmf)
{
vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
spin_lock(vmf->ptl);
return true;
}
static inline bool pte_map_lock(struct vm_fault *vmf)
{
vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
return true;
}
inline bool pte_map_lock_addr(struct vm_fault *vmf, unsigned long addr)
{
vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
addr, &vmf->ptl);
return true;
}
static inline bool vmf_allows_speculation(struct vm_fault *vmf)
{
return false;
}
#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */
/*
* Scan a region of virtual memory, calling a provided function on
* each leaf page table where it exists.
*
* Unlike apply_to_page_range, this does _not_ fill in page tables
* where they are absent.
*/
int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr,
unsigned long size, pte_fn_t fn, void *data)
{
return __apply_to_page_range(mm, addr, size, fn, data, false);
}
EXPORT_SYMBOL_GPL(apply_to_existing_page_range);
/*
* handle_pte_fault chooses page fault handler according to an entry which was
* read non-atomically. Before making any commitment, on those architectures
* or configurations (e.g. i386 with PAE) which might give a mix of unmatched
* parts, do_swap_page must check under lock before unmapping the pte and
* proceeding (but do_wp_page is only called after already making such a check;
* and do_anonymous_page can safely check later on).
*
* pte_unmap_same() returns:
* 0 if the PTE are the same
* VM_FAULT_PTNOTSAME if the PTE are different
* VM_FAULT_RETRY if the VMA has changed in our back during
* a speculative page fault handling.
*/
static inline int pte_unmap_same(struct vm_fault *vmf)
{
int ret = 0;
#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
if (sizeof(pte_t) > sizeof(unsigned long)) {
if (pte_spinlock(vmf)) {
if (!pte_same(*vmf->pte, vmf->orig_pte))
ret = VM_FAULT_PTNOTSAME;
spin_unlock(vmf->ptl);
} else
ret = VM_FAULT_RETRY;
}
#endif
pte_unmap(vmf->pte);
return ret;
}
static inline bool cow_user_page(struct page *dst, struct page *src,
struct vm_fault *vmf)
{
bool ret;
void *kaddr;
void __user *uaddr;
bool locked = false;
struct vm_area_struct *vma = vmf->vma;
struct mm_struct *mm = vma->vm_mm;
unsigned long addr = vmf->address;
if (likely(src)) {
copy_user_highpage(dst, src, addr, vma);
return true;
}
/*
* If the source page was a PFN mapping, we don't have
* a "struct page" for it. We do a best-effort copy by
* just copying from the original user address. If that
* fails, we just zero-fill it. Live with it.
*/
kaddr = kmap_atomic(dst);
uaddr = (void __user *)(addr & PAGE_MASK);
/*
* On architectures with software "accessed" bits, we would
* take a double page fault, so mark it accessed here.
*/
if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) {
pte_t entry;
vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
locked = true;
if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
/*
* Other thread has already handled the fault
* and update local tlb only
*/
update_mmu_tlb(vma, addr, vmf->pte);
ret = false;
goto pte_unlock;
}
entry = pte_mkyoung(vmf->orig_pte);
if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0))
update_mmu_cache(vma, addr, vmf->pte);
}
/*
* This really shouldn't fail, because the page is there
* in the page tables. But it might just be unreadable,
* in which case we just give up and fill the result with
* zeroes.
*/
if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
if (locked)
goto warn;
/* Re-validate under PTL if the page is still mapped */
vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
locked = true;
if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
/* The PTE changed under us, update local tlb */
update_mmu_tlb(vma, addr, vmf->pte);
ret = false;
goto pte_unlock;
}
/*
* The same page can be mapped back since last copy attempt.
* Try to copy again under PTL.
*/
if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
/*
* Give a warn in case there can be some obscure
* use-case
*/
warn:
WARN_ON_ONCE(1);
clear_page(kaddr);
}
}
ret = true;
pte_unlock:
if (locked)
pte_unmap_unlock(vmf->pte, vmf->ptl);
kunmap_atomic(kaddr);
flush_dcache_page(dst);
return ret;
}
static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
{
struct file *vm_file = vma->vm_file;
if (vm_file)
return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO;
/*
* Special mappings (e.g. VDSO) do not have any file so fake
* a default GFP_KERNEL for them.
*/
return GFP_KERNEL;
}
/*
* Notify the address space that the page is about to become writable so that
* it can prohibit this or wait for the page to get into an appropriate state.
*
* We do this without the lock held, so that it can sleep if it needs to.
*/
static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
{
vm_fault_t ret;
struct page *page = vmf->page;
unsigned int old_flags = vmf->flags;
vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
if (vmf->vma->vm_file &&
IS_SWAPFILE(vmf->vma->vm_file->f_mapping->host))
return VM_FAULT_SIGBUS;
ret = vmf->vma->vm_ops->page_mkwrite(vmf);
/* Restore original flags so that caller is not surprised */
vmf->flags = old_flags;
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
return ret;
if (unlikely(!(ret & VM_FAULT_LOCKED))) {
lock_page(page);
if (!page->mapping) {
unlock_page(page);
return 0; /* retry */
}
ret |= VM_FAULT_LOCKED;
} else
VM_BUG_ON_PAGE(!PageLocked(page), page);
return ret;
}
/*
* Handle dirtying of a page in shared file mapping on a write fault.
*
* The function expects the page to be locked and unlocks it.
*/
static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct address_space *mapping;
struct page *page = vmf->page;
bool dirtied;
bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
dirtied = set_page_dirty(page);
VM_BUG_ON_PAGE(PageAnon(page), page);
/*
* Take a local copy of the address_space - page.mapping may be zeroed
* by truncate after unlock_page(). The address_space itself remains
* pinned by vma->vm_file's reference. We rely on unlock_page()'s
* release semantics to prevent the compiler from undoing this copying.
*/
mapping = page_rmapping(page);
unlock_page(page);
if (!page_mkwrite)
file_update_time(vma->vm_file);
/*
* Throttle page dirtying rate down to writeback speed.
*
* mapping may be NULL here because some device drivers do not
* set page.mapping but still dirty their pages
*
* Drop the mmap_lock before waiting on IO, if we can. The file
* is pinning the mapping, as per above.
*/
if ((dirtied || page_mkwrite) && mapping) {
struct file *fpin;
fpin = maybe_unlock_mmap_for_io(vmf, NULL);
balance_dirty_pages_ratelimited(mapping);
if (fpin) {
fput(fpin);
return VM_FAULT_RETRY;
}
}
return 0;
}
/*
* Handle write page faults for pages that can be reused in the current vma
*
* This can happen either due to the mapping being with the VM_SHARED flag,
* or due to us being the last reference standing to the page. In either
* case, all we need to do here is to mark the page as writable and update
* any related book-keeping.
*/
static inline void wp_page_reuse(struct vm_fault *vmf)
__releases(vmf->ptl)
{
struct vm_area_struct *vma = vmf->vma;
struct page *page = vmf->page;
pte_t entry;
/*
* Clear the pages cpupid information as the existing
* information potentially belongs to a now completely
* unrelated process.
*/
if (page)
page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
entry = pte_mkyoung(vmf->orig_pte);
entry = maybe_mkwrite(pte_mkdirty(entry), vmf->vma_flags);
if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
update_mmu_cache(vma, vmf->address, vmf->pte);
pte_unmap_unlock(vmf->pte, vmf->ptl);
count_vm_event(PGREUSE);
}
/*
* Handle the case of a page which we actually need to copy to a new page.
*
* Called with mmap_lock locked and the old page referenced, but
* without the ptl held.
*
* High level logic flow:
*
* - Allocate a page, copy the content of the old page to the new one.
* - Handle book keeping and accounting - cgroups, mmu-notifiers, etc.
* - Take the PTL. If the pte changed, bail out and release the allocated page
* - If the pte is still the way we remember it, update the page table and all
* relevant references. This includes dropping the reference the page-table
* held to the old page, as well as updating the rmap.
* - In any case, unlock the PTL and drop the reference we took to the old page.
*/
static vm_fault_t wp_page_copy(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct mm_struct *mm = vma->vm_mm;
struct page *old_page = vmf->page;
struct page *new_page = NULL;
pte_t entry;
int page_copied = 0;
struct mmu_notifier_range range;
vm_fault_t ret = VM_FAULT_OOM;
if (unlikely(anon_vma_prepare(vma)))
goto out;
if (is_zero_pfn(pte_pfn(vmf->orig_pte))) {
new_page = alloc_zeroed_user_highpage_movable(vma,
vmf->address);
if (!new_page)
goto out;
} else {
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
vmf->address);
if (!new_page)
goto out;
if (!cow_user_page(new_page, old_page, vmf)) {
/*
* COW failed, if the fault was solved by other,
* it's fine. If not, userspace would re-fault on
* the same address and we will handle the fault
* from the second attempt.
*/
put_page(new_page);
if (old_page)
put_page(old_page);
return 0;
}
trace_android_vh_cow_user_page(vmf, new_page);
}
if (mem_cgroup_charge(new_page, mm, GFP_KERNEL))
goto out_free_new;
cgroup_throttle_swaprate(new_page, GFP_KERNEL);
__SetPageUptodate(new_page);
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
vmf->address & PAGE_MASK,
(vmf->address & PAGE_MASK) + PAGE_SIZE);
mmu_notifier_invalidate_range_start(&range);
/*
* Re-check the pte - we dropped the lock
*/
if (!pte_map_lock(vmf)) {
ret = VM_FAULT_RETRY;
goto out_invalidate_end;
}
if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
if (old_page) {
if (!PageAnon(old_page)) {
dec_mm_counter_fast(mm,
mm_counter_file(old_page));
inc_mm_counter_fast(mm, MM_ANONPAGES);
}
} else {
inc_mm_counter_fast(mm, MM_ANONPAGES);
}
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
entry = mk_pte(new_page, vmf->vma_page_prot);
entry = pte_sw_mkyoung(entry);
entry = maybe_mkwrite(pte_mkdirty(entry), vmf->vma_flags);
/*
* Clear the pte entry and flush it first, before updating the
* pte with the new entry. This will avoid a race condition
* seen in the presence of one thread doing SMC and another
* thread doing COW.
*/
ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
__page_add_new_anon_rmap(new_page, vma, vmf->address, false);
__lru_cache_add_inactive_or_unevictable(new_page, vmf->vma_flags);
/*
* We call the notify macro here because, when using secondary
* mmu page tables (such as kvm shadow page tables), we want the
* new page to be mapped directly into the secondary page table.
*/
set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
update_mmu_cache(vma, vmf->address, vmf->pte);
if (old_page) {
/*
* Only after switching the pte to the new page may
* we remove the mapcount here. Otherwise another
* process may come and find the rmap count decremented
* before the pte is switched to the new page, and
* "reuse" the old page writing into it while our pte
* here still points into it and can be read by other
* threads.
*
* The critical issue is to order this
* page_remove_rmap with the ptp_clear_flush above.
* Those stores are ordered by (if nothing else,)
* the barrier present in the atomic_add_negative
* in page_remove_rmap.
*
* Then the TLB flush in ptep_clear_flush ensures that
* no process can access the old page before the
* decremented mapcount is visible. And the old page
* cannot be reused until after the decremented
* mapcount is visible. So transitively, TLBs to
* old page will be flushed before it can be reused.
*/
page_remove_rmap(old_page, false);
}
/* Free the old page.. */
new_page = old_page;
page_copied = 1;
} else {
update_mmu_tlb(vma, vmf->address, vmf->pte);
}
if (new_page)
put_page(new_page);
pte_unmap_unlock(vmf->pte, vmf->ptl);
/*
* No need to double call mmu_notifier->invalidate_range() callback as
* the above ptep_clear_flush_notify() did already call it.
*/
mmu_notifier_invalidate_range_only_end(&range);
if (old_page) {
/*
* Don't let another task, with possibly unlocked vma,
* keep the mlocked page.
*/
if (page_copied && (vmf->vma_flags & VM_LOCKED)) {
lock_page(old_page); /* LRU manipulation */
if (PageMlocked(old_page))
munlock_vma_page(old_page);
unlock_page(old_page);
}
put_page(old_page);
}
return page_copied ? VM_FAULT_WRITE : 0;
out_invalidate_end:
mmu_notifier_invalidate_range_only_end(&range);
out_free_new:
put_page(new_page);
out:
if (old_page)
put_page(old_page);
return ret;
}
/**
* finish_mkwrite_fault - finish page fault for a shared mapping, making PTE
* writeable once the page is prepared
*
* @vmf: structure describing the fault
*
* This function handles all that is needed to finish a write page fault in a
* shared mapping due to PTE being read-only once the mapped page is prepared.
* It handles locking of PTE and modifying it.
*
* The function expects the page to be locked or other protection against
* concurrent faults / writeback (such as DAX radix tree locks).
*
* Return: %VM_FAULT_WRITE on success, %0 when PTE got changed before
* we acquired PTE lock.
*/
vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
{
WARN_ON_ONCE(!(vmf->vma_flags & VM_SHARED));
if (!pte_map_lock(vmf))
return VM_FAULT_RETRY;
/*
* We might have raced with another page fault while we released the
* pte_offset_map_lock.
*/
if (!pte_same(*vmf->pte, vmf->orig_pte)) {
update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
pte_unmap_unlock(vmf->pte, vmf->ptl);
return VM_FAULT_NOPAGE;
}
wp_page_reuse(vmf);
return 0;
}
/*
* Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED
* mapping
*/
static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
vm_fault_t ret;
pte_unmap_unlock(vmf->pte, vmf->ptl);
vmf->flags |= FAULT_FLAG_MKWRITE;
ret = vma->vm_ops->pfn_mkwrite(vmf);
if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
return ret;
return finish_mkwrite_fault(vmf);
}
wp_page_reuse(vmf);
return VM_FAULT_WRITE;
}
static vm_fault_t wp_page_shared(struct vm_fault *vmf)
__releases(vmf->ptl)
{
struct vm_area_struct *vma = vmf->vma;
vm_fault_t ret = VM_FAULT_WRITE;
get_page(vmf->page);
if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
vm_fault_t tmp;
pte_unmap_unlock(vmf->pte, vmf->ptl);
tmp = do_page_mkwrite(vmf);
if (unlikely(!tmp || (tmp &
(VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
put_page(vmf->page);
return tmp;
}
tmp = finish_mkwrite_fault(vmf);
if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
unlock_page(vmf->page);
put_page(vmf->page);
return tmp;
}
} else {
wp_page_reuse(vmf);
lock_page(vmf->page);
}
ret |= fault_dirty_shared_page(vmf);
put_page(vmf->page);
return ret;
}
/*
* This routine handles present pages, when users try to write
* to a shared page. It is done by copying the page to a new address
* and decrementing the shared-page counter for the old page.
*
* Note that this routine assumes that the protection checks have been
* done by the caller (the low-level page fault routine in most cases).
* Thus we can safely just mark it writable once we've done any necessary
* COW.
*
* We also mark the page dirty at this point even though the page will
* change only once the write actually happens. This avoids a few races,
* and potentially makes it more efficient.
*
* We enter with non-exclusive mmap_lock (to exclude vma changes,
* but allow concurrent faults), with pte both mapped and locked.
* We return with mmap_lock still held, but pte unmapped and unlocked.
*/
static vm_fault_t do_wp_page(struct vm_fault *vmf)
__releases(vmf->ptl)
{
struct vm_area_struct *vma = vmf->vma;
if (userfaultfd_pte_wp(vma, *vmf->pte)) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
if (vmf->flags & FAULT_FLAG_SPECULATIVE)
return VM_FAULT_RETRY;
return handle_userfault(vmf, VM_UFFD_WP);
}
/*
* Userfaultfd write-protect can defer flushes. Ensure the TLB
* is flushed in this case before copying.
*/
if (unlikely(userfaultfd_wp(vmf->vma) &&
mm_tlb_flush_pending(vmf->vma->vm_mm)))
flush_tlb_page(vmf->vma, vmf->address);
vmf->page = _vm_normal_page(vma, vmf->address, vmf->orig_pte,
vmf->vma_flags);
if (!vmf->page) {
/*
* VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
* VM_PFNMAP VMA.
*
* We should not cow pages in a shared writeable mapping.
* Just mark the pages writable and/or call ops->pfn_mkwrite.
*/
if ((vmf->vma_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))
return wp_pfn_shared(vmf);
pte_unmap_unlock(vmf->pte, vmf->ptl);
return wp_page_copy(vmf);
}
/*
* Take out anonymous pages first, anonymous shared vmas are
* not dirty accountable.
*/
if (PageAnon(vmf->page)) {
struct page *page = vmf->page;
/* PageKsm() doesn't necessarily raise the page refcount */
if (PageKsm(page) || page_count(page) != 1)
goto copy;
if (!trylock_page(page))
goto copy;
if (PageKsm(page) || page_mapcount(page) != 1 || page_count(page) != 1) {
unlock_page(page);
goto copy;
}
/*
* Ok, we've got the only map reference, and the only
* page count reference, and the page is locked,
* it's dark out, and we're wearing sunglasses. Hit it.
*/
unlock_page(page);
wp_page_reuse(vmf);
return VM_FAULT_WRITE;
} else if (unlikely((vmf->vma_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))) {
return wp_page_shared(vmf);
}
copy:
/*
* Ok, we need to copy. Oh, well..
*/
get_page(vmf->page);
pte_unmap_unlock(vmf->pte, vmf->ptl);
return wp_page_copy(vmf);
}
static void unmap_mapping_range_vma(struct vm_area_struct *vma,
unsigned long start_addr, unsigned long end_addr,
struct zap_details *details)
{
zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
}
static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
struct zap_details *details)
{
struct vm_area_struct *vma;
pgoff_t vba, vea, zba, zea;
vma_interval_tree_foreach(vma, root,
details->first_index, details->last_index) {
vba = vma->vm_pgoff;
vea = vba + vma_pages(vma) - 1;
zba = details->first_index;
if (zba < vba)
zba = vba;
zea = details->last_index;
if (zea > vea)
zea = vea;
unmap_mapping_range_vma(vma,
((zba - vba) << PAGE_SHIFT) + vma->vm_start,
((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
details);
}
}
/**
* unmap_mapping_page() - Unmap single page from processes.
* @page: The locked page to be unmapped.
*
* Unmap this page from any userspace process which still has it mmaped.
* Typically, for efficiency, the range of nearby pages has already been
* unmapped by unmap_mapping_pages() or unmap_mapping_range(). But once
* truncation or invalidation holds the lock on a page, it may find that
* the page has been remapped again: and then uses unmap_mapping_page()
* to unmap it finally.
*/
void unmap_mapping_page(struct page *page)
{
struct address_space *mapping = page->mapping;
struct zap_details details = { };
VM_BUG_ON(!PageLocked(page));
VM_BUG_ON(PageTail(page));
details.check_mapping = mapping;
details.first_index = page->index;
details.last_index = page->index + thp_nr_pages(page) - 1;
details.single_page = page;
i_mmap_lock_write(mapping);
if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
unmap_mapping_range_tree(&mapping->i_mmap, &details);
i_mmap_unlock_write(mapping);
}
/**
* unmap_mapping_pages() - Unmap pages from processes.
* @mapping: The address space containing pages to be unmapped.
* @start: Index of first page to be unmapped.
* @nr: Number of pages to be unmapped. 0 to unmap to end of file.
* @even_cows: Whether to unmap even private COWed pages.
*
* Unmap the pages in this address space from any userspace process which
* has them mmaped. Generally, you want to remove COWed pages as well when
* a file is being truncated, but not when invalidating pages from the page
* cache.
*/
void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
pgoff_t nr, bool even_cows)
{
struct zap_details details = { };
details.check_mapping = even_cows ? NULL : mapping;
details.first_index = start;
details.last_index = start + nr - 1;
if (details.last_index < details.first_index)
details.last_index = ULONG_MAX;
i_mmap_lock_write(mapping);
if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
unmap_mapping_range_tree(&mapping->i_mmap, &details);
i_mmap_unlock_write(mapping);
}
/**
* unmap_mapping_range - unmap the portion of all mmaps in the specified
* address_space corresponding to the specified byte range in the underlying
* file.
*
* @mapping: the address space containing mmaps to be unmapped.
* @holebegin: byte in first page to unmap, relative to the start of
* the underlying file. This will be rounded down to a PAGE_SIZE
* boundary. Note that this is different from truncate_pagecache(), which
* must keep the partial page. In contrast, we must get rid of
* partial pages.
* @holelen: size of prospective hole in bytes. This will be rounded
* up to a PAGE_SIZE boundary. A holelen of zero truncates to the
* end of the file.
* @even_cows: 1 when truncating a file, unmap even private COWed pages;
* but 0 when invalidating pagecache, don't throw away private data.
*/
void unmap_mapping_range(struct address_space *mapping,
loff_t const holebegin, loff_t const holelen, int even_cows)
{
pgoff_t hba = holebegin >> PAGE_SHIFT;
pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
/* Check for overflow. */
if (sizeof(holelen) > sizeof(hlen)) {
long long holeend =
(holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
if (holeend & ~(long long)ULONG_MAX)
hlen = ULONG_MAX - hba + 1;
}
unmap_mapping_pages(mapping, hba, hlen, even_cows);
}
EXPORT_SYMBOL(unmap_mapping_range);
/*
* We enter with non-exclusive mmap_lock (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with pte unmapped and unlocked.
*
* We return with the mmap_lock locked or unlocked in the same cases
* as does filemap_fault().
*/
vm_fault_t do_swap_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct page *page = NULL, *swapcache;
swp_entry_t entry;
pte_t pte;
int locked;
int exclusive = 0;
vm_fault_t ret;
void *shadow = NULL;
if (vmf->flags & FAULT_FLAG_SPECULATIVE) {
pte_unmap(vmf->pte);
return VM_FAULT_RETRY;
}
ret = pte_unmap_same(vmf);
if (ret) {
/*
* If pte != orig_pte, this means another thread did the
* swap operation in our back.
* So nothing else to do.
*/
if (ret == VM_FAULT_PTNOTSAME)
ret = 0;
goto out;
}
entry = pte_to_swp_entry(vmf->orig_pte);
if (unlikely(non_swap_entry(entry))) {
if (is_migration_entry(entry)) {
migration_entry_wait(vma->vm_mm, vmf->pmd,
vmf->address);
} else if (is_device_private_entry(entry)) {
vmf->page = device_private_entry_to_page(entry);
ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
} else if (is_hwpoison_entry(entry)) {
ret = VM_FAULT_HWPOISON;
} else {
print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
ret = VM_FAULT_SIGBUS;
}
goto out;
}
delayacct_set_flag(DELAYACCT_PF_SWAPIN);
page = lookup_swap_cache(entry, vma, vmf->address);
swapcache = page;
if (!page) {
struct swap_info_struct *si = swp_swap_info(entry);
if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
__swap_count(entry) == 1) {
/* skip swapcache */
gfp_t flags = GFP_HIGHUSER_MOVABLE;
trace_android_rvh_set_skip_swapcache_flags(&flags);
page = alloc_page_vma(flags, vma, vmf->address);
if (page) {
int err;
__SetPageLocked(page);
__SetPageSwapBacked(page);
set_page_private(page, entry.val);
/* Tell memcg to use swap ownership records */
SetPageSwapCache(page);
err = mem_cgroup_charge(page, vma->vm_mm,
GFP_KERNEL);
ClearPageSwapCache(page);
if (err) {
ret = VM_FAULT_OOM;
goto out_page;
}
shadow = get_shadow_from_swap_cache(entry);
if (shadow)
workingset_refault(page, shadow);
lru_cache_add(page);
swap_readpage(page, true);
}
} else if (vmf->flags & FAULT_FLAG_SPECULATIVE) {
/*
* Don't try readahead during a speculative page fault
* as the VMA's boundaries may change in our back.
* If the page is not in the swap cache and synchronous
* read is disabled, fall back to the regular page fault
* mechanism.
*/
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
ret = VM_FAULT_RETRY;
goto out;
} else {
page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
vmf);
swapcache = page;
}
if (!page) {
/*
* Back out if the VMA has changed in our back during
* a speculative page fault or if somebody else
* faulted in this pte while we released the pte lock.
*/
if (!pte_map_lock(vmf)) {
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
ret = VM_FAULT_RETRY;
goto out;
}
if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
ret = VM_FAULT_OOM;
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
goto unlock;
}
/* Had to read the page from swap area: Major fault */
ret = VM_FAULT_MAJOR;
count_vm_event(PGMAJFAULT);
count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
} else if (PageHWPoison(page)) {
/*
* hwpoisoned dirty swapcache pages are kept for killing
* owner processes (which may be unknown at hwpoison time)
*/
ret = VM_FAULT_HWPOISON;
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
goto out_release;
}
locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
if (!locked) {
ret |= VM_FAULT_RETRY;
goto out_release;
}
/*
* Make sure try_to_free_swap or reuse_swap_page or swapoff did not
* release the swapcache from under us. The page pin, and pte_same
* test below, are not enough to exclude that. Even if it is still
* swapcache, we need to check that the page's swap has not changed.
*/
if (unlikely((!PageSwapCache(page) ||
page_private(page) != entry.val)) && swapcache)
goto out_page;
page = ksm_might_need_to_copy(page, vma, vmf->address);
if (unlikely(!page)) {
ret = VM_FAULT_OOM;
page = swapcache;
goto out_page;
}
cgroup_throttle_swaprate(page, GFP_KERNEL);
/*
* Back out if the VMA has changed in our back during a speculative
* page fault or if somebody else already faulted in this pte.
*/
if (!pte_map_lock(vmf)) {
ret = VM_FAULT_RETRY;
goto out_page;
}
if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
goto out_nomap;
if (unlikely(!PageUptodate(page))) {
ret = VM_FAULT_SIGBUS;
goto out_nomap;
}
/*
* The page isn't present yet, go ahead with the fault.
*
* Be careful about the sequence of operations here.
* To get its accounting right, reuse_swap_page() must be called
* while the page is counted on swap but not yet in mapcount i.e.
* before page_add_anon_rmap() and swap_free(); try_to_free_swap()
* must be called after the swap_free(), or it will never succeed.
*/
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
pte = mk_pte(page, vmf->vma_page_prot);
if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vmf->vma_flags);
vmf->flags &= ~FAULT_FLAG_WRITE;
ret |= VM_FAULT_WRITE;
exclusive = RMAP_EXCLUSIVE;
}
flush_icache_page(vma, page);
if (pte_swp_soft_dirty(vmf->orig_pte))
pte = pte_mksoft_dirty(pte);
if (pte_swp_uffd_wp(vmf->orig_pte)) {
pte = pte_mkuffd_wp(pte);
pte = pte_wrprotect(pte);
}
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
vmf->orig_pte = pte;
/* ksm created a completely new copy */
if (unlikely(page != swapcache && swapcache)) {
__page_add_new_anon_rmap(page, vma, vmf->address, false);
__lru_cache_add_inactive_or_unevictable(page, vmf->vma_flags);
} else {
do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
}
trace_android_vh_swapin_add_anon_rmap(vmf, page);
swap_free(entry);
if (mem_cgroup_swap_full(page) ||
(vmf->vma_flags & VM_LOCKED) || PageMlocked(page))
try_to_free_swap(page);
unlock_page(page);
if (page != swapcache && swapcache) {
/*
* Hold the lock to avoid the swap entry to be reused
* until we take the PT lock for the pte_same() check
* (to avoid false positives from pte_same). For
* further safety release the lock after the swap_free
* so that the swap count won't change under a
* parallel locked swapcache.
*/
unlock_page(swapcache);
put_page(swapcache);
}
if (vmf->flags & FAULT_FLAG_WRITE) {
ret |= do_wp_page(vmf);
if (ret & VM_FAULT_ERROR)
ret &= VM_FAULT_ERROR;
goto out;
}
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, vmf->address, vmf->pte);
unlock:
pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
return ret;
out_nomap:
pte_unmap_unlock(vmf->pte, vmf->ptl);
out_page:
unlock_page(page);
out_release:
put_page(page);
if (page != swapcache && swapcache) {
unlock_page(swapcache);
put_page(swapcache);
}
return ret;
}
/*
* We enter with non-exclusive mmap_lock (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_lock still held, but pte unmapped and unlocked.
*/
static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct page *page;
vm_fault_t ret = 0;
pte_t entry;
/* File mapping without ->vm_ops ? */
if (vmf->vma_flags & VM_SHARED)
return VM_FAULT_SIGBUS;
/* Do not check unstable pmd, if it's changed will retry later */
if (vmf->flags & FAULT_FLAG_SPECULATIVE)
goto skip_pmd_checks;
/*
* Use pte_alloc() instead of pte_alloc_map(). We can't run
* pte_offset_map() on pmds where a huge pmd might be created
* from a different thread.
*
* pte_alloc_map() is safe to use under mmap_write_lock(mm) or when
* parallel threads are excluded by other means.
*
* Here we only have mmap_read_lock(mm).
*/
if (pte_alloc(vma->vm_mm, vmf->pmd))
return VM_FAULT_OOM;
/* See comment in handle_pte_fault() */
if (unlikely(pmd_trans_unstable(vmf->pmd)))
return 0;
skip_pmd_checks:
/* Use the zero-page for reads */
if (!(vmf->flags & FAULT_FLAG_WRITE) &&
!mm_forbids_zeropage(vma->vm_mm)) {
entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
vmf->vma_page_prot));
if (!pte_map_lock(vmf))
return VM_FAULT_RETRY;
if (!pte_none(*vmf->pte)) {
update_mmu_tlb(vma, vmf->address, vmf->pte);
goto unlock;
}
ret = check_stable_address_space(vma->vm_mm);
if (ret)
goto unlock;
/*
* Don't call the userfaultfd during the speculative path.
* We already checked for the VMA to not be managed through
* userfaultfd, but it may be set in our back once we have lock
* the pte. In such a case we can ignore it this time.
*/
if (vmf->flags & FAULT_FLAG_SPECULATIVE)
goto setpte;
/* Deliver the page fault to userland, check inside PT lock */
if (userfaultfd_missing(vma)) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
return handle_userfault(vmf, VM_UFFD_MISSING);
}
goto setpte;
}
/* Allocate our own private page. */
if (unlikely(anon_vma_prepare(vma)))
goto oom;
page = alloc_zeroed_user_highpage_movable(vma, vmf->address);
if (!page)
goto oom;
if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
goto oom_free_page;
cgroup_throttle_swaprate(page, GFP_KERNEL);
/*
* The memory barrier inside __SetPageUptodate makes sure that
* preceding stores to the page contents become visible before
* the set_pte_at() write.
*/
__SetPageUptodate(page);
entry = mk_pte(page, vmf->vma_page_prot);
entry = pte_sw_mkyoung(entry);
if (vmf->vma_flags & VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry));
if (!pte_map_lock(vmf)) {
ret = VM_FAULT_RETRY;
goto release;
}
if (!pte_none(*vmf->pte)) {
update_mmu_cache(vma, vmf->address, vmf->pte);
goto unlock_and_release;
}
ret = check_stable_address_space(vma->vm_mm);
if (ret)
goto unlock_and_release;
/* Deliver the page fault to userland, check inside PT lock */
if (!(vmf->flags & FAULT_FLAG_SPECULATIVE) &&
userfaultfd_missing(vma)) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
put_page(page);
return handle_userfault(vmf, VM_UFFD_MISSING);
}
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
__page_add_new_anon_rmap(page, vma, vmf->address, false);
__lru_cache_add_inactive_or_unevictable(page, vmf->vma_flags);
setpte:
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, vmf->address, vmf->pte);
unlock:
pte_unmap_unlock(vmf->pte, vmf->ptl);
return ret;
unlock_and_release:
pte_unmap_unlock(vmf->pte, vmf->ptl);
release:
put_page(page);
return ret;
oom_free_page:
put_page(page);
oom:
return VM_FAULT_OOM;
}
/*
* The mmap_lock must have been held on entry, and may have been
* released depending on flags and vma->vm_ops->fault() return value.
* See filemap_fault() and __lock_page_retry().
*/
static vm_fault_t __do_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
vm_fault_t ret;
/* Do not check unstable pmd, if it's changed will retry later */
if (vmf->flags & FAULT_FLAG_SPECULATIVE)
goto skip_pmd_checks;
/*
* Preallocate pte before we take page_lock because this might lead to
* deadlocks for memcg reclaim which waits for pages under writeback:
* lock_page(A)
* SetPageWriteback(A)
* unlock_page(A)
* lock_page(B)
* lock_page(B)
* pte_alloc_one
* shrink_page_list
* wait_on_page_writeback(A)
* SetPageWriteback(B)
* unlock_page(B)
* # flush A, B to clear the writeback
*/
if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
if (!vmf->prealloc_pte)
return VM_FAULT_OOM;
smp_wmb(); /* See comment in __pte_alloc() */
}
skip_pmd_checks:
ret = vma->vm_ops->fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
VM_FAULT_DONE_COW)))
return ret;
if (unlikely(PageHWPoison(vmf->page))) {
struct page *page = vmf->page;
vm_fault_t poisonret = VM_FAULT_HWPOISON;
if (ret & VM_FAULT_LOCKED) {
if (page_mapped(page))
unmap_mapping_pages(page_mapping(page),
page->index, 1, false);
/* Retry if a clean page was removed from the cache. */
if (invalidate_inode_page(page))
poisonret = VM_FAULT_NOPAGE;
unlock_page(page);
}
put_page(page);
vmf->page = NULL;
return poisonret;
}
if (unlikely(!(ret & VM_FAULT_LOCKED)))
lock_page(vmf->page);
else
VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page);
return ret;
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static void deposit_prealloc_pte(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
/*
* We are going to consume the prealloc table,
* count that as nr_ptes.
*/
mm_inc_nr_ptes(vma->vm_mm);
vmf->prealloc_pte = NULL;
}
vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
{
struct vm_area_struct *vma = vmf->vma;
bool write = vmf->flags & FAULT_FLAG_WRITE;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
pmd_t entry;
int i;
vm_fault_t ret = VM_FAULT_FALLBACK;
if (!transhuge_vma_suitable(vma, haddr))
return ret;
page = compound_head(page);
if (compound_order(page) != HPAGE_PMD_ORDER)
return ret;
/*
* Archs like ppc64 need additonal space to store information
* related to pte entry. Use the preallocated table for that.
*/
if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
if (!vmf->prealloc_pte)
return VM_FAULT_OOM;
smp_wmb(); /* See comment in __pte_alloc() */
}
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
if (unlikely(!pmd_none(*vmf->pmd)))
goto out;
for (i = 0; i < HPAGE_PMD_NR; i++)
flush_icache_page(vma, page + i);
entry = mk_huge_pmd(page, vmf->vma_page_prot);
if (write)
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR);
page_add_file_rmap(page, true);
/*
* deposit and withdraw with pmd lock held
*/
if (arch_needs_pgtable_deposit())
deposit_prealloc_pte(vmf);
set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
update_mmu_cache_pmd(vma, haddr, vmf->pmd);
/* fault is handled */
ret = 0;
count_vm_event(THP_FILE_MAPPED);
out:
spin_unlock(vmf->ptl);
return ret;
}
#else
vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
{
return VM_FAULT_FALLBACK;
}
#endif
void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
{
struct vm_area_struct *vma = vmf->vma;
bool write = vmf->flags & FAULT_FLAG_WRITE;
bool prefault = vmf->address != addr;
pte_t entry;
flush_icache_page(vma, page);
entry = mk_pte(page, vmf->vma_page_prot);
if (prefault && arch_wants_old_prefaulted_pte())
entry = pte_mkold(entry);
else
entry = pte_sw_mkyoung(entry);
if (write)
entry = maybe_mkwrite(pte_mkdirty(entry), vmf->vma_flags);
/* copy-on-write page */
if (write && !(vmf->vma_flags & VM_SHARED)) {
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
__page_add_new_anon_rmap(page, vma, addr, false);
__lru_cache_add_inactive_or_unevictable(page, vmf->vma_flags);
} else {
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
page_add_file_rmap(page, false);
}
set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
}
/**
* finish_fault - finish page fault once we have prepared the page to fault
*
* @vmf: structure describing the fault
*
* This function handles all that is needed to finish a page fault once the
* page to fault in is prepared. It handles locking of PTEs, inserts PTE for
* given page, adds reverse page mapping, handles memcg charges and LRU
* addition.
*
* The function expects the page to be locked and on success it consumes a
* reference of a page being mapped (for the PTE which maps it).
*
* Return: %0 on success, %VM_FAULT_ code in case of error.
*/
vm_fault_t finish_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct page *page;
vm_fault_t ret;
/* Did we COW the page? */
if ((vmf->flags & FAULT_FLAG_WRITE) &&
!(vmf->vma_flags & VM_SHARED))
page = vmf->cow_page;
else
page = vmf->page;
/*
* check even for read faults because we might have lost our CoWed
* page
*/
if (!(vma->vm_flags & VM_SHARED)) {
ret = check_stable_address_space(vma->vm_mm);
if (ret)
return ret;
}
/* Do not check unstable pmd, if it's changed will retry later */
if (vmf->flags & FAULT_FLAG_SPECULATIVE)
goto skip_pmd_checks;
if (pmd_none(*vmf->pmd)) {
if (PageTransCompound(page)) {
ret = do_set_pmd(vmf, page);
if (ret != VM_FAULT_FALLBACK)
return ret;
}
if (vmf->prealloc_pte) {
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
if (likely(pmd_none(*vmf->pmd))) {
mm_inc_nr_ptes(vma->vm_mm);
pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
vmf->prealloc_pte = NULL;
}
spin_unlock(vmf->ptl);
} else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) {
return VM_FAULT_OOM;
}
}
/*
* See comment in handle_pte_fault() for how this scenario happens, we
* need to return NOPAGE so that we drop this page.
*/
if (pmd_devmap_trans_unstable(vmf->pmd))
return VM_FAULT_NOPAGE;
skip_pmd_checks:
if (!pte_map_lock(vmf))
return VM_FAULT_RETRY;
ret = 0;
/* Re-check under ptl */
if (likely(pte_none(*vmf->pte)))
do_set_pte(vmf, page, vmf->address);
else
ret = VM_FAULT_NOPAGE;
update_mmu_tlb(vma, vmf->address, vmf->pte);
pte_unmap_unlock(vmf->pte, vmf->ptl);
return ret;
}
static unsigned long fault_around_bytes __read_mostly =
rounddown_pow_of_two(65536);
#ifdef CONFIG_DEBUG_FS
static int fault_around_bytes_get(void *data, u64 *val)
{
*val = fault_around_bytes;
return 0;
}
/*
* fault_around_bytes must be rounded down to the nearest page order as it's
* what do_fault_around() expects to see.
*/
static int fault_around_bytes_set(void *data, u64 val)
{
if (val / PAGE_SIZE > PTRS_PER_PTE)
return -EINVAL;
if (val > PAGE_SIZE)
fault_around_bytes = rounddown_pow_of_two(val);
else
fault_around_bytes = PAGE_SIZE; /* rounddown_pow_of_two(0) is undefined */
return 0;
}
DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops,
fault_around_bytes_get, fault_around_bytes_set, "%llu\n");
static int __init fault_around_debugfs(void)
{
debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL,
&fault_around_bytes_fops);
return 0;
}
late_initcall(fault_around_debugfs);
#endif
/*
* do_fault_around() tries to map few pages around the fault address. The hope
* is that the pages will be needed soon and this will lower the number of
* faults to handle.
*
* It uses vm_ops->map_pages() to map the pages, which skips the page if it's
* not ready to be mapped: not up-to-date, locked, etc.
*
* This function is called with the page table lock taken. In the split ptlock
* case the page table lock only protects only those entries which belong to
* the page table corresponding to the fault address.
*
* This function doesn't cross the VMA boundaries, in order to call map_pages()
* only once.
*
* fault_around_bytes defines how many bytes we'll try to map.
* do_fault_around() expects it to be set to a power of two less than or equal
* to PTRS_PER_PTE.
*
* The virtual address of the area that we map is naturally aligned to
* fault_around_bytes rounded down to the machine page size
* (and therefore to page order). This way it's easier to guarantee
* that we don't cross page table boundaries.
*/
static vm_fault_t do_fault_around(struct vm_fault *vmf)
{
unsigned long address = vmf->address, nr_pages, mask;
pgoff_t start_pgoff = vmf->pgoff;
pgoff_t end_pgoff;
int off;
nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
address = max(address & mask, vmf->vma->vm_start);
off = ((vmf->address - address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
start_pgoff -= off;
/*
* end_pgoff is either the end of the page table, the end of
* the vma or nr_pages from start_pgoff, depending what is nearest.
*/
end_pgoff = start_pgoff -
((address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
PTRS_PER_PTE - 1;
end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1,
start_pgoff + nr_pages - 1);
if (!(vmf->flags & FAULT_FLAG_SPECULATIVE) &&
pmd_none(*vmf->pmd)) {
vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
if (!vmf->prealloc_pte)
return VM_FAULT_OOM;
smp_wmb(); /* See comment in __pte_alloc() */
}
return vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
}
static vm_fault_t do_read_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
vm_fault_t ret = 0;
/*
* Let's call ->map_pages() first and use ->fault() as fallback
* if page by the offset is not ready to be mapped (cold cache or
* something).
*/
if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
if (likely(!userfaultfd_minor(vmf->vma))) {
ret = do_fault_around(vmf);
if (ret)
return ret;
}
}
ret = __do_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
ret |= finish_fault(vmf);
unlock_page(vmf->page);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
put_page(vmf->page);
return ret;
}
static vm_fault_t do_cow_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
vm_fault_t ret;
if (unlikely(anon_vma_prepare(vma)))
return VM_FAULT_OOM;
vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
if (!vmf->cow_page)
return VM_FAULT_OOM;
if (mem_cgroup_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL)) {
put_page(vmf->cow_page);
return VM_FAULT_OOM;
}
cgroup_throttle_swaprate(vmf->cow_page, GFP_KERNEL);
ret = __do_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
goto uncharge_out;
if (ret & VM_FAULT_DONE_COW)
return ret;
copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
__SetPageUptodate(vmf->cow_page);
ret |= finish_fault(vmf);
unlock_page(vmf->page);
put_page(vmf->page);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
goto uncharge_out;
return ret;
uncharge_out:
put_page(vmf->cow_page);
return ret;
}
static vm_fault_t do_shared_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
vm_fault_t ret, tmp;
ret = __do_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
/*
* Check if the backing address space wants to know that the page is
* about to become writable
*/
if (vma->vm_ops->page_mkwrite) {
unlock_page(vmf->page);
tmp = do_page_mkwrite(vmf);
if (unlikely(!tmp ||
(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
put_page(vmf->page);
return tmp;
}
}
ret |= finish_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
VM_FAULT_RETRY))) {
unlock_page(vmf->page);
put_page(vmf->page);
return ret;
}
ret |= fault_dirty_shared_page(vmf);
return ret;
}
/*
* We enter with non-exclusive mmap_lock (to exclude vma changes,
* but allow concurrent faults).
* The mmap_lock may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
* If mmap_lock is released, vma may become invalid (for example
* by other thread calling munmap()).
*/
static vm_fault_t do_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct mm_struct *vm_mm = vma->vm_mm;
vm_fault_t ret;
/*
* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND
*/
if (!vma->vm_ops->fault) {
/*
* If we find a migration pmd entry or a none pmd entry, which
* should never happen, return SIGBUS
*/
if (unlikely(!pmd_present(*vmf->pmd)))
ret = VM_FAULT_SIGBUS;
else {
vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm,
vmf->pmd,
vmf->address,
&vmf->ptl);
/*
* Make sure this is not a temporary clearing of pte
* by holding ptl and checking again. A R/M/W update
* of pte involves: take ptl, clearing the pte so that
* we don't have concurrent modification by hardware
* followed by an update.
*/
if (unlikely(pte_none(*vmf->pte)))
ret = VM_FAULT_SIGBUS;
else
ret = VM_FAULT_NOPAGE;
pte_unmap_unlock(vmf->pte, vmf->ptl);
}
} else if (!(vmf->flags & FAULT_FLAG_WRITE))
ret = do_read_fault(vmf);
else if (!(vmf->vma_flags & VM_SHARED))
ret = do_cow_fault(vmf);
else
ret = do_shared_fault(vmf);
/* preallocated pagetable is unused: free it */
if (vmf->prealloc_pte) {
pte_free(vm_mm, vmf->prealloc_pte);
vmf->prealloc_pte = NULL;
}
return ret;
}
static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
unsigned long addr, int page_nid,
int *flags)
{
get_page(page);
count_vm_numa_event(NUMA_HINT_FAULTS);
if (page_nid == numa_node_id()) {
count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
*flags |= TNF_FAULT_LOCAL;
}
return mpol_misplaced(page, vma, addr);
}
static vm_fault_t do_numa_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct page *page = NULL;
int page_nid = NUMA_NO_NODE;
int last_cpupid;
int target_nid;
bool migrated = false;
pte_t pte, old_pte;
bool was_writable = pte_savedwrite(vmf->orig_pte);
int flags = 0;
/*
* The "pte" at this point cannot be used safely without
* validation through pte_unmap_same(). It's of NUMA type but
* the pfn may be screwed if the read is non atomic.
*/
if (!pte_spinlock(vmf))
return VM_FAULT_RETRY;
if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
goto out;
}
/*
* Make it present again, Depending on how arch implementes non
* accessible ptes, some can allow access by kernel mode.
*/
old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte);
pte = pte_modify(old_pte, vmf->vma_page_prot);
pte = pte_mkyoung(pte);
if (was_writable)
pte = pte_mkwrite(pte);
ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
update_mmu_cache(vma, vmf->address, vmf->pte);
page = _vm_normal_page(vma, vmf->address, pte, vmf->vma_flags);
if (!page) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;
}
/* TODO: handle PTE-mapped THP */
if (PageCompound(page)) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;
}
/*
* Avoid grouping on RO pages in general. RO pages shouldn't hurt as
* much anyway since they can be in shared cache state. This misses
* the case where a mapping is writable but the process never writes
* to it but pte_write gets cleared during protection updates and
* pte_dirty has unpredictable behaviour between PTE scan updates,
* background writeback, dirty balancing and application behaviour.
*/
if (!pte_write(pte))
flags |= TNF_NO_GROUP;
/*
* Flag if the page is shared between multiple address spaces. This
* is later used when determining whether to group tasks together
*/
if (page_mapcount(page) > 1 && (vmf->vma_flags & VM_SHARED))
flags |= TNF_SHARED;
last_cpupid = page_cpupid_last(page);
page_nid = page_to_nid(page);
target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
&flags);
pte_unmap_unlock(vmf->pte, vmf->ptl);
if (target_nid == NUMA_NO_NODE) {
put_page(page);
goto out;
}
/* Migrate to the requested node */
migrated = migrate_misplaced_page(page, vmf, target_nid);
if (migrated) {
page_nid = target_nid;
flags |= TNF_MIGRATED;
} else
flags |= TNF_MIGRATE_FAIL;
out:
if (page_nid != NUMA_NO_NODE)
task_numa_fault(last_cpupid, page_nid, 1, flags);
return 0;
}
static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
{
if (vma_is_anonymous(vmf->vma))
return do_huge_pmd_anonymous_page(vmf);
if (vmf->vma->vm_ops->huge_fault)
return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
return VM_FAULT_FALLBACK;
}
/* `inline' is required to avoid gcc 4.1.2 build error */
static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
{
if (vma_is_anonymous(vmf->vma)) {
if (userfaultfd_huge_pmd_wp(vmf->vma, orig_pmd))
return handle_userfault(vmf, VM_UFFD_WP);
return do_huge_pmd_wp_page(vmf, orig_pmd);
}
if (vmf->vma->vm_ops->huge_fault) {
vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
}
/* COW or write-notify handled on pte level: split pmd. */
__split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
return VM_FAULT_FALLBACK;
}
static vm_fault_t create_huge_pud(struct vm_fault *vmf)
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
/* No support for anonymous transparent PUD pages yet */
if (vma_is_anonymous(vmf->vma))
return VM_FAULT_FALLBACK;
if (vmf->vma->vm_ops->huge_fault)
return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
return VM_FAULT_FALLBACK;
}
static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
/* No support for anonymous transparent PUD pages yet */
if (vma_is_anonymous(vmf->vma))
goto split;
if (vmf->vma->vm_ops->huge_fault) {
vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
}
split:
/* COW or write-notify not handled on PUD level: split pud.*/
__split_huge_pud(vmf->vma, vmf->pud, vmf->address);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
return VM_FAULT_FALLBACK;
}
/*
* These routines also need to handle stuff like marking pages dirty
* and/or accessed for architectures that don't do it in hardware (most
* RISC architectures). The early dirtying is also good on the i386.
*
* There is also a hook called "update_mmu_cache()" that architectures
* with external mmu caches can use to update those (ie the Sparc or
* PowerPC hashed page tables that act as extended TLBs).
*
* We enter with non-exclusive mmap_lock (to exclude vma changes, but allow
* concurrent faults).
*
* The mmap_lock may have been released depending on flags and our return value.
* See filemap_fault() and __lock_page_or_retry().
*/
static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
{
pte_t entry;
vm_fault_t ret = 0;
/* Do not check unstable pmd, if it's changed will retry later */
if (vmf->flags & FAULT_FLAG_SPECULATIVE)
goto skip_pmd_checks;
if (unlikely(pmd_none(*vmf->pmd))) {
/*
* Leave __pte_alloc() until later: because vm_ops->fault may
* want to allocate huge page, and if we expose page table
* for an instant, it will be difficult to retract from
* concurrent faults and from rmap lookups.
*/
vmf->pte = NULL;
} else {
/*
* If a huge pmd materialized under us just retry later. Use
* pmd_trans_unstable() via pmd_devmap_trans_unstable() instead
* of pmd_trans_huge() to ensure the pmd didn't become
* pmd_trans_huge under us and then back to pmd_none, as a
* result of MADV_DONTNEED running immediately after a huge pmd
* fault in a different thread of this mm, in turn leading to a
* misleading pmd_trans_huge() retval. All we have to ensure is
* that it is a regular pmd that we can walk with
* pte_offset_map() and we can do that through an atomic read
* in C, which is what pmd_trans_unstable() provides.
*/
if (pmd_devmap_trans_unstable(vmf->pmd))
return 0;
/*
* A regular pmd is established and it can't morph into a huge
* pmd from under us anymore at this point because we hold the
* mmap_lock read mode and khugepaged takes it in write mode.
* So now it's safe to run pte_offset_map().
* This is not applicable to the speculative page fault handler
* but in that case, the pte is fetched earlier in
* handle_speculative_fault().
*/
vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
vmf->orig_pte = *vmf->pte;
/*
* some architectures can have larger ptes than wordsize,
* e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and
* CONFIG_32BIT=y, so READ_ONCE cannot guarantee atomic
* accesses. The code below just needs a consistent view
* for the ifs and we later double check anyway with the
* ptl lock held. So here a barrier will do.
*/
barrier();
if (pte_none(vmf->orig_pte)) {
pte_unmap(vmf->pte);
vmf->pte = NULL;
}
}
skip_pmd_checks:
if (!vmf->pte) {
if (vma_is_anonymous(vmf->vma))
return do_anonymous_page(vmf);
else if ((vmf->flags & FAULT_FLAG_SPECULATIVE) &&
!vmf_allows_speculation(vmf))
return VM_FAULT_RETRY;
else
return do_fault(vmf);
}
if (!pte_present(vmf->orig_pte))
return do_swap_page(vmf);
if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
return do_numa_page(vmf);
if (!pte_spinlock(vmf))
return VM_FAULT_RETRY;
entry = vmf->orig_pte;
if (unlikely(!pte_same(*vmf->pte, entry))) {
update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
goto unlock;
}
if (vmf->flags & FAULT_FLAG_WRITE) {
if (!pte_write(entry)) {
if (!(vmf->flags & FAULT_FLAG_SPECULATIVE))
return do_wp_page(vmf);
if (!mmu_notifier_trylock(vmf->vma->vm_mm)) {
ret = VM_FAULT_RETRY;
goto unlock;
}
ret = do_wp_page(vmf);
mmu_notifier_unlock(vmf->vma->vm_mm);
return ret;
}
entry = pte_mkdirty(entry);
}
entry = pte_mkyoung(entry);
if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
vmf->flags & FAULT_FLAG_WRITE)) {
update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
} else {
/* Skip spurious TLB flush for retried page fault */
if (vmf->flags & FAULT_FLAG_TRIED)
goto unlock;
if (vmf->flags & FAULT_FLAG_SPECULATIVE)
ret = VM_FAULT_RETRY;
/*
* This is needed only for protection faults but the arch code
* is not yet telling us if this is a protection fault or not.
* This still avoids useless tlb flushes for .text page faults
* with threads.
*/
if (vmf->flags & FAULT_FLAG_WRITE)
flush_tlb_fix_spurious_fault(vmf->vma, vmf->address);
}
trace_android_rvh_handle_pte_fault_end(vmf, highest_memmap_pfn);
trace_android_vh_handle_pte_fault_end(vmf, highest_memmap_pfn);
unlock:
pte_unmap_unlock(vmf->pte, vmf->ptl);
return ret;
}
/*
* By the time we get here, we already hold the mm semaphore
*
* The mmap_lock may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
*/
static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
unsigned long address, unsigned int flags)
{
struct vm_fault vmf = {
.vma = vma,
.address = address & PAGE_MASK,
.flags = flags,
.pgoff = linear_page_index(vma, address),
.gfp_mask = __get_fault_gfp_mask(vma),
.vma_flags = vma->vm_flags,
.vma_page_prot = vma->vm_page_prot,
};
unsigned int dirty = flags & FAULT_FLAG_WRITE;
struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
p4d_t *p4d;
vm_fault_t ret;
pgd = pgd_offset(mm, address);
p4d = p4d_alloc(mm, pgd, address);
if (!p4d)
return VM_FAULT_OOM;
vmf.pud = pud_alloc(mm, p4d, address);
if (!vmf.pud)
return VM_FAULT_OOM;
retry_pud:
if (pud_none(*vmf.pud) && __transparent_hugepage_enabled(vma)) {
ret = create_huge_pud(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
pud_t orig_pud = *vmf.pud;
barrier();
if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
/* NUMA case for anonymous PUDs would go here */
if (dirty && !pud_write(orig_pud)) {
ret = wp_huge_pud(&vmf, orig_pud);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
huge_pud_set_accessed(&vmf, orig_pud);
return 0;
}
}
}
vmf.pmd = pmd_alloc(mm, vmf.pud, address);
if (!vmf.pmd)
return VM_FAULT_OOM;
/* Huge pud page fault raced with pmd_alloc? */
if (pud_trans_unstable(vmf.pud))
goto retry_pud;
#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
vmf.sequence = raw_read_seqcount(&vma->vm_sequence);
#endif
if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) {
ret = create_huge_pmd(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
pmd_t orig_pmd = *vmf.pmd;
barrier();
if (unlikely(is_swap_pmd(orig_pmd))) {
VM_BUG_ON(thp_migration_supported() &&
!is_pmd_migration_entry(orig_pmd));
if (is_pmd_migration_entry(orig_pmd))
pmd_migration_entry_wait(mm, vmf.pmd);
return 0;
}
if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
return do_huge_pmd_numa_page(&vmf, orig_pmd);
if (dirty && !pmd_write(orig_pmd)) {
ret = wp_huge_pmd(&vmf, orig_pmd);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
huge_pmd_set_accessed(&vmf, orig_pmd);
return 0;
}
}
}
return handle_pte_fault(&vmf);
}
/**
* mm_account_fault - Do page fault accountings
*
* @regs: the pt_regs struct pointer. When set to NULL, will skip accounting
* of perf event counters, but we'll still do the per-task accounting to
* the task who triggered this page fault.
* @address: the faulted address.
* @flags: the fault flags.
* @ret: the fault retcode.
*
* This will take care of most of the page fault accountings. Meanwhile, it
* will also include the PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN] perf counter
* updates. However note that the handling of PERF_COUNT_SW_PAGE_FAULTS should
* still be in per-arch page fault handlers at the entry of page fault.
*/
static inline void mm_account_fault(struct pt_regs *regs,
unsigned long address, unsigned int flags,
vm_fault_t ret)
{
bool major;
/*
* We don't do accounting for some specific faults:
*
* - Unsuccessful faults (e.g. when the address wasn't valid). That
* includes arch_vma_access_permitted() failing before reaching here.
* So this is not a "this many hardware page faults" counter. We
* should use the hw profiling for that.
*
* - Incomplete faults (VM_FAULT_RETRY). They will only be counted
* once they're completed.
*/
if (ret & (VM_FAULT_ERROR | VM_FAULT_RETRY))
return;
/*
* We define the fault as a major fault when the final successful fault
* is VM_FAULT_MAJOR, or if it retried (which implies that we couldn't
* handle it immediately previously).
*/
major = (ret & VM_FAULT_MAJOR) || (flags & FAULT_FLAG_TRIED);
if (major)
current->maj_flt++;
else
current->min_flt++;
/*
* If the fault is done for GUP, regs will be NULL. We only do the
* accounting for the per thread fault counters who triggered the
* fault, and we skip the perf event updates.
*/
if (!regs)
return;
if (major)
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
else
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
}
#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
#ifndef CONFIG_ARCH_HAS_PTE_SPECIAL
/* This is required by vm_normal_page() */
#error "Speculative page fault handler requires CONFIG_ARCH_HAS_PTE_SPECIAL"
#endif
/*
* vm_normal_page() adds some processing which should be done while
* hodling the mmap_sem.
*/
/*
* Tries to handle the page fault in a speculative way, without grabbing the
* mmap_sem.
* When VM_FAULT_RETRY is returned, the vma pointer is valid and this vma must
* be checked later when the mmap_sem has been grabbed by calling
* can_reuse_spf_vma().
* This is needed as the returned vma is kept in memory until the call to
* can_reuse_spf_vma() is made.
*/
static vm_fault_t ___handle_speculative_fault(struct mm_struct *mm,
unsigned long address, unsigned int flags,
struct vm_area_struct *vma)
{
struct vm_fault vmf = {
.address = address,
.pgoff = linear_page_index(vma, address),
.vma = vma,
.gfp_mask = __get_fault_gfp_mask(vma),
.flags = flags,
};
#ifdef CONFIG_NUMA
struct mempolicy *pol;
#endif
pgd_t *pgd, pgdval;
p4d_t *p4d, p4dval;
pud_t pudval;
int seq;
vm_fault_t ret;
/* Clear flags that may lead to release the mmap_sem to retry */
flags &= ~(FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_KILLABLE);
flags |= FAULT_FLAG_SPECULATIVE;
/* rmb <-> seqlock,vma_rb_erase() */
seq = raw_read_seqcount(&vmf.vma->vm_sequence);
if (seq & 1) {
trace_spf_vma_changed(_RET_IP_, vmf.vma, address);
return VM_FAULT_RETRY;
}
if (!vmf_allows_speculation(&vmf))
return VM_FAULT_RETRY;
vmf.vma_flags = READ_ONCE(vmf.vma->vm_flags);
vmf.vma_page_prot = READ_ONCE(vmf.vma->vm_page_prot);
#ifdef CONFIG_USERFAULTFD
/* Can't call userland page fault handler in the speculative path */
if (unlikely(vmf.vma_flags & __VM_UFFD_FLAGS)) {
trace_spf_vma_notsup(_RET_IP_, vmf.vma, address);
return VM_FAULT_RETRY;
}
#endif
if (vmf.vma_flags & VM_GROWSDOWN || vmf.vma_flags & VM_GROWSUP) {
/*
* This could be detected by the check address against VMA's
* boundaries but we want to trace it as not supported instead
* of changed.
*/
trace_spf_vma_notsup(_RET_IP_, vmf.vma, address);
return VM_FAULT_RETRY;
}
if (address < READ_ONCE(vmf.vma->vm_start)
|| READ_ONCE(vmf.vma->vm_end) <= address) {
trace_spf_vma_changed(_RET_IP_, vmf.vma, address);
return VM_FAULT_RETRY;
}
if (!arch_vma_access_permitted(vmf.vma, flags & FAULT_FLAG_WRITE,
flags & FAULT_FLAG_INSTRUCTION,
flags & FAULT_FLAG_REMOTE))
goto out_segv;
/* This is one is required to check that the VMA has write access set */
if (flags & FAULT_FLAG_WRITE) {
if (unlikely(!(vmf.vma_flags & VM_WRITE)))
goto out_segv;
} else if (unlikely(!(vmf.vma_flags & (VM_READ|VM_EXEC|VM_WRITE))))
goto out_segv;
#ifdef CONFIG_NUMA
/*
* MPOL_INTERLEAVE implies additional checks in
* mpol_misplaced() which are not compatible with the
*speculative page fault processing.
*/
pol = __get_vma_policy(vmf.vma, address);
if (!pol)
pol = get_task_policy(current);
if (pol && pol->mode == MPOL_INTERLEAVE) {
trace_spf_vma_notsup(_RET_IP_, vmf.vma, address);
return VM_FAULT_RETRY;
}
#endif
/*
* Do a speculative lookup of the PTE entry.
*/
local_irq_disable();
pgd = pgd_offset(mm, address);
pgdval = READ_ONCE(*pgd);
if (pgd_none(pgdval) || unlikely(pgd_bad(pgdval)))
goto out_walk;
p4d = p4d_offset(pgd, address);
if (pgd_val(READ_ONCE(*pgd)) != pgd_val(pgdval))
goto out_walk;
p4dval = READ_ONCE(*p4d);
if (p4d_none(p4dval) || unlikely(p4d_bad(p4dval)))
goto out_walk;
vmf.pud = pud_offset(p4d, address);
if (p4d_val(READ_ONCE(*p4d)) != p4d_val(p4dval))
goto out_walk;
pudval = READ_ONCE(*vmf.pud);
if (pud_none(pudval) || unlikely(pud_bad(pudval)))
goto out_walk;
/* Huge pages at PUD level are not supported. */
if (unlikely(pud_trans_huge(pudval)))
goto out_walk;
vmf.pmd = pmd_offset(vmf.pud, address);
if (pud_val(READ_ONCE(*vmf.pud)) != pud_val(pudval))
goto out_walk;
vmf.orig_pmd = READ_ONCE(*vmf.pmd);
/*
* pmd_none could mean that a hugepage collapse is in progress
* in our back as collapse_huge_page() mark it before
* invalidating the pte (which is done once the IPI is catched
* by all CPU and we have interrupt disabled).
* For this reason we cannot handle THP in a speculative way since we
* can't safely indentify an in progress collapse operation done in our
* back on that PMD.
* Regarding the order of the following checks, see comment in
* pmd_devmap_trans_unstable()
*/
if (unlikely(pmd_devmap(vmf.orig_pmd) ||
pmd_none(vmf.orig_pmd) || pmd_trans_huge(vmf.orig_pmd) ||
is_swap_pmd(vmf.orig_pmd)))
goto out_walk;
/*
* The above does not allocate/instantiate page-tables because doing so
* would lead to the possibility of instantiating page-tables after
* free_pgtables() -- and consequently leaking them.
*
* The result is that we take at least one !speculative fault per PMD
* in order to instantiate it.
*/
vmf.pte = pte_offset_map(vmf.pmd, address);
if (pmd_val(READ_ONCE(*vmf.pmd)) != pmd_val(vmf.orig_pmd)) {
pte_unmap(vmf.pte);
vmf.pte = NULL;
goto out_walk;
}
vmf.orig_pte = READ_ONCE(*vmf.pte);
barrier(); /* See comment in handle_pte_fault() */
if (pte_none(vmf.orig_pte)) {
pte_unmap(vmf.pte);
vmf.pte = NULL;
}
vmf.sequence = seq;
vmf.flags = flags;
local_irq_enable();
/*
* We need to re-validate the VMA after checking the bounds, otherwise
* we might have a false positive on the bounds.
*/
if (read_seqcount_retry(&vmf.vma->vm_sequence, seq)) {
trace_spf_vma_changed(_RET_IP_, vmf.vma, address);
return VM_FAULT_RETRY;
}
mem_cgroup_enter_user_fault();
ret = handle_pte_fault(&vmf);
mem_cgroup_exit_user_fault();
if (ret != VM_FAULT_RETRY) {
if (vma_is_anonymous(vmf.vma))
count_vm_event(SPECULATIVE_PGFAULT_ANON);
else
count_vm_event(SPECULATIVE_PGFAULT_FILE);
}
/*
* The task may have entered a memcg OOM situation but
* if the allocation error was handled gracefully (no
* VM_FAULT_OOM), there is no need to kill anything.
* Just clean up the OOM state peacefully.
*/
if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
mem_cgroup_oom_synchronize(false);
return ret;
out_walk:
trace_spf_vma_notsup(_RET_IP_, vmf.vma, address);
local_irq_enable();
return VM_FAULT_RETRY;
out_segv:
trace_spf_vma_access(_RET_IP_, vmf.vma, address);
return VM_FAULT_SIGSEGV;
}
vm_fault_t __handle_speculative_fault(struct mm_struct *mm,
unsigned long address, unsigned int flags,
struct vm_area_struct **vma,
struct pt_regs *regs)
{
vm_fault_t ret;
check_sync_rss_stat(current);
*vma = get_vma(mm, address);
if (!*vma)
return VM_FAULT_RETRY;
ret = ___handle_speculative_fault(mm, address, flags, *vma);
/*
* If there is no need to retry, don't return the vma to the caller.
*/
if (ret != VM_FAULT_RETRY) {
put_vma(*vma);
*vma = NULL;
mm_account_fault(regs, address, flags, ret);
}
return ret;
}
/*
* This is used to know if the vma fetch in the speculative page fault handler
* is still valid when trying the regular fault path while holding the
* mmap_sem.
* The call to put_vma(vma) must be made after checking the vma's fields, as
* the vma may be freed by put_vma(). In such a case it is expected that false
* is returned.
*/
bool can_reuse_spf_vma(struct vm_area_struct *vma, unsigned long address)
{
bool ret;
ret = !RB_EMPTY_NODE(&vma->vm_rb) &&
vma->vm_start <= address && address < vma->vm_end;
put_vma(vma);
return ret;
}
#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */
/*
* By the time we get here, we already hold the mm semaphore
*
* The mmap_lock may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
*/
vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
unsigned int flags, struct pt_regs *regs)
{
vm_fault_t ret;
__set_current_state(TASK_RUNNING);
count_vm_event(PGFAULT);
count_memcg_event_mm(vma->vm_mm, PGFAULT);
/* do counter updates before entering really critical section. */
check_sync_rss_stat(current);
if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
flags & FAULT_FLAG_INSTRUCTION,
flags & FAULT_FLAG_REMOTE))
return VM_FAULT_SIGSEGV;
/*
* Enable the memcg OOM handling for faults triggered in user
* space. Kernel faults are handled more gracefully.
*/
if (flags & FAULT_FLAG_USER)
mem_cgroup_enter_user_fault();
if (unlikely(is_vm_hugetlb_page(vma)))
ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
else
ret = __handle_mm_fault(vma, address, flags);
if (flags & FAULT_FLAG_USER) {
mem_cgroup_exit_user_fault();
/*
* The task may have entered a memcg OOM situation but
* if the allocation error was handled gracefully (no
* VM_FAULT_OOM), there is no need to kill anything.
* Just clean up the OOM state peacefully.
*/
if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
mem_cgroup_oom_synchronize(false);
}
mm_account_fault(regs, address, flags, ret);
return ret;
}
EXPORT_SYMBOL_GPL(handle_mm_fault);
#ifndef __PAGETABLE_P4D_FOLDED
/*
* Allocate p4d page table.
* We've already handled the fast-path in-line.
*/
int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
{
p4d_t *new = p4d_alloc_one(mm, address);
if (!new)
return -ENOMEM;
smp_wmb(); /* See comment in __pte_alloc */
spin_lock(&mm->page_table_lock);
if (pgd_present(*pgd)) /* Another has populated it */
p4d_free(mm, new);
else
pgd_populate(mm, pgd, new);
spin_unlock(&mm->page_table_lock);
return 0;
}
#endif /* __PAGETABLE_P4D_FOLDED */
#ifndef __PAGETABLE_PUD_FOLDED
/*
* Allocate page upper directory.
* We've already handled the fast-path in-line.
*/
int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
{
pud_t *new = pud_alloc_one(mm, address);
if (!new)
return -ENOMEM;
smp_wmb(); /* See comment in __pte_alloc */
spin_lock(&mm->page_table_lock);
if (!p4d_present(*p4d)) {
mm_inc_nr_puds(mm);
p4d_populate(mm, p4d, new);
} else /* Another has populated it */
pud_free(mm, new);
spin_unlock(&mm->page_table_lock);
return 0;
}
#endif /* __PAGETABLE_PUD_FOLDED */
#ifndef __PAGETABLE_PMD_FOLDED
/*
* Allocate page middle directory.
* We've already handled the fast-path in-line.
*/
int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
{
spinlock_t *ptl;
pmd_t *new = pmd_alloc_one(mm, address);
if (!new)
return -ENOMEM;
smp_wmb(); /* See comment in __pte_alloc */
ptl = pud_lock(mm, pud);
if (!pud_present(*pud)) {
mm_inc_nr_pmds(mm);
pud_populate(mm, pud, new);
} else /* Another has populated it */
pmd_free(mm, new);
spin_unlock(ptl);
return 0;
}
#endif /* __PAGETABLE_PMD_FOLDED */
int follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
struct mmu_notifier_range *range, pte_t **ptepp,
pmd_t **pmdpp, spinlock_t **ptlp)
{
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *ptep;
pgd = pgd_offset(mm, address);
if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
goto out;
p4d = p4d_offset(pgd, address);
if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d)))
goto out;
pud = pud_offset(p4d, address);
if (pud_none(*pud) || unlikely(pud_bad(*pud)))
goto out;
pmd = pmd_offset(pud, address);
VM_BUG_ON(pmd_trans_huge(*pmd));
if (pmd_huge(*pmd)) {
if (!pmdpp)
goto out;
if (range) {
mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0,
NULL, mm, address & PMD_MASK,
(address & PMD_MASK) + PMD_SIZE);
mmu_notifier_invalidate_range_start(range);
}
*ptlp = pmd_lock(mm, pmd);
if (pmd_huge(*pmd)) {
*pmdpp = pmd;
return 0;
}
spin_unlock(*ptlp);
if (range)
mmu_notifier_invalidate_range_end(range);
}
if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
goto out;
if (range) {
mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
address & PAGE_MASK,
(address & PAGE_MASK) + PAGE_SIZE);
mmu_notifier_invalidate_range_start(range);
}
ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
if (!pte_present(*ptep))
goto unlock;
*ptepp = ptep;
return 0;
unlock:
pte_unmap_unlock(ptep, *ptlp);
if (range)
mmu_notifier_invalidate_range_end(range);
out:
return -EINVAL;
}
/**
* follow_pte - look up PTE at a user virtual address
* @mm: the mm_struct of the target address space
* @address: user virtual address
* @ptepp: location to store found PTE
* @ptlp: location to store the lock for the PTE
*
* On a successful return, the pointer to the PTE is stored in @ptepp;
* the corresponding lock is taken and its location is stored in @ptlp.
* The contents of the PTE are only stable until @ptlp is released;
* any further use, if any, must be protected against invalidation
* with MMU notifiers.
*
* Only IO mappings and raw PFN mappings are allowed. The mmap semaphore
* should be taken for read.
*
* KVM uses this function. While it is arguably less bad than ``follow_pfn``,
* it is not a good general-purpose API.
*
* Return: zero on success, -ve otherwise.
*/
int follow_pte(struct mm_struct *mm, unsigned long address,
pte_t **ptepp, spinlock_t **ptlp)
{
return follow_invalidate_pte(mm, address, NULL, ptepp, NULL, ptlp);
}
EXPORT_SYMBOL_GPL(follow_pte);
/**
* follow_pfn - look up PFN at a user virtual address
* @vma: memory mapping
* @address: user virtual address
* @pfn: location to store found PFN
*
* Only IO mappings and raw PFN mappings are allowed.
*
* This function does not allow the caller to read the permissions
* of the PTE. Do not use it.
*
* Return: zero and the pfn at @pfn on success, -ve otherwise.
*/
int follow_pfn(struct vm_area_struct *vma, unsigned long address,
unsigned long *pfn)
{
int ret = -EINVAL;
spinlock_t *ptl;
pte_t *ptep;
if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
return ret;
ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
if (ret)
return ret;
*pfn = pte_pfn(*ptep);
pte_unmap_unlock(ptep, ptl);
return 0;
}
EXPORT_SYMBOL(follow_pfn);
#ifdef CONFIG_HAVE_IOREMAP_PROT
int follow_phys(struct vm_area_struct *vma,
unsigned long address, unsigned int flags,
unsigned long *prot, resource_size_t *phys)
{
int ret = -EINVAL;
pte_t *ptep, pte;
spinlock_t *ptl;
if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
goto out;
if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
goto out;
pte = *ptep;
if ((flags & FOLL_WRITE) && !pte_write(pte))
goto unlock;
*prot = pgprot_val(pte_pgprot(pte));
*phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
ret = 0;
unlock:
pte_unmap_unlock(ptep, ptl);
out:
return ret;
}
int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
void *buf, int len, int write)
{
resource_size_t phys_addr;
unsigned long prot = 0;
void __iomem *maddr;
int offset = addr & (PAGE_SIZE-1);
if (follow_phys(vma, addr, write, &prot, &phys_addr))
return -EINVAL;
maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
if (!maddr)
return -ENOMEM;
if (write)
memcpy_toio(maddr + offset, buf, len);
else
memcpy_fromio(buf, maddr + offset, len);
iounmap(maddr);
return len;
}
EXPORT_SYMBOL_GPL(generic_access_phys);
#endif
/*
* Access another process' address space as given in mm. If non-NULL, use the
* given task for page fault accounting.
*/
int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
unsigned long addr, void *buf, int len, unsigned int gup_flags)
{
struct vm_area_struct *vma;
void *old_buf = buf;
int write = gup_flags & FOLL_WRITE;
if (mmap_read_lock_killable(mm))
return 0;
/* ignore errors, just check how much was successfully transferred */
while (len) {
int bytes, ret, offset;
void *maddr;
struct page *page = NULL;
ret = get_user_pages_remote(mm, addr, 1,
gup_flags, &page, &vma, NULL);
if (ret <= 0) {
#ifndef CONFIG_HAVE_IOREMAP_PROT
break;
#else
/*
* Check if this is a VM_IO | VM_PFNMAP VMA, which
* we can access using slightly different code.
*/
vma = find_vma(mm, addr);
if (!vma || vma->vm_start > addr)
break;
if (vma->vm_ops && vma->vm_ops->access)
ret = vma->vm_ops->access(vma, addr, buf,
len, write);
if (ret <= 0)
break;
bytes = ret;
#endif
} else {
bytes = len;
offset = addr & (PAGE_SIZE-1);
if (bytes > PAGE_SIZE-offset)
bytes = PAGE_SIZE-offset;
maddr = kmap(page);
if (write) {
copy_to_user_page(vma, page, addr,
maddr + offset, buf, bytes);
set_page_dirty_lock(page);
} else {
copy_from_user_page(vma, page, addr,
buf, maddr + offset, bytes);
}
kunmap(page);
put_user_page(page);
}
len -= bytes;
buf += bytes;
addr += bytes;
}
mmap_read_unlock(mm);
return buf - old_buf;
}
/**
* access_remote_vm - access another process' address space
* @mm: the mm_struct of the target address space
* @addr: start address to access
* @buf: source or destination buffer
* @len: number of bytes to transfer
* @gup_flags: flags modifying lookup behaviour
*
* The caller must hold a reference on @mm.
*
* Return: number of bytes copied from source to destination.
*/
int access_remote_vm(struct mm_struct *mm, unsigned long addr,
void *buf, int len, unsigned int gup_flags)
{
return __access_remote_vm(NULL, mm, addr, buf, len, gup_flags);
}
/*
* Access another process' address space.
* Source/target buffer must be kernel space,
* Do not walk the page table directly, use get_user_pages
*/
int access_process_vm(struct task_struct *tsk, unsigned long addr,
void *buf, int len, unsigned int gup_flags)
{
struct mm_struct *mm;
int ret;
mm = get_task_mm(tsk);
if (!mm)
return 0;
ret = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags);
mmput(mm);
return ret;
}
EXPORT_SYMBOL_GPL(access_process_vm);
/*
* Print the name of a VMA.
*/
void print_vma_addr(char *prefix, unsigned long ip)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
/*
* we might be running from an atomic context so we cannot sleep
*/
if (!mmap_read_trylock(mm))
return;
vma = find_vma(mm, ip);
if (vma && vma->vm_file) {
struct file *f = vma->vm_file;
char *buf = (char *)__get_free_page(GFP_NOWAIT);
if (buf) {
char *p;
p = file_path(f, buf, PAGE_SIZE);
if (IS_ERR(p))
p = "?";
printk("%s%s[%lx+%lx]", prefix, kbasename(p),
vma->vm_start,
vma->vm_end - vma->vm_start);
free_page((unsigned long)buf);
}
}
mmap_read_unlock(mm);
}
#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
void __might_fault(const char *file, int line)
{
/*
* Some code (nfs/sunrpc) uses socket ops on kernel memory while
* holding the mmap_lock, this is safe because kernel memory doesn't
* get paged out, therefore we'll never actually fault, and the
* below annotations will generate false positives.
*/
if (uaccess_kernel())
return;
if (pagefault_disabled())
return;
__might_sleep(file, line, 0);
#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
if (current->mm)
might_lock_read(&current->mm->mmap_lock);
#endif
}
EXPORT_SYMBOL(__might_fault);
#endif
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
/*
* Process all subpages of the specified huge page with the specified
* operation. The target subpage will be processed last to keep its
* cache lines hot.
*/
static inline void process_huge_page(
unsigned long addr_hint, unsigned int pages_per_huge_page,
void (*process_subpage)(unsigned long addr, int idx, void *arg),
void *arg)
{
int i, n, base, l;
unsigned long addr = addr_hint &
~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
/* Process target subpage last to keep its cache lines hot */
might_sleep();
n = (addr_hint - addr) / PAGE_SIZE;
if (2 * n <= pages_per_huge_page) {
/* If target subpage in first half of huge page */
base = 0;
l = n;
/* Process subpages at the end of huge page */
for (i = pages_per_huge_page - 1; i >= 2 * n; i--) {
cond_resched();
process_subpage(addr + i * PAGE_SIZE, i, arg);
}
} else {
/* If target subpage in second half of huge page */
base = pages_per_huge_page - 2 * (pages_per_huge_page - n);
l = pages_per_huge_page - n;
/* Process subpages at the begin of huge page */
for (i = 0; i < base; i++) {
cond_resched();
process_subpage(addr + i * PAGE_SIZE, i, arg);
}
}
/*
* Process remaining subpages in left-right-left-right pattern
* towards the target subpage
*/
for (i = 0; i < l; i++) {
int left_idx = base + i;
int right_idx = base + 2 * l - 1 - i;
cond_resched();
process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg);
cond_resched();
process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg);
}
}
static void clear_gigantic_page(struct page *page,
unsigned long addr,
unsigned int pages_per_huge_page)
{
int i;
struct page *p = page;
might_sleep();
for (i = 0; i < pages_per_huge_page;
i++, p = mem_map_next(p, page, i)) {
cond_resched();
clear_user_highpage(p, addr + i * PAGE_SIZE);
}
}
static void clear_subpage(unsigned long addr, int idx, void *arg)
{
struct page *page = arg;
clear_user_highpage(page + idx, addr);
}
void clear_huge_page(struct page *page,
unsigned long addr_hint, unsigned int pages_per_huge_page)
{
unsigned long addr = addr_hint &
~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
clear_gigantic_page(page, addr, pages_per_huge_page);
return;
}
process_huge_page(addr_hint, pages_per_huge_page, clear_subpage, page);
}
static void copy_user_gigantic_page(struct page *dst, struct page *src,
unsigned long addr,
struct vm_area_struct *vma,
unsigned int pages_per_huge_page)
{
int i;
struct page *dst_base = dst;
struct page *src_base = src;
for (i = 0; i < pages_per_huge_page; ) {
cond_resched();
copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
i++;
dst = mem_map_next(dst, dst_base, i);
src = mem_map_next(src, src_base, i);
}
}
struct copy_subpage_arg {
struct page *dst;
struct page *src;
struct vm_area_struct *vma;
};
static void copy_subpage(unsigned long addr, int idx, void *arg)
{
struct copy_subpage_arg *copy_arg = arg;
copy_user_highpage(copy_arg->dst + idx, copy_arg->src + idx,
addr, copy_arg->vma);
}
void copy_user_huge_page(struct page *dst, struct page *src,
unsigned long addr_hint, struct vm_area_struct *vma,
unsigned int pages_per_huge_page)
{
unsigned long addr = addr_hint &
~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
struct copy_subpage_arg arg = {
.dst = dst,
.src = src,
.vma = vma,
};
if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
copy_user_gigantic_page(dst, src, addr, vma,
pages_per_huge_page);
return;
}
process_huge_page(addr_hint, pages_per_huge_page, copy_subpage, &arg);
}
long copy_huge_page_from_user(struct page *dst_page,
const void __user *usr_src,
unsigned int pages_per_huge_page,
bool allow_pagefault)
{
void *src = (void *)usr_src;
void *page_kaddr;
unsigned long i, rc = 0;
unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;
struct page *subpage = dst_page;
for (i = 0; i < pages_per_huge_page;
i++, subpage = mem_map_next(subpage, dst_page, i)) {
if (allow_pagefault)
page_kaddr = kmap(subpage);
else
page_kaddr = kmap_atomic(subpage);
rc = copy_from_user(page_kaddr,
(const void __user *)(src + i * PAGE_SIZE),
PAGE_SIZE);
if (allow_pagefault)
kunmap(subpage);
else
kunmap_atomic(page_kaddr);
ret_val -= (PAGE_SIZE - rc);
if (rc)
break;
flush_dcache_page(subpage);
cond_resched();
}
return ret_val;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
static struct kmem_cache *page_ptl_cachep;
void __init ptlock_cache_init(void)
{
page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
SLAB_PANIC, NULL);
}
bool ptlock_alloc(struct page *page)
{
spinlock_t *ptl;
ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
if (!ptl)
return false;
page->ptl = ptl;
return true;
}
void ptlock_free(struct page *page)
{
kmem_cache_free(page_ptl_cachep, page->ptl);
}
#endif