Files
android_kernel_xiaomi_sm8450/kernel/sched/fair.c
Greg Kroah-Hartman 66379c1ee5 Merge tag 'android12-5.10.66_r00' into android12-5.10
This is the merge of the upstream LTS release of 5.10.66 into the
android12-5.10 branch.

There are 2 new symbols added to be tracked:

Leaf changes summary: 2 artifacts changed (1 filtered out)
Changed leaf types summary: 0 (1 filtered out) leaf type changed
Removed/Changed/Added functions summary: 0 Removed, 0 Changed, 2 Added functions
Removed/Changed/Added variables summary: 0 Removed, 0 Changed, 0 Added variable

2 Added functions:

  [A] 'function void __sdhci_set_timeout(sdhci_host*, mmc_command*)'
  [A] 'function void virtio_break_device(virtio_device*)'

It contains the following new commits:

a7b45c4c60c5 Merge tag 'android12-5.10.66_r00' into android12-5.10
d3c86f460d Merge 5.10.66 into android12-5.10-lts
e1ad6bbfcc ANDROID: GKI: update virtual device symbol list for led audio driver.
44a32dcb2f Linux 5.10.66
1de280adb2 Revert "time: Handle negative seconds correctly in timespec64_to_ns()"
f49fd9882f Revert "posix-cpu-timers: Force next expiration recalc after itimer reset"
0daa75bf75 Revert "block: nbd: add sanity check for first_minor"
d3c3f4e078 Revert "Bluetooth: Move shutdown callback before flushing tx and rx queue"
2300418cc6 Merge 5.10.65 into android12-5.10-lts
c31c2cca22 Linux 5.10.65
b216a075a9 clk: kirkwood: Fix a clocking boot regression
5866b1175d backlight: pwm_bl: Improve bootloader/kernel device handover
4c00435cb8 fbmem: don't allow too huge resolutions
34d099a330 IMA: remove the dependency on CRYPTO_MD5
5cc1ee3135 IMA: remove -Wmissing-prototypes warning
1319689981 fuse: flush extending writes
8018100c54 fuse: truncate pagecache on atomic_o_trunc
a8ca1fba54 ARM: dts: at91: add pinctrl-{names, 0} for all gpios
c2c7eefc93 KVM: nVMX: Unconditionally clear nested.pi_pending on nested VM-Enter
bf36224463 KVM: VMX: avoid running vmx_handle_exit_irqoff in case of emulation
c06e6ff2fc KVM: x86: Update vCPU's hv_clock before back to guest when tsc_offset is adjusted
1db337b10d KVM: s390: index kvm->arch.idle_mask by vcpu_idx
dc9db2a2aa Revert "KVM: x86: mmu: Add guest physical address check in translate_gpa()"
c6b42ec1c9 x86/resctrl: Fix a maybe-uninitialized build warning treated as error
bafece6cd1 perf/x86/amd/ibs: Extend PERF_PMU_CAP_NO_EXCLUDE to IBS Op
ae95c3a147 tty: Fix data race between tiocsti() and flush_to_ldisc()
4d0e6d6fe4 bio: fix page leak bio_add_hw_page failure
24fbd77d5a io_uring: IORING_OP_WRITE needs hash_reg_file set
656f343d72 time: Handle negative seconds correctly in timespec64_to_ns()
611b7f9dc9 f2fs: guarantee to write dirty data when enabling checkpoint back
75ffcd85df iwlwifi Add support for ax201 in Samsung Galaxy Book Flex2 Alpha
3853c0c070 ASoC: rt5682: Remove unused variable in rt5682_i2c_remove()
c4f1ad3930 ipv4: fix endianness issue in inet_rtm_getroute_build_skb()
dc4ff31506 octeontx2-af: Set proper errorcode for IPv4 checksum errors
bf2991f8e7 octeontx2-af: Fix static code analyzer reported issues
ee485124b7 octeontx2-af: Fix loop in free and unmap counter
a67c66c1bb net: qualcomm: fix QCA7000 checksum handling
f96bc82e03 net: sched: Fix qdisc_rate_table refcount leak when get tcf_block failed
5867e20e18 ipv4: make exception cache less predictible
8692f0bb29 ipv6: make exception cache less predictible
4663aaef24 brcmfmac: pcie: fix oops on failure to resume and reprobe
e68128e078 bcma: Fix memory leak for internally-handled cores
26fae720c1 atlantic: Fix driver resume flow.
cb996dc9f9 ath6kl: wmi: fix an error code in ath6kl_wmi_sync_point()
baecab8c46 ice: Only lock to update netdev dev_addr
bd6d9c83f4 iwlwifi: skip first element in the WTAS ACPI table
4c4f868082 iwlwifi: follow the new inclusive terminology
5c305b90d8 ASoC: wcd9335: Disable irq on slave ports in the remove function
729a459efd ASoC: wcd9335: Fix a memory leak in the error handling path of the probe function
9c640a2bb5 ASoC: wcd9335: Fix a double irq free in the remove function
8446bb0ff1 tty: serial: fsl_lpuart: fix the wrong mapbase value
9ee4ff8cbe usb: bdc: Fix a resource leak in the error handling path of 'bdc_probe()'
4d2823abd1 usb: bdc: Fix an error handling path in 'bdc_probe()' when no suitable DMA config is available
86b79054d7 usb: ehci-orion: Handle errors of clk_prepare_enable() in probe
f0bb631273 i2c: xlp9xx: fix main IRQ check
7ac3090e01 i2c: mt65xx: fix IRQ check
6c4857203f CIFS: Fix a potencially linear read overflow
b0491ab7d4 bpf: Fix possible out of bound write in narrow load handling
cfaefbcc6b mmc: moxart: Fix issue with uninitialized dma_slave_config
ced0bc7481 mmc: dw_mmc: Fix issue with uninitialized dma_slave_config
8a9f9b9755 mmc: sdhci: Fix issue with uninitialized dma_slave_config
dd903083cb ASoC: Intel: Skylake: Fix module resource and format selection
b0159dbd1d ASoC: Intel: Skylake: Leave data as is when invoking TLV IPCs
7934c79fb0 ASoC: Intel: kbl_da7219_max98927: Fix format selection for max98373
56d976f450 rsi: fix an error code in rsi_probe()
110ce7d256 rsi: fix error code in rsi_load_9116_firmware()
b4bbb77d88 gfs2: init system threads before freeze lock
ee029e3aa1 i2c: hix5hd2: fix IRQ check
d36ab9b3ee i2c: fix platform_get_irq.cocci warnings
187705a4b1 i2c: s3c2410: fix IRQ check
3913fa307a i2c: iop3xx: fix deferred probing
50e6f34499 Bluetooth: add timeout sanity check to hci_inquiry
cc59ad70cf lkdtm: replace SCSI_DISPATCH_CMD with SCSI_QUEUE_RQ
9295566a13 mm/swap: consider max pages in iomap_swapfile_add_extent
a9c29bc2a5 usb: gadget: mv_u3d: request_irq() after initializing UDC
b2f4dd13b2 firmware: raspberrypi: Fix a leak in 'rpi_firmware_get()'
60831f5ae6 firmware: raspberrypi: Keep count of all consumers
5c68b7795b i2c: synquacer: fix deferred probing
f577e9f58f clk: staging: correct reference to config IOMEM to config HAS_IOMEM
5ae5f087c9 arm64: dts: marvell: armada-37xx: Extend PCIe MEM space
cb788d698a nfsd4: Fix forced-expiry locking
c9773f42c1 lockd: Fix invalid lockowner cast after vfs_test_lock
2600861b90 locking/local_lock: Add missing owner initialization
d5462a630f locking/lockdep: Mark local_lock_t
22b106df73 mac80211: Fix insufficient headroom issue for AMSDU
0ad4ddb27e libbpf: Re-build libbpf.so when libbpf.map changes
494629ba62 usb: phy: tahvo: add IRQ check
46638d6941 usb: host: ohci-tmio: add IRQ check
4b7874a32e PM: cpu: Make notifier chain use a raw_spinlock_t
4711284768 Bluetooth: Move shutdown callback before flushing tx and rx queue
d993a6f137 samples: pktgen: add missing IPv6 option to pktgen scripts
2c0b826f4a devlink: Clear whole devlink_flash_notify struct
2aa3d5c9e1 selftests/bpf: Fix test_core_autosize on big-endian machines
c03bf1bc84 usb: gadget: udc: renesas_usb3: Fix soc_device_match() abuse
eabbb2e8cc usb: phy: twl6030: add IRQ checks
fa5dbfd539 usb: phy: fsl-usb: add IRQ check
99ad1be3e9 usb: gadget: udc: s3c2410: add IRQ check
0a77314589 usb: gadget: udc: at91: add IRQ check
27f102bcee usb: dwc3: qcom: add IRQ check
c4e0f54a56 usb: dwc3: meson-g12a: add IRQ check
96ba1e20e2 ASoC: rt5682: Properly turn off regulators if wrong device ID
1a2feb2304 ASoC: rt5682: Implement remove callback
628acf6ee2 net/mlx5: Fix unpublish devlink parameters
fe6322774c net/mlx5: Register to devlink ingress VLAN filter trap
dbeb4574dd drm/msm/dsi: Fix some reference counted resource leaks
059c2c09f4 Bluetooth: fix repeated calls to sco_sock_kill
6df58421da ASoC: Intel: Fix platform ID matching
10dfcfda5c cgroup/cpuset: Fix violation of cpuset locking rule
cbc9766143 cgroup/cpuset: Miscellaneous code cleanup
974ab0a04f counter: 104-quad-8: Return error when invalid mode during ceiling_write
c158f9b232 arm64: dts: exynos: correct GIC CPU interfaces address range on Exynos7
7125705623 drm/msm/dpu: make dpu_hw_ctl_clear_all_blendstages clear necessary LMs
a6e980b110 drm/msm/mdp4: move HW revision detection to earlier phase
90363618b5 drm/msm/mdp4: refactor HW revision detection into read_mdp_hw_revision
416929eaf4 selftests/bpf: Fix bpf-iter-tcp4 test to print correctly the dest IP
d6337dfd1e PM: EM: Increase energy calculation precision
5014a8453f Bluetooth: increase BTNAMSIZ to 21 chars to fix potential buffer overflow
afffa7b4c6 debugfs: Return error during {full/open}_proxy_open() on rmmod
17830b0415 soc: qcom: smsm: Fix missed interrupts if state changes while masked
b8361513ac bpf, samples: Add missing mprog-disable to xdp_redirect_cpu's optstring
cd6008e31a PCI: PM: Enable PME if it can be signaled from D3cold
3890c6e1da PCI: PM: Avoid forcing PCI_D0 for wakeup reasons inconsistently
eda4ccca90 media: venus: venc: Fix potential null pointer dereference on pointer fmt
519ad41a09 media: em28xx-input: fix refcount bug in em28xx_usb_disconnect
a7dd8b778a leds: trigger: audio: Add an activate callback to ensure the initial brightness is set
917191d582 leds: lt3593: Put fwnode in any case during ->probe()
eef8496579 i2c: highlander: add IRQ check
11dd40c189 net/mlx5: Fix missing return value in mlx5_devlink_eswitch_inline_mode_set()
b376ae5597 devlink: Break parameter notification sequence to be before/after unload/load driver
9fa9ff1040 arm64: dts: renesas: hihope-rzg2-ex: Add EtherAVB internal rx delay
e4da0e0006 arm64: dts: renesas: rzg2: Convert EtherAVB to explicit delay handling
61b1db2358 Bluetooth: mgmt: Fix wrong opcode in the response for add_adv cmd
bca46d2283 net: cipso: fix warnings in netlbl_cipsov4_add_std
b6b5dc12bd drm: mxsfb: Clear FIFO_CLEAR bit
1a0014c1c6 drm: mxsfb: Increase number of outstanding requests on V4 and newer HW
46f5463940 drm: mxsfb: Enable recovery on underflow
e0f3de1573 cgroup/cpuset: Fix a partition bug with hotplug
7a0b297480 net/mlx5e: Block LRO if firmware asks for tunneled LRO
c40ed983b8 net/mlx5e: Prohibit inner indir TIRs in IPoIB
a11fc1cd8a ARM: dts: meson8b: ec100: Fix the pwm regulator supply properties
2e68547e99 ARM: dts: meson8b: mxq: Fix the pwm regulator supply properties
0d40e59c03 ARM: dts: meson8b: odroidc1: Fix the pwm regulator supply properties
eda87dd473 ARM: dts: meson8: Use a higher default GPU clock frequency
a7d0a59e21 tcp: seq_file: Avoid skipping sk during tcp_seek_last_pos
1f60072320 drm/amdgpu/acp: Make PM domain really work
c7ebd3622b 6lowpan: iphc: Fix an off-by-one check of array index
def6efdf91 Bluetooth: sco: prevent information leak in sco_conn_defer_accept()
e9a6274087 media: atomisp: fix the uninitialized use and rename "retvalue"
b0e87701b8 media: coda: fix frame_mem_ctrl for YUV420 and YVU420 formats
c062253748 media: rockchip/rga: fix error handling in probe
dc49537334 media: rockchip/rga: use pm_runtime_resume_and_get()
94d6aa2b87 media: go7007: remove redundant initialization
ffd9c8cecb media: go7007: fix memory leak in go7007_usb_probe
fb22665c37 media: dvb-usb: Fix error handling in dvb_usb_i2c_init
6b0fe69534 media: dvb-usb: fix uninit-value in vp702x_read_mac_addr
372890e0b4 media: dvb-usb: fix uninit-value in dvb_usb_adapter_dvb_init
83f7297a4a ionic: cleanly release devlink instance
203537ff35 driver core: Fix error return code in really_probe()
4225d357bc firmware: fix theoretical UAF race with firmware cache and resume
c4aaad8a33 gfs2: Fix memory leak of object lsi on error return path
8c3b5028ec libbpf: Fix removal of inner map in bpf_object__create_map
ffb887c15f soc: qcom: rpmhpd: Use corner in power_off
f32b433d8e i40e: improve locking of mac_filter_hash
5ac21a4e6e arm64: dts: renesas: r8a77995: draak: Remove bogus adv7511w properties
a8c1eaed23 ARM: dts: aspeed-g6: Fix HVI3C function-group in pinctrl dtsi
6ca0b40891 libbpf: Fix the possible memory leak on error
f1673e8525 gve: fix the wrong AdminQ buffer overflow check
1568dbe889 drm/of: free the iterator object on failure
389dfd1147 bpf: Fix potential memleak and UAF in the verifier.
d4213b7093 bpf: Fix a typo of reuseport map in bpf.h.
56e5c527cc drm/of: free the right object
38235f195d media: cxd2880-spi: Fix an error handling path
25fbfc31ce soc: rockchip: ROCKCHIP_GRF should not default to y, unconditionally
c391728c9b leds: is31fl32xx: Fix missing error code in is31fl32xx_parse_dt()
d4abb6e141 media: TDA1997x: enable EDID support
8ce22f8538 ASoC: mediatek: mt8183: Fix Unbalanced pm_runtime_enable in mt8183_afe_pcm_dev_probe
3d58f5e83f drm/gma500: Fix end of loop tests for list_for_each_entry
54912723f1 drm/panfrost: Fix missing clk_disable_unprepare() on error in panfrost_clk_init()
1e1423449d EDAC/i10nm: Fix NVDIMM detection
a20e6868cb spi: spi-zynq-qspi: use wait_for_completion_timeout to make zynq_qspi_exec_mem_op not interruptible
e2cb04c61b spi: sprd: Fix the wrong WDG_LOAD_VAL
cd8cca7268 regulator: vctrl: Avoid lockdep warning in enable/disable ops
8665e30317 regulator: vctrl: Use locked regulator_get_voltage in probe path
80b1a70b04 blk-crypto: fix check for too-large dun_bytes
ba6e5af621 spi: davinci: invoke chipselect callback
c0aec70a25 x86/mce: Defer processing of early errors
6627be8b36 tpm: ibmvtpm: Avoid error message when process gets signal while waiting
bd2028e9e2 certs: Trigger creation of RSA module signing key if it's not an RSA key
fddf3a72ab crypto: qat - use proper type for vf_mask
e7273d57d2 irqchip/gic-v3: Fix priority comparison when non-secure priorities are used
f1f6d3d2ad spi: coldfire-qspi: Use clk_disable_unprepare in the remove function
4b21d4e820 block: nbd: add sanity check for first_minor
31fc50cd93 clocksource/drivers/sh_cmt: Fix wrong setting if don't request IRQ for clock source channel
dde7ff1c19 lib/mpi: use kcalloc in mpi_resize
20d84fc59e irqchip/loongson-pch-pic: Improve edge triggered interrupt support
e9a902f882 genirq/timings: Fix error return code in irq_timings_test_irqs()
10d3bdd2d5 spi: spi-pic32: Fix issue with uninitialized dma_slave_config
d4ec971bfa spi: spi-fsl-dspi: Fix issue with uninitialized dma_slave_config
87aa69aa10 block: return ELEVATOR_DISCARD_MERGE if possible
3868507181 m68k: Fix invalid RMW_INSNS on CPUs that lack CAS
497f3d9c3f rcu: Fix stall-warning deadlock due to non-release of rcu_node ->lock
ea5e5bc881 rcu: Add lockdep_assert_irqs_disabled() to rcu_sched_clock_irq() and callees
527b56d785 rcu: Fix to include first blocked task in stall warning
e6778e1b22 sched: Fix UCLAMP_FLAG_IDLE setting
718180c246 sched/numa: Fix is_core_idle()
bf4b0fa3a2 m68k: emu: Fix invalid free in nfeth_cleanup()
246c771b85 power: supply: cw2015: use dev_err_probe to allow deferred probe
a758b1d4ca s390/ap: fix state machine hang after failure to enable irq
86f9980909 s390/debug: fix debug area life cycle
0980d2b21f s390/debug: keep debug data on resize
0404bf4a66 s390/pci: fix misleading rc in clp_set_pci_fn()
8b471e72b5 s390/kasan: fix large PMD pages address alignment check
9d999957cb udf_get_extendedattr() had no boundary checks.
db2f238d8d fcntl: fix potential deadlock for &fasync_struct.fa_lock
349633ed31 crypto: qat - do not export adf_iov_putmsg()
205cfad5c0 crypto: qat - fix naming for init/shutdown VF to PF notifications
c29cc43e30 crypto: qat - fix reuse of completion variable
e53575ea28 crypto: qat - handle both source of interrupt in VF ISR
9819975c63 crypto: qat - do not ignore errors from enable_vf2pf_comms()
6f3c58bd62 crypto: omap - Fix inconsistent locking of device lists
fc4073df29 libata: fix ata_host_start()
cf619a528e s390/zcrypt: fix wrong offset index for APKA master key valid state
b4aa00bf8a s390/cio: add dev_busid sysfs entry for each subchannel
d0831db736 power: supply: max17042_battery: fix typo in MAx17042_TOFF
5d59f38c6b power: supply: smb347-charger: Add missing pin control activation
10e759e350 nvmet: pass back cntlid on successful completion
ea4a353c0e nvme-rdma: don't update queue count when failing to set io queues
5d0f0c3bbe nvme-tcp: don't update queue count when failing to set io queues
591f69d7c4 blk-throtl: optimize IOPS throttle for large IO scenarios
cf13537be5 bcache: add proper error unwinding in bcache_device_init
48aa6e4e28 isofs: joliet: Fix iocharset=utf8 mount option
940ac46132 udf: Fix iocharset=utf8 mount option
4cf1551af3 udf: Check LVID earlier
3d12ccecfa hrtimer: Ensure timerfd notification for HIGHRES=n
aadfa1d6ca hrtimer: Avoid double reprogramming in __hrtimer_start_range_ns()
13ccaef77e posix-cpu-timers: Force next expiration recalc after itimer reset
8a6c5eec81 EDAC/mce_amd: Do not load edac_mce_amd module on guests
4b680b3fc6 rcu/tree: Handle VM stoppage in stall detection
1cc05d71f0 sched/deadline: Fix missing clock update in migrate_task_rq_dl()
104adbffbe crypto: omap-sham - clear dma flags only after omap_sham_update_dma_stop()
ce7f2b516c power: supply: axp288_fuel_gauge: Report register-address on readb / writeb errors
3ebd7b3841 sched/deadline: Fix reset_on_fork reporting of DL tasks
8c4d94db5a crypto: mxs-dcp - Check for DMA mapping errors
7bb6302e9d regulator: tps65910: Silence deferred probe error
a859850996 regmap: fix the offset of register error log
97bc540bfb locking/mutex: Fix HANDOFF condition
5df7cc992d ANDROID: GKI: update .xml after android12-5.10 merge
639159686b  Merge branch 'android12-5.10' into `android12-5.10-lts`
8a365a2340 Revert "tty: drop termiox user definitions"
c8de3a470a Merge 5.10.64 into android12-5.10-lts
cb83afdc0b Linux 5.10.64
f72fce5507 PCI: Call Max Payload Size-related fixup quirks early
8c04a16d20 x86/reboot: Limit Dell Optiplex 990 quirk to early BIOS versions
1234849353 xhci: fix unsafe memory usage in xhci tracing
3f7f1baf70 xhci: fix even more unsafe memory usage in xhci tracing
30e6e9f8bf usb: mtu3: fix the wrong HS mult value
8a4439aaf4 usb: mtu3: use @mult for HS isoc or intr
147819723c usb: mtu3: restore HS function when set SS/SSP
c75e2fd0d3 usb: gadget: tegra-xudc: fix the wrong mult value for HS isoc or intr
d544c9a219 usb: host: xhci-rcar: Don't reload firmware after the completion
c3fd7b0b9a ALSA: usb-audio: Add registration quirk for JBL Quantum 800
798679af79 blk-mq: clearing flush request reference in tags->rqs[]
e51ff3ffc3 netfilter: nftables: clone set element expression template
36983fc2f8 netfilter: nf_tables: initialize set before expression setup
3fda454f90 netfilter: nftables: avoid potential overflows on 32bit arches
cad6239f50 blk-mq: fix is_flush_rq
ceffaa61b5 blk-mq: fix kernel panic during iterating over flush request
bc1b5c5f3e x86/events/amd/iommu: Fix invalid Perf result due to IOMMU PMC power-gating
554efc9a61 Revert "r8169: avoid link-up interrupt issue on RTL8106e if user enables ASPM"
d24347e2ff tty: drop termiox user definitions
0757a883b9 net: linux/skbuff.h: combine SKB_EXTENSIONS + KCOV handling
0b62660c6a serial: 8250: 8250_omap: Fix unused variable warning
7ff0b71b68 net: kcov: don't select SKB_EXTENSIONS when there is no NET
c422599206 mm/page_alloc: speed up the iteration of max_order
50e56c68e1 net: ll_temac: Remove left-over debug message
6e2c4e6656 USB: serial: mos7720: improve OOM-handling in read_mos_reg()
ddd7e8b7b8 igmp: Add ip_mc_list lock in ip_check_mc_rcu
06e0ef2a71 ANDROID: GKI: fix up spi structure change
77b971b479 Merge 5.10.63 into android12-5.10-lts
e07f317d5a Linux 5.10.63
4405ea221d media: stkwebcam: fix memory leak in stk_camera_probe
ad5e13f15d fuse: fix illegal access to inode with reused nodeid
40ba433a85 new helper: inode_wrong_type()
ded9137fcf spi: Switch to signed types for *_native_cs SPI controller fields
55bb5193ce serial: 8250: 8250_omap: Fix possible array out of bounds access
8e41134a92 ALSA: pcm: fix divide error in snd_pcm_lib_ioctl
4ffde17862 ALSA: hda/realtek: Workaround for conflicting SSID on ASUS ROG Strix G17
4ee2686b37 ALSA: hda/realtek: Quirk for HP Spectre x360 14 amp setup
2808d59fb2 cryptoloop: add a deprecation warning
61a038f80c perf/x86/amd/power: Assign pmu.module
ec9a82e034 perf/x86/amd/ibs: Work around erratum #1197
23c29490b8 ceph: fix possible null-pointer dereference in ceph_mdsmap_decode()
d2064a1444 perf/x86/intel/pt: Fix mask of num_address_ranges
0e74bba604 qede: Fix memset corruption
35f223cb21 net: macb: Add a NULL check on desc_ptp
cf50d02e47 qed: Fix the VF msix vectors flow
2177c4943e reset: reset-zynqmp: Fixed the argument data type
9872349b08 gpu: ipu-v3: Fix i.MX IPU-v3 offset calculations for (semi)planar U/V formats
b983d60292 ARM: OMAP1: ams-delta: remove unused function ams_delta_camera_power
bc860c3f09 xtensa: fix kconfig unmet dependency warning for HAVE_FUTEX_CMPXCHG
b1075d2a70 static_call: Fix unused variable warn w/o MODULE
ae16b7c668 Revert "Add a reference to ucounts for each cred"
1aa3f27e59 Revert "cred: add missing return error code when set_cred_ucounts() failed"
0c1443874e Revert "ucounts: Increase ucounts reference counter before the security hook"
0479b2bd29 ubifs: report correct st_size for encrypted symlinks
3ac01789f6 f2fs: report correct st_size for encrypted symlinks
894a02236d ext4: report correct st_size for encrypted symlinks
b8c298cf57 fscrypt: add fscrypt_symlink_getattr() for computing st_size
09a3795496 ext4: fix race writing to an inline_data file while its xattrs are changing
0115d3d9f9 ANDROID: GKI: add virtio_break_device to the symbol list
59911be7e5 Revert "once: Fix panic when module unload"
4b20d2de0b Revert "pipe: avoid unnecessary EPOLLET wakeups under normal loads"
b6e7497caf Revert "pipe: do FASYNC notifications for every pipe IO, not just state changes"
674d2ac211 Merge 5.10.62 into android12-5.10-lts
f6dd002450 Linux 5.10.62
0c9a876f28 bpf: Fix potentially incorrect results with bpf_get_local_storage()
38c1915d3e audit: move put_tree() to avoid trim_trees refcount underflow and UAF
1890ee7ff8 net: don't unconditionally copy_from_user a struct ifreq for socket ioctls
0085646e02 Revert "parisc: Add assembly implementations for memset, strlen, strcpy, strncpy and strcat"
17982c664f Revert "floppy: reintroduce O_NDELAY fix"
709c162ddc kthread: Fix PF_KTHREAD vs to_kthread() race
c43add24df btrfs: fix NULL pointer dereference when deleting device by invalid id
1604c42a1c arm64: dts: qcom: msm8994-angler: Fix gpio-reserved-ranges 85-88
f760c1101f lkdtm: Enable DOUBLE_FAULT on all architectures
b6c657abb8 net: dsa: mt7530: fix VLAN traffic leaks again
f8242f554c usb: typec: ucsi: Clear pending after acking connector change
e15e32d519 usb: typec: ucsi: Work around PPM losing change information
08953884aa usb: typec: ucsi: acpi: Always decode connector change information
9a4f1dc8a1 tracepoint: Use rcu get state and cond sync for static call updates
b6ae385407 srcu: Provide polling interfaces for Tiny SRCU grace periods
450948b06c srcu: Make Tiny SRCU use multi-bit grace-period counter
641e1d8840 srcu: Provide internal interface to start a Tiny SRCU grace period
f789de3be8 srcu: Provide polling interfaces for Tree SRCU grace periods
fdf66e5a7f srcu: Provide internal interface to start a Tree SRCU grace period
d3c38d8549 powerpc/perf: Invoke per-CPU variable access with disabled interrupts
77b77d45a4 perf annotate: Fix jump parsing for C++ code.
9f9e40ddfc perf tools: Fix arm64 build error with gcc-11
94687c49b6 perf record: Fix memory leak in vDSO found using ASAN
e0ca67030f perf symbol-elf: Fix memory leak by freeing sdt_note.args
0d8e39bb94 perf env: Fix memory leak of bpf_prog_info_linear member
133d7f93ee riscv: Fixup patch_text panic in ftrace
7e2087249e riscv: Fixup wrong ftrace remove cflag
b42fde92cd Bluetooth: btusb: check conditions before enabling USB ALT 3 for WBS
60d69cb4e6 vt_kdsetmode: extend console locking
0a178a0151 tipc: call tipc_wait_for_connect only when dlen is not 0
ded6da217c mtd: spinand: Fix incorrect parameters for on-die ECC
3b2018f9c9 pipe: do FASYNC notifications for every pipe IO, not just state changes
e91da23c1b pipe: avoid unnecessary EPOLLET wakeups under normal loads
d845f89d59 btrfs: fix race between marking inode needs to be logged and log syncing
6f38d95f33 net/rds: dma_map_sg is entitled to merge entries
b882dda2bf drm/nouveau/kms/nv50: workaround EFI GOP window channel format differences
7f422cda03 drm/nouveau/disp: power down unused DP links during init
6fd6e20520 drm: Copy drm_wait_vblank to user before returning
26ee94ba34 blk-mq: don't grab rq's refcount in blk_mq_check_expired()
b00ca56757 drm/amd/pm: change the workload type for some cards
3c37ec4350 Revert "drm/amd/pm: fix workload mismatch on vega10"
cc126b400b qed: Fix null-pointer dereference in qed_rdma_create_qp()
18a65ba069 qed: qed ll2 race condition fixes
4ac9c81e8a tools/virtio: fix build
c7ee4d2261 vringh: Use wiov->used to check for read/write desc order
6c074eaaf7 virtio_vdpa: reject invalid vq indices
0698278e8e virtio_pci: Support surprise removal of virtio pci device
065a13c299 virtio: Improve vq->broken access to avoid any compiler optimization
f41c7462d8 cpufreq: blocklist Qualcomm sm8150 in cpufreq-dt-platdev
3dea931590 opp: remove WARN when no valid OPPs remain
be37f7dbcd iwlwifi: pnvm: accept multiple HW-type TLVs
9a6a5602c2 clk: renesas: rcar-usb2-clock-sel: Fix kernel NULL pointer dereference
bdc5049c36 perf/x86/intel/uncore: Fix integer overflow on 23 bit left shift of a u32
c5600b9146 dt-bindings: sifive-l2-cache: Fix 'select' matching
ad5329a533 usb: gadget: u_audio: fix race condition on endpoint stop
257ea8a5ed drm/i915: Fix syncmap memory leak
e49b8d9c5e net: stmmac: fix kernel panic due to NULL pointer dereference of plat->est
b2091d47a1 net: stmmac: add mutex lock to protect est parameters
ac874290e7 Revert "mmc: sdhci-iproc: Set SDHCI_QUIRK_CAP_CLOCK_BASE_BROKEN on BCM2711"
411680a07c net: hns3: fix get wrong pfc_en when query PFC configuration
e834ca7c79 net: hns3: fix duplicate node in VLAN list
5931ec35e9 net: hns3: add waiting time before cmdq memory is released
9820af16a8 net: hns3: clear hardware resource when loading driver
ad0db83855 rtnetlink: Return correct error on changing device netns
51bc5c6660 cxgb4: dont touch blocked freelist bitmap after free
beefd5f0c6 ipv4: use siphash instead of Jenkins in fnhe_hashfun()
dced8347a7 ipv6: use siphash in rt6_exception_hash()
f517335a61 net/sched: ets: fix crash when flipping from 'strict' to 'quantum'
b493af3a66 ucounts: Increase ucounts reference counter before the security hook
8e0881f6f5 net: marvell: fix MVNETA_TX_IN_PRGRS bit number
850401a23a xgene-v2: Fix a resource leak in the error handling path of 'xge_probe()'
fb45459d9d ip_gre: add validation for csum_start
e78006b59a RDMA/efa: Free IRQ vectors on error flow
8f1e3ad945 e1000e: Do not take care about recovery NVM checksum
87285ac51e e1000e: Fix the max snoop/no-snoop latency for 10M
58b3dbf10c igc: Use num_tx_queues when iterating over tx_ring queue
ae6480ba06 igc: fix page fault when thunderbolt is unplugged
384dea502e net: usb: pegasus: fixes of set_register(s) return value evaluation;
3217c9d460 ice: do not abort devlink info if board identifier can't be found
3a2c5fbb1c RDMA/bnxt_re: Remove unpaired rtnl unlock in bnxt_re_dev_init()
56ac7463a1 IB/hfi1: Fix possible null-pointer dereference in _extend_sdma_tx_descs()
3e949aaa8b RDMA/bnxt_re: Add missing spin lock initialization
22c18102ec scsi: core: Fix hang of freezing queue between blocking and running device
01da7c1dc4 usb: dwc3: gadget: Stop EP0 transfers during pullup disable
87b2016493 usb: dwc3: gadget: Fix dwc3_calc_trbs_left()
56c92b8ddc usb: renesas-xhci: Prefer firmware loading on unknown ROM state
b0bcc80388 USB: serial: option: add new VID/PID to support Fibocom FG150
8437e07c37 Revert "USB: serial: ch341: fix character loss at high transfer rates"
da3067eadc drm/amdgpu: Cancel delayed work when GFXOFF is disabled
3134292a8e Revert "btrfs: compression: don't try to compress if we don't have enough pages"
921c2533aa riscv: Ensure the value of FP registers in the core dump file is up to date
e55a8b4615 ceph: correctly handle releasing an embedded cap flush
7008b9981b can: usb: esd_usb2: esd_usb2_rx_event(): fix the interchange of the CAN RX and TX error counters
45b7b20971 net: mscc: Fix non-GPL export of regmap APIs
ef2d68ef9a ovl: fix uninitialized pointer read in ovl_lookup_real_one()
c94d50979f blk-iocost: fix lockdep warning on blkcg->lock
6815e21fe2 once: Fix panic when module unload
f68ad168e2 netfilter: conntrack: collect all entries in one cycle
a13a2df0b1 ARC: Fix CONFIG_STACKDEPOT
0af6a9f82c ASoC: component: Remove misplaced prefix handling in pin control functions
34cc80ec12 ASoC: rt5682: Adjust headset volume button threshold
d81ddadabd bpf: Fix NULL pointer dereference in bpf_get_local_storage() helper
9dd6f6d896 bpf: Fix ringbuf helper function compatibility
ad41706c77 net: qrtr: fix another OOB Read in qrtr_endpoint_post
45cad77f78 ANDROID: GKI: update the android12-5.10-lts abi .xml file
af06413d4c Merge branch 'android12-5.10' into `android12-5.10-lts`
383ea08168 ANDROID: GKI: db845c: Update symbols list and ABI for lts v5.10.61
e0382dd2e5 Revert "virtio: Protect vqs list access"
a6777a7cee Merge 5.10.61 into android12-5.10-lts
83da0c0fca Revert "net: igmp: fix data-race in igmp_ifc_timer_expire()"
a75a648f84 Revert "net: igmp: increase size of mr_ifc_count"
63aa0473df Revert "PCI/MSI: Protect msi_desc::masked for multi-MSI"
b558262fdc Merge 5.10.60 into android12-5.10-lts
56f751a409 Merge branch 'android12-5.10' into `android12-5.10-lts`
452ea6a15e Linux 5.10.61
f15e642673 io_uring: only assign io_uring_enter() SQPOLL error in actual error case
695ab28a7f io_uring: fix xa_alloc_cycle() error return value check
0d5fcfc640 fs: warn about impending deprecation of mandatory locks
8132fc2bf4 mm: memcontrol: fix occasional OOMs due to proportional memory.low reclaim
53e81668e1 ASoC: intel: atom: Fix breakage for PCM buffer address setup
88f65f57a0 ALSA: hda/realtek: Limit mic boost on HP ProBook 445 G8
b6672f67ec PCI: Increase D3 delay for AMD Renoir/Cezanne XHCI
a69326e134 s390/pci: fix use after free of zpci_dev
05b56e0554 ALSA: hda/via: Apply runtime PM workaround for ASUS B23E
67fece6289 btrfs: prevent rename2 from exchanging a subvol with a directory from different parents
16cfa72766 mmc: sdhci-iproc: Set SDHCI_QUIRK_CAP_CLOCK_BASE_BROKEN on BCM2711
2566c1d823 mmc: sdhci-iproc: Cap min clock frequency on BCM2711
110b7f72f6 ALSA: hda/realtek: Enable 4-speaker output for Dell XPS 15 9510 laptop
258782b937 ipack: tpci200: fix memory leak in the tpci200_register
3ee1b08097 ipack: tpci200: fix many double free issues in tpci200_pci_probe
0775bc462a slimbus: ngd: reset dma setup during runtime pm
45d6fc21cd slimbus: messaging: check for valid transaction id
b700b523dd slimbus: messaging: start transaction ids from 1 instead of zero
bd0c2f83d0 tracing / histogram: Fix NULL pointer dereference on strcmp() on NULL event name
f7c125493c ALSA: hda - fix the 'Capture Switch' value change notifications
7451c309c7 clk: qcom: gdsc: Ensure regulator init state matches GDSC state
7203b4986d clk: imx6q: fix uart earlycon unwork
3f8920c570 mmc: sdhci-msm: Update the software timeout value for sdhc
8f499a90e7 mmc: mmci: stm32: Check when the voltage switch procedure should be done
f8dac276a9 mmc: dw_mmc: Fix hang on data CRC error
645fd92c3e Revert "flow_offload: action should not be NULL when it is referenced"
2f6c42806e iavf: Fix ping is lost after untrusted VF had tried to change MAC
7873c29832 i40e: Fix ATR queue selection
e003a89219 r8152: fix writing USB_BP2_EN
21ca0b18ad iommu/vt-d: Fix incomplete cache flush in intel_pasid_tear_down_entry()
81578e587c iommu/vt-d: Consolidate duplicate cache invaliation code
eee84eafc3 ovs: clear skb->tstamp in forwarding path
47a1161dac net: mdio-mux: Handle -EPROBE_DEFER correctly
13af9c81e6 net: mdio-mux: Don't ignore memory allocation errors
df61235881 sch_cake: fix srchost/dsthost hashing mode
e1ec5858ba ixgbe, xsk: clean up the resources in ixgbe_xsk_pool_enable error path
3b7397b203 net: qlcnic: add missed unlock in qlcnic_83xx_flash_read32
9bc2d1a5a8 virtio-net: use NETIF_F_GRO_HW instead of NETIF_F_LRO
b7adfde949 virtio-net: support XDP when not more queues
1ce62fe6e4 vrf: Reset skb conntrack connection on VRF rcv
8ae539a361 bnxt_en: Add missing DMA memory barriers
9751aa4436 bnxt_en: Disable aRFS if running on 212 firmware
efd9b79b92 ptp_pch: Restore dependency on PCI
85e0518f18 net: 6pack: fix slab-out-of-bounds in decode_data
e0ae168360 bnxt: count Tx drops
296fe765dd bnxt: make sure xmit_more + errors does not miss doorbells
d913d5cc3b bnxt: disable napi before canceling DIM
5b24ae8f05 bnxt: don't lock the tx queue from napi poll
585ff7344e bpf: Clear zext_dst of dead insns
8dfdeeb1e9 drm/mediatek: Add AAL output size configuration
a8b8d61bab drm/mediatek: Fix aal size config
6715cefa72 soc / drm: mediatek: Move DDP component defines into mtk-mmsys.h
1b6fc6f739 vdpa/mlx5: Avoid destroying MR on empty iotlb
ecdd7c4880 vhost: Fix the calculation in vhost_overflow()
e0b603c89a bus: ti-sysc: Fix error handling for sysc_check_active_timer()
1af7ccbd92 vhost-vdpa: Fix integer overflow in vhost_vdpa_process_iotlb_update()
293180f593 virtio: Protect vqs list access
9108120fbe dccp: add do-while-0 stubs for dccp_pr_debug macros
e352531ed0 cpufreq: armada-37xx: forbid cpufreq for 1.2 GHz variant
d2ab5491de iommu: Check if group is NULL before remove device
44f454a146 arm64: dts: qcom: msm8992-bullhead: Remove PSCI
95ed753d91 arm64: dts: qcom: c630: fix correct powerdown pin for WSA881x
d7d04c6749 Bluetooth: hidp: use correct wait queue when removing ctrl_wait
2e6cc93e1b drm/amd/display: workaround for hard hang on HPD on native DP
dcc8c5fb8d drm/amd/display: Fix Dynamic bpp issue with 8K30 with Navi 1X
8849a8c705 net: usb: lan78xx: don't modify phy_device state concurrently
735e613fa5 net: usb: pegasus: Check the return value of get_geristers() and friends;
6b368411bc ARM: dts: nomadik: Fix up interrupt controller node names
410d1ea4ff qede: fix crash in rmmod qede while automatic debug collection
7525f2e4de drm/amdgpu: fix the doorbell missing when in CGPG issue for renoir.
711459514e scsi: core: Fix capacity set to zero after offlinining device
8071dbe1bd scsi: core: Avoid printing an error if target_alloc() returns -ENXIO
4f78db7df6 scsi: scsi_dh_rdac: Avoid crash during rdac_bus_attach()
cc312fa7e6 scsi: megaraid_mm: Fix end of loop tests for list_for_each_entry()
968ee9176a scsi: pm80xx: Fix TMF task completion race condition
b353028aed dmaengine: of-dma: router_xlate to return -EPROBE_DEFER if controller is not yet available
35f4162236 ARM: dts: am43x-epos-evm: Reduce i2c0 bus speed for tps65218
505884a0c7 net: xfrm: Fix end of loop tests for list_for_each_entry
f1c0533fae spi: spi-mux: Add module info needed for autoloading
b618a32142 dmaengine: usb-dmac: Fix PM reference leak in usb_dmac_probe()
c160df90b0 dmaengine: xilinx_dma: Fix read-after-free bug when terminating transfers
d4930271a4 USB: core: Fix incorrect pipe calculation in do_proc_control()
ba6c1b004a USB: core: Avoid WARNings for 0-length descriptor requests
8e100c72b6 KVM: X86: Fix warning caused by stale emulation context
47d4c79997 KVM: x86: Factor out x86 instruction emulation with decoding
ff2fc9e4aa media: drivers/media/usb: fix memory leak in zr364xx_probe
56320b1ad4 media: zr364xx: fix memory leaks in probe()
b5c7ec6d15 media: zr364xx: propagate errors from zr364xx_start_readpipe()
779a0f4347 mtd: cfi_cmdset_0002: fix crash when erasing/writing AMD cards
e2036bc3fc ath9k: Postpone key cache entry deletion for TXQ frames reference it
609c0cfd07 ath: Modify ath_key_delete() to not need full key entry
2925a8385e ath: Export ath_hw_keysetmac()
6566c207e5 ath9k: Clear key cache explicitly on disabling hardware
8f05076983 ath: Use safer key clearing with key cache entries
2c5bd949b1 Linux 5.10.60
3a24e12130 net: dsa: microchip: ksz8795: Use software untagging on CPU port
1e78179d75 net: dsa: microchip: ksz8795: Fix VLAN untagged flag change on deletion
5033d5e231 net: dsa: microchip: ksz8795: Reject unsupported VLAN configuration
60c007b527 net: dsa: microchip: ksz8795: Fix PVID tag insertion
f365d53c86 net: dsa: microchip: Fix probing KSZ87xx switch with DT node for host port
3dc5666baf KVM: nSVM: always intercept VMLOAD/VMSAVE when nested (CVE-2021-3656)
c0883f6931 KVM: nSVM: avoid picking up unsupported bits from L2 in int_ctl (CVE-2021-3653)
b5f05bdfda vmlinux.lds.h: Handle clang's module.{c,d}tor sections
2fe07584a6 ceph: take snap_empty_lock atomically with snaprealm refcount change
a23aced54c ceph: clean up locking annotation for ceph_get_snap_realm and __lookup_snap_realm
b0efc93271 ceph: add some lockdep assertions around snaprealm handling
dcdb587ac4 vboxsf: Add support for the atomic_open directory-inode op
7cd14c1a7f vboxsf: Add vboxsf_[create|release]_sf_handle() helpers
433f0b31eb KVM: nVMX: Use vmx_need_pf_intercept() when deciding if L0 wants a #PF
0ab67e3dfc KVM: VMX: Use current VMCS to query WAITPKG support for MSR emulation
4a948c579e efi/libstub: arm64: Double check image alignment at entry
fc7da433fa powerpc/smp: Fix OOPS in topology_init()
312730cd15 PCI/MSI: Protect msi_desc::masked for multi-MSI
724d0a9850 PCI/MSI: Use msi_mask_irq() in pci_msi_shutdown()
9233687518 PCI/MSI: Correct misleading comments
e42fb8e616 PCI/MSI: Do not set invalid bits in MSI mask
042e03c9cd PCI/MSI: Enforce MSI[X] entry updates to be visible
0b2509d7a9 PCI/MSI: Enforce that MSI-X table entry is masked for update
aa8092c1d1 PCI/MSI: Mask all unused MSI-X entries
7e90e81a4b PCI/MSI: Enable and mask MSI-X early
2d2c668480 genirq/timings: Prevent potential array overflow in __irq_timings_store()
355754194b genirq/msi: Ensure deactivation on teardown
f0736bed18 x86/resctrl: Fix default monitoring groups reporting
25216ed97d x86/ioapic: Force affinity setup before startup
19fb5dabed x86/msi: Force affinity setup before startup
4e52a4fe6f genirq: Provide IRQCHIP_AFFINITY_PRE_STARTUP
2a28b52306 x86/tools: Fix objdump version check again
4acc0d9871 powerpc/kprobes: Fix kprobe Oops happens in booke
015e2c900b efi/libstub: arm64: Relax 2M alignment again for relocatable kernels
feb4a01d3e efi/libstub: arm64: Force Image reallocation if BSS was not reserved
afcb84e6cf arm64: efi: kaslr: Fix occasional random alloc (and boot) failure
e0ee8d9c31 nbd: Aovid double completion of a request
f5cefe9a52 vsock/virtio: avoid potential deadlock when vsock device remove
dff830e5e7 xen/events: Fix race in set_evtchn_to_irq
65395b053d drm/i915: Only access SFC_DONE when media domain is not fused off
4344440d91 net: igmp: increase size of mr_ifc_count
696afe28dc tcp_bbr: fix u32 wrap bug in round logic if bbr_init() called after 2B packets
8976606ca3 net: linkwatch: fix failure to restore device state across suspend/resume
4c2af90119 net: bridge: fix memleak in br_add_if()
f333a5ca71 net: bridge: fix flags interpretation for extern learn fdb entries
e3b949b86d net: bridge: validate the NUD_PERMANENT bit when adding an extern_learn FDB entry
1cad01aca1 net: dsa: sja1105: fix broken backpressure in .port_fdb_dump
56cc3408ff net: dsa: lantiq: fix broken backpressure in .port_fdb_dump
f7720b35cd net: dsa: lan9303: fix broken backpressure in .port_fdb_dump
24e1b7dbb1 net: igmp: fix data-race in igmp_ifc_timer_expire()
69b13167a6 net: Fix memory leak in ieee802154_raw_deliver
dbfaf7a6a2 net: dsa: microchip: ksz8795: Fix VLAN filtering
ccc1fe82c8 net: dsa: microchip: Fix ksz_read64()
558092b8ed drm/meson: fix colour distortion from HDR set during vendor u-boot
6e1886465d net/mlx5: Fix return value from tracer initialization
303ba011f5 net/mlx5: Synchronize correct IRQ when destroying CQ
00a0c11ddd bareudp: Fix invalid read beyond skb's linear data
30b1fc47f7 psample: Add a fwd declaration for skbuff
b3f0b17084 iavf: Set RSS LUT and key in reset handle path
a6192bae12 ice: don't remove netdev->dev_addr from uc sync list
bae5b521fe ice: Prevent probing virtual functions
059238c52c net: sched: act_mirred: Reset ct info when mirror/redirect skb
f15f7716b0 net/smc: fix wait on already cleared link
51f4965d77 ppp: Fix generating ifname when empty IFLA_IFNAME is specified
046579c9fc net: phy: micrel: Fix link detection on ksz87xx switch"
e95620c3bd bpf: Fix integer overflow involving bucket_size
1960c3ac52 libbpf: Fix probe for BPF_PROG_TYPE_CGROUP_SOCKOPT
a3e9a3e228 platform/x86: pcengines-apuv2: Add missing terminating entries to gpio-lookup tables
53ebbfdd0e net: mvvp2: fix short frame size on s390
784320edb6 net: dsa: mt7530: add the missing RxUnicast MIB counter
20a8031902 ASoC: cs42l42: Fix LRCLK frame start edge
750503aecf pinctrl: tigerlake: Fix GPIO mapping for newer version of software
be49d5437d netfilter: nf_conntrack_bridge: Fix memory leak when error
aa6b17bfef ASoC: cs42l42: Remove duplicate control for WNF filter frequency
b268f9f6b7 ASoC: cs42l42: Fix inversion of ADC Notch Switch control
2386a8cde1 ASoC: SOF: Intel: hda-ipc: fix reply size checking
0e47f99e86 ASoC: cs42l42: Don't allow SND_SOC_DAIFMT_LEFT_J
576939671f ASoC: cs42l42: Correct definition of ADC Volume control
a21963c35f pinctrl: mediatek: Fix fallback behavior for bias_set_combo
27188a9382 ieee802154: hwsim: fix GPF in hwsim_new_edge_nl
528f17c02d ieee802154: hwsim: fix GPF in hwsim_set_edge_lqi
95de3592f8 drm/amdgpu: don't enable baco on boco platforms in runpm
bd80d11a51 drm/amd/display: use GFP_ATOMIC in amdgpu_dm_irq_schedule_work
ae311a7418 drm/amd/display: Remove invalid assert for ODM + MPC case
c2351e5faa libnvdimm/region: Fix label activation vs errors
366de90ccf ACPI: NFIT: Fix support for virtual SPA ranges
f3fcf9d1b7 ceph: reduce contention in ceph_check_delayed_caps()
ca6dea44bd ARC: fp: set FPU_STATUS.FWE to enable FPU_STATUS update on context switch
4716a2145b net: ethernet: ti: cpsw: fix min eth packet size for non-switch use-cases
561d13128b seccomp: Fix setting loaded filter count during TSYNC
54916988a0 scsi: lpfc: Move initialization of phba->poll_list earlier to avoid crash
3db5cb9228 cifs: create sd context must be a multiple of 8
d1398e3715 i2c: dev: zero out array used for i2c reads from userspace
b8bceace43 ASoC: intel: atom: Fix reference to PCM buffer address
261613ef34 ASoC: tlv320aic31xx: Fix jack detection after suspend
7e5a7fa68b ASoC: uniphier: Fix reference to PCM buffer address
209eb62b45 ASoC: xilinx: Fix reference to PCM buffer address
c419c4c91b ASoC: amd: Fix reference to PCM buffer address
0c9adae117 iio: adc: Fix incorrect exit of for-loop
632279e505 iio: humidity: hdc100x: Add margin to the conversion time
45de224b13 iio: adis: set GPIO reset pin direction
d0532ed064 iio: adc: ti-ads7950: Ensure CS is deasserted after reading channels
46f161a1b2 Merge 5.10.59 into android12-5.10-lts
5805e5eec9 Linux 5.10.59
25cff25ec6 net: xilinx_emaclite: Do not print real IOMEM pointer
6a002d48a6 ovl: prevent private clone if bind mount is not allowed
bffead8d36 ppp: Fix generating ppp unit id when ifname is not specified
5df85901fe ALSA: hda: Add quirk for ASUS Flow x13
8930f2c60a ALSA: hda/realtek: fix mute/micmute LEDs for HP ProBook 650 G8 Notebook PC
98c3fa3a9d ALSA: pcm: Fix mmap breakage without explicit buffer setup
cc1a4dff23 USB:ehci:fix Kunpeng920 ehci hardware problem
bd909fd387 vboxsf: Make vboxsf_dir_create() return the handle for the created file
971703fc19 vboxsf: Honor excl flag to the dir-inode create op
96b2232cb7 arm64: dts: renesas: beacon: Fix USB ref clock references
e0dd4a0ab0 arm64: dts: renesas: beacon: Fix USB extal reference
0f47027d1b arm64: dts: renesas: rzg2: Add usb2_clksel to RZ/G2 M/N/H
eaa7feecd3 mm: make zone_to_nid() and zone_set_nid() available for DISCONTIGMEM
0e70939037 Revert "selftests/resctrl: Use resctrl/info for feature detection"
d8c3859870 bpf: Add lockdown check for probe_write_user helper
5b5064ea9a firmware: tee_bnxt: Release TEE shm, session, and context during kexec
c5a625c6a4 tee: Correct inappropriate usage of TEE_SHM_DMA_BUF flag
9f105d2d4f KVM: SVM: Fix off-by-one indexing when nullifying last used SEV VMCB
c80c82c899 Revert "xfrm: Fix RCU vs hash_resize_mutex lock inversion"
af3bdb4304 Merge 5.10.58 into android12-5.10-lts
bd3afc373f Merge branch 'android12-5.10' into `android12-5.10-lts`
132a8267ad Linux 5.10.58
3d7d1b0f5f arm64: fix compat syscall return truncation
bb65051dcd drm/amdgpu/display: only enable aux backlight control for OLED panels
c8b7cfa674 smb3: rc uninitialized in one fallocate path
8cfdd039ca net/qla3xxx: fix schedule while atomic in ql_wait_for_drvr_lock and ql_adapter_reset
fbbb209268 alpha: Send stop IPI to send to online CPUs
13d0a9b3b9 net: qede: Fix end of loop tests for list_for_each_entry
1478e902bc virt_wifi: fix error on connect
ecd8614809 reiserfs: check directory items on read from disk
dbe4f82fed reiserfs: add check for root_inode in reiserfs_fill_super
0f05e0ffa2 libata: fix ata_pio_sector for CONFIG_HIGHMEM
11891adab2 drm/i915: avoid uninitialised var in eb_parse()
a3e6bd0c71 sched/rt: Fix double enqueue caused by rt_effective_prio
c797b8872b perf/x86/amd: Don't touch the AMD64_EVENTSEL_HOSTONLY bit inside the guest
2d94cffc94 soc: ixp4xx/qmgr: fix invalid __iomem access
7397034905 drm/i915: Correct SFC_DONE register offset
16aecf1e36 interconnect: qcom: icc-rpmh: Ensure floor BW is enforced for all nodes
22b4917c85 interconnect: Always call pre_aggregate before aggregate
ccfe4f62ff interconnect: Zero initial BW after sync-state
05565b4693 spi: meson-spicc: fix memory leak in meson_spicc_remove
1a084e7821 interconnect: Fix undersized devress_alloc allocation
dcc23e5851 soc: ixp4xx: fix printing resources
37cbd27ef4 arm64: vdso: Avoid ISB after reading from cntvct_el0
7a2b5bb00f KVM: x86/mmu: Fix per-cpu counter corruption on 32-bit builds
32f55c25ee KVM: Do not leak memory for duplicate debugfs directories
309a31127b KVM: x86: accept userspace interrupt only if no event is injected
a786282b55 md/raid10: properly indicate failure when ending a failed write request
3d7d2d2b06 ARM: omap2+: hwmod: fix potential NULL pointer access
9851ad2f71 Revert "gpio: mpc8xxx: change the gpio interrupt flags."
57c44e7ac7 bus: ti-sysc: AM3: RNG is GP only
f4984f60ac selinux: correct the return value when loads initial sids
100f8396d1 pcmcia: i82092: fix a null pointer dereference bug
afcd5a0e01 net/xfrm/compat: Copy xfrm_spdattr_type_t atributes
f08b2d078c xfrm: Fix RCU vs hash_resize_mutex lock inversion
23e36a8610 timers: Move clearing of base::timer_running under base:: Lock
9a69d0d24d fpga: dfl: fme: Fix cpu hotplug issue in performance reporting
bfb5f1a123 serial: 8250_pci: Avoid irq sharing for MSI(-X) interrupts.
0f30fedced serial: 8250_pci: Enumerate Elkhart Lake UARTs via dedicated driver
17f3c64f70 MIPS: Malta: Do not byte-swap accesses to the CBUS UART
8a1624f4a8 serial: 8250: Mask out floating 16/32-bit bus bits
c03cef6715 serial: 8250_mtk: fix uart corruption issue when rx power off
a4f8bfc919 serial: tegra: Only print FIFO error message when an error occurs
cc73007768 ext4: fix potential htree corruption when growing large_dir directories
6b5a3d2c2b pipe: increase minimum default pipe size to 2 pages
556e7f204d media: rtl28xxu: fix zero-length control request
551e0c5d6b drivers core: Fix oops when driver probe fails
faec2c68ea staging: rtl8712: error handling refactoring
e468a357af staging: rtl8712: get rid of flush_scheduled_work
369101e399 staging: rtl8723bs: Fix a resource leak in sd_int_dpc
1628b64efb tpm_ftpm_tee: Free and unregister TEE shared memory during kexec
2a879ff971 optee: fix tee out of memory failure seen during kexec reboot
ad80c25987 optee: Refuse to load the driver under the kdump kernel
1340dc3fb7 optee: Fix memory leak when failing to register shm pages
6b2ded93d3 tee: add tee_shm_alloc_kernel_buf()
5e9d820214 optee: Clear stale cache entries during initialization
e5d8fd8709 arm64: stacktrace: avoid tracing arch_stack_walk()
7799ad4d18 tracepoint: Fix static call function vs data state mismatch
14673e1929 tracepoint: static call: Compare data on transition from 2->1 callees
046e12323a tracing: Fix NULL pointer dereference in start_creating
b2aca8daa5 tracing: Reject string operand in the histogram expression
b10ccc2c58 tracing / histogram: Give calculation hist_fields a size
f972745280 scripts/tracing: fix the bug that can't parse raw_trace_func
fd3afb81f4 clk: fix leak on devm_clk_bulk_get_all() unwind
948ff2f214 usb: otg-fsm: Fix hrtimer list corruption
8f8645de09 usb: typec: tcpm: Keep other events when receiving FRS and Sourcing_vbus events
5b4318885a usb: host: ohci-at91: suspend/resume ports after/before OHCI accesses
1f2015506d usb: gadget: f_hid: idle uses the highest byte for duration
825ac3f0bc usb: gadget: f_hid: fixed NULL pointer dereference
683702dff7 usb: gadget: f_hid: added GET_IDLE and SET_IDLE handlers
051518d9cf usb: cdns3: Fixed incorrect gadget state
822bec5cbb usb: gadget: remove leaked entry from udc driver list
98c83d7261 usb: dwc3: gadget: Avoid runtime resume if disabling pullup
79e9389038 ALSA: usb-audio: Add registration quirk for JBL Quantum 600
b7532db2d4 ALSA: usb-audio: Fix superfluous autosuspend recovery
80b7aa2651 ALSA: hda/realtek: Fix headset mic for Acer SWIFT SF314-56 (ALC256)
de30786fb2 ALSA: hda/realtek: add mic quirk for Acer SF314-42
c0b626f0a2 ALSA: pcm - fix mmap capability check for the snd-dummy driver
dd3f7c5c89 drm/amdgpu/display: fix DMUB firmware version info
ecb739cf15 firmware_loader: fix use-after-free in firmware_fallback_sysfs
5019f5812b firmware_loader: use -ETIMEDOUT instead of -EAGAIN in fw_load_sysfs_fallback
aa3b8bc17e USB: serial: ftdi_sio: add device ID for Auto-M3 OP-COM v2
d245a76719 USB: serial: ch341: fix character loss at high transfer rates
0470385e63 USB: serial: option: add Telit FD980 composition 0x1056
ba4a395668 USB: usbtmc: Fix RCU stall warning
f2f856b65a Bluetooth: defer cleanup of resources in hci_unregister_dev()
821e6a6133 blk-iolatency: error out if blk_get_queue() failed in iolatency_set_limit()
c5a499b860 net: vxge: fix use-after-free in vxge_device_unregister
fb49d67262 net: fec: fix use-after-free in fec_drv_remove
f12b6b6bc1 net: pegasus: fix uninit-value in get_interrupt_interval
c66d273b70 bnx2x: fix an error code in bnx2x_nic_load()
f76f9caccb mips: Fix non-POSIX regexp
f93b7b0000 MIPS: check return value of pgtable_pmd_page_ctor
9b2b2f0771 net: sched: fix lockdep_set_class() typo error for sch->seqlock
d1f2abe57b net: dsa: qca: ar9331: reorder MDIO write sequence
a45ee8ed0c net: ipv6: fix returned variable type in ip6_skb_dst_mtu
f87be69b7f nfp: update ethtool reporting of pauseframe control
44f2e360e7 sctp: move the active_key update after sh_keys is added
e74551ba93 RDMA/mlx5: Delay emptying a cache entry when a new MR is added to it recently
1242ca9369 gpio: tqmx86: really make IRQ optional
4ef549dc9c net: natsemi: Fix missing pci_disable_device() in probe and remove
1dc3eef381 net: phy: micrel: Fix detection of ksz87xx switch
e09dba75ca net: dsa: sja1105: match FDB entries regardless of inner/outer VLAN tag
c0b14a0e61 net: dsa: sja1105: be stateless with FDB entries on SJA1105P/Q/R/S/SJA1110 too
00bf923dce net: dsa: sja1105: invalidate dynamic FDB entries learned concurrently with statically added ones
de425f1c3a net: dsa: sja1105: overwrite dynamic FDB entries with static ones in .port_fdb_add
74bcf85ff1 net, gro: Set inner transport header offset in tcp/udp GRO hook
80fd533ac3 dmaengine: imx-dma: configure the generic DMA type to make it work
163e6d8721 ARM: dts: stm32: Fix touchscreen IRQ line assignment on DHCOM
442f7e04d5 ARM: dts: stm32: Disable LAN8710 EDPD on DHCOM
449991df08 media: videobuf2-core: dequeue if start_streaming fails
3e8bba6012 scsi: sr: Return correct event when media event code is 3
aaaf6e6e41 spi: imx: mx51-ecspi: Fix low-speed CONFIGREG delay calculation
cd989e1192 spi: imx: mx51-ecspi: Reinstate low-speed CONFIGREG delay
281514da66 dmaengine: stm32-dmamux: Fix PM usage counter unbalance in stm32 dmamux ops
bbce3c99f6 dmaengine: stm32-dma: Fix PM usage counter imbalance in stm32 dma ops
84656b4c27 clk: tegra: Implement disable_unused() of tegra_clk_sdmmc_mux_ops
edf1b7911a dmaengine: uniphier-xdmac: Use readl_poll_timeout_atomic() in atomic state
4ebd11d1c7 omap5-board-common: remove not physically existing vdds_1v8_main fixed-regulator
9bf056b99f ARM: dts: am437x-l4: fix typo in can@0 node
e79a30f71d clk: stm32f4: fix post divisor setup for I2S/SAI PLLs
71f39badc8 ALSA: usb-audio: fix incorrect clock source setting
c4fcda1287 arm64: dts: armada-3720-turris-mox: remove mrvl,i2c-fast-mode
8d13f6a0a6 arm64: dts: armada-3720-turris-mox: fixed indices for the SDHC controllers
f239369f37 ARM: dts: imx: Swap M53Menlo pinctrl_power_button/pinctrl_power_out pins
ee6f708432 ARM: imx: fix missing 3rd argument in macro imx_mmdc_perf_init
e1011b9c59 ARM: dts: colibri-imx6ull: limit SDIO clock to 25MHz
c0f61abbef arm64: dts: ls1028: sl28: fix networking for variant 2
54555c3996 ARM: dts: imx6qdl-sr-som: Increase the PHY reset duration to 10ms
3790f94098 ARM: imx: add missing clk_disable_unprepare()
a28569b510 ARM: imx: add missing iounmap()
9189d77f0e arm64: dts: ls1028a: fix node name for the sysclk
d61dc8c634 net: xfrm: fix memory leak in xfrm_user_rcv_msg
8efe3a635f bus: ti-sysc: Fix gpt12 system timer issue with reserved status
e32a291736 ALSA: seq: Fix racy deletion of subscriber
b917f123b5 Revert "ACPICA: Fix memory leak caused by _CID repair function"
a15695131a Merge 5.10.57 into android12-5.10-lts
afeb953f87 Merge branch 'android12-5.10' into `android12-5.10-lts`
1cd6e30b83 Linux 5.10.57
9c645a020b spi: mediatek: Fix fifo transfer
7254e2d9eb selftest/bpf: Verifier tests for var-off access
30ea1c5352 bpf, selftests: Adjust few selftest outcomes wrt unreachable code
98bf2906d3 bpf: Update selftests to reflect new error states
360e5b7af6 bpf, selftests: Adjust few selftest result_unpriv outcomes
5abcd138cb selftest/bpf: Adjust expected verifier errors
83bbf953f6 selftests/bpf: Add a test for ptr_to_map_value on stack for helper access
e2b7a4ccbf Revert "watchdog: iTCO_wdt: Account for rebooting on second timeout"
1b1a00b13c firmware: arm_scmi: Add delayed response status check
93ef561406 firmware: arm_scmi: Ensure drivers provide a probe function
1812895f17 Revert "Bluetooth: Shutdown controller after workqueues are flushed or cancelled"
cae3fa3d81 ACPI: fix NULL pointer dereference
98b070694f drm/amd/display: Fix max vstartup calculation for modes with borders
f9d875c8c9 drm/amd/display: Fix comparison error in dcn21 DML
91865b458e nvme: fix nvme_setup_command metadata trace event
06a9092f66 efi/mokvar: Reserve the table only if it is in boot services data
27ff30c8b3 ASoC: ti: j721e-evm: Check for not initialized parent_clk_id
a00bcc5298 ASoC: ti: j721e-evm: Fix unbalanced domain activity tracking during startup
e8b287e783 net: Fix zero-copy head len calculation.
c6bdf7d97d ASoC: rt5682: Fix the issue of garbled recording after powerd_dbus_suspend
74b53ee4b8 qed: fix possible unpaired spin_{un}lock_bh in _qed_mcp_cmd_and_union()
f6a2ff040b r8152: Fix potential PM refcount imbalance
c98a7916cd ASoC: tlv320aic31xx: fix reversed bclk/wclk master bits
03258515c9 spi: stm32h7: fix full duplex irq handler handling
cfb8173a23 regulator: rt5033: Fix n_voltages settings for BUCK and LDO
81dc9a4868 regulator: rtmv20: Fix wrong mask for strobe-polarity-high
9e55b9278c btrfs: fix lost inode on log replay after mix of fsync, rename and inode eviction
e2419c5709 btrfs: fix race causing unnecessary inode logging during link and rename
118b070bf4 Revert "drm/i915: Propagate errors on awaiting already signaled fences"
6976f3cf34 drm/i915: Revert "drm/i915/gem: Asynchronous cmdparser"
2da9d8f1db Merge branch 'android12-5.10' into `android12-5.10-lts`
8b444656fa Merge 5.10.56 into android12-5.10-lts
75ca4a8efe Merge branch 'android12-5.10' into `android12-5.10-lts`
9746c25334 Linux 5.10.56
55dd22c5d0 can: j1939: j1939_session_deactivate(): clarify lifetime of session object
75ebe1d355 i40e: Add additional info to PHY type error
2ca5ec188b Revert "perf map: Fix dso->nsinfo refcounting"
c14cee5bc4 powerpc/pseries: Fix regression while building external modules
bfc8e67c60 SMB3: fix readpage for large swap cache
be561c0154 bpf: Fix pointer arithmetic mask tightening under state pruning
ffb9d5c48b bpf: verifier: Allocate idmap scratch in verifier env
a11ca29c65 bpf: Remove superfluous aux sanitation on subprog rejection
0e9280654a bpf: Fix leakage due to insufficient speculative store bypass mitigation
bea9e2fd18 bpf: Introduce BPF nospec instruction for mitigating Spectre v4
cd61e665a1 can: hi311x: fix a signedness bug in hi3110_cmd()
65dfa6cb22 sis900: Fix missing pci_disable_device() in probe and remove
93e5bf4b29 tulip: windbond-840: Fix missing pci_disable_device() in probe and remove
58b8c812c7 sctp: fix return value check in __sctp_rcv_asconf_lookup
362e9d23cf net/mlx5e: Fix nullptr in mlx5e_hairpin_get_mdev()
bd744f2a27 net/mlx5: Fix flow table chaining
1b148bd72e skmsg: Make sk_psock_destroy() static
645a1d3bef drm/msm/dp: Initialize the INTF_CONFIG register
4a6841921c drm/msm/dpu: Fix sm8250_mdp register length
e6097071a4 net: llc: fix skb_over_panic
01f3581d44 KVM: x86: Check the right feature bit for MSR_KVM_ASYNC_PF_ACK access
f5f78ae5f1 mlx4: Fix missing error code in mlx4_load_one()
51b751fc06 octeontx2-pf: Fix interface down flag on error
4951ffa3fa tipc: do not write skb_shinfo frags when doing decrytion
7eefa0b74f ionic: count csum_none when offload enabled
60decbe01d ionic: fix up dim accounting for tx and rx
a7c85a516c ionic: remove intr coalesce update from napi
6961323eed net: qrtr: fix memory leaks
91350564ea net: Set true network header for ECN decapsulation
a41282e82a tipc: fix sleeping in tipc accept routine
10f585740c tipc: fix implicit-connect for SYN+
bb60616162 i40e: Fix log TC creation failure when max num of queues is exceeded
c1cc6bce1a i40e: Fix queue-to-TC mapping on Tx
4382cca179 i40e: Fix firmware LLDP agent related warning
e090ffdf05 i40e: Fix logic of disabling queues
cbc8012902 netfilter: nft_nat: allow to specify layer 4 protocol NAT only
3dbda8483f netfilter: conntrack: adjust stop timestamp to real expiry value
ac038f4152 mac80211: fix enabling 4-address mode on a sta vif after assoc
076bc6ebce bpf: Fix OOB read when printing XDP link fdinfo
e6a06a13ec RDMA/bnxt_re: Fix stats counters
c8667cb406 cfg80211: Fix possible memory leak in function cfg80211_bss_update
9ab284bc35 nfc: nfcsim: fix use after free during module unload
ea04a3b572 blk-iocost: fix operation ordering in iocg_wake_fn()
fc2756cce0 drm/amdgpu: Fix resource leak on probe error path
ccc7a1bb32 drm/amdgpu: Avoid printing of stack contents on firmware load error
63570e5780 drm/amd/display: ensure dentist display clock update finished in DCN20
2eab387507 NIU: fix incorrect error return, missed in previous revert
cb71730a63 HID: wacom: Re-enable touch by default for Cintiq 24HDT / 27QHDT
7bca5da005 alpha: register early reserved memory in memblock
30e19d072e can: esd_usb2: fix memory leak
88b4025816 can: ems_usb: fix memory leak
f58ac91ff8 can: usb_8dev: fix memory leak
a6ebfbdaca can: mcba_usb_start(): add missing urb->transfer_dma initialization
2fc2c2816c can: peak_usb: pcan_usb_handle_bus_evt(): fix reading rxerr/txerr values
afe2ffd920 can: raw: raw_setsockopt(): fix raw_rcv panic for sock UAF
a9c02d0e15 can: j1939: j1939_xtp_rx_dat_one(): fix rxtimer value between consecutive TP.DT to 750ms
da4f4916da ocfs2: issue zeroout to EOF blocks
9430145930 ocfs2: fix zero out valid data
52acb6c147 KVM: add missing compat KVM_CLEAR_DIRTY_LOG
7d67d4ab28 x86/kvm: fix vcpu-id indexed array sizes
2388c7674f ACPI: DPTF: Fix reading of attributes
0d6afa2597 Revert "ACPI: resources: Add checks for ACPI IRQ override"
0a421a2fc5 btrfs: mark compressed range uptodate only if all bio succeed
4e1a57d752 btrfs: fix rw device counting in __btrfs_free_extra_devids
27aa7171fe pipe: make pipe writes always wake up readers
02210a5e18 x86/asm: Ensure asm/proto.h can be included stand-alone
65b2658634 io_uring: fix null-ptr-deref in io_sq_offload_start()
e44d22fdf7 selftest: fix build error in tools/testing/selftests/vm/userfaultfd.c
1afedcdcf8 Merge 5.10.55 into android12-5.10-lts
11fe69a171 Linux 5.10.55
984e93b8e2 ipv6: ip6_finish_output2: set sk into newly allocated nskb
a74054ca75 ARM: dts: versatile: Fix up interrupt controller node names
3510b9b41c iomap: remove the length variable in iomap_seek_hole
8659186e72 iomap: remove the length variable in iomap_seek_data
6503940748 cifs: fix the out of range assignment to bit fields in parse_server_interfaces
fe5fe0b1c8 firmware: arm_scmi: Fix range check for the maximum number of pending messages
8f8e5475a3 firmware: arm_scmi: Fix possible scmi_linux_errmap buffer overflow
d01328fef6 hfs: add lock nesting notation to hfs_find_init
06b3d9923f hfs: fix high memory mapping in hfs_bnode_read
680b2917e6 hfs: add missing clean-up in hfs_fill_super
5c3d753b87 drm/ttm: add a check against null pointer dereference
2323690eb0 ipv6: allocate enough headroom in ip6_finish_output2()
86cb49e731 rcu-tasks: Don't delete holdouts within trc_wait_for_one_reader()
55ddab2bfd rcu-tasks: Don't delete holdouts within trc_inspect_reader()
4d972881f8 sctp: move 198 addresses from unusable to private scope
915226f31f net: annotate data race around sk_ll_usec
92289f58f0 net/802/garp: fix memleak in garp_request_join()
5d93810761 net/802/mrp: fix memleak in mrp_request_join()
df34f88862 cgroup1: fix leaked context root causing sporadic NULL deref in LTP
dcd00801f3 workqueue: fix UAF in pwq_unbound_release_workfn()
93c5951e0c af_unix: fix garbage collect vs MSG_PEEK
dee8119eaa KVM: x86: determine if an exception has an error code only when injecting it.
6f5d7a45f5 io_uring: fix link timeout refs
475312897e tools: Allow proper CC/CXX/... override with LLVM=1 in Makefile.include
57e177ea01 Merge branch 'android12-5.10' into `android12-5.10-lts`
e4cac2c332 Merge 5.10.54 into android12-5.10-lts
0482d070e7 Merge branch 'android12-5.10' into `android12-5.10-lts`
08277b9dde Linux 5.10.54
c9f8e17990 skbuff: Fix build with SKB extensions disabled
ba28765d33 xhci: add xhci_get_virt_ep() helper
624290f368 sfc: ensure correct number of XDP queues
1df4fe5a88 drm/i915/gvt: Clear d3_entered on elsp cmd submission.
c938e65768 usb: ehci: Prevent missed ehci interrupts with edge-triggered MSI
25af91a806 perf inject: Close inject.output on exit
fb35426d12 Documentation: Fix intiramfs script name
570341f10e skbuff: Release nfct refcount on napi stolen or re-used skbs
31828ffdab bonding: fix build issue
c9d97b7bb8 PCI: Mark AMD Navi14 GPU ATS as broken
f7ee361182 net: dsa: mv88e6xxx: enable SerDes PCS register dump via ethtool -d on Topaz
30f1d4d036 net: dsa: mv88e6xxx: enable SerDes RX stats for Topaz
fc31b5be13 drm/amdgpu: update golden setting for sienna_cichlid
69a603aa17 drm: Return -ENOTTY for non-drm ioctls
2831eeb7bc driver core: Prevent warning when removing a device link from unregistered consumer
0e75938323 nds32: fix up stack guard gap
7497f4c91d misc: eeprom: at24: Always append device id even if label property is set.
8571daace5 rbd: always kick acquire on "acquired" and "released" notifications
2f3731de5e rbd: don't hold lock_rwsem while running_list is being drained
92291fa2d1 hugetlbfs: fix mount mode command line processing
1a25c5738d memblock: make for_each_mem_range() traverse MEMBLOCK_HOTPLUG regions
0b591c020d userfaultfd: do not untag user pointers
fca5343b48 io_uring: remove double poll entry on arm failure
9eef902915 io_uring: explicitly count entries for poll reqs
1077e2b152 selftest: use mmap instead of posix_memalign to allocate memory
6e81e2c38a posix-cpu-timers: Fix rearm racing against process tick
3efec3b4b1 bus: mhi: core: Validate channel ID when processing command completions
b3f3a58a86 ixgbe: Fix packet corruption due to missing DMA sync
e991457afd media: ngene: Fix out-of-bounds bug in ngene_command_config_free_buf()
755971dc7e btrfs: check for missing device in btrfs_trim_fs
552b053f1a tracing: Synthetic event field_pos is an index not a boolean
757bdba802 tracing: Fix bug in rb_per_cpu_empty() that might cause deadloop.
a5e1aff589 tracing/histogram: Rename "cpu" to "common_cpu"
0edad8b9f6 tracepoints: Update static_call before tp_funcs when adding a tracepoint
4ed4074c6c firmware/efi: Tell memblock about EFI iomem reservations
647e26b03e usb: typec: stusb160x: register role switch before interrupt registration
a206167bd6 usb: dwc2: gadget: Fix sending zero length packet in DDMA mode.
f2c04f6b21 usb: dwc2: gadget: Fix GOUTNAK flow for Slave mode.
7073acb51a usb: gadget: Fix Unbalanced pm_runtime_enable in tegra_xudc_probe
1bf7371b90 USB: serial: cp210x: add ID for CEL EM3588 USB ZigBee stick
45c87a9433 USB: serial: cp210x: fix comments for GE CS1000
f528521c15 USB: serial: option: add support for u-blox LARA-R6 family
311fd7f7f1 usb: renesas_usbhs: Fix superfluous irqs happen after usb_pkt_pop()
7af54a4e22 usb: max-3421: Prevent corruption of freed memory
69da81a964 USB: usb-storage: Add LaCie Rugged USB3-FW to IGNORE_UAS
e6343aab3e usb: hub: Fix link power management max exit latency (MEL) calculations
8f087b4cf1 usb: hub: Disable USB 3 device initiated lpm if exit latency is too high
709137c853 KVM: PPC: Book3S HV Nested: Sanitise H_ENTER_NESTED TM state
c1fbdf0f3c KVM: PPC: Book3S: Fix H_RTAS rets buffer overflow
e3eb672c16 xhci: Fix lost USB 2 remote wake
02e2e96ba5 usb: xhci: avoid renesas_usb_fw.mem when it's unusable
9e9cf23b77 Revert "usb: renesas-xhci: Fix handling of unknown ROM state"
ebaa67086f ALSA: pcm: Fix mmap capability check
431e311055 ALSA: pcm: Call substream ack() method upon compat mmap commit
3c9afa23f3 ALSA: hdmi: Expose all pins on MSI MS-7C94 board
253759df80 ALSA: hda/realtek: Fix pop noise and 2 Front Mic issues on a machine
2b3cdf5819 ALSA: sb: Fix potential ABBA deadlock in CSP driver
5858c8a464 ALSA: usb-audio: Add registration quirk for JBL Quantum headsets
2de518548d ALSA: usb-audio: Add missing proc text entry for BESPOKEN type
37a88b41dc s390/boot: fix use of expolines in the DMA code
d1ab962880 s390/ftrace: fix ftrace_update_ftrace_func implementation
3b4009b496 mmc: core: Don't allocate IDA for OF aliases
fc6ac92cfc proc: Avoid mixing integer types in mem_rw()
76f7eae7ec cifs: fix fallocate when trying to allocate a hole.
c26372b8a8 cifs: only write 64kb at a time when fallocating a small region of a file
b91e5b6347 drm/panel: raspberrypi-touchscreen: Prevent double-free
9e0373945e net: sched: cls_api: Fix the the wrong parameter
c8ebf135c1 net: dsa: sja1105: make VID 4095 a bridge VLAN too
164294d09c tcp: disable TFO blackhole logic by default
8eb2258732 sctp: update active_key for asoc when old key is being replaced
ef799bd8ff nvme: set the PRACT bit when using Write Zeroes with T10 PI
7850f03ed8 r8169: Avoid duplicate sysfs entry creation error
0f5dc39714 afs: Fix tracepoint string placement with built-in AFS
711057846a Revert "USB: quirks: ignore remote wake-up on Fibocom L850-GL LTE modem"
8985dc2cab nvme-pci: don't WARN_ON in nvme_reset_work if ctrl.state is not RESETTING
fb28b15920 ceph: don't WARN if we're still opening a session to an MDS
ce8fafb680 ipv6: fix another slab-out-of-bounds in fib6_nh_flush_exceptions
071729150b net/sched: act_skbmod: Skip non-Ethernet packets
ee36bb4713 spi: spi-bcm2835: Fix deadlock
432738c974 net: hns3: fix rx VLAN offload state inconsistent issue
3e903e0b57 net: hns3: fix possible mismatches resp of mailbox
f4305375f0 ALSA: hda: intel-dsp-cfg: add missing ElkhartLake PCI ID
41a839437a net/tcp_fastopen: fix data races around tfo_active_disable_stamp
ba33363976 net: hisilicon: rename CACHE_LINE_MASK to avoid redefinition
320dcbdec4 bnxt_en: Check abort error state in bnxt_half_open_nic()
134a0536f0 bnxt_en: Validate vlan protocol ID on RX packets
4f7da0f97b bnxt_en: Add missing check for BNXT_STATE_ABORT_ERR in bnxt_fw_rset_task()
927370485e bnxt_en: Refresh RoCE capabilities in bnxt_ulp_probe()
ab830c3bae bnxt_en: don't disable an already disabled PCI device
2646368944 ACPI: Kconfig: Fix table override from built-in initrd
113ce8c504 spi: cadence: Correct initialisation of runtime PM again
3ea448b62b scsi: target: Fix protect handling in WRITE SAME(32)
b82a1a26aa scsi: iscsi: Fix iface sysfs attr detection
6811744bd0 netrom: Decrease sock refcount when sock timers expire
096a8dca8c sctp: trim optlen when it's a huge value in sctp_setsockopt
8e9662fde6 net: sched: fix memory leak in tcindex_partial_destroy_work
e14ef10953 KVM: PPC: Fix kvm_arch_vcpu_ioctl vcpu_load leak
fcbad8e18d KVM: PPC: Book3S: Fix CONFIG_TRANSACTIONAL_MEM=n crash
30b8302151 net: decnet: Fix sleeping inside in af_decnet
d402c60da0 efi/tpm: Differentiate missing and invalid final event log table.
8983766903 dma-mapping: handle vmalloc addresses in dma_common_{mmap,get_sgtable}
115e4f5b64 usb: hso: fix error handling code of hso_create_net_device
1582a02fec net: fix uninit-value in caif_seqpkt_sendmsg
2fc8048265 bpftool: Check malloc return value in mount_bpffs_for_pin
3b5b0afd8d bpf, sockmap, udp: sk_prot needs inuse_idx set for proc stats
c260442431 bpf, sockmap, tcp: sk_prot needs inuse_idx set for proc stats
715f378f42 bpf, sockmap: Fix potential memory leak on unlikely error case
e3a9548ae5 s390/bpf: Perform r1 range checking before accessing jit->seen_reg[r1]
9264bebe9e liquidio: Fix unintentional sign extension issue on left shift of u16
0ff2ea9d8f timers: Fix get_next_timer_interrupt() with no timers pending
ca9ba1de8f xdp, net: Fix use-after-free in bpf_xdp_link_release
39f1735c81 bpf: Fix tail_call_reachable rejection for interpreter when jit failed
2b4046e64f bpf, test: fix NULL pointer dereference on invalid expected_attach_type
3dba72d1fc ASoC: rt5631: Fix regcache sync errors on resume
2435dcfd16 spi: mediatek: fix fifo rx mode
a9a85bfedd regulator: hi6421: Fix getting wrong drvdata
5cdc986aad regulator: hi6421: Use correct variable type for regmap api val argument
23811b75fd spi: stm32: fixes pm_runtime calls in probe/remove
844ab04b62 spi: imx: add a check for speed_hz before calculating the clock
3b6c430d12 ASoC: wm_adsp: Correct wm_coeff_tlv_get handling
57efe4f82a perf sched: Fix record failure when CONFIG_SCHEDSTATS is not set
61f2e1e795 perf data: Close all files in close_dir()
7c91e0ce26 perf probe-file: Delete namelist in del_events() on the error path
a6c32317cd perf lzma: Close lzma stream on exit
2ae8f40a8f perf script: Fix memory 'threads' and 'cpus' leaks on exit
51077d315a perf report: Free generated help strings for sort option
2bfa3c53ea perf env: Fix memory leak of cpu_pmu_caps
a2f0da3af6 perf test maps__merge_in: Fix memory leak of maps
b7bfd8aeb9 perf dso: Fix memory leak in dso__new_map()
c9c101da3e perf test event_update: Fix memory leak of evlist
b768db7f80 perf test session_topology: Delete session->evlist
b8892d16a9 perf env: Fix sibling_dies memory leak
306411a8bf perf probe: Fix dso->nsinfo refcounting
f21987d7bb perf map: Fix dso->nsinfo refcounting
7337ff2093 perf inject: Fix dso->nsinfo refcounting
a87d42ae7f KVM: x86/pmu: Clear anythread deprecated bit when 0xa leaf is unsupported on the SVM
b990585f9b nvme-pci: do not call nvme_dev_remove_admin from nvme_remove
0fa11e1a20 mptcp: fix warning in __skb_flow_dissect() when do syn cookie for subflow join
3714e0bb0d cxgb4: fix IRQ free race during driver unload
d92337bf54 pwm: sprd: Ensure configuring period and duty_cycle isn't wrongly skipped
f1edbcc47f selftests: icmp_redirect: IPv6 PMTU info should be cleared after redirect
906bbb18db selftests: icmp_redirect: remove from checking for IPv6 route get
bb737eceb9 stmmac: platform: Fix signedness bug in stmmac_probe_config_dt()
79ec7b5b2f ipv6: fix 'disable_policy' for fwd packets
35eaefb44e bonding: fix incorrect return value of bond_ipsec_offload_ok()
13626bad63 bonding: fix suspicious RCU usage in bond_ipsec_offload_ok()
56ccdf868a bonding: Add struct bond_ipesc to manage SA
b3bd1f5e50 bonding: disallow setting nested bonding + ipsec offload
43511a6a16 bonding: fix suspicious RCU usage in bond_ipsec_del_sa()
6ca0e55a13 ixgbevf: use xso.real_dev instead of xso.dev in callback functions of struct xfrmdev_ops
ba7bfcdff1 bonding: fix null dereference in bond_ipsec_add_sa()
3ae639af36 bonding: fix suspicious RCU usage in bond_ipsec_add_sa()
4a31baf55f net: add kcov handle to skb extensions
78e4baff95 gve: Fix an error handling path in 'gve_probe()'
813449fb85 igb: Fix position of assignment to *ring
44171801d3 igb: Check if num of q_vectors is smaller than max before array access
cb9292445d iavf: Fix an error handling path in 'iavf_probe()'
a6756d637b e1000e: Fix an error handling path in 'e1000_probe()'
dea695a2ee fm10k: Fix an error handling path in 'fm10k_probe()'
a099192fe7 igb: Fix an error handling path in 'igb_probe()'
db4c32c1b9 igc: Fix an error handling path in 'igc_probe()'
7bc9fb1f80 ixgbe: Fix an error handling path in 'ixgbe_probe()'
02d1af0bee igc: change default return of igc_read_phy_reg()
f153664d8e igb: Fix use-after-free error during reset
e15f629036 igc: Fix use-after-free error during reset
67e686fc73 Revert "bpf: Track subprog poke descriptors correctly and fix use-after-free"
afe9ed0e13 Merge 5.10.53 into android12-5.10-lts
71046eac2d Linux 5.10.53
6cd9bd2a2d udp: annotate data races around unix_sk(sk)->gso_size
bfdb38a426 drm/panel: nt35510: Do not fail if DSI read fails
0d90d8492f perf test bpf: Free obj_buf
a9f36bf361 bpf: Track subprog poke descriptors correctly and fix use-after-free
782d71e29b bpftool: Properly close va_list 'ap' by va_end() on error
2381b8e882 tools: bpf: Fix error in 'make -C tools/ bpf_install'
638632997c tcp: call sk_wmem_schedule before sk_mem_charge in zerocopy path
2fee3cf4c9 ipv6: tcp: drop silly ICMPv6 packet too big messages
ad4ba34049 tcp: fix tcp_init_transfer() to not reset icsk_ca_initialized
d60f07bcb7 tcp: annotate data races around tp->mtu_info
ea66fcb296 tcp: consistently disable header prediction for mptcp
c28c747e37 ARM: dts: tacoma: Add phase corrections for eMMC
e55160537d ARM: dts: aspeed: Fix AST2600 machines line names
ff4b8f35c9 kbuild: do not suppress Kconfig prompts for silent build
0d514185ae dma-buf/sync_file: Don't leak fences on merge failure
f33605908a net: fddi: fix UAF in fza_probe
66c73f187d net: dsa: properly check for the bridge_leave methods in dsa_switch_bridge_leave()
7d7d0e84ac Revert "mm/shmem: fix shmem_swapin() race with swapoff"
2179d96ec7 net: validate lwtstate->data before returning from skb_tunnel_info()
b61d8814c4 net: send SYNACK packet with accepted fwmark
f2a062fcfe net: ti: fix UAF in tlan_remove_one
b560521eca net: qcom/emac: fix UAF in emac_remove
dbbf5b957b net: moxa: fix UAF in moxart_mac_probe
88ff9ec9c6 net: ip_tunnel: fix mtu calculation for ETHER tunnel devices
846829e75d net: bcmgenet: Ensure all TX/RX queues DMAs are disabled
b9fa66072f net: netdevsim: use xso.real_dev instead of xso.dev in callback functions of struct xfrmdev_ops
59070cc43d net: bridge: sync fdb to new unicast-filtering ports
7b5a2910e7 net/sched: act_ct: remove and free nf_table callbacks
6d4476236f vmxnet3: fix cksum offload issues for tunnels with non-default udp ports
c3bc9ce7d4 net/sched: act_ct: fix err check for nf_conntrack_confirm
fc40fdefd9 netfilter: ctnetlink: suspicious RCU usage in ctnetlink_dump_helpinfo
34365de508 net: ipv6: fix return value of ip6_skb_dst_mtu
73146745ff net: dsa: mv88e6xxx: enable devlink ATU hash param for Topaz
a8c7ba3687 net: dsa: mv88e6xxx: enable .rmu_disable() on Topaz
14cd8ce80a net: dsa: mv88e6xxx: use correct .stats_set_histogram() on Topaz
c657413dcd net: dsa: mv88e6xxx: enable .port_set_policy() on Topaz
fcb970edc0 net: bcmgenet: ensure EXT_ENERGY_DET_MASK is clear
4e275a4aca usb: cdns3: Enable TDL_CHK only for OUT ep
ce6ee46e0f mm/page_alloc: fix memory map initialization for descending nodes
9e1cf2d1ed mm/userfaultfd: fix uffd-wp special cases for fork()
84ff5f66c3 mm/thp: simplify copying of huge zero page pmd when fork
a62177b357 f2fs: Show casefolding support only when supported
277b311ae1 Revert "swap: fix do_swap_page() race with swapoff"
d92aa22f24 arm64: dts: marvell: armada-37xx: move firmware node to generic dtsi file
0e67c76384 firmware: turris-mox-rwtm: add marvell,armada-3700-rwtm-firmware compatible string
e58c162789 cifs: prevent NULL deref in cifs_compose_mount_options()
faa3e7da48 s390: introduce proper type handling call_on_stack() macro
be10fff3a4 s390/traps: do not test MONITOR CALL without CONFIG_BUG
9beba14699 thermal/core/thermal_of: Stop zone device before unregistering it
7412c988fe perf/x86/intel/uncore: Clean up error handling path of iio mapping
892387e761 sched/fair: Fix CFS bandwidth hrtimer expiry type
eb859b043c scsi: qedf: Add check to synchronize abort and flush
a4a54c54af scsi: libfc: Fix array index out of bound exception
0d7596a954 scsi: libsas: Add LUN number check in .slave_alloc callback
2f8df6332e scsi: aic7xxx: Fix unintentional sign extension issue on left shift of u8
cdb995a6cb rtc: max77686: Do not enforce (incorrect) interrupt trigger type
d3ba15fb04 arch/arm64/boot/dts/marvell: fix NAND partitioning scheme
e378db1189 kbuild: mkcompile_h: consider timestamp if KBUILD_BUILD_TIMESTAMP is set
8c12a3a68d thermal/drivers/sprd: Add missing of_node_put for loop iteration
20babcd830 thermal/drivers/imx_sc: Add missing of_node_put for loop iteration
469951ce4b thermal/drivers/rcar_gen3_thermal: Do not shadow rcar_gen3_ths_tj_1
bd40e2da3a thermal/core: Correct function name thermal_zone_device_unregister()
6c099d595f arm64: dts: imx8mq: assign PCIe clocks
24c41aa9d0 arm64: dts: ls208xa: remove bus-num from dspi node
7e3f5739cc firmware: tegra: bpmp: Fix Tegra234-only builds
7c03982fa1 soc/tegra: fuse: Fix Tegra234-only builds
ffa6f08be1 ARM: OMAP2+: Block suspend for am3 and am4 if PM is not configured
f40a7c9b8e ARM: dts: stm32: fix stpmic node for stm32mp1 boards
2670d3d1ef ARM: dts: stm32: Rename spi-flash/mx66l51235l@N to flash@N on DHCOM SoM
26cd441c52 ARM: dts: stm32: Drop unused linux,wakeup from touchscreen node on DHCOM SoM
a5bc2a2d83 ARM: dts: stm32: fix the Odyssey SoM eMMC VQMMC supply
e27052f21a ARM: dts: stm32: move stmmac axi config in ethernet node on stm32mp15
6a7af63478 ARM: dts: stm32: fix i2c node name on stm32f746 to prevent warnings
160c92d728 ARM: dts: rockchip: fix supply properties in io-domains nodes
d671fae046 arm64: dts: juno: Update SCPI nodes as per the YAML schema
fc71d8df58 ARM: dts: bcm283x: Fix up GPIO LED node names
4bc03e321f ARM: dts: bcm283x: Fix up MMC node names
15d727c044 firmware: arm_scmi: Fix the build when CONFIG_MAILBOX is not selected
b4009ea92f firmware: arm_scmi: Add SMCCC discovery dependency in Kconfig
41e2bcca23 memory: tegra: Fix compilation warnings on 64bit platforms
7ad965c8a7 ARM: dts: stm32: fix timer nodes on STM32 MCU to prevent warnings
ccec32f771 ARM: dts: stm32: fix RCC node name on stm32f429 MCU
bfbc4b4820 ARM: dts: stm32: fix gpio-keys node on STM32 MCU boards
34ec6702a2 ARM: dts: stm32: fix stm32mp157c-odyssey card detect pin
44f4e344f4 ARM: dts: stm32: Fix touchscreen node on dhcom-pdk2
3534a4b2d0 ARM: dts: stm32: Remove extra size-cells on dhcom-pdk2
8da771b552 arm64: dts: qcom: sc7180: Move rmtfs memory region
a74d3bbe05 ARM: tegra: nexus7: Correct 3v3 regulator GPIO of PM269 variant
e89f4098d5 ARM: tegra: wm8903: Fix polarity of headphones-detection GPIO in device-trees
c8815d6fee arm64: dts: ti: k3-am654x/j721e/j7200-common-proc-board: Fix MCU_RGMII1_TXC direction
28d2ae9815 ARM: dts: OMAP2+: Replace underscores in sub-mailbox node names
a73a22a69f ARM: dts: am335x: fix ti,no-reset-on-init flag for gpios
ddf2d14894 ARM: dts: am437x-gp-evm: fix ti,no-reset-on-init flag for gpios
a09b4c4449 ARM: dts: am57xx-cl-som-am57x: fix ti,no-reset-on-init flag for gpios
3f09485699 kbuild: sink stdout from cmd for silent build
27582c9fa2 rtc: mxc_v2: add missing MODULE_DEVICE_TABLE
6f5891a560 ARM: dts: imx6dl-riotboard: configure PHY clock and set proper EEE value
5190a6604a ARM: dts: ux500: Fix orientation of accelerometer
61fda04276 ARM: dts: ux500: Rename gpio-controller node
985947c535 ARM: dts: ux500: Fix interrupt cells
ff9ef21bb6 arm64: dts: rockchip: fix regulator-gpio states array
31e1b8c07d ARM: imx: pm-imx5: Fix references to imx5_cpu_suspend_info
d05ebeffca ARM: dts: imx6: phyFLEX: Fix UART hardware flow control
249d8e4ea1 ARM: dts: Hurricane 2: Fix NAND nodes names
f6541401db ARM: dts: BCM63xx: Fix NAND nodes names
b5fc6b9ab4 ARM: NSP: dts: fix NAND nodes names
5f844007eb ARM: Cygnus: dts: fix NAND nodes names
9525d58c5a ARM: brcmstb: dts: fix NAND nodes names
4e8eb51ae6 reset: ti-syscon: fix to_ti_syscon_reset_data macro
cabcb576fc arm64: dts: rockchip: Fix power-controller node names for rk3399
81ea23d988 arm64: dts: rockchip: Fix power-controller node names for rk3328
e4f97b740d arm64: dts: rockchip: Fix power-controller node names for px30
95b64be2e7 ARM: dts: rockchip: Fix power-controller node names for rk3288
5881af8d69 ARM: dts: rockchip: Fix power-controller node names for rk3188
70abb82831 ARM: dts: rockchip: Fix power-controller node names for rk3066a
fc01549d7b ARM: dts: rockchip: Fix IOMMU nodes properties on rk322x
55014c38e7 ARM: dts: rockchip: Fix the timer clocks order
2e1f681b68 arm64: dts: rockchip: fix pinctrl sleep nodename for rk3399.dtsi
7d3408c723 ARM: dts: rockchip: fix pinctrl sleep nodename for rk3036-kylin and rk3288
e2d1e44161 ARM: dts: rockchip: Fix thermal sensor cells o rk322x
c8f0cef75d ARM: dts: gemini: add device_type on pci
191523dcfa ARM: dts: gemini: rename mdio to the right name
90a010f605 Merge branch 'android12-5.10' into `android12-5.10-lts`
c0dd8de281 Merge branch 'android12-5.10' into `android12-5.10-lts`
51ab149d5f Merge 5.10.52 into android12-5.10-lts
2cd5fe24a7 Linux 5.10.52
174c34d9cd seq_file: disallow extremely large seq buffer allocations
b33aa0dbd7 scsi: scsi_dh_alua: Fix signedness bug in alua_rtpg()
e09c9b5584 MIPS: vdso: Invalid GIC access through VDSO
20f79ce2b1 mips: disable branch profiling in boot/decompress.o
4e2764e96a mips: always link byteswap helpers into decompressor
53c5c2496f static_call: Fix static_call_text_reserved() vs __init
59ae35884c jump_label: Fix jump_label_text_reserved() vs __init
143a6b8ec5 sched/uclamp: Ignore max aggregation if rq is idle
43b89ef7bc scsi: be2iscsi: Fix an error handling path in beiscsi_dev_probe()
f71f13034f arm64: dts: rockchip: Re-add regulator-always-on for vcc_sdio for rk3399-roc-pc
b3231050c7 arm64: dts: rockchip: Re-add regulator-boot-on, regulator-always-on for vdd_gpu on rk3399-roc-pc
9436e9001d firmware: turris-mox-rwtm: show message about HWRNG registration
b2a5949a91 firmware: turris-mox-rwtm: fail probing when firmware does not support hwrng
ddf380b094 firmware: turris-mox-rwtm: report failures better
271c12dbeb firmware: turris-mox-rwtm: fix reply status decoding function
804aabb509 thermal/drivers/rcar_gen3_thermal: Fix coefficient calculations
dae7775232 ARM: dts: imx6q-dhcom: Add gpios pinctrl for i2c bus recovery
936446f15a ARM: dts: imx6q-dhcom: Fix ethernet plugin detection problems
f12a456f1c ARM: dts: imx6q-dhcom: Fix ethernet reset time properties
b1995806d0 thermal/drivers/sprd: Add missing MODULE_DEVICE_TABLE
4d9ea28586 ARM: dts: am437x: align ti,pindir-d0-out-d1-in property with dt-shema
6641724d68 ARM: dts: am335x: align ti,pindir-d0-out-d1-in property with dt-shema
0724764c08 ARM: dts: dra7: Fix duplicate USB4 target module node
6cd58375c5 arm64: dts: allwinner: a64-sopine-baseboard: change RGMII mode to TXID
b5789e2377 memory: fsl_ifc: fix leak of private memory on probe failure
8d071d270a memory: fsl_ifc: fix leak of IO mapping on probe failure
1479998d80 arm64: dts: ti: k3-j721e-main: Fix external refclk input to SERDES
668ca46870 arm64: dts: renesas: r8a779a0: Drop power-domains property from GIC node
884d09d1f1 reset: bail if try_module_get() fails
8c07e1a8c5 ARM: dts: BCM5301X: Fixup SPI binding
db4e87ab60 dt-bindings: i2c: at91: fix example for scl-gpios
4b4c61049e firmware: arm_scmi: Reset Rx buffer to max size during async commands
c381e695cf firmware: tegra: Fix error return code in tegra210_bpmp_init()
f58a3bc94a arm64: dts: qcom: trogdor: Add no-hpd to DSI bridge node
d99524d13d ARM: dts: stm32: Rework LAN8710Ai PHY reset on DHCOM SoM
91df7f4a04 ARM: dts: stm32: Connect PHY IRQ line on DH STM32MP1 SoM
c4218acd68 arm64: dts: renesas: r8a7796[01]: Fix OPP table entry voltages
305df11389 arm64: dts: renesas: Add missing opp-suspend properties
55fd1d3ca5 arm64: dts: ti: j7200-main: Enable USB2 PHY RX sensitivity workaround
b8d350b4ac ARM: dts: r8a7779, marzen: Fix DU clock names
b02a65061e arm64: dts: renesas: v3msk: Fix memory size
ab4d76eb77 rtc: fix snprintf() checking in is_rtc_hctosys()
e352463654 ARM: dts: sun8i: h3: orangepi-plus: Fix ethernet phy-mode
3199ff7b9f memory: pl353: Fix error return code in pl353_smc_probe()
fc7a8347ce reset: brcmstb: Add missing MODULE_DEVICE_TABLE
3f526ea670 memory: atmel-ebi: add missing of_node_put for loop iteration
84fa4a1063 memory: stm32-fmc2-ebi: add missing of_node_put for loop iteration
c385d93c3c ARM: dts: exynos: fix PWM LED max brightness on Odroid XU4
b7016870fe ARM: dts: exynos: fix PWM LED max brightness on Odroid HC1
640105e7c0 ARM: dts: exynos: fix PWM LED max brightness on Odroid XU/XU3
6870bc4267 ARM: exynos: add missing of_node_put for loop iteration
85dd41383b reset: a10sr: add missing of_match_table reference
685ec4c0f2 reset: RESET_INTEL_GW should depend on X86
2ca912471d reset: RESET_BRCMSTB_RESCAL should depend on ARCH_BRCMSTB
f75cec5c20 ARM: dts: gemini-rut1xx: remove duplicate ethernet node
f11508ecc6 hexagon: use common DISCARDS macro
f712169279 hexagon: handle {,SOFT}IRQENTRY_TEXT in linker script
4aa17d058a NFSv4/pNFS: Don't call _nfs4_pnfs_v3_ds_connect multiple times
885c0cc2ac NFSv4/pnfs: Fix layoutget behaviour after invalidation
a668a77e6a NFSv4/pnfs: Fix the layout barrier update
6ccccc03f8 vdpa/mlx5: Clear vq ready indication upon device reset
0e5f204ea5 ALSA: isa: Fix error return code in snd_cmi8330_probe()
6612c41233 nfsd: Reduce contention for the nfsd_file nf_rwsem
89047f0089 nvme-tcp: can't set sk_user_data without write_lock
4b3fd33f58 virtio_net: move tx vq operation under tx queue lock
8795692f0d vdpa/mlx5: Fix possible failure in umem size calculation
63272b1ffd vdpa/mlx5: Fix umem sizes assignments on VQ create
e22051e7c9 PCI: tegra194: Fix tegra_pcie_ep_raise_msi_irq() ill-defined shift
527bb29eb1 pwm: imx1: Don't disable clocks at device remove time
12d84de59d PCI: intel-gw: Fix INTx enable
b5859dacd2 x86/fpu: Limit xstate copy size in xstateregs_set()
07b760a791 x86/fpu: Fix copy_xstate_to_kernel() gap handling
aa7fccd383 f2fs: fix to avoid adding tab before doc section
607caa0801 PCI: iproc: Support multi-MSI only on uniprocessor kernel
54dc6fcce3 PCI: iproc: Fix multi-MSI base vector number allocation
ac2e498ab2 ubifs: Set/Clear I_LINKABLE under i_lock for whiteout inode
4b515308ab nfs: fix acl memory leak of posix_acl_create()
b8eace7d3b SUNRPC: prevent port reuse on transports which don't request it.
5577eece79 watchdog: jz4740: Fix return value check in jz4740_wdt_probe()
3b93d520ac watchdog: aspeed: fix hardware timeout calculation
412ef737be ubifs: journal: Fix error return code in ubifs_jnl_write_inode()
6bcc0590cb ubifs: Fix off-by-one error
aab881d7f0 um: fix error return code in winch_tramp()
9bb3f31b25 um: fix error return code in slip_open()
0bfb6d4949 misc: alcor_pci: fix inverted branch condition
5c7ef8a370 NFSv4: Fix an Oops in pnfs_mark_request_commit() when doing O_DIRECT
ff4023d019 NFSv4: Initialise connection to the server in nfs4_alloc_client()
36291fd627 power: supply: rt5033_battery: Fix device tree enumeration
ae56850d36 PCI/sysfs: Fix dsm_label_utf16s_to_utf8s() buffer overrun
6594d0aa1c remoteproc: k3-r5: Fix an error message
d3c150978e f2fs: compress: fix to disallow temp extension
43cefd1264 f2fs: add MODULE_SOFTDEP to ensure crc32 is included in the initramfs
74569cb9ed x86/signal: Detect and prevent an alternate signal stack overflow
f0e905df68 NFSD: Fix TP_printk() format specifier in nfsd_clid_class
2830dd2faa f2fs: atgc: fix to set default age threshold
f6ec306b93 virtio_console: Assure used length from device is limited
09a94a89d7 virtio_net: Fix error handling in virtnet_restore()
cd24da0db9 virtio-blk: Fix memory leak among suspend/resume procedure
d420b11666 PCI: rockchip: Register IRQ handlers after device and data are ready
424fc30298 ACPI: video: Add quirk for the Dell Vostro 3350
4f2b140658 ACPI: AMBA: Fix resource name in /proc/iomem
9dcc9ad343 pwm: tegra: Don't modify HW state in .remove callback
3d82361abd pwm: img: Fix PM reference leak in img_pwm_enable()
b3205768cd drm/amdkfd: fix sysfs kobj leak
687875fa9c power: supply: ab8500: add missing MODULE_DEVICE_TABLE
e88d524c66 power: supply: charger-manager: add missing MODULE_DEVICE_TABLE
89786fbc4d NFS: nfs_find_open_context() may only select open files
0fedfa72ae drm/gma500: Add the missed drm_gem_object_put() in psb_user_framebuffer_create()
59d912fe9b ceph: remove bogus checks and WARN_ONs from ceph_set_page_dirty
3c586f8255 orangefs: fix orangefs df output.
6e43cdcbb7 PCI: tegra: Add missing MODULE_DEVICE_TABLE
2df1abffc4 remoteproc: core: Fix cdev remove and rproc del
f3a56cd3ea x86/fpu: Return proper error codes from user access functions
39ed17de8c watchdog: iTCO_wdt: Account for rebooting on second timeout
9cc9f5de28 watchdog: imx_sc_wdt: fix pretimeout
66ba9cf929 watchdog: Fix possible use-after-free by calling del_timer_sync()
a173e3b62c watchdog: sc520_wdt: Fix possible use-after-free in wdt_turnoff()
a397cb4576 watchdog: Fix possible use-after-free in wdt_startup()
96c0bf0912 PCI: pciehp: Ignore Link Down/Up caused by DPC
4970647404 NFSv4: Fix delegation return in cases where we have to retry
b05c555c8d PCI/P2PDMA: Avoid pci_get_slot(), which may sleep
8e3f27bb7f ARM: 9087/1: kprobes: test-thumb: fix for LLVM_IAS=1
9d829ca43b power: reset: gpio-poweroff: add missing MODULE_DEVICE_TABLE
efc6443c1a power: supply: max17042: Do not enforce (incorrect) interrupt trigger type
7667cdc4b7 PCI: hv: Fix a race condition when removing the device
14016c1728 power: supply: ab8500: Avoid NULL pointers
0df49cdc7c PCI: ftpci100: Rename macro name collision
e133435232 pwm: spear: Don't modify HW state in .remove callback
15a19c5a92 power: supply: sc2731_charger: Add missing MODULE_DEVICE_TABLE
d7897890ba power: supply: sc27xx: Add missing MODULE_DEVICE_TABLE
6ed9f9899b kcov: add __no_sanitize_coverage to fix noinstr for all architectures
ff53dfb323 lib/decompress_unlz4.c: correctly handle zero-padding around initrds.
b85b43c3e4 phy: intel: Fix for warnings due to EMMC clock 175Mhz change in FIP
a9d986be49 i2c: core: Disable client irq on reboot/shutdown
9c6c657047 intel_th: Wait until port is in reset before programming it
45f1de1fff staging: rtl8723bs: fix macro value for 2.4Ghz only device
1f577093c8 leds: turris-omnia: add missing MODULE_DEVICE_TABLE
ff8f11860e ALSA: firewire-motu: fix detection for S/PDIF source on optical interface in v2 protocol
9ada4baae6 ALSA: usb-audio: scarlett2: Fix 6i6 Gen 2 line out descriptions
fb7c8bfa2e ALSA: hda: Add IRQ check for platform_get_irq()
63c49cfa2f backlight: lm3630a: Fix return code of .update_status() callback
719c45a41c ASoC: Intel: kbl_da7219_max98357a: shrink platform_id below 20 characters
692e16958f powerpc/boot: Fixup device-tree on little endian
b41cb0e4af usb: gadget: hid: fix error return code in hid_bind()
309b44d316 usb: gadget: f_hid: fix endianness issue with descriptors
16668cc656 ALSA: usb-audio: scarlett2: Fix scarlett2_*_ctl_put() return values
3005d48b40 ALSA: usb-audio: scarlett2: Fix data_mutex lock
8f075c61ea ALSA: usb-audio: scarlett2: Fix 18i8 Gen 2 PCM Input count
3b7bd795cb ALSA: bebob: add support for ToneWeal FW66
90cd79aa9a Input: hideep - fix the uninitialized use in hideep_nvm_unlock()
a50b56ffc0 s390/mem_detect: fix tprot() program check new psw handling
7e1e0235b3 s390/mem_detect: fix diag260() program check new psw handling
c25be19aa9 s390/ipl_parm: fix program check new psw handling
3794633dfd s390/processor: always inline stap() and __load_psw_mask()
f22649cf90 habanalabs: remove node from list before freeing the node
25ddb0a42f habanalabs/gaudi: set the correct cpu_id on MME2_QM failure
3dd2a9daa7 ASoC: soc-core: Fix the error return code in snd_soc_of_parse_audio_routing()
8e18158ea7 powerpc/mm/book3s64: Fix possible build error
ed0b4b56a9 gpio: pca953x: Add support for the On Semi pca9655
b7f4423c7d selftests/powerpc: Fix "no_handler" EBB selftest
c7f2112e7a ALSA: ppc: fix error return code in snd_pmac_probe()
1004c52e3c scsi: storvsc: Correctly handle multiple flags in srb_status
b3d3a2466e gpio: zynq: Check return value of irq_get_irq_data
3d2b0818da gpio: zynq: Check return value of pm_runtime_get_sync
71f8d7fbfe ASoC: soc-pcm: fix the return value in dpcm_apply_symmetry()
43d1aaa196 iommu/arm-smmu: Fix arm_smmu_device refcount leak in address translation
c4007596fb iommu/arm-smmu: Fix arm_smmu_device refcount leak when arm_smmu_rpm_get fails
f8763ab3fb powerpc/ps3: Add dma_mask to ps3_dma_region
0e54f8ee6b ALSA: sb: Fix potential double-free of CSP mixer elements
52d242f2bf selftests: timers: rtcpie: skip test if default RTC device does not exist
7b18f26d82 s390: disable SSP when needed
78cddc9aa6 s390/sclp_vt220: fix console name to match device
2f4e7363a9 serial: tty: uartlite: fix console setup
dc9db46292 fsi: Add missing MODULE_DEVICE_TABLE
0c67c2e203 ASoC: img: Fix PM reference leak in img_i2s_in_probe()
af8b891cd3 mfd: cpcap: Fix cpcap dmamask not set warnings
d339f6a0d1 mfd: da9052/stmpe: Add and modify MODULE_DEVICE_TABLE
5dd2955565 scsi: qedi: Fix cleanup session block/unblock use
6f36afa155 scsi: qedi: Fix TMF session block/unblock use
57fa983ea7 scsi: qedi: Fix race during abort timeouts
afa1c8ee7e scsi: qedi: Fix null ref during abort handling
fa7adae4b5 scsi: iscsi: Fix shost->max_id use
89812e7957 scsi: iscsi: Fix conn use after free during resets
21962a5dd6 scsi: iscsi: Add iscsi_cls_conn refcount helpers
5ac2428f2b scsi: megaraid_sas: Handle missing interrupts while re-enabling IRQs
422fb12054 scsi: megaraid_sas: Early detection of VD deletion through RaidMap update
0680db6f41 scsi: megaraid_sas: Fix resource leak in case of probe failure
c851de0215 fs/jfs: Fix missing error code in lmLogInit()
7207cd708e scsi: scsi_dh_alua: Check for negative result value
6bad74b2b4 scsi: core: Fixup calling convention for scsi_mode_sense()
b4fd2ab0a9 scsi: mpt3sas: Fix deadlock while cancelling the running firmware event
7a80f71601 tty: serial: 8250: serial_cs: Fix a memory leak in error handling path
75452cc776 ALSA: ac97: fix PM reference leak in ac97_bus_remove()
664695a754 scsi: core: Cap scsi_host cmd_per_lun at can_queue
21d8b90cec scsi: lpfc: Fix crash when lpfc_sli4_hba_setup() fails to initialize the SGLs
2626d5ed6b scsi: lpfc: Fix "Unexpected timeout" error in direct attach topology
bb1d1c2149 scsi: hisi_sas: Propagate errors in interrupt_init_v1_hw()
0245504090 scsi: arcmsr: Fix doorbell status being updated late on ARC-1886
20c62caf2e w1: ds2438: fixing bug that would always get page0
8e8d910e9a usb: common: usb-conn-gpio: fix NULL pointer dereference of charger
b30a115e4a Revert "ALSA: bebob/oxfw: fix Kconfig entry for Mackie d.2 Pro"
7d7f30cf18 ALSA: usx2y: Don't call free_pages_exact() with NULL address
f4997bf6c4 ALSA: usx2y: Avoid camelCase
1b1d6aa1a8 iio: magn: bmc150: Balance runtime pm + use pm_runtime_resume_and_get()
5ecb0acc45 iio: gyro: fxa21002c: Balance runtime pm + use pm_runtime_resume_and_get().
5f69841c22 partitions: msdos: fix one-byte get_unaligned()
a8c3d1a515 ASoC: intel/boards: add missing MODULE_DEVICE_TABLE
58f69684ba misc: alcor_pci: fix null-ptr-deref when there is no PCI bridge
b9c87ce3bc misc/libmasm/module: Fix two use after free in ibmasm_init_one
4f3c807739 serial: fsl_lpuart: disable DMA for console and fix sysrq
6942fbc009 tty: serial: fsl_lpuart: fix the potential risk of division or modulo by zero
35a35909ec rcu: Reject RCU_LOCKDEP_WARN() false positives
23597afbe0 srcu: Fix broken node geometry after early ssp init
4d395142d9 scsi: arcmsr: Fix the wrong CDB payload report to IOP
22d22fef9c dmaengine: fsl-qdma: check dma_set_mask return value
3206433070 ASoC: Intel: sof_sdw: add mutual exclusion between PCH DMIC and RT715
164a3880a7 leds: tlc591xx: fix return value check in tlc591xx_probe()
9ebcc60565 net: bridge: multicast: fix MRD advertisement router port marking race
664cc645bd net: bridge: multicast: fix PIM hello router port marking race
b3aea76efe Revert "drm/ast: Remove reference to struct drm_device.pdev"
b3f8120039 drm/ingenic: Switch IPU plane to type OVERLAY
8f6dcc4dd7 drm/ingenic: Fix non-OSD mode
cae871baa4 drm/dp_mst: Add missing drm parameters to recently added call to drm_dbg_kms()
16fb4e9c39 drm/dp_mst: Avoid to mess up payload table by ports in stale topology
3462bc8b1a drm/dp_mst: Do not set proposed vcpi directly
087bff9acd fbmem: Do not delete the mode that is still in use
811763e3be cgroup: verify that source is a string
0728df8048 drm/i915/gt: Fix -EDEADLK handling regression
81dd2d60f6 drm/i915/gtt: drop the page table optimisation
905169794d tracing: Do not reference char * as a string in histograms
e1261c7a84 scsi: zfcp: Report port fc_security as unknown early during remote cable pull
ea518b70ed scsi: core: Fix bad pointer dereference when ehandler kthread is invalid
8b2ae2de53 KVM: X86: Disable hardware breakpoints unconditionally before kvm_x86->run()
bedc5d0911 KVM: nSVM: Check the value written to MSR_VM_HSAVE_PA
5b779e597c KVM: x86/mmu: Do not apply HPA (memory encryption) mask to GPAs
1a1a5e4409 KVM: x86: Use guest MAXPHYADDR from CPUID.0x8000_0008 iff TDP is enabled
679837dc0a KVM: mmio: Fix use-after-free Read in kvm_vm_ioctl_unregister_coalesced_mmio
72797ffca1 cifs: handle reconnect of tcon when there is no cached dfs referral
b93f949942 certs: add 'x509_revocation_list' to gitignore
ff7ea0d0e9 Revert "media: subdev: disallow ioctl for saa6588/davinci"
8db62be3c3 Merge 5.10.51 into android12-5.10-lts
f682613465 Linux 5.10.51
8678660301 f2fs: fix to avoid racing on fsync_entry_slab by multi filesystem instances
5e4f5138bd ext4: fix memory leak in ext4_fill_super
3780348c1a smackfs: restrict bytes count in smk_set_cipso()
8018936950 jfs: fix GPF in diFree
fcb041ca5c drm/ast: Remove reference to struct drm_device.pdev
3785f3c1e3 pinctrl: mcp23s08: Fix missing unlock on error in mcp23s08_irq()
b716ccffbc dm writecache: write at least 4k when committing
090588059c io_uring: fix clear IORING_SETUP_R_DISABLED in wrong function
aa57b2d6b3 media: uvcvideo: Fix pixel format change for Elgato Cam Link 4K
31874b6b63 media: gspca/sunplus: fix zero-length control requests
de95c0bd79 media: gspca/sq905: fix control-request direction
c57bfd8000 media: zr364xx: fix memory leak in zr364xx_start_readpipe
dbd58d3978 media: dtv5100: fix control-request directions
db317a3722 media: subdev: disallow ioctl for saa6588/davinci
e2c1218ddc PCI: aardvark: Implement workaround for the readback value of VEND_ID
1309197089 PCI: aardvark: Fix checking for PIO Non-posted Request
f147115018 PCI: Leave Apple Thunderbolt controllers on for s2idle or standby
ba47e65a5d dm btree remove: assign new_root only when removal succeeds
1b5918b087 dm writecache: flush origin device when writing and cache is full
cbc03ffec2 dm zoned: check zone capacity
35c1c4bd2d coresight: tmc-etf: Fix global-out-of-bounds in tmc_update_etf_buffer()
048624ad56 coresight: Propagate symlink failure
0c2bc14891 ipack/carriers/tpci200: Fix a double free in tpci200_pci_probe
eb81b5a37d tracing: Resize tgid_map to pid_max, not PID_MAX_DEFAULT
3cda5b7f4e tracing: Simplify & fix saved_tgids logic
8cc58a6e2c rq-qos: fix missed wake-ups in rq_qos_throttle try two
f9fb4986f4 seq_buf: Fix overflow in seq_buf_putmem_hex()
418b333afb extcon: intel-mrfld: Sync hardware and software state on init
af092ec16e selftests/lkdtm: Fix expected text for CR4 pinning
0af643fa7e lkdtm/bugs: XFAIL UNALIGNED_LOAD_STORE_WRITE
baedb1f5a0 nvmem: core: add a missing of_node_put
f0a079c0ba mfd: syscon: Free the allocated name field of struct regmap_config
a8a2e506ea power: supply: ab8500: Fix an old bug
38dde03eb2 ubifs: Fix races between xattr_{set|get} and listxattr operations
690a11fb4e thermal/drivers/int340x/processor_thermal: Fix tcc setting
ef5066f95c ipmi/watchdog: Stop watchdog timer when the current action is 'none'
7ade84f8df qemu_fw_cfg: Make fw_cfg_rev_attr a proper kobj_attribute
02671eda9a i40e: fix PTP on 5Gb links
ab9d7c5fc9 ASoC: tegra: Set driver_name=tegra for all machine drivers
e0d9beb44a fpga: stratix10-soc: Add missing fpga_mgr_free() call
5a5ebf5d48 clocksource/arm_arch_timer: Improve Allwinner A64 timer workaround
b5e26be407 cpu/hotplug: Cure the cpusets trainwreck
a11a457820 arm64: tlb: fix the TTL value of tlb_get_level
0afa6ad0c4 ata: ahci_sunxi: Disable DIPM
5543f61e2e mmc: core: Allow UHS-I voltage switch for SDSC cards if supported
b53b0ca4a4 mmc: core: clear flags before allowing to retune
658f58189a mmc: sdhci: Fix warning message when accessing RPMB in HS400 mode
5ced01c0e8 mmc: sdhci-acpi: Disable write protect detection on Toshiba Encore 2 WT8-B
3f9c2a058e drm/i915/display: Do not zero past infoframes.vsc
8abf5eec0e drm/nouveau: Don't set allow_fb_modifiers explicitly
42a333ea4b drm/arm/malidp: Always list modifiers
0bcc074f90 drm/msm/mdp4: Fix modifier support enabling
4d61ddd740 drm/tegra: Don't set allow_fb_modifiers explicitly
c601693617 drm/amd/display: Reject non-zero src_y and src_x for video planes
7d30538894 pinctrl/amd: Add device HID for new AMD GPIO controller
b13574fa83 drm/amd/display: fix incorrrect valid irq check
3c8216b350 drm/rockchip: dsi: remove extra component_del() call
2998599fb1 drm/dp: Handle zeroed port counts in drm_dp_read_downstream_info()
98bd09d928 drm/vc4: hdmi: Prevent clock unbalance
a2b8835cb4 drm/vc4: crtc: Skip the TXP
293e520d20 drm/vc4: txp: Properly set the possible_crtcs mask
0d50d93d05 drm/radeon: Call radeon_suspend_kms() in radeon_pci_shutdown() for Loongson64
7aa28f2f67 drm/radeon: Add the missed drm_gem_object_put() in radeon_user_framebuffer_create()
2674ffcad0 drm/amdgpu: enable sdma0 tmz for Raven/Renoir(V2)
8f933b27cb drm/amdgpu: Update NV SIMD-per-CU to 2
97ebbfe445 powerpc/powernv/vas: Release reference to tgid during window close
a024e88f8a powerpc/barrier: Avoid collision with clang's __lwsync macro
d2e52d4664 powerpc/mm: Fix lockup on kernel exec fault
4ad382bc4a scsi: iscsi: Fix iSCSI cls conn state
221b7e1e76 scsi: iscsi: Fix race condition between login and sync thread
9073188835 io_uring: convert io_buffer_idr to XArray
c5a50a220a io_uring: Convert personality_idr to XArray
cb2985feb1 io_uring: simplify io_remove_personalities()
7d4f961588 mm,hwpoison: return -EBUSY when migration fails
fd6625a1ec loop: fix I/O error on fsync() in detached loop devices
88f0bc830c arm64: dts: rockchip: Enable USB3 for rk3328 Rock64
421aff50af arm64: dts: rockchip: add rk3328 dwc3 usb controller node
8eb12fa96b ath11k: unlock on error path in ath11k_mac_op_add_interface()
9706c53433 MIPS: MT extensions are not available on MIPS32r1
6cf2e905b1 selftests/resctrl: Fix incorrect parsing of option "-t"
10f8fca676 MIPS: set mips32r5 for virt extensions
ff4762bcb9 MIPS: loongsoon64: Reserve memory below starting pfn to prevent Oops
6ef81a5c0e sctp: add size validation when walking chunks
d4dbef7046 sctp: validate from_addr_param return
e83f312114 flow_offload: action should not be NULL when it is referenced
a61af01141 bpf: Fix false positive kmemleak report in bpf_ringbuf_area_alloc()
20285dc271 sched/fair: Ensure _sum and _avg values stay consistent
e2296a4365 Bluetooth: btusb: fix bt fiwmare downloading failure issue for qca btsoc.
8d7a3989c1 Bluetooth: mgmt: Fix the command returns garbage parameter value
05298f1733 Bluetooth: btusb: Add support USB ALT 3 for WBS
cc49ab24ec Bluetooth: L2CAP: Fix invalid access on ECRED Connection response
79a3130864 Bluetooth: L2CAP: Fix invalid access if ECRED Reconfigure fails
c4a9967e4d Bluetooth: btusb: Add a new QCA_ROME device (0cf3:e500)
60789afc02 Bluetooth: Shutdown controller after workqueues are flushed or cancelled
5147d86c4a Bluetooth: Fix alt settings for incoming SCO with transparent coding format
8f939b4c25 Bluetooth: Fix the HCI to MGMT status conversion table
5f5f8022c1 Bluetooth: btusb: Fixed too many in-token issue for Mediatek Chip.
3d08b59179 RDMA/cma: Fix rdma_resolve_route() memory leak
a8585fdf42 net: ip: avoid OOM kills with large UDP sends over loopback
04177aa99a media, bpf: Do not copy more entries than user space requested
d8bb134d80 IB/isert: Align target max I/O size to initiator size
d330f5f8df mac80211_hwsim: add concurrent channels scanning support over virtio
97f0677226 mac80211: consider per-CPU statistics if present
1b728869a1 cfg80211: fix default HE tx bitrate mask in 2G band
0a7ba5d373 wireless: wext-spy: Fix out-of-bounds warning
c1ad55b6a1 sfc: error code if SRIOV cannot be disabled
1013dc896d sfc: avoid double pci_remove of VFs
7cd6986f2d iwlwifi: pcie: fix context info freeing
b98ec6d8b3 iwlwifi: pcie: free IML DMA memory allocation
78eadadff3 iwlwifi: mvm: fix error print when session protection ends
1e1bb1efd6 iwlwifi: mvm: don't change band on bound PHY contexts
1df3603039 RDMA/rxe: Don't overwrite errno from ib_umem_get()
ee33c042f4 vsock: notify server to shutdown when client has pending signal
38bc2ebf34 atm: nicstar: register the interrupt handler in the right place
90efb7f100 atm: nicstar: use 'dma_free_coherent' instead of 'kfree'
1d304c7ddd net: fec: add ndo_select_queue to fix TX bandwidth fluctuations
c7a31ae63e MIPS: add PMD table accounting into MIPS'pmd_alloc_one
50ce920fe1 rtl8xxxu: Fix device info for RTL8192EU devices
a10e871b73 mt76: mt7915: fix IEEE80211_HE_PHY_CAP7_MAX_NC for station mode
4cd713e48c drm/amdkfd: Walk through list with dqm lock hold
a2122e0792 drm/amdgpu: fix bad address translation for sienna_cichlid
932be4cf2b io_uring: fix false WARN_ONCE
92a9fb51e5 net: sched: fix error return code in tcf_del_walker()
d2801d1118 net: ipa: Add missing of_node_put() in ipa_firmware_load()
5cc0cf735f net: fix mistake path for netdev_features_strings
891db094a0 mt76: mt7615: fix fixed-rate tx status reporting
090b06b25a ice: mark PTYPE 2 as reserved
b88a907830 ice: fix incorrect payload indicator on PTYPE
2e66c36f13 bpf: Fix up register-based shifts in interpreter to silence KUBSAN
0e72b151e3 drm/amdkfd: Fix circular lock in nocpsch path
cd29db48bb drm/amdkfd: fix circular locking on get_wave_state
9d21abc8fd cw1200: add missing MODULE_DEVICE_TABLE
c5e4a10d7b wl1251: Fix possible buffer overflow in wl1251_cmd_scan
5a3d373c4a wlcore/wl12xx: Fix wl12xx get_mac error if device is in ELP
ad7083a95d dm writecache: commit just one block, not a full page
57f7ed25bd xfrm: Fix error reporting in xfrm_state_construct.
a5f8862967 drm/amd/display: Verify Gamma & Degamma LUT sizes in amdgpu_dm_atomic_check
db3c3643d5 r8169: avoid link-up interrupt issue on RTL8106e if user enables ASPM
f38371821c selinux: use __GFP_NOWARN with GFP_NOWAIT in the AVC
0a244be95b fjes: check return value after calling platform_get_resource()
378c156f9d drm/amdkfd: use allowed domain for vmbo validation
fb3b4bcdd3 net: sgi: ioc3-eth: check return value after calling platform_get_resource()
e613f67f1b selftests: Clean forgotten resources as part of cleanup()
8a4318c14a net: phy: realtek: add delay to fix RXC generation issue
c71de31b2e drm/amd/display: Fix off-by-one error in DML
afa06442d2 drm/amd/display: Set DISPCLK_MAX_ERRDET_CYCLES to 7
02f444321b drm/amd/display: Release MST resources on switch from MST to SST
01d6a69319 drm/amd/display: Update scaling settings on modeset
57c63b47d6 drm/amd/display: Fix DCN 3.01 DSCCLK validation
8e4da40142 net: moxa: Use devm_platform_get_and_ioremap_resource()
278dc34b71 net: micrel: check return value after calling platform_get_resource()
ce1307ec62 net: mvpp2: check return value after calling platform_get_resource()
49b3a7f38a net: bcmgenet: check return value after calling platform_get_resource()
92820a1282 net: mscc: ocelot: check return value after calling platform_get_resource()
f3b96f4b6b virtio_net: Remove BUG() to avoid machine dead
87c39048ec ice: fix clang warning regarding deadcode.DeadStores
e352556ace ice: set the value of global config lock timeout longer
b5f2982e06 pinctrl: mcp23s08: fix race condition in irq handler
a4a86400c6 net: bridge: mrp: Update ring transitions.
cc4f0a9d5a dm: Fix dm_accept_partial_bio() relative to zone management commands
939f750215 dm writecache: don't split bios when overwriting contiguous cache content
65e780667c dm space maps: don't reset space map allocation cursor when committing
313d9f2580 RDMA/cxgb4: Fix missing error code in create_qp()
f9c67c179e net: tcp better handling of reordering then loss cases
8fa6473a61 drm/amdgpu: remove unsafe optimization to drop preamble ib
c5b518f4b9 drm/amd/display: Avoid HDCP over-read and corruption
3c172f6e44 MIPS: ingenic: Select CPU_SUPPORTS_CPUFREQ && MIPS_EXTERNAL_TIMER
0903ac8f09 MIPS: cpu-probe: Fix FPU detection on Ingenic JZ4760(B)
8f939b7957 ipv6: use prandom_u32() for ID generation
c92298d228 virtio-net: Add validation for used length
5e039a80a7 drm: bridge: cdns-mhdp8546: Fix PM reference leak in
d1eaf4cb44 clk: tegra: Ensure that PLLU configuration is applied properly
dc5bacea94 clk: tegra: Fix refcounting of gate clocks
315988817a RDMA/rtrs: Change MAX_SESS_QUEUE_DEPTH
4f6a0f31c6 net: stmmac: the XPCS obscures a potential "PHY not found" error
a7d608bb78 drm: rockchip: add missing registers for RK3066
d89ea206e9 drm: rockchip: add missing registers for RK3188
e54b4a5348 net/mlx5: Fix lag port remapping logic
62137d1ae5 net/mlx5e: IPsec/rep_tc: Fix rep_tc_update_skb drops IPsec packet
219150485d clk: renesas: r8a77995: Add ZA2 clock
0680344d71 drm/bridge: cdns: Fix PM reference leak in cdns_dsi_transfer()
95f8ce9f18 igb: fix assignment on big endian machines
66d593aa3a igb: handle vlan types with checker enabled
ffb865715a e100: handle eeprom as little endian
f06ea024c1 drm/vc4: hdmi: Fix PM reference leak in vc4_hdmi_encoder_pre_crtc_co()
48c96d5bac drm/vc4: Fix clock source for VEC PixelValve on BCM2711
21bf141458 udf: Fix NULL pointer dereference in udf_symlink function
0687411e2a drm/sched: Avoid data corruptions
5ed8c298b2 drm/scheduler: Fix hang when sched_entity released
73ac001f06 pinctrl: equilibrium: Add missing MODULE_DEVICE_TABLE
1b832bd777 net/sched: cls_api: increase max_reclassify_loop
6ceb0182b0 net: mdio: provide shim implementation of devm_of_mdiobus_register
d2d17ca924 drm/virtio: Fix double free on probe failure
69a71b59b1 reiserfs: add check for invalid 1st journal block
c5073100dc drm/bridge: lt9611: Add missing MODULE_DEVICE_TABLE
b5713dac19 net: mdio: ipq8064: add regmap config to disable REGCACHE
c0dd36bcb6 drm/mediatek: Fix PM reference leak in mtk_crtc_ddp_hw_init()
3393405257 net: Treat __napi_schedule_irqoff() as __napi_schedule() on PREEMPT_RT
a7f7c42e31 atm: nicstar: Fix possible use-after-free in nicstar_cleanup()
b7ee9ae1e0 mISDN: fix possible use-after-free in HFC_cleanup()
e759ff76eb atm: iphase: fix possible use-after-free in ia_module_exit()
2292d9691c hugetlb: clear huge pte during flush function on mips platform
a74872106e clk: renesas: rcar-usb2-clock-sel: Fix error handling in .probe()
3ca86d44b9 drm/amd/display: fix use_max_lb flag for 420 pixel formats
5953b984c3 net: pch_gbe: Use proper accessors to BE data in pch_ptp_match()
fb960728f8 drm/bridge: nwl-dsi: Force a full modeset when crtc_state->active is changed to be true
796554d3d6 drm/vc4: fix argument ordering in vc4_crtc_get_margins()
b025bc07c9 drm/amd/amdgpu/sriov disable all ip hw status by default
fb7479d64d drm/amd/display: fix HDCP reset sequence on reinitialize
d055669e66 drm/ast: Fixed CVE for DP501
95c3133bc8 drm/zte: Don't select DRM_KMS_FB_HELPER
b60ae0fab5 drm/mxsfb: Don't select DRM_KMS_FB_HELPER
1328352dcd Merge branch 'android12-5.10' into `android12-5.10-lts`
a3cd27f5c4 ANDROID: GKI: fix up crc change in ip.h
11b396dfd9 Revert "Add a reference to ucounts for each cred"
049c7d395d Revert "cred: add missing return error code when set_cred_ucounts() failed"
cf08d2746d Revert "Bluetooth: Fix Set Extended (Scan Response) Data"
2df0fb4a4b Merge 5.10.50 into android12-5.10-lts
43b0742ef4 Linux 5.10.50
bdf4d33e83 powerpc/preempt: Don't touch the idle task's preempt_count during hotplug
9b07d817f7 iommu/dma: Fix compile warning in 32-bit builds
0855952ed4 cred: add missing return error code when set_cred_ucounts() failed
ce04375e2d s390: preempt: Fix preempt_count initialization
e4a577d617 crypto: qce - fix error return code in qce_skcipher_async_req_handle()
fb0c0a04e4 scsi: core: Retry I/O for Notify (Enable Spinup) Required error
2b541b6c74 media: exynos4-is: remove a now unused integer
2b58f5154a mmc: vub3000: fix control-request direction
39ac3e1945 mmc: block: Disable CMDQ on the ioctl path
a75457f630 io_uring: fix blocking inline submission
c98d9318dc block: return the correct bvec when checking for gaps
51c19f4a62 erofs: fix error return code in erofs_read_superblock()
97cbddc8a2 tpm: Replace WARN_ONCE() with dev_err_once() in tpm_tis_status()
b5a2b5b642 fscrypt: fix derivation of SipHash keys on big endian CPUs
089057af71 fscrypt: don't ignore minor_hash when hash is 0
b9c3b48559 mailbox: qcom-ipcc: Fix IPCC mbox channel exhaustion
2a7c96c2e2 scsi: target: cxgbit: Unmap DMA buffer before calling target_execute_cmd()
e2e615e631 scsi: fc: Correct RHBA attributes length
5c6956e664 exfat: handle wrong stream entry size in exfat_readdir()
b6a41435c8 csky: syscache: Fixup duplicate cache flush
3483e1a41c csky: fix syscache.c fallthrough warning
8ff266de89 perf llvm: Return -ENOMEM when asprintf() fails
58fa4b36ab selftests/vm/pkeys: refill shadow register after implicit kernel write
1dd18fda3e selftests/vm/pkeys: handle negative sys_pkey_alloc() return code
92125cb883 selftests/vm/pkeys: fix alloc_random_pkey() to make it really, really random
456554040e lib/math/rational.c: fix divide by zero
787f4e7a7d mm/z3fold: use release_z3fold_page_locked() to release locked z3fold page
0fe11b79c2 mm/z3fold: fix potential memory leak in z3fold_destroy_pool()
555dffa484 include/linux/huge_mm.h: remove extern keyword
ebd6a295b5 hugetlb: remove prep_compound_huge_page cleanup
2e16ad5611 mm/hugetlb: remove redundant check in preparing and destroying gigantic page
0da83a815d mm/hugetlb: use helper huge_page_order and pages_per_huge_page
31be4ea35c mm/huge_memory.c: don't discard hugepage if other processes are mapping it
b65597377b mm/huge_memory.c: add missing read-only THP checking in transparent_hugepage_enabled()
aa41f7a2a6 mm/huge_memory.c: remove dedicated macro HPAGE_CACHE_INDEX_MASK
9b0b9edea1 mm/pmem: avoid inserting hugepage PTE entry with fsdax if hugepage support is disabled
0885ea1d47 vfio/pci: Handle concurrent vma faults
363d85bfae arm64: dts: marvell: armada-37xx: Fix reg for standard variant of UART
347af865b6 serial: mvebu-uart: correctly calculate minimal possible baudrate
9ad82f0412 serial: mvebu-uart: do not allow changing baudrate when uartclk is not available
dd6d4e92e7 ALSA: firewire-lib: Fix 'amdtp_domain_start()' when no AMDTP_OUT_STREAM stream is found
53fa3ba808 powerpc/papr_scm: Make 'perf_stats' invisible if perf-stats unavailable
04db493fc7 powerpc/64s: Fix copy-paste data exposure into newly created tasks
ac08ba518c powerpc/papr_scm: Properly handle UUID types and API
d3358c66ee powerpc: Offline CPU in stop_this_cpu()
9443acbd25 serial: 8250: 8250_omap: Fix possible interrupt storm on K3 SoCs
e0e3e0b7da serial: 8250: 8250_omap: Disable RX interrupt after DMA enable
786461739a selftests/ftrace: fix event-no-pid on 1-core machine
57e49a0bc4 leds: ktd2692: Fix an error handling path
53cb671592 leds: as3645a: Fix error return code in as3645a_parse_node()
f3bf888507 ASoC: fsl_spdif: Fix unexpected interrupt after suspend
2938ffd568 ASoC: Intel: sof_sdw: add SOF_RT715_DAI_ID_FIX for AlderLake
5f2dfce8d8 configfs: fix memleak in configfs_release_bin_file
e30e636447 ASoC: atmel-i2s: Fix usage of capture and playback at the same time
af497961ab powerpc/powernv: Fix machine check reporting of async store errors
f8d223f80a extcon: max8997: Add missing modalias string
e16fcc8374 extcon: sm5502: Drop invalid register write in sm5502_reg_data
25c7efb387 phy: ti: dm816x: Fix the error handling path in 'dm816x_usb_phy_probe()
6398fc0e57 phy: uniphier-pcie: Fix updating phy parameters
a4b7c0af61 soundwire: stream: Fix test for DP prepare complete
b0be06493e scsi: mpt3sas: Fix error return value in _scsih_expander_add()
f51088868b habanalabs: Fix an error handling path in 'hl_pci_probe()'
c183b55ed7 mtd: rawnand: marvell: add missing clk_disable_unprepare() on error in marvell_nfc_resume()
f929d21af7 of: Fix truncation of memory sizes on 32-bit platforms
db45ea8767 ASoC: cs42l42: Correct definition of CS42L42_ADC_PDN_MASK
55bb225c08 iio: prox: isl29501: Fix buffer alignment in iio_push_to_buffers_with_timestamp()
4973967504 iio: light: vcnl4035: Fix buffer alignment in iio_push_to_buffers_with_timestamp()
c850b52e47 serial: 8250: Actually allow UPF_MAGIC_MULTIPLIER baud rates
5db39ad3fa staging: mt7621-dts: fix pci address for PCI memory range
c5cd4b74fd coresight: core: Fix use of uninitialized pointer
58c0621c44 staging: rtl8712: fix memory leak in rtl871x_load_fw_cb
bf5d6f6979 staging: rtl8712: fix error handling in r871xu_drv_init
7bc3fa5db4 staging: gdm724x: check for overflow in gdm_lte_netif_rx()
f937370610 staging: gdm724x: check for buffer overflow in gdm_lte_multi_sdu_pkt()
14106b90e1 ASoC: fsl_spdif: Fix error handler with pm_runtime_enable
9cf11dca57 iio: light: vcnl4000: Fix buffer alignment in iio_push_to_buffers_with_timestamp()
8c85c0f8cb iio: magn: rm3100: Fix alignment of buffer in iio_push_to_buffers_with_timestamp()
c923e9effe iio: adc: ti-ads8688: Fix alignment of buffer in iio_push_to_buffers_with_timestamp()
15634d6dce iio: adc: mxs-lradc: Fix buffer alignment in iio_push_to_buffers_with_timestamp()
a6a1e347c7 iio: adc: hx711: Fix buffer alignment in iio_push_to_buffers_with_timestamp()
2abfdd6132 iio: adc: at91-sama5d2: Fix buffer alignment in iio_push_to_buffers_with_timestamp()
2abfa52947 thunderbolt: Bond lanes only when dual_link_port != NULL in alloc_dev_default()
36f60700f9 eeprom: idt_89hpesx: Restore printing the unsupported fwnode name
fc8ab06001 eeprom: idt_89hpesx: Put fwnode in matching case during ->probe()
c7188d1998 usb: dwc2: Don't reset the core after setting turnaround time
8aa1cb46b7 usb: gadget: f_fs: Fix setting of device and driver data cross-references
d4d3cd4c76 ASoC: mediatek: mtk-btcvsd: Fix an error handling path in 'mtk_btcvsd_snd_probe()'
92a30a90d6 ASoC: rt5682-sdw: set regcache_cache_only false before reading RT5682_DEVICE_ID
696cfb2567 ASoC: rt5682: fix getting the wrong device id when the suspend_stress_test
8ef111db5e ASoC: rt715-sdw: use first_hw_init flag on resume
36dc6957f7 ASoC: rt711-sdw: use first_hw_init flag on resume
de77f9d92c ASoC: rt700-sdw: use first_hw_init flag on resume
e1456cba8e ASoC: rt5682-sdw: use first_hw_init flag on resume
16674ae3b2 ASoC: rt1308-sdw: use first_hw_init flag on resume
dc15216412 ASoC: max98373-sdw: use first_hw_init flag on resume
45a3d00eaf iommu/dma: Fix IOVA reserve dma ranges
ad73683815 selftests: splice: Adjust for handler fallback removal
045c29902f s390: appldata depends on PROC_SYSCTL
485b1c02b5 s390: enable HAVE_IOREMAP_PROT
d65f69deac s390/irq: select HAVE_IRQ_EXIT_ON_IRQ_STACK
d8fe62cb91 iommu/amd: Fix extended features logging
bd95a3e159 visorbus: fix error return code in visorchipset_init()
e5a3a3108f fsi/sbefifo: Fix reset timeout
4a95eb0c80 fsi/sbefifo: Clean up correct FIFO when receiving reset request from SBE
719c4db394 fsi: occ: Don't accept response from un-initialized OCC
af3d7f9e26 fsi: scom: Reset the FSI2PIB engine for any error
446eed9c85 fsi: core: Fix return of error values on failures
d22bef4101 mfd: rn5t618: Fix IRQ trigger by changing it to level mode
9b8bfdbc7e mfd: mp2629: Select MFD_CORE to fix build error
68f2f83f6f scsi: iscsi: Flush block work before unblock
b5371faa06 scsi: FlashPoint: Rename si_flags field
0b2f741113 leds: lp50xx: Put fwnode in error case during ->probe()
5f7bda9ba8 leds: lm3697: Don't spam logs when probe is deferred
8fc7d4a3f0 leds: lm3692x: Put fwnode in any case during ->probe()
c54ad49e2f leds: lm36274: Put fwnode in error case during ->probe()
1ed9133171 leds: lm3532: select regmap I2C API
b504e279e5 leds: class: The -ENOTSUPP should never be seen by user space
58279b341b tty: nozomi: Fix the error handling path of 'nozomi_card_init()'
28c947b072 firmware: stratix10-svc: Fix a resource leak in an error handling path
1e1b9cd400 char: pcmcia: error out if 'num_bytes_read' is greater than 4 in set_protocol()
5a766253e3 staging: mmal-vchiq: Fix incorrect static vchiq_instance.
cf05986cc4 mtd: rawnand: arasan: Ensure proper configuration for the asserted target
2f8824f556 mtd: partitions: redboot: seek fis-index-block in the right node
a16eae11f0 perf scripting python: Fix tuple_set_u64()
201b975c16 Input: hil_kbd - fix error return code in hil_dev_connect()
000c70680d ASoC: rsnd: tidyup loop on rsnd_adg_clk_query()
2e1d76c3b9 backlight: lm3630a_bl: Put fwnode in error case during ->probe()
d4ebf352a7 ASoC: hisilicon: fix missing clk_disable_unprepare() on error in hi6210_i2s_startup()
2541d78f78 ASoC: rk3328: fix missing clk_disable_unprepare() on error in rk3328_platform_probe()
246b4f1e20 iio: potentiostat: lmp91000: Fix alignment of buffer in iio_push_to_buffers_with_timestamp()
71dbba0b95 iio: cros_ec_sensors: Fix alignment of buffer in iio_push_to_buffers_with_timestamp()
1fa3107759 iio: chemical: atlas: Fix buffer alignment in iio_push_to_buffers_with_timestamp()
93a5538d50 iio: light: tcs3472: Fix buffer alignment in iio_push_to_buffers_with_timestamp()
daecb8c0a1 iio: light: tcs3414: Fix buffer alignment in iio_push_to_buffers_with_timestamp()
293b8246a0 iio: light: isl29125: Fix buffer alignment in iio_push_to_buffers_with_timestamp()
f960139a3c iio: magn: bmc150: Fix buffer alignment in iio_push_to_buffers_with_timestamp()
e5e102f4b5 iio: magn: hmc5843: Fix buffer alignment in iio_push_to_buffers_with_timestamp()
4613232e0b iio: prox: as3935: Fix buffer alignment in iio_push_to_buffers_with_timestamp()
ab16be53b2 iio: prox: pulsed-light: Fix buffer alignment in iio_push_to_buffers_with_timestamp()
8e23dd6236 iio: prox: srf08: Fix buffer alignment in iio_push_to_buffers_with_timestamp()
c61ac1f83b iio: humidity: am2315: Fix buffer alignment in iio_push_to_buffers_with_timestamp()
8ea878287c iio: gyro: bmg160: Fix buffer alignment in iio_push_to_buffers_with_timestamp()
718a67a909 iio: adc: vf610: Fix buffer alignment in iio_push_to_buffers_with_timestamp()
df5343bd59 iio: adc: ti-ads1015: Fix buffer alignment in iio_push_to_buffers_with_timestamp()
f0bc78df4a iio: accel: stk8ba50: Fix buffer alignment in iio_push_to_buffers_with_timestamp()
cd62282a51 iio: accel: stk8312: Fix buffer alignment in iio_push_to_buffers_with_timestamp()
638ba5aa15 iio: accel: mxc4005: Fix overread of data and alignment issue.
0d220d40b3 iio: accel: kxcjk-1013: Fix buffer alignment in iio_push_to_buffers_with_timestamp()
9eb5fb66b6 iio: accel: hid: Fix buffer alignment in iio_push_to_buffers_with_timestamp()
4b362443dc iio: accel: bma220: Fix buffer alignment in iio_push_to_buffers_with_timestamp()
3cca4db5f7 iio: accel: bma180: Fix buffer alignment in iio_push_to_buffers_with_timestamp()
2edfba8a55 iio: adis16475: do not return ints in irq handlers
92efd6396e iio: adis16400: do not return ints in irq handlers
2e41116e6e iio: adis_buffer: do not return ints in irq handlers
67d88b7bf6 mwifiex: re-fix for unaligned accesses
460bee9009 tty: nozomi: Fix a resource leak in an error handling function
f5186bd17f serial: 8250_omap: fix a timeout loop condition
5dcff72fe4 serial: fsl_lpuart: remove RTSCTS handling from get_mctrl()
685d53abc9 serial: fsl_lpuart: don't modify arbitrary data on lpuart32
728f23e53c rcu: Invoke rcu_spawn_core_kthreads() from rcu_spawn_gp_kthread()
66111dfe22 ASoC: rt5682: Disable irq on shutdown
8b195380cd staging: fbtft: Don't spam logs when probe is deferred
7a42f3c30d staging: fbtft: Rectify GPIO handling
d8c1504cf1 MIPS: Fix PKMAP with 32-bit MIPS huge page support
a23ba98e91 RDMA/core: Always release restrack object
a938d4e8c6 RDMA/mlx5: Don't access NULL-cleared mpi pointer
c6965316d6 net: tipc: fix FB_MTU eat two pages
1148952dc6 net: sched: fix warning in tcindex_alloc_perfect_hash
4476568069 net: lwtunnel: handle MTU calculation in forwading
6939c39a41 writeback: fix obtain a reference to a freeing memcg css
4c3e839bfd clk: si5341: Update initialization magic
55aaba36d7 clk: si5341: Check for input clock presence and PLL lock on startup
42ac32d834 clk: si5341: Avoid divide errors due to bogus register contents
043637617d clk: si5341: Wait for DEVICE_READY on startup
29746bd0f7 clk: qcom: clk-alpha-pll: fix CAL_L write in alpha_pll_fabia_prepare
94221679ee clk: actions: Fix AHPPREDIV-H-AHB clock chain on Owl S500 SoC
f3b6df5dfd clk: actions: Fix bisp_factor_table based clocks on Owl S500 SoC
ced193bc08 clk: actions: Fix SD clocks factor table on Owl S500 SoC
12d2d6fd11 clk: actions: Fix UART clock dividers on Owl S500 SoC
7d97522e6e Bluetooth: Fix handling of HCI_LE_Advertising_Set_Terminated event
4f5fc3be2c Bluetooth: Fix Set Extended (Scan Response) Data
c5fedfcc20 Bluetooth: Fix not sending Set Extended Scan Response
a2dcad039e Bluetooth: mgmt: Fix slab-out-of-bounds in tlv_data_is_valid
d28e780431 Revert "be2net: disable bh with spin_lock in be_process_mcc"
342b06b600 gve: Fix swapped vars when fetching max queues
11044f8c2c RDMA/cma: Fix incorrect Packet Lifetime calculation
fc2ea819b9 bpfilter: Specify the log level for the kmsg message
4228c00e14 net: dsa: sja1105: fix NULL pointer dereference in sja1105_reload_cbs()
393d48b3de e1000e: Check the PCIm state
cebff3d9f7 ipv6: fix out-of-bound access in ip6_parse_tlv()
9e753c47b9 net: atlantic: fix the macsec key length
1c95d4d432 net: phy: mscc: fix macsec key length
711a28d24d net: macsec: fix the length used to copy the key for offloading
c764f2d899 RDMA/cma: Protect RMW with qp_mutex
d52ceed845 ibmvnic: free tx_pool if tso_pool alloc fails
f25accc4fd ibmvnic: set ltb->buff to NULL after freeing
3f85d2ca32 Revert "ibmvnic: remove duplicate napi_schedule call in open function"
cad22e48ca i40e: Fix missing rtnl locking when setting up pf switch
d9a5d19706 i40e: Fix autoneg disabling for non-10GBaseT links
88819239e9 i40e: Fix error handling in i40e_vsi_open
aefa927744 bpf: Do not change gso_size during bpf_skb_change_proto()
0dac8b0ad0 can: j1939: j1939_sk_setsockopt(): prevent allocation of j1939 filter for optlen == 0
2d58a38275 ipv6: exthdrs: do not blindly use init_net
b559d003f0 net: bcmgenet: Fix attaching to PYH failed on RPi 4B
514c96bf65 mac80211: remove iwlwifi specific workaround NDPs of null_response
1b3985aa53 drm/msm/dpu: Fix error return code in dpu_mdss_init()
134a561aee drm/msm: Fix error return code in msm_drm_init()
f97b9c4c07 bpf: Fix null ptr deref with mixed tail calls and subprogs
56c31bc9aa ieee802154: hwsim: avoid possible crash in hwsim_del_edge_nl()
71a345ede5 ieee802154: hwsim: Fix memory leak in hwsim_add_one
4b44486b8b tc-testing: fix list handling
997285646a net: ti: am65-cpsw-nuss: Fix crash when changing number of TX queues
6610d5a73b net/ipv4: swap flow ports when validating source
c3fcfc4e36 ip6_tunnel: fix GRE6 segmentation
162e75687e vxlan: add missing rcu_read_lock() in neigh_reduce()
6cd23b5f40 rtw88: 8822c: fix lc calibration timing
db2386fa43 iwlwifi: increase PNVM load timeout
78e6587585 xfrm: Fix xfrm offload fallback fail case
5c8e5fecea pkt_sched: sch_qfq: fix qfq_change_class() error path
b2ce4ebdd9 netfilter: nf_tables_offload: check FLOW_DISSECTOR_KEY_BASIC in VLAN transfer logic
581e37ad5c tls: prevent oversized sendfile() hangs by ignoring MSG_MORE
e7c3ae4797 net: sched: add barrier to ensure correct ordering for lockless qdisc
ca9b5ab791 vrf: do not push non-ND strict packets with a source LLA through packet taps again
e72d9e4b98 net: ethernet: ezchip: fix error handling
161f8b73da net: ethernet: ezchip: fix UAF in nps_enet_remove
f026d82211 net: ethernet: aeroflex: fix UAF in greth_of_remove
6a8c7c5c07 mt76: mt7615: fix NULL pointer dereference in tx_prepare_skb()
6987ee9bf0 mt76: fix possible NULL pointer dereference in mt76_tx
e717f974ce samples/bpf: Fix the error return code of xdp_redirect's main()
79bf8f04ce samples/bpf: Fix Segmentation fault for xdp_redirect command
fcd8d6371a RDMA/rtrs-srv: Set minimal max_send_wr and max_recv_wr
a9355b201d bpf: Fix libelf endian handling in resolv_btfids
607706027c xsk: Fix broken Tx ring validation
cd7877a39f xsk: Fix missing validation for skb and unaligned mode
89621945b6 selftests/bpf: Whitelist test_progs.h from .gitignore
49c25a1a8d RDMA/rxe: Fix qp reference counting for atomic ops
58da10487a netfilter: nft_tproxy: restrict support to TCP and UDP transport protocols
ed3d498834 netfilter: nft_osf: check for TCP packet before further processing
cf28cb51f0 netfilter: nft_exthdr: check for IPv6 packet before further processing
8f6714f3c1 RDMA/mlx5: Don't add slave port to unaffiliated list
a158ee32d4 netlabel: Fix memory leak in netlbl_mgmt_add_common
28e8df0c65 ath11k: send beacon template after vdev_start/restart during csa
48b69f31de ath10k: Fix an error code in ath10k_add_interface()
e0727a61b0 ath11k: Fix an error handling path in ath11k_core_fetch_board_data_api_n()
a54e9166e7 cw1200: Revert unnecessary patches that fix unreal use-after-free bugs
30efdcaca3 brcmsmac: mac80211_if: Fix a resource leak in an error handling path
5d452eafbd brcmfmac: Fix a double-free in brcmf_sdio_bus_reset
5b8d0b0727 brcmfmac: correctly report average RSSI in station info
db4de88d43 brcmfmac: fix setting of station info chains bitmask
80ad538a87 ssb: Fix error return code in ssb_bus_scan()
0147e6ccb8 wcn36xx: Move hal_buf allocation to devm_kmalloc in probe
581098969c clk: imx8mq: remove SYS PLL 1/2 clock gates
da8904c465 ieee802154: hwsim: Fix possible memory leak in hwsim_subscribe_all_others
7142f92412 wireless: carl9170: fix LEDS build errors & warnings
ecb6797501 ath10k: add missing error return code in ath10k_pci_probe()
668c0663d6 ath10k: go to path err_unsupported when chip id is not supported
4654f1fc30 tools/bpftool: Fix error return code in do_batch()
1ccbb552e3 drm: qxl: ensure surf.data is ininitialized
bdc16fe9df clk: vc5: fix output disabling when enabling a FOD
43b7f1dec6 drm/vc4: hdmi: Fix error path of hpd-gpios
756679a123 drm/pl111: Actually fix CONFIG_VEXPRESS_CONFIG depends
87890e1113 RDMA/rxe: Fix failure during driver load
c5db39c4df drm/pl111: depend on CONFIG_VEXPRESS_CONFIG
42800fcff3 RDMA/core: Sanitize WQ state received from the userspace
c470dd34c6 net/sched: act_vlan: Fix modify to allow 0
6a56913355 xfrm: remove the fragment check for ipv6 beet mode
9fddbe9495 clk: tegra30: Use 300MHz for video decoder by default
48bcd756af ehea: fix error return code in ehea_restart_qps()
6cbc167bc1 RDMA/rtrs-clt: Fix memory leak of not-freed sess->stats and stats->pcpu_stats
6569ae1deb RDMA/rtrs-clt: Check if the queue_depth has changed during a reconnection
8651ad0e29 RDMA/rtrs-srv: Fix memory leak when having multiple sessions
e7df730884 RDMA/rtrs-srv: Fix memory leak of unfreed rtrs_srv_stats object
f03d4c1296 RDMA/rtrs: Do not reset hb_missed_max after re-connection
bd4df557ae RDMA/rtrs-clt: Check state of the rtrs_clt_sess before reading its stats
067b663131 RDMA/srp: Fix a recently introduced memory leak
116d5cdfac mptcp: generate subflow hmac after mptcp_finish_join()
284e741c4e mptcp: fix pr_debug in mptcp_token_new_connect
eee0f7d399 drm/rockchip: cdn-dp: fix sign extension on an int multiply for a u64 result
75db503c9c drm/rockchip: lvds: Fix an error handling path
59eb7193be drm/rockchip: dsi: move all lane config except LCDC mux to bind()
40492ebd29 drm/rockchip: cdn-dp-core: add missing clk_disable_unprepare() on error in cdn_dp_grf_write()
ef61b0826c drm: rockchip: set alpha_en to 0 if it is not used
7902ee2fae net: ftgmac100: add missing error return code in ftgmac100_probe()
28b3837b40 clk: meson: g12a: fix gp0 and hifi ranges
27e9e0c468 net: qrtr: ns: Fix error return code in qrtr_ns_init()
40b701707e drm/vmwgfx: Fix cpu updates of coherent multisample surfaces
a8e85ed088 drm/vmwgfx: Mark a surface gpu-dirty after the SVGA3dCmdDXGenMips command
8f2b15ec3b pinctrl: renesas: r8a77990: JTAG pins do not have pull-down capabilities
6f4718c134 pinctrl: renesas: r8a7796: Add missing bias for PRESET# pin
2d487941ee net: pch_gbe: Propagate error from devm_gpio_request_one()
2f9f23c43a net: mvpp2: Put fwnode in error case during ->probe()
b3fecbf60e video: fbdev: imxfb: Fix an error message
1655266d91 drm/ast: Fix missing conversions to managed API
5885fce7b4 drm/amd/dc: Fix a missing check bug in dm_dp_mst_detect()
0cd39c9657 drm/bridge: Fix the stop condition of drm_bridge_chain_pre_enable()
45415d1f99 drm/bridge/sii8620: fix dependency on extcon
1de9425286 xfrm: xfrm_state_mtu should return at least 1280 for ipv6
10f32b8c9e mm/page_alloc: fix counting of managed_pages
d7deea31ed mm: page_alloc: refactor setup_per_zone_lowmem_reserve()
5458985533 mm: memcg/slab: properly set up gfp flags for objcg pointer array
8e4af3917b mm/shmem: fix shmem_swapin() race with swapoff
a5dcdfe4cb swap: fix do_swap_page() race with swapoff
29ae2c9c9c mm/debug_vm_pgtable: ensure THP availability via has_transparent_hugepage()
7abf6e5763 mm/debug_vm_pgtable/basic: iterate over entire protection_map[]
27634d63ca mm/debug_vm_pgtable/basic: add validation for dirtiness after write protect
c872674da7 dax: fix ENOMEM handling in grab_mapping_entry()
c015295b28 ocfs2: fix snprintf() checking
512106ae23 blk-mq: update hctx->dispatch_busy in case of real scheduler
3e33b1329c cpufreq: Make cpufreq_online() call driver->offline() on errors
cc0b1776fd ACPI: bgrt: Fix CFI violation
3cbe01ac28 ACPI: Use DEVICE_ATTR_<RW|RO|WO> macros
d3dd2fe274 blk-wbt: make sure throttle is enabled properly
1c2f21a8a0 blk-wbt: introduce a new disable state to prevent false positive by rwb_enabled()
e0afab5181 ACPI: APEI: fix synchronous external aborts in user-mode
f626452df8 extcon: extcon-max8997: Fix IRQ freeing at error path
45b399e309 clocksource/drivers/timer-ti-dm: Save and restore timer TIOCP_CFG
0317b728d8 mark pstore-blk as broken
296fbe2608 ACPI: sysfs: Fix a buffer overrun problem with description_show()
ce47ae8961 nvme-pci: look for StorageD3Enable on companion ACPI device instead
3ffe41f25f block: avoid double io accounting for flush request
17e77feadd ACPI: PM / fan: Put fan device IDs into separate header file
4dcb59d6a2 PM / devfreq: Add missing error code in devfreq_add_device()
a61f8a2e45 media: video-mux: Skip dangling endpoints
62c666805a media: v4l2-async: Clean v4l2_async_notifier_add_fwnode_remote_subdev
6bfcb61789 psi: Fix race between psi_trigger_create/destroy
8d7debe744 crypto: nx - Fix RCU warning in nx842_OF_upd_status
c43082d284 spi: spi-sun6i: Fix chipselect/clock bug
f18f7a2276 lockdep/selftests: Fix selftests vs PROVE_RAW_LOCK_NESTING
fca9e784a3 lockdep: Fix wait-type for empty stack
ca47a4fa89 sched/uclamp: Fix uclamp_tg_restrict()
aea030cefc sched/rt: Fix Deadline utilization tracking during policy change
c576472a05 sched/rt: Fix RT utilization tracking during policy change
67f66d48bd x86/sev: Split up runtime #VC handler for correct state tracking
2e1003f3ee x86/sev: Make sure IRQs are disabled while GHCB is active
eefebcda89 btrfs: clear log tree recovering status if starting transaction fails
aec3a574c6 regulator: hi655x: Fix pass wrong pointer to config.driver_data
96275c8f6c KVM: arm64: Don't zero the cycle count register when PMCR_EL0.P is set
e5154bf217 perf/arm-cmn: Fix invalid pointer when access dtc object sharing the same IRQ number
31dcfec19d KVM: x86/mmu: Fix return value in tdp_mmu_map_handle_target_level()
64d31137b1 KVM: nVMX: Don't clobber nested MMU's A/D status on EPTP switch
bac38bd7c4 KVM: nVMX: Ensure 64-bit shift when checking VMFUNC bitmap
b2c5af71ce KVM: nVMX: Sync all PGDs on nested transition with shadow paging
5ac406b81c hwmon: (max31790) Fix fan speed reporting for fan7..12
e02d52b7e9 hwmon: (max31722) Remove non-standard ACPI device IDs
5c00e99497 hwmon: (lm70) Revert "hwmon: (lm70) Add support for ACPI"
5cfc66b454 hwmon: (lm70) Use device_get_match_data()
c9f8416e43 media: s5p-g2d: Fix a memory leak on ctx->fh.m2m_ctx
921d2518db media: subdev: remove VIDIOC_DQEVENT_TIME32 handling
bb5e089df7 arm64/mm: Fix ttbr0 values stored in struct thread_info for software-pan
8d6acfe80d arm64: consistently use reserved_pg_dir
f1f30b3373 mmc: usdhi6rol0: fix error return code in usdhi6_probe()
cd909ebb73 crypto: sm2 - fix a memory leak in sm2
d598b8b77b crypto: sm2 - remove unnecessary reset operations
deef40c47e crypto: x86/curve25519 - fix cpu feature checking logic in mod_exit
bc50c40385 crypto: omap-sham - Fix PM reference leak in omap sham ops
615f2f5e7e crypto: nitrox - fix unchecked variable in nitrox_register_interrupts
5d4cb7c394 regulator: fan53880: Fix vsel_mask setting for FAN53880_BUCK
082d977b46 media: siano: Fix out-of-bounds warnings in smscore_load_firmware_family2()
52734fb99e m68k: atari: Fix ATARI_KBD_CORE kconfig unmet dependency warning
b54a0f7926 media: gspca/gl860: fix zero-length control requests
0109910cbd media: tc358743: Fix error return code in tc358743_probe_of()
5091f2738d media: au0828: fix a NULL vs IS_ERR() check
31157148a5 media: exynos4-is: Fix a use after free in isp_video_release
2a91d7cc42 media: rkvdec: Fix .buf_prepare
5a3ac10611 locking/lockdep: Reduce LOCKDEP dependency list
1328decacd pata_ep93xx: fix deferred probing
bab207d352 media: rc: i2c: Fix an error message
a9d02976e9 crypto: ccp - Fix a resource leak in an error handling path
c3285441b4 crypto: sa2ul - Fix pm_runtime enable in sa_ul_probe()
8ac033d9c4 crypto: sa2ul - Fix leaks on failure paths with sa_dma_init()
c0ec4ac436 x86/elf: Use _BITUL() macro in UAPI headers
912d16a2d7 evm: fix writing <securityfs>/evm overflow
403577f75d pata_octeon_cf: avoid WARN_ON() in ata_host_activate()
5f9aaaaac8 kbuild: Fix objtool dependency for 'OBJECT_FILES_NON_STANDARD_<obj> := n'
37481ad72d sched/uclamp: Fix locking around cpu_util_update_eff()
6c2b3d565f sched/uclamp: Fix wrong implementation of cpu.uclamp.min
b49d231c67 media: I2C: change 'RST' to "RSET" to fix multiple build errors
e7a376edb4 pata_rb532_cf: fix deferred probing
9df79fd17b sata_highbank: fix deferred probing
45d2d67833 crypto: ux500 - Fix error return code in hash_hw_final()
8c8c11b4df crypto: ixp4xx - update IV after requests
f00454ac40 crypto: ixp4xx - dma_unmap the correct address
2c3164f31a media: hantro: do a PM resume earlier
6efd8921eb media: s5p_cec: decrement usage count if disabled
e23dc4a3e8 media: venus: Rework error fail recover logic
08d0aa16a1 spi: Avoid undefined behaviour when counting unused native CSs
db5a7e22c9 spi: Allow to have all native CSs in use along with GPIOs
0c1d1517d6 writeback, cgroup: increment isw_nr_in_flight before grabbing an inode
3bf8076a7b ia64: mca_drv: fix incorrect array size calculation
fc12d8fbcf kthread_worker: fix return value when kthread_mod_delayed_work() races with kthread_cancel_delayed_work_sync()
1208f10b4b block: fix discard request merge
9d0634f6cb mailbox: qcom: Use PLATFORM_DEVID_AUTO to register platform device
c35b484130 cifs: fix missing spinlock around update to ses->status
a72d660c0d HID: wacom: Correct base usage for capacitive ExpressKey status bits
6bac00744b ACPI: tables: Add custom DSDT file as makefile prerequisite
5c93dd7c59 tpm_tis_spi: add missing SPI device ID entries
d9b40ebd44 clocksource: Check per-CPU clock synchronization when marked unstable
03a65c14ab clocksource: Retry clock read if long delays detected
8ab9714fd8 ACPI: EC: trust DSDT GPE for certain HP laptop
c406bb9ece cifs: improve fallocate emulation
998d9fefdd PCI: hv: Add check for hyperv_initialized in init_hv_pci_drv()
f5a90d44a1 EDAC/Intel: Do not load EDAC driver when running as a guest
950a739905 nvmet-fc: do not check for invalid target port in nvmet_fc_handle_fcp_rqst()
66e8848482 nvme-pci: fix var. type for increasing cq_head
9dc2c2b941 platform/x86: toshiba_acpi: Fix missing error code in toshiba_acpi_setup_keyboard()
e2cf3b5cb2 platform/x86: asus-nb-wmi: Revert "add support for ASUS ROG Zephyrus G14 and G15"
dff2466722 platform/x86: asus-nb-wmi: Revert "Drop duplicate DMI quirk structures"
1da08a428e block: fix race between adding/removing rq qos and normal IO
555dba7c63 ACPI: resources: Add checks for ACPI IRQ override
c79852298c ACPI: bus: Call kobject_put() in acpi_init() error path
a8c0057aee ACPICA: Fix memory leak caused by _CID repair function
2ebbe3a620 fs: dlm: fix memory leak when fenced
eda609d864 drivers: hv: Fix missing error code in vmbus_connect()
019d04f914 open: don't silently ignore unknown O-flags in openat2()
d838dddf3f random32: Fix implicit truncation warning in prandom_seed_state()
7425fe57d9 fs: dlm: cancel work sync othercon
747b654e40 blk-mq: clear stale request in tags->rq[] before freeing one request pool
a3362ff043 blk-mq: grab rq->refcount before calling ->fn in blk_mq_tagset_busy_iter
f58625bf2c block_dump: remove block_dump feature in mark_inode_dirty()
ca8541015d ACPI: EC: Make more Asus laptops use ECDT _GPE
b74b839a16 platform/x86: touchscreen_dmi: Add info for the Goodix GT912 panel of TM800A550L tablets
d4801889d6 platform/x86: touchscreen_dmi: Add an extra entry for the upside down Goodix touchscreen on Teclast X89 tablets
bb3a3a6ceb Input: goodix - platform/x86: touchscreen_dmi - Move upside down quirks to touchscreen_dmi.c
9e914f59cc lib: vsprintf: Fix handling of number field widths in vsscanf
f8c3236890 hv_utils: Fix passing zero to 'PTR_ERR' warning
0f2f529302 ACPI: processor idle: Fix up C-state latency if not ordered
ae281fbbc4 EDAC/ti: Add missing MODULE_DEVICE_TABLE
2c0285062d HID: do not use down_interruptible() when unbinding devices
51b7499cec ACPI: video: use native backlight for GA401/GA502/GA503
83653ace03 media: Fix Media Controller API config checks
f0b8f5682d regulator: da9052: Ensure enough delay time for .set_voltage_time_sel
5d9e3279f5 regulator: mt6358: Fix vdram2 .vsel_mask
b58b54ef49 KVM: s390: get rid of register asm usage
963baea02d lockding/lockdep: Avoid to find wrong lock dep path in check_irq_usage()
93cc59d8d0 locking/lockdep: Fix the dep path printing for backwards BFS
9c0835c69d btrfs: disable build on platforms having page size 256K
ad71a9ad74 btrfs: don't clear page extent mapped if we're not invalidating the full page
703b494a68 btrfs: sysfs: fix format string for some discard stats
8d05e30c97 btrfs: abort transaction if we fail to update the delayed inode
e0ffb169a3 btrfs: fix error handling in __btrfs_update_delayed_inode
9b28291237 KVM: PPC: Book3S HV: Fix TLB management on SMT8 POWER9 and POWER10 processors
00b1a9f0e8 drivers/perf: fix the missed ida_simple_remove() in ddr_perf_probe()
625ee7d267 hwmon: (max31790) Fix pwmX_enable attributes
c1eb091dbb hwmon: (max31790) Report correct current pwm duty cycles
49623e4b73 media: imx-csi: Skip first few frames from a BT.656 source
72962620ef media: siano: fix device register error path
cc4ba5a397 media: dvb_net: avoid speculation from net slot
c34f3912a7 crypto: shash - avoid comparing pointers to exported functions under CFI
0d201fee96 spi: meson-spicc: fix memory leak in meson_spicc_probe
a0bbb5d378 spi: meson-spicc: fix a wrong goto jump for avoiding memory leak.
ea7e1b581b mmc: via-sdmmc: add a check against NULL pointer dereference
237999da70 mmc: sdhci-sprd: use sdhci_sprd_writew
b6cbe1fcf8 memstick: rtsx_usb_ms: fix UAF
8c252a6303 media: dvd_usb: memory leak in cinergyt2_fe_attach
e78a588b59 Makefile: fix GDB warning with CONFIG_RELR
bce4838273 crypto: hisilicon/sec - fixup 3des minimum key size declaration
74ef2418a7 media: st-hva: Fix potential NULL pointer dereferences
617afcee2a media: bt8xx: Fix a missing check bug in bt878_probe
1f9cff025d media: v4l2-core: Avoid the dangling pointer in v4l2_fh_release
1d2838152e media: cedrus: Fix .buf_prepare
cbfb77c2f9 media: hantro: Fix .buf_prepare
6e08d3ab5c media: em28xx: Fix possible memory leak of em28xx struct
75c45a8188 media: bt878: do not schedule tasklet when it is not setup
a61d119248 media: i2c: ov2659: Use clk_{prepare_enable,disable_unprepare}() to set xvclk on/off
9fa8542a63 sched/fair: Fix ascii art by relpacing tabs
d0214b841c arm64: perf: Convert snprintf to sysfs_emit
addcb6bb58 crypto: qce: skcipher: Fix incorrect sg count for dma transfers
d000c598db crypto: qat - remove unused macro in FW loader
5daa889433 crypto: qat - check return code of qat_hal_rd_rel_reg()
9962341807 media: imx: imx7_mipi_csis: Fix logging of only error event counters
179d9c18ba media: pvrusb2: fix warning in pvr2_i2c_core_done
893c243e52 media: hevc: Fix dependent slice segment flags
a245f93ad0 media: cobalt: fix race condition in setting HPD
4626df7f65 media: cpia2: fix memory leak in cpia2_usb_probe
e717d6c291 media: sti: fix obj-$(config) targets
93f80a0bbd crypto: nx - add missing MODULE_DEVICE_TABLE
4e8c9510b7 hwrng: exynos - Fix runtime PM imbalance on error
3c51d82d0b sched/core: Initialize the idle task with preemption disabled
f8607f5ebe regulator: uniphier: Add missing MODULE_DEVICE_TABLE
66f0f478ec spi: omap-100k: Fix the length judgment problem
8692603ff1 spi: spi-topcliff-pch: Fix potential double free in pch_spi_process_messages()
cb42cf32ce spi: spi-loopback-test: Fix 'tx_buf' might be 'rx_buf'
b980385a70 media: exynos-gsc: fix pm_runtime_get_sync() usage count
a8b1889cd6 media: exynos4-is: fix pm_runtime_get_sync() usage count
29fd79b628 media: sti/bdisp: fix pm_runtime_get_sync() usage count
abdc897710 media: sunxi: fix pm_runtime_get_sync() usage count
d627fc298c media: s5p-jpeg: fix pm_runtime_get_sync() usage count
3c90c3fbdc media: mtk-vcodec: fix PM runtime get logic
8318f7bc0e media: sh_vou: fix pm_runtime_get_sync() usage count
64e291d697 media: am437x: fix pm_runtime_get_sync() usage count
adf052c779 media: s5p: fix pm_runtime_get_sync() usage count
437ca06c78 media: mdk-mdp: fix pm_runtime_get_sync() usage count
ff7e4b94db media: marvel-ccic: fix some issues when getting pm_runtime
ccf0a291f7 staging: media: rkvdec: fix pm_runtime_get_sync() usage count
b2c4d9a33c Add a reference to ucounts for each cred
61a7a634a0 spi: Make of_register_spi_device also set the fwnode
f2b2400476 thermal/cpufreq_cooling: Update offline CPUs per-cpu thermal_pressure
4eab2e2e98 fuse: reject internal errno
bb7ee90ea5 fuse: check connected before queueing on fpq->io
912e98505a fuse: ignore PG_workingset after stealing
576b44c326 fuse: Fix infinite loop in sget_fc()
ae6ab39251 fuse: Fix crash if superblock of submount gets killed early
91c2aa2c64 fuse: Fix crash in fuse_dentry_automount() error path
53124265fc evm: Refuse EVM_ALLOW_METADATA_WRITES only if an HMAC key is loaded
7b84c7d7e2 evm: Execute evm_inode_init_security() only when an HMAC key is loaded
a7e18f57ed loop: Fix missing discard support when using LOOP_CONFIGURE
75395690e5 powerpc/stacktrace: Fix spurious "stale" traces in raise_backtrace_ipi()
c556b938b3 seq_buf: Make trace_seq_putmem_hex() support data longer than 8
0531e84bc8 tracepoint: Add tracepoint_probe_register_may_exist() for BPF tracing
2aedacfaf6 tracing/histograms: Fix parsing of "sym-offset" modifier
998de999ba rsi: fix AP mode with WPA failure due to encrypted EAPOL
71808ec5b9 rsi: Assign beacon rate settings to the correct rate_info descriptor field
43189683fe ssb: sdio: Don't overwrite const buffer if block_write fails
58940e88ba ath9k: Fix kernel NULL pointer dereference during ath_reset_internal()
cc46d6d14f serial_cs: remove wrong GLOBETROTTER.cis entry
cf727d99ab serial_cs: Add Option International GSM-Ready 56K/ISDN modem
23055da561 serial: sh-sci: Stop dmaengine transfer in sci_stop_tx()
17451bd036 serial: mvebu-uart: fix calculation of clock divisor
8c90ec9965 iio: accel: bma180: Fix BMA25x bandwidth register values
9efc775c28 iio: ltr501: ltr501_read_ps(): add missing endianness conversion
c6c3ea1d9d iio: ltr501: ltr559: fix initialization of LTR501_ALS_CONTR
fbadeba72e iio: ltr501: mark register holding upper 8 bits of ALS_DATA{0,1} and PS_DATA as volatile, too
17c67f4848 iio: light: tcs3472: do not free unallocated IRQ
6534a5e0c2 iio: frequency: adf4350: disable reg and clk on error in adf4350_probe()
a6f7bf2652 rtc: stm32: Fix unbalanced clk_disable_unprepare() on probe error path
e92bd19246 clk: agilex/stratix10: fix bypass representation
3093214a6a clk: agilex/stratix10: remove noc_clk
308d01f525 clk: agilex/stratix10/n5x: fix how the bypass_reg is handled
e582a2f352 f2fs: Prevent swap file in LFS mode
36ae903607 s390: mm: Fix secure storage access exception handling
38a2ba82e2 s390/cio: dont call css_wait_for_slow_path() inside a lock
9aae145dc5 KVM: x86/mmu: Use MMU's role to detect CR4.SMEP value in nested NPT walk
a9ac58f85f KVM: x86/mmu: Treat NX as used (not reserved) for all !TDP shadow MMUs
30c44537cb KVM: PPC: Book3S HV: Workaround high stack usage with clang
39d0dfab6c KVM: nVMX: Handle split-lock #AC exceptions that happen in L2
7510c5cd0d perf/smmuv3: Don't trample existing events with global filter
9109e15709 mm/gup: fix try_grab_compound_head() race with split_huge_page()
9b0d1f4cb8 bus: mhi: Wait for M2 state during system resume
cbcbfb0488 SUNRPC: Should wake up the privileged task firstly.
30f5608498 SUNRPC: Fix the batch tasks count wraparound.
c6d864601e mac80211: remove iwlwifi specific workaround that broke sta NDP tx
507925fff0 can: peak_pciefd: pucan_handle_status(): fix a potential starvation issue in TX path
f79ea4755f can: j1939: j1939_sk_init(): set SOCK_RCU_FREE to call sk_destruct() after RCU is done
22bfa94db2 can: isotp: isotp_release(): omit unintended hrtimer restart on socket release
af94ef8f0b can: gw: synchronize rcu operations before removing gw job entry
b52e0cf0bf can: bcm: delay release of struct bcm_op after synchronize_rcu()
aa07327083 ext4: use ext4_grp_locked_error in mb_find_extent
6903f99f19 ext4: fix avefreec in find_group_orlov
98cd580211 ext4: remove check for zero nr_to_scan in ext4_es_scan()
68a40ff916 ext4: correct the cache_nr in tracepoint ext4_es_shrink_exit
f4e91a4e0d ext4: return error code when ext4_fill_flex_info() fails
b368b0375e ext4: fix overflow in ext4_iomap_alloc()
ea5466f1a7 ext4: fix kernel infoleak via ext4_extent_header
076d9b0623 ext4: cleanup in-core orphan list if ext4_truncate() failed to get a transaction handle
80d05ce58a btrfs: clear defrag status of a root if starting transaction fails
6b00b1717f btrfs: compression: don't try to compress if we don't have enough pages
34172f601a btrfs: send: fix invalid path for unlink operations after parent orphanization
2fa9298035 ARM: dts: at91: sama5d4: fix pinctrl muxing
ea45145e6c ARM: dts: ux500: Fix LED probing
b34aa5aaaa arm_pmu: Fix write counter incorrect in ARMv7 big-endian mode
123c1b05b0 crypto: ccp - Annotate SEV Firmware file names
834c47a387 crypto: nx - Fix memcpy() over-reading in nonce
b4c35e9e80 Input: joydev - prevent use of not validated data in JSIOCSBTNMAP ioctl
7b0393e6f6 iov_iter_fault_in_readable() should do nothing in xarray case
b6df9e43d5 copy_page_to_iter(): fix ITER_DISCARD case
d91638f70e selftests/lkdtm: Avoid needing explicit sub-shell
1738bcf9e6 ntfs: fix validity check for file name attribute
f794c839df gfs2: Fix error handling in init_statfs
3ae1c663bd gfs2: Fix underflow in gfs2_page_mkwrite
b242ae99fa xhci: solve a double free problem while doing s4
ff0f59d2d8 usb: typec: Add the missed altmode_id_remove() in typec_register_altmode()
63d6029e66 usb: dwc3: Fix debugfs creation flow
022d22a311 USB: cdc-acm: blacklist Heimann USB Appset device
f9d9db593d usb: renesas-xhci: Fix handling of unknown ROM state
3b54578850 usb: gadget: eem: fix echo command packet response issue
c964c4682e net: can: ems_usb: fix use-after-free in ems_usb_disconnect()
a2ad0bddd0 Input: usbtouchscreen - fix control-request directions
23e8f46884 media: dvb-usb: fix wrong definition
a6f433fd9e ALSA: hda/realtek: fix mute/micmute LEDs for HP EliteBook 830 G8 Notebook PC
ba65dd6eb8 ALSA: hda/realtek: Apply LED fixup for HP Dragonfly G1, too
13a05c7b43 ALSA: hda/realtek: Fix bass speaker DAC mapping for Asus UM431D
37e179c028 ALSA: hda/realtek: Improve fixup for HP Spectre x360 15-df0xxx
ea824a31a3 ALSA: hda/realtek: fix mute/micmute LEDs for HP EliteBook x360 830 G8
659b440a8d ALSA: hda/realtek: Add another ALC236 variant support
36bc25ec61 ALSA: hda/realtek: fix mute/micmute LEDs for HP ProBook 630 G8
0535de167b ALSA: hda/realtek: fix mute/micmute LEDs for HP ProBook 445 G8
2a13d43821 ALSA: hda/realtek: fix mute/micmute LEDs for HP ProBook 450 G8
9127b27703 ALSA: intel8x0: Fix breakage at ac97 clock measurement
19418ed317 ALSA: usb-audio: scarlett2: Fix wrong resume call
5c4d51b438 ALSA: firewire-motu: fix stream format for MOTU 8pre FireWire
313a5e869d ALSA: usb-audio: Fix OOB access at proc output
cfd3c66ca7 ALSA: usb-audio: fix rate on Ozone Z90 USB headset
ae9957b3ee Bluetooth: Remove spurious error message
f5af19889f Bluetooth: btqca: Don't modify firmware contents in-place
3cdcbd1b8c Bluetooth: hci_qca: fix potential GPF
b1a6760ddf Merge branch 'android12-5.10' into `android12-5.10-lts`
e722f1d83b Merge 5.10.49 into android12-5.10-lts
9c4e6d448c Merge 5.10.48 into android12-5.10-lts
904ad453ba Linux 5.10.49
064b57a8da xen/events: reset active flag for lateeoi events later
a245f6842d Hexagon: change jumps to must-extend in futex_atomic_*
a7f51048c5 Hexagon: add target builtins to kernel
243f325ecc Hexagon: fix build errors
8148665cb7 media: uvcvideo: Support devices that report an OT as an entity source
d5737410d2 KVM: PPC: Book3S HV: Save and restore FSCR in the P9 path
a09a522772 Linux 5.10.48
4dc9680428 Revert "KVM: x86/mmu: Drop kvm_mmu_extended_role.cr4_la57 hack"
4ab869e028 RDMA/mlx5: Block FDB rules when not in switchdev mode
348143a380 gpio: AMD8111 and TQMX86 require HAS_IOPORT_MAP
45ca6df5df drm/nouveau: fix dma_address check for CPU/GPU sync
d191c3d6ad gpio: mxc: Fix disabled interrupt wake-up support
f77f972384 scsi: sr: Return appropriate error code when disk is ejected
c37b834212 Merge branch 'android12-5.10' into `android12-5.10-lts`
3213549c5d ANDROID: ABI: update android12-5.10-lts ABI for 7/2 KMI update
2fcffe3f27 ANDROID: ABI: hikey960_gki.fragment: Add cfg80211 and mac80211 as modules
fd7a54895d ANDROID: ABI: gki_defconfig: Make cfg80211 and mac80211 modules
d43e5a796b Revert "ANDROID: GKI: Enable some necessary CFG80211 configs for WIFI"
194be71cc6 Merge 5.10.47 into android12-5.10-lts
bc9699030e Merge branch 'android12-5.10' into `android12-5.10-lts`
4357ae26d4 Linux 5.10.47
1573d595e2 integrity: Load mokx variables into the blacklist keyring
c6ae6f89fc certs: Add ability to preload revocation certs
72d6f5d982 certs: Move load_system_certificate_list to a common function
45109066f6 certs: Add EFI_CERT_X509_GUID support for dbx entries
0ba128fa68 Revert "drm: add a locked version of drm_is_current_master"
0463b49e02 netfs: fix test for whether we can skip read when writing beyond EOF
e6108147dd swiotlb: manipulate orig_addr when tlb_addr has offset
7570a8b5dd KVM: SVM: Call SEV Guest Decommission if ASID binding fails
377a796e7a mm, futex: fix shared futex pgoff on shmem huge page
ab9d178167 mm/thp: another PVMW_SYNC fix in page_vma_mapped_walk()
915c3a262c mm/thp: fix page_vma_mapped_walk() if THP mapped by ptes
90073aecc3 mm: page_vma_mapped_walk(): get vma_address_end() earlier
bf60fc2314 mm: page_vma_mapped_walk(): use goto instead of while (1)
9f85dcaf15 mm: page_vma_mapped_walk(): add a level of indentation
e56bdb3976 mm: page_vma_mapped_walk(): crossing page table boundary
8dc191ed9c mm: page_vma_mapped_walk(): prettify PVMW_MIGRATION block
7b55a4bcfc mm: page_vma_mapped_walk(): use pmde for *pvmw->pmd
1cb0b9059f mm: page_vma_mapped_walk(): settle PageHuge on entry
65febb41b4 mm: page_vma_mapped_walk(): use page for pvmw->page
825c28052b mm: thp: replace DEBUG_VM BUG with VM_WARN when unmap fails for split
0010275ca2 mm/thp: unmap_mapping_page() to fix THP truncate_cleanup_page()
38cda6b5ab mm/thp: fix page_address_in_vma() on file THP tails
37ffe9f4d7 mm/thp: fix vma_address() if virtual address below file offset
66be14a926 mm/thp: try_to_unmap() use TTU_SYNC for safe splitting
6527d8ef68 mm/thp: make is_huge_zero_pmd() safe and quicker
a8f4ea1d38 mm/thp: fix __split_huge_pmd_locked() on shmem migration entry
32f954e961 mm, thp: use head page in __migration_entry_wait()
bfd90b56d7 mm/rmap: use page_not_mapped in try_to_unmap()
ff81af8259 mm/rmap: remove unneeded semicolon in page_not_mapped()
a0ad7ea018 mm: add VM_WARN_ON_ONCE_PAGE() macro
130a1d76ee x86/fpu: Make init_fpstate correct with optimized XSAVE
51d8011782 x86/fpu: Preserve supervisor states in sanitize_restored_user_xstate()
2b35a4eaaa kthread: prevent deadlock when kthread_mod_delayed_work() races with kthread_cancel_delayed_work_sync()
bfe28af78a kthread_worker: split code for canceling the delayed work timer
02c303f3b9 ceph: must hold snap_rwsem when filling inode for async create
de0af2651d i2c: robotfuzz-osif: fix control-request directions
dd8ed6c9bc KVM: do not allow mapping valid but non-reference-counted pages
5fd0c2cf7b s390/stack: fix possible register corruption with stack switch helper
ab5bef9780 nilfs2: fix memory leak in nilfs_sysfs_delete_device_group
ace31c91fd scsi: sd: Call sd_revalidate_disk() for ioctl(BLKRRPART)
b9e6c20d4c gpiolib: cdev: zero padding during conversion to gpioline_info_changed
0221a5a4db i2c: i801: Ensure that SMBHSTSTS_INUSE_STS is cleared when leaving i801_access
018d03fcf7 pinctrl: stm32: fix the reported number of GPIO lines per bank
df654cd3d3 perf/x86: Track pmu in per-CPU cpu_hw_events
f9e73b2967 net: ll_temac: Avoid ndo_start_xmit returning NETDEV_TX_BUSY
1c9cf96f56 net: ll_temac: Add memory-barriers for TX BD access
bafb6cdd4f PCI: Add AMD RS690 quirk to enable 64-bit DMA
d91c50e6a6 recordmcount: Correct st_shndx handling
fb71d81ccd mac80211: handle various extensible elements correctly
676a7cb1a9 mac80211: reset profile_periodicity/ema_ap
ca0e1fefbb net: qed: Fix memcpy() overflow of qed_dcbx_params()
4658a8d307 KVM: selftests: Fix kvm_check_cap() assertion
e83e3c5d85 r8169: Avoid memcpy() over-reading of ETH_SS_STATS
992b105abf sh_eth: Avoid memcpy() over-reading of ETH_SS_STATS
a10856ea60 r8152: Avoid memcpy() over-reading of ETH_SS_STATS
196b22ef6c net/packet: annotate accesses to po->ifindex
da8b3aeff4 net/packet: annotate accesses to po->bind
18ed1789bb net: caif: fix memory leak in ldisc_open
edcd7594ad riscv32: Use medany C model for modules
47c07f919f net: phy: dp83867: perform soft reset and retain established link
f57132a887 net/packet: annotate data race in packet_sendmsg()
9707960ecf inet: annotate date races around sk->sk_txhash
7293f63b7b net: annotate data race in sock_error()
61b132f67c ping: Check return value of function 'ping_queue_rcv_skb'
08c389de6d inet: annotate data race in inet_send_prepare() and inet_dgram_connect()
c2311fd6de net: ethtool: clear heap allocations for ethtool function
c2813d1966 mac80211: drop multicast fragments
fedc4d4f54 net: ipv4: Remove unneed BUG() function
93c2aac13b dmaengine: mediatek: use GFP_NOWAIT instead of GFP_ATOMIC in prep_dma
0f48f92771 dmaengine: mediatek: do not issue a new desc if one is still current
63fa5b2d4b dmaengine: mediatek: free the proper desc in desc_free handler
78fa0f707d dmaengine: rcar-dmac: Fix PM reference leak in rcar_dmac_probe()
6a07cf3606 cfg80211: call cfg80211_leave_ocb when switching away from OCB
a902833300 mac80211_hwsim: drop pending frames on stop
8cfe765afd mac80211: remove warning in ieee80211_get_sband()
b671b98169 dmaengine: xilinx: dpdma: Limit descriptor IDs to 16 bits
524f70b30e dmaengine: xilinx: dpdma: Add missing dependencies to Kconfig
13b245a7bd dmaengine: stm32-mdma: fix PM reference leak in stm32_mdma_alloc_chan_resourc()
86f3e72dcb dmaengine: zynqmp_dma: Fix PM reference leak in zynqmp_dma_alloc_chan_resourc()
4df9ed0edb perf/x86/intel/lbr: Zero the xstate buffer on allocation
56bc20e5fc perf/x86/lbr: Remove cpuc->lbr_xsave allocation from atomic context
ca2acbd548 locking/lockdep: Improve noinstr vs errors
59aa5c91f8 x86/xen: Fix noinstr fail in exc_xen_unknown_trap()
cb83c99cf6 x86/entry: Fix noinstr fail in __do_fast_syscall_32()
cf59354875 drm/vc4: hdmi: Make sure the controller is powered in detect
f73aca83fd drm/vc4: hdmi: Move the HSM clock enable to runtime_pm
f11f9ff8a7 Revert "PCI: PM: Do not read power state in pci_enable_device_flags()"
4b06ebab4a spi: spi-nxp-fspi: move the register operation after the clock enable
50a1312a29 arm64: Force NO_BLOCK_MAPPINGS if crashkernel reservation is required
bd5d4df4dc arm64: Ignore any DMA offsets in the max_zone_phys() calculation
3bbdf5a6fc MIPS: generic: Update node names to avoid unit addresses
03096a4601 mmc: meson-gx: use memcpy_to/fromio for dram-access-quirk
b8fd230ae0 ARM: 9081/1: fix gcc-10 thumb2-kernel regression
3d6c4f78ec drm/amdgpu: wait for moving fence after pinning
694bb36aa7 drm/radeon: wait for moving fence after pinning
bcfea2412f drm/nouveau: wait for moving fence after pinning v2
3ef0ca0ec9 drm: add a locked version of drm_is_current_master
fea853aca3 Revert "drm/amdgpu/gfx10: enlarge CP_MEC_DOORBELL_RANGE_UPPER to cover full doorbell."
1bd81429d5 Revert "drm/amdgpu/gfx9: fix the doorbell missing when in CGPG issue."
3051f230f1 module: limit enabling module.sig_enforce
76d5608135 Revert "mm: relocate 'write_protect_seq' in struct mm_struct"
948d38f94d Merge 5.10.46 into android12-5.10-lts
78b5962cda Merge branch 'android12-5.10' into `android12-5.10-lts`
3de043c685 Linux 5.10.46
174c27583b usb: dwc3: core: fix kernel panic when do reboot
e52d43c82f usb: dwc3: debugfs: Add and remove endpoint dirs dynamically
1b5fbb6618 perf beauty: Update copy of linux/socket.h with the kernel sources
69371e0482 tools headers UAPI: Sync linux/in.h copy with the kernel sources
4f6e7098f1 net: fec_ptp: add clock rate zero check
1af3a8e91f net: stmmac: disable clocks in stmmac_remove_config_dt()
f71ca814c2 mm/slub.c: include swab.h
f6ed235754 mm/slub: actually fix freelist pointer vs redzoning
4314c8c63b mm/slub: fix redzoning for small allocations
4a36fda16b mm/slub: clarify verification reporting
12eb3c2c1a mm/swap: fix pte_same_as_swp() not removing uffd-wp bit when compare
fc7fdd8c5c net: bridge: fix vlan tunnel dst refcnt when egressing
fe0448a3fa net: bridge: fix vlan tunnel dst null pointer dereference
cfe403f209 net: ll_temac: Fix TX BD buffer overwrite
019ab7d044 net: ll_temac: Make sure to free skb when it is completely used
41984d4fbe drm/amdgpu/gfx9: fix the doorbell missing when in CGPG issue.
bc58ec307c drm/amdgpu/gfx10: enlarge CP_MEC_DOORBELL_RANGE_UPPER to cover full doorbell.
96b4126f8c cfg80211: avoid double free of PMSR request
5493b0c2a7 cfg80211: make certificate generation more robust
f74df6e086 mac80211: Fix NULL ptr deref for injected rate info
df203c1fda dmaengine: pl330: fix wrong usage of spinlock flags in dma_cyclc
b842b568a5 crash_core, vmcoreinfo: append 'SECTION_SIZE_BITS' to vmcoreinfo
63ba83563e x86/fpu: Reset state for all signal restore failures
a7748e021b x86/fpu: Invalidate FPU state after a failed XRSTOR from a user buffer
076f732b16 x86/fpu: Prevent state corruption in __fpu__restore_sig()
abc790bdbb x86/pkru: Write hardware init value to PKRU when xstate is init
208bb686e7 x86/ioremap: Map EFI-reserved memory as encrypted for SEV
75a55bc2e5 x86/process: Check PF_KTHREAD and not current->mm for kernel threads
ddaaf38e19 x86/mm: Avoid truncating memblocks for SGX memory
f6bcb1a628 ARCv2: save ABI registers across signal handling
b516daed99 s390/ap: Fix hanging ioctl caused by wrong msg counter
7c003dab43 s390/mcck: fix calculation of SIE critical section size
3a9934d6b8 KVM: X86: Fix x86_emulator slab cache leak
18eca69f88 KVM: x86/mmu: Calculate and check "full" mmu_role for nested MMU
669a8866e4 KVM: x86: Immediately reset the MMU context when the SMM flag is cleared
077cb8946f PCI: Work around Huawei Intelligent NIC VF FLR erratum
ee1a9cfed2 PCI: Add ACS quirk for Broadcom BCM57414 NIC
1a1dbc4473 PCI: aardvark: Fix kernel panic during PIO transfer
dac77a14fa PCI: Mark some NVIDIA GPUs to avoid bus reset
1e460ddf5b PCI: Mark TI C667X to avoid bus reset
c9fd0ab39f tracing: Do no increment trace_clock_global() by one
b313bd944d tracing: Do not stop recording comms if the trace file is being read
adb3849ed8 tracing: Do not stop recording cmdlines when tracing is off
1a91fafa3e usb: chipidea: imx: Fix Battery Charger 1.2 CDP detection
576996b64e usb: core: hub: Disable autosuspend for Cypress CY7C65632
6bd3d80d1f can: mcba_usb: fix memory leak in mcba_usb
509ab6bfdd can: j1939: fix Use-after-Free, hold skb ref while in use
0cf4b37790 can: bcm/raw/isotp: use per module netdevice notifier
acb755be1f can: bcm: fix infoleak in struct bcm_msg_head
8c82c52d1d bpf: Do not mark insn as seen under speculative path verification
e9d271731d bpf: Inherit expanded/patched seen count from old aux data
ed423d80bb irqchip/gic-v3: Workaround inconsistent PMR setting on NMI entry
103c4a08ba mm: relocate 'write_protect_seq' in struct mm_struct
a87abba03a hwmon: (scpi-hwmon) shows the negative temperature properly
57b21ef118 radeon: use memcpy_to/fromio for UVD fw upload
3e4b0fbb72 ASoC: qcom: lpass-cpu: Fix pop noise during audio capture begin
360609fc8b drm/sun4i: dw-hdmi: Make HDMI PHY into a platform device
5bd6bcb353 pinctrl: ralink: rt2880: avoid to error in calls is pin is already enabled
6d0dc1b34c ASoC: rt5682: Fix the fast discharge for headset unplugging in soundwire mode
ba8a26a7ce regulator: rt4801: Fix NULL pointer dereference if priv->enable_gpios is NULL
2f8f0e97ce spi: stm32-qspi: Always wait BUSY bit to be cleared in stm32_qspi_wait_cmd()
e03c8b3516 ASoC: tas2562: Fix TDM_CFG0_SAMPRATE values
813ff24f1d sched/pelt: Ensure that *_sum is always synced with *_avg
f6d28f0e36 spi: spi-zynq-qspi: Fix some wrong goto jumps & missing error code
0ea21221dd regulator: rtmv20: Fix to make regcache value first reading back from HW
3c5064cd29 ASoC: fsl-asoc-card: Set .owner attribute when registering card.
9a17907946 phy: phy-mtk-tphy: Fix some resource leaks in mtk_phy_init()
02e2455748 ASoC: rt5659: Fix the lost powers for the HDA header
3fb6c6acc1 platform/x86: thinkpad_acpi: Add X1 Carbon Gen 9 second fan support
0609c36696 regulator: bd70528: Fix off-by-one for buck123 .n_voltages setting
0ea923519a regulator: cros-ec: Fix error code in dev_err message
95deeb29d8 net: ethernet: fix potential use-after-free in ec_bhf_remove
8c0c2d97ad icmp: don't send out ICMP messages with a source address of 0.0.0.0
c5d70dbc4d bnxt_en: Call bnxt_ethtool_free() in bnxt_init_one() error path
f8774be4dc bnxt_en: Fix TQM fastpath ring backing store computation
acc9175541 bnxt_en: Rediscover PHY capabilities after firmware reset
acc3589959 cxgb4: fix wrong shift.
05b2b9f7d2 net: cdc_eem: fix tx fixup skb leak
290b0b6432 net: hamradio: fix memory leak in mkiss_close
45bf43d868 be2net: Fix an error handling path in 'be_probe()'
327e626c39 net/mlx5: Reset mkey index on creation
38aafe678c net/mlx5: E-Switch, Allow setting GUID for host PF vport
601be24dba net/mlx5: E-Switch, Read PF mac address
5f2ccc58a3 net/af_unix: fix a data-race in unix_dgram_sendmsg / unix_release_sock
ac31cc837c net: ipv4: fix memory leak in ip_mc_add1_src
d08f726cd5 net: fec_ptp: fix issue caused by refactor the fec_devtype
570a52cf3e net: usb: fix possible use-after-free in smsc75xx_bind
70c8418469 lantiq: net: fix duplicated skb in rx descriptor ring
11fac7e912 net: cdc_ncm: switch to eth%d interface naming
9a47949562 ptp: improve max_adj check against unreasonable values
5fc6ed1831 bpf: Fix leakage under speculation on mispredicted branches
960b08dd36 net: qrtr: fix OOB Read in qrtr_endpoint_post
0239c439ce ipv4: Fix device used for dst_alloc with local routes
4b5ad4b5ae cxgb4: fix wrong ethtool n-tuple rule lookup
d708e5efdd netxen_nic: Fix an error handling path in 'netxen_nic_probe()'
70513cdb93 qlcnic: Fix an error handling path in 'qlcnic_probe()'
fb3a948143 ethtool: strset: fix message length calculation
0e185a7b28 net: qualcomm: rmnet: don't over-count statistics
e3577776d6 net: qualcomm: rmnet: Update rmnet device MTU based on real device
4abfd597fe net: make get_net_ns return error if NET_NS is disabled
6a4b39944e net: stmmac: dwmac1000: Fix extended MAC address registers definition
6392ed82ad cxgb4: halt chip before flashing PHY firmware image
b38ec782d0 cxgb4: fix sleep in atomic when flashing PHY firmware
3d60457d74 cxgb4: fix endianness when flashing boot image
5bf940fe91 alx: Fix an error handling path in 'alx_probe()'
0adf32c033 selftests: mptcp: enable syncookie only in absence of reorders
eab06f7504 mptcp: do not warn on bad input from the network
222ebeda17 mptcp: try harder to borrow memory from subflow under pressure
3b491dd593 sch_cake: Fix out of bounds when parsing TCP options and header
73eeba71dc mptcp: Fix out of bounds when parsing TCP options
9cdf299ba4 netfilter: synproxy: Fix out of bounds when parsing TCP options
a336dc6fdd net/mlx5e: Block offload of outer header csum for UDP tunnels
34ff3770bf net/mlx5: DR, Don't use SW steering when RoCE is not supported
3623bfcab3 net/mlx5: DR, Allow SW steering for sw_owner_v2 devices
792f16e083 net/mlx5: Consider RoCE cap before init RDMA resources
be7f3f401d net/mlx5e: Fix page reclaim for dead peer hairpin
02c55a2570 net/mlx5e: Remove dependency in IPsec initialization flows
4733b73709 net/sched: act_ct: handle DNAT tuple collision
c8f1437c01 rtnetlink: Fix regression in bridge VLAN configuration
8729ec8a22 udp: fix race between close() and udp_abort()
7dd7b1e4d9 ice: parameterize functions responsible for Tx ring management
805ae44fc0 ice: add ndo_bpf callback for safe mode netdev ops
27e3d7da65 netfilter: nft_fib_ipv6: skip ipv6 packets from any to link-local
5cea03aef6 net: lantiq: disable interrupt before sheduling NAPI
db5f4adc93 net: dsa: felix: re-enable TX flow control in ocelot_port_flush()
5946fbf483 net: rds: fix memory leak in rds_recvmsg
aba26b3838 vrf: fix maximum MTU
deeeb65c6e net: ipv4: fix memory leak in netlbl_cipsov4_add_std
2088824ac9 libbpf: Fixes incorrect rx_ring_setup_done
195585ddb7 mlxsw: core: Set thermal zone polling delay argument to real value at init
e95848e9b5 mlxsw: reg: Spectrum-3: Enforce lowest max-shaper burst size of 11
5a1cd67a80 mac80211: fix skb length check in ieee80211_scan_rx()
282baa8104 batman-adv: Avoid WARN_ON timing related checks
ae1d3b989d fanotify: fix copy_event_to_user() fid error clean up
018685461a kvm: LAPIC: Restore guard to prevent illegal APIC register access
9e379da727 mm/memory-failure: make sure wait for page writeback in memory_failure
090b1bb928 afs: Fix an IS_ERR() vs NULL check
5efb0b3886 dmaengine: stedma40: add missing iounmap() on error in d40_probe()
ff864fa71a dmaengine: SF_PDMA depends on HAS_IOMEM
c0090b0169 dmaengine: QCOM_HIDMA_MGMT depends on HAS_IOMEM
f984fa006b dmaengine: ALTERA_MSGDMA depends on HAS_IOMEM
be4789636f dmaengine: xilinx: dpdma: initialize registers before request_irq
edd60afc3f dmaengine: fsl-dpaa2-qdma: Fix error return code in two functions
4d74c98023 dmaengine: idxd: add missing dsa driver unregister
e52dde966a Merge 5.10.45 into android12-5.10-lts
defb903783 Merge branch 'android12-5.10' into `android12-5.10-lts`
037a447b7a Linux 5.10.45
808fcc1e70 fib: Return the correct errno code
d8b2e3e17c net: Return the correct errno code
04c1556bfc net/x25: Return the correct errno code
0aa3569508 rtnetlink: Fix missing error code in rtnl_bridge_notify()
9250f97fd5 drm/amd/amdgpu:save psp ring wptr to avoid attack
9e8c2af010 drm/amd/display: Fix potential memory leak in DMUB hw_init
75fa7fbef1 drm/amdgpu: refine amdgpu_fru_get_product_info
34fe4ccb1f drm/amd/display: Allow bandwidth validation for 0 streams.
ecd26536ec net: ipconfig: Don't override command-line hostnames or domains
511a010291 nvme-loop: do not warn for deleted controllers during reset
155c2fea4b nvme-loop: check for NVME_LOOP_Q_LIVE in nvme_loop_destroy_admin_queue()
620424df29 nvme-loop: clear NVME_LOOP_Q_LIVE when nvme_loop_configure_admin_queue() fails
1c80ca596c nvme-loop: reset queue count to 1 in nvme_loop_destroy_io_queues()
b8fdea0695 scsi: scsi_devinfo: Add blacklist entry for HPE OPEN-V
f8ac1bd527 Bluetooth: Add a new USB ID for RTL8822CE
5491d97078 scsi: qedf: Do not put host in qedf_vport_create() unconditionally
609b56e979 ethernet: myri10ge: Fix missing error code in myri10ge_probe()
5d5f0d945d scsi: target: core: Fix warning on realtime kernels
a61156314b gfs2: Fix use-after-free in gfs2_glock_shrink_scan
bb73f2f789 riscv: Use -mno-relax when using lld linker
35277c1a66 HID: gt683r: add missing MODULE_DEVICE_TABLE
7a557de079 gfs2: fix a deadlock on withdraw-during-mount
c3e9ea16ad gfs2: Prevent direct-I/O write fallback errors from getting lost
864b5a8d53 ARM: OMAP2+: Fix build warning when mmc_omap is not built
247ec8ee0b ARM: OMAP1: Fix use of possibly uninitialized irq variable
3c0ad70cba drm/tegra: sor: Fully initialize SOR before registration
9c1d492baa gpu: host1x: Split up client initalization and registration
570b3e4020 drm/tegra: sor: Do not leak runtime PM reference
b1e3596416 HID: usbhid: fix info leak in hid_submit_ctrl
1dfd9f18ca HID: Add BUS_VIRTUAL to hid_connect logging
258d3fdbb1 HID: multitouch: set Stylus suffix for Stylus-application devices, too
6a142ea610 HID: quirks: Add quirk for Lenovo optical mouse
716a087adc HID: hid-sensor-hub: Return error for hid_set_field() failure
0bd8a4b46c HID: hid-input: add mapping for emoji picker key
b3c5bfc43c HID: a4tech: use A4_2WHEEL_MOUSE_HACK_B8 for A4TECH NB-95
be6c988792 HID: quirks: Set INCREMENT_USAGE_ON_DUPLICATE for Saitek X65
c7836de2ca net: ieee802154: fix null deref in parse dev addr
82658bfd88 Merge 5.10.44 into android12-5.10-lts
f2b1fc360f Linux 5.10.44
ef9a0d224b proc: only require mm_struct for writing
43c32c2225 tracing: Correct the length check which causes memory corruption
5b537408f2 scsi: core: Only put parent device if host state differs from SHOST_CREATED
0a31d1237a scsi: core: Put .shost_dev in failure path if host state changes to RUNNING
146446a43b scsi: core: Fix failure handling of scsi_add_host_with_dma()
7a696ce1d5 scsi: core: Fix error handling of scsi_host_alloc()
6e13b9bc66 NFSv4: nfs4_proc_set_acl needs to restore NFS_CAP_UIDGID_NOMAP on error.
d973bd0d6e NFSv4: Fix second deadlock in nfs4_evict_inode()
c3b6cf64df NFS: Fix use-after-free in nfs4_init_client()
9064c9d544 kvm: fix previous commit for 32-bit builds
351075bcfe perf session: Correct buffer copying when peeking events
b4651cea43 NFSv4: Fix deadlock between nfs4_evict_inode() and nfs4_opendata_get_inode()
279ad78a00 NFS: Fix a potential NULL dereference in nfs_get_client()
91f7fdc4cc IB/mlx5: Fix initializing CQ fragments buffer
d046f724bb KVM: x86: Ensure liveliness of nested VM-Enter fail tracepoint message
4921feb0e5 x86/nmi_watchdog: Fix old-style NMI watchdog regression on old Intel CPUs
190a7f9089 sched/fair: Fix util_est UTIL_AVG_UNCHANGED handling
32e22db8b2 sched/fair: Make sure to update tg contrib for blocked load
4c37b062ed sched/fair: Keep load_avg and load_sum synced
c64a3be39f perf: Fix data race between pin_count increment/decrement
e0b518a2eb gpio: wcd934x: Fix shift-out-of-bounds error
56a388a9cc phy: ti: Fix an error code in wiz_probe()
62d891861f ASoC: meson: gx-card: fix sound-dai dt schema
0e2c9aeb00 ASoC: core: Fix Null-point-dereference in fmt_single_name()
d83075c25a phy: cadence: Sierra: Fix error return code in cdns_sierra_phy_probe()
c9cb5837e9 tools/bootconfig: Fix error return code in apply_xbc()
16ccdcdfe6 vmlinux.lds.h: Avoid orphan section with !SMP
c25ec6386a ARM: cpuidle: Avoid orphan section warning
cb1aa1da04 RDMA/mlx4: Do not map the core_clock page to user space unless enabled
67cf4e447b RDMA/ipoib: Fix warning caused by destroying non-initial netns
fd681a8c7a drm/msm/a6xx: avoid shadow NULL reference in failure path
0bc79f4b7a drm/msm/a6xx: update/fix CP_PROTECT initialization
5b7dc8329d drm/msm/a6xx: fix incorrectly set uavflagprd_inv field for A650
5a61f69da3 drm/mcde: Fix off by 10^3 in calculation
d688892980 usb: typec: mux: Fix copy-paste mistake in typec_mux_match
9e0677c2e3 usb: dwc3: gadget: Disable gadget IRQ during pullup disable
cc440da4aa phy: usb: Fix misuse of IS_ENABLED
aafc51fddf regulator: rtmv20: Fix .set_current_limit/.get_current_limit callbacks
4579f65176 regulator: bd71828: Fix .n_voltages settings
5a5f5cfb5f regulator: fan53880: Fix missing n_voltages setting
c365ff9761 regulator: bd718x7: Fix the BUCK7 voltage setting on BD71837
e3a502abf5 regulator: max77620: Use device_set_of_node_from_dev()
06653ebc0a regulator: core: resolve supply for boot-on/always-on regulators
7dcdfa28e1 usb: typec: tcpm: cancel frs hrtimer when unregister tcpm port
18eaf0de50 usb: typec: tcpm: cancel vdm and state machine hrtimer when unregister tcpm port
b972eff874 usb: fix various gadget panics on 10gbps cabling
4b289a0f30 usb: fix various gadgets null ptr deref on 10gbps cabling.
6bf8ff7d05 usb: gadget: eem: fix wrong eem header operation
21bee94fb9 USB: serial: cp210x: fix alternate function for CP2102N QFN20
4fa815beea USB: serial: quatech2: fix control-request directions
ef91a6bd94 USB: serial: omninet: add device id for Zyxel Omni 56K Plus
1e2d41c17f USB: serial: ftdi_sio: add NovaTech OrionMX product ID
5cead89696 usb: gadget: f_fs: Ensure io_completion_wq is idle during unbind
0b3bb7950e usb: typec: intel_pmc_mux: Add missed error check for devm_ioremap_resource()
6900ef1b10 usb: typec: intel_pmc_mux: Put fwnode in error case during ->probe()
572de10087 usb: typec: ucsi: Clear PPM capability data in ucsi_init() error path
7cee4344cb usb: typec: wcove: Use LE to CPU conversion when accessing msg->header
199af8a06d usb: musb: fix MUSB_QUIRK_B_DISCONNECT_99 handling
4704036391 usb: dwc3: ep0: fix NULL pointer exception
851dee5a5d usb: dwc3: gadget: Bail from dwc3_gadget_exit() if dwc->gadget is NULL
2af93b437a usb: dwc3: meson-g12a: Disable the regulator in the error handling path of the probe
750a0d7556 usb: dwc3-meson-g12a: fix usb2 PHY glue init when phy0 is disabled
b452e8bb7c usb: pd: Set PD_T_SINK_WAIT_CAP to 310ms
0ff5f83ae1 usb: f_ncm: only first packet of aggregate needs to start timer
0f5a20b1fd USB: f_ncm: ncm_bitrate (speed) is unsigned
1bf2c28ab2 mmc: renesas_sdhi: Fix HS400 on R-Car M3-W+
67aca230ca mmc: renesas_sdhi: abort tuning when timeout detected
9752438476 ftrace: Do not blindly read the ip address in ftrace_bug()
74d3b20b1b cgroup1: don't allow '\n' in renaming
31fe243a63 btrfs: promote debugging asserts to full-fledged checks in validate_super
ca69dc891b btrfs: return value from btrfs_mark_extent_written() in case of error
bf240fee5b async_xor: check src_offs is not NULL before updating it
8d5c0f6b7a staging: rtl8723bs: Fix uninitialized variables
7af299b977 kvm: avoid speculation-based attacks from out-of-range memslot accesses
6b6ff4d1f3 KVM: X86: MMU: Use the correct inherited permissions to get shadow page
14831b7956 perf/x86/intel/uncore: Fix M2M event umask for Ice Lake server
aa8591a58c drm: Lock pointer access in drm_master_release()
491d52e007 drm: Fix use-after-free read in drm_getunique()
afd87792db Revert "ACPI: sleep: Put the FACS table after using it"
82a8ffba54 spi: bcm2835: Fix out-of-bounds access with more than 4 slaves
05e6b71594 ALSA: hda/realtek: fix mute/micmute LEDs for HP ZBook Power G8
d62d55f394 ALSA: hda/realtek: fix mute/micmute LEDs for HP EliteBook 840 Aero G8
5573068067 ALSA: hda/realtek: fix mute/micmute LEDs and speaker for HP EliteBook x360 1040 G8
bd0fe358d1 ALSA: hda/realtek: fix mute/micmute LEDs and speaker for HP Elite Dragonfly G2
6a81e47788 ALSA: hda/realtek: headphone and mic don't work on an Acer laptop
98f842951f ALSA: firewire-lib: fix the context to call snd_pcm_stop_xrun()
bd7d88b087 ALSA: seq: Fix race of snd_seq_timer_open()
fff6af6dea i2c: mpc: implement erratum A-004447 workaround
d78b76af9f i2c: mpc: Make use of i2c_recover_bus()
fa05ba6196 spi: Cleanup on failure of initial setup
0c4d4de2da spi: Don't have controller clean up spi device before driver unbind
3a5b982463 powerpc/fsl: set fsl,i2c-erratum-a004447 flag for P1010 i2c controllers
a7c3c17867 powerpc/fsl: set fsl,i2c-erratum-a004447 flag for P2041 i2c controllers
590f718a64 nvmet: fix false keep-alive timeout when a controller is torn down
2538f06f94 nvme-tcp: remove incorrect Kconfig dep in BLK_DEV_NVME
b0308804b2 bnx2x: Fix missing error code in bnx2x_iov_init_one()
90547d5db5 dm verity: fix require_signatures module_param permissions
7519ece673 MIPS: Fix kernel hang under FUNCTION_GRAPH_TRACER and PREEMPT_TRACER
37a079a6ae nvme-fabrics: decode host pathing error for connect
f42afc0f29 net: dsa: microchip: enable phy errata workaround on 9567
ee144b7980 net: appletalk: cops: Fix data race in cops_probe1
a385cbf31e net: macb: ensure the device is available before accessing GEMGXL control registers
bbb48789b6 scsi: target: qla2xxx: Wait for stop_phase1 at WWN removal
1676363528 scsi: hisi_sas: Drop free_irq() of devm_request_irq() allocated irq
eac1d159b7 scsi: vmw_pvscsi: Set correct residual data length
30030c6ff3 scsi: bnx2fc: Return failure if io_req is already in ABTS processing
8d717c9135 net:sfc: fix non-freed irq in legacy irq mode
e806df71ee RDS tcp loopback connection can hang
4353eb4218 net/qla3xxx: fix schedule while atomic in ql_sem_spinlock
ad241cb1cf wq: handle VM suspension in stall detection
5ca472d40e cgroup: disable controllers at parse time
be23c4af3d net: mdiobus: get rid of a BUG_ON()
1d6d43d480 netlink: disable IRQs for netlink_lock_table()
42e4900138 bonding: init notify_work earlier to avoid uninitialized use
143fc72209 isdn: mISDN: netjet: Fix crash in nj_probe:
2e2145ccfb usb: chipidea: udc: assign interrupt number to USB gadget structure
06e84ea1f4 spi: sprd: Add missing MODULE_DEVICE_TABLE
369f3caa4d ASoC: sti-sas: add missing MODULE_DEVICE_TABLE
01905f3232 vfio-ccw: Serialize FSM IDLE state with I/O completion
cad3dc73c0 vfio-ccw: Reset FSM state to IDLE inside FSM
4352209ed0 ASoC: Intel: bytcr_rt5640: Add quirk for the Lenovo Miix 3-830 tablet
a5ee8f54d0 ASoC: Intel: bytcr_rt5640: Add quirk for the Glavey TM800A550L tablet
2f523cd4a9 usb: cdns3: Fix runtime PM imbalance on error
1e5cab5020 net/nfc/rawsock.c: fix a permission check bug
584b2c7ce2 bpf: Forbid trampoline attach for functions with variable arguments
fb91ab403e spi: spi-zynq-qspi: Fix stack violation bug
4b8b7bc3a7 spi: Fix spi device unregister flow
cb24d57ad5 ASoC: amd: fix for pcm_read() error
3b89db7468 ASoC: max98088: fix ni clock divider calculation
f70102cb36 proc: Track /proc/$pid/attr/ opener mm_struct
3c79e1658f Merge branch 'android12-5.10' 'android12-5.10-lts'
2935d31616 Merge branch 'android12-5.10' 'android12-5.10-lts'
3956bf29b2 ANDROID: clang: update to 12.0.5
f9761818fe ANDROID: GKI: Refresh ABI following trimmed symbol CRC fix
e913e8a922 FROMLIST: export: Make CRCs robust to symbol trimming
b0c3c31639 Merge branch 'android12-5.10' into android12-5.10-lts
e9fa24e154 ANDROID: Add GKI_HIDDEN_MM_CONFIGS to support ballooning.

Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
Change-Id: I4cb43e42abd29006bab26b7e65ecfa052e10eed9
2021-10-21 09:45:02 +02:00

11606 lines
304 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
*
* Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
*
* Interactivity improvements by Mike Galbraith
* (C) 2007 Mike Galbraith <efault@gmx.de>
*
* Various enhancements by Dmitry Adamushko.
* (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
*
* Group scheduling enhancements by Srivatsa Vaddagiri
* Copyright IBM Corporation, 2007
* Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
*
* Scaled math optimizations by Thomas Gleixner
* Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
*
* Adaptive scheduling granularity, math enhancements by Peter Zijlstra
* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
*/
#include "sched.h"
#include <trace/hooks/sched.h>
/*
* Targeted preemption latency for CPU-bound tasks:
*
* NOTE: this latency value is not the same as the concept of
* 'timeslice length' - timeslices in CFS are of variable length
* and have no persistent notion like in traditional, time-slice
* based scheduling concepts.
*
* (to see the precise effective timeslice length of your workload,
* run vmstat and monitor the context-switches (cs) field)
*
* (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
*/
unsigned int sysctl_sched_latency = 6000000ULL;
EXPORT_SYMBOL_GPL(sysctl_sched_latency);
static unsigned int normalized_sysctl_sched_latency = 6000000ULL;
/*
* The initial- and re-scaling of tunables is configurable
*
* Options are:
*
* SCHED_TUNABLESCALING_NONE - unscaled, always *1
* SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
* SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
*
* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
*/
enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
/*
* Minimal preemption granularity for CPU-bound tasks:
*
* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
unsigned int sysctl_sched_min_granularity = 750000ULL;
EXPORT_SYMBOL_GPL(sysctl_sched_min_granularity);
static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
/*
* This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
*/
static unsigned int sched_nr_latency = 8;
/*
* After fork, child runs first. If set to 0 (default) then
* parent will (try to) run first.
*/
unsigned int sysctl_sched_child_runs_first __read_mostly;
/*
* SCHED_OTHER wake-up granularity.
*
* This option delays the preemption effects of decoupled workloads
* and reduces their over-scheduling. Synchronous workloads will still
* have immediate wakeup/sleep latencies.
*
* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
int sched_thermal_decay_shift;
static int __init setup_sched_thermal_decay_shift(char *str)
{
int _shift = 0;
if (kstrtoint(str, 0, &_shift))
pr_warn("Unable to set scheduler thermal pressure decay shift parameter\n");
sched_thermal_decay_shift = clamp(_shift, 0, 10);
return 1;
}
__setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift);
#ifdef CONFIG_SMP
/*
* For asym packing, by default the lower numbered CPU has higher priority.
*/
int __weak arch_asym_cpu_priority(int cpu)
{
return -cpu;
}
/*
* The margin used when comparing utilization with CPU capacity.
*
* (default: ~20%)
*/
#define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024)
#endif
#ifdef CONFIG_CFS_BANDWIDTH
/*
* Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
* each time a cfs_rq requests quota.
*
* Note: in the case that the slice exceeds the runtime remaining (either due
* to consumption or the quota being specified to be smaller than the slice)
* we will always only issue the remaining available time.
*
* (default: 5 msec, units: microseconds)
*/
unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
#endif
static inline void update_load_add(struct load_weight *lw, unsigned long inc)
{
lw->weight += inc;
lw->inv_weight = 0;
}
static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
{
lw->weight -= dec;
lw->inv_weight = 0;
}
static inline void update_load_set(struct load_weight *lw, unsigned long w)
{
lw->weight = w;
lw->inv_weight = 0;
}
/*
* Increase the granularity value when there are more CPUs,
* because with more CPUs the 'effective latency' as visible
* to users decreases. But the relationship is not linear,
* so pick a second-best guess by going with the log2 of the
* number of CPUs.
*
* This idea comes from the SD scheduler of Con Kolivas:
*/
static unsigned int get_update_sysctl_factor(void)
{
unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
unsigned int factor;
switch (sysctl_sched_tunable_scaling) {
case SCHED_TUNABLESCALING_NONE:
factor = 1;
break;
case SCHED_TUNABLESCALING_LINEAR:
factor = cpus;
break;
case SCHED_TUNABLESCALING_LOG:
default:
factor = 1 + ilog2(cpus);
break;
}
return factor;
}
static void update_sysctl(void)
{
unsigned int factor = get_update_sysctl_factor();
#define SET_SYSCTL(name) \
(sysctl_##name = (factor) * normalized_sysctl_##name)
SET_SYSCTL(sched_min_granularity);
SET_SYSCTL(sched_latency);
SET_SYSCTL(sched_wakeup_granularity);
#undef SET_SYSCTL
}
void __init sched_init_granularity(void)
{
update_sysctl();
}
#define WMULT_CONST (~0U)
#define WMULT_SHIFT 32
static void __update_inv_weight(struct load_weight *lw)
{
unsigned long w;
if (likely(lw->inv_weight))
return;
w = scale_load_down(lw->weight);
if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
lw->inv_weight = 1;
else if (unlikely(!w))
lw->inv_weight = WMULT_CONST;
else
lw->inv_weight = WMULT_CONST / w;
}
/*
* delta_exec * weight / lw.weight
* OR
* (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
*
* Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case
* we're guaranteed shift stays positive because inv_weight is guaranteed to
* fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
*
* Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
* weight/lw.weight <= 1, and therefore our shift will also be positive.
*/
static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
{
u64 fact = scale_load_down(weight);
int shift = WMULT_SHIFT;
__update_inv_weight(lw);
if (unlikely(fact >> 32)) {
while (fact >> 32) {
fact >>= 1;
shift--;
}
}
fact = mul_u32_u32(fact, lw->inv_weight);
while (fact >> 32) {
fact >>= 1;
shift--;
}
return mul_u64_u32_shr(delta_exec, fact, shift);
}
const struct sched_class fair_sched_class;
/**************************************************************
* CFS operations on generic schedulable entities:
*/
#ifdef CONFIG_FAIR_GROUP_SCHED
static inline struct task_struct *task_of(struct sched_entity *se)
{
SCHED_WARN_ON(!entity_is_task(se));
return container_of(se, struct task_struct, se);
}
/* Walk up scheduling entities hierarchy */
#define for_each_sched_entity(se) \
for (; se; se = se->parent)
static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
{
return p->se.cfs_rq;
}
/* runqueue on which this entity is (to be) queued */
static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
{
return se->cfs_rq;
}
/* runqueue "owned" by this group */
static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
{
return grp->my_q;
}
static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
{
if (!path)
return;
if (cfs_rq && task_group_is_autogroup(cfs_rq->tg))
autogroup_path(cfs_rq->tg, path, len);
else if (cfs_rq && cfs_rq->tg->css.cgroup)
cgroup_path(cfs_rq->tg->css.cgroup, path, len);
else
strlcpy(path, "(null)", len);
}
static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
int cpu = cpu_of(rq);
if (cfs_rq->on_list)
return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list;
cfs_rq->on_list = 1;
/*
* Ensure we either appear before our parent (if already
* enqueued) or force our parent to appear after us when it is
* enqueued. The fact that we always enqueue bottom-up
* reduces this to two cases and a special case for the root
* cfs_rq. Furthermore, it also means that we will always reset
* tmp_alone_branch either when the branch is connected
* to a tree or when we reach the top of the tree
*/
if (cfs_rq->tg->parent &&
cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
/*
* If parent is already on the list, we add the child
* just before. Thanks to circular linked property of
* the list, this means to put the child at the tail
* of the list that starts by parent.
*/
list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
&(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
/*
* The branch is now connected to its tree so we can
* reset tmp_alone_branch to the beginning of the
* list.
*/
rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
return true;
}
if (!cfs_rq->tg->parent) {
/*
* cfs rq without parent should be put
* at the tail of the list.
*/
list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
&rq->leaf_cfs_rq_list);
/*
* We have reach the top of a tree so we can reset
* tmp_alone_branch to the beginning of the list.
*/
rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
return true;
}
/*
* The parent has not already been added so we want to
* make sure that it will be put after us.
* tmp_alone_branch points to the begin of the branch
* where we will add parent.
*/
list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch);
/*
* update tmp_alone_branch to points to the new begin
* of the branch
*/
rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
return false;
}
static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
if (cfs_rq->on_list) {
struct rq *rq = rq_of(cfs_rq);
/*
* With cfs_rq being unthrottled/throttled during an enqueue,
* it can happen the tmp_alone_branch points the a leaf that
* we finally want to del. In this case, tmp_alone_branch moves
* to the prev element but it will point to rq->leaf_cfs_rq_list
* at the end of the enqueue.
*/
if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev;
list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
cfs_rq->on_list = 0;
}
}
static inline void assert_list_leaf_cfs_rq(struct rq *rq)
{
SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
}
/* Iterate thr' all leaf cfs_rq's on a runqueue */
#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
leaf_cfs_rq_list)
/* Do the two (enqueued) entities belong to the same group ? */
static inline struct cfs_rq *
is_same_group(struct sched_entity *se, struct sched_entity *pse)
{
if (se->cfs_rq == pse->cfs_rq)
return se->cfs_rq;
return NULL;
}
static inline struct sched_entity *parent_entity(struct sched_entity *se)
{
return se->parent;
}
static void
find_matching_se(struct sched_entity **se, struct sched_entity **pse)
{
int se_depth, pse_depth;
/*
* preemption test can be made between sibling entities who are in the
* same cfs_rq i.e who have a common parent. Walk up the hierarchy of
* both tasks until we find their ancestors who are siblings of common
* parent.
*/
/* First walk up until both entities are at same depth */
se_depth = (*se)->depth;
pse_depth = (*pse)->depth;
while (se_depth > pse_depth) {
se_depth--;
*se = parent_entity(*se);
}
while (pse_depth > se_depth) {
pse_depth--;
*pse = parent_entity(*pse);
}
while (!is_same_group(*se, *pse)) {
*se = parent_entity(*se);
*pse = parent_entity(*pse);
}
}
#else /* !CONFIG_FAIR_GROUP_SCHED */
static inline struct task_struct *task_of(struct sched_entity *se)
{
return container_of(se, struct task_struct, se);
}
#define for_each_sched_entity(se) \
for (; se; se = NULL)
static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
{
return &task_rq(p)->cfs;
}
static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
{
struct task_struct *p = task_of(se);
struct rq *rq = task_rq(p);
return &rq->cfs;
}
/* runqueue "owned" by this group */
static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
{
return NULL;
}
static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
{
if (path)
strlcpy(path, "(null)", len);
}
static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
return true;
}
static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
}
static inline void assert_list_leaf_cfs_rq(struct rq *rq)
{
}
#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
static inline struct sched_entity *parent_entity(struct sched_entity *se)
{
return NULL;
}
static inline void
find_matching_se(struct sched_entity **se, struct sched_entity **pse)
{
}
#endif /* CONFIG_FAIR_GROUP_SCHED */
static __always_inline
void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
/**************************************************************
* Scheduling class tree data structure manipulation methods:
*/
static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
{
s64 delta = (s64)(vruntime - max_vruntime);
if (delta > 0)
max_vruntime = vruntime;
return max_vruntime;
}
static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
{
s64 delta = (s64)(vruntime - min_vruntime);
if (delta < 0)
min_vruntime = vruntime;
return min_vruntime;
}
static inline int entity_before(struct sched_entity *a,
struct sched_entity *b)
{
return (s64)(a->vruntime - b->vruntime) < 0;
}
static void update_min_vruntime(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
u64 vruntime = cfs_rq->min_vruntime;
if (curr) {
if (curr->on_rq)
vruntime = curr->vruntime;
else
curr = NULL;
}
if (leftmost) { /* non-empty tree */
struct sched_entity *se;
se = rb_entry(leftmost, struct sched_entity, run_node);
if (!curr)
vruntime = se->vruntime;
else
vruntime = min_vruntime(vruntime, se->vruntime);
}
/* ensure we never gain time by being placed backwards. */
cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
#ifndef CONFIG_64BIT
smp_wmb();
cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
#endif
}
/*
* Enqueue an entity into the rb-tree:
*/
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node;
struct rb_node *parent = NULL;
struct sched_entity *entry;
bool leftmost = true;
trace_android_rvh_enqueue_entity(cfs_rq, se);
/*
* Find the right place in the rbtree:
*/
while (*link) {
parent = *link;
entry = rb_entry(parent, struct sched_entity, run_node);
/*
* We dont care about collisions. Nodes with
* the same key stay together.
*/
if (entity_before(se, entry)) {
link = &parent->rb_left;
} else {
link = &parent->rb_right;
leftmost = false;
}
}
rb_link_node(&se->run_node, parent, link);
rb_insert_color_cached(&se->run_node,
&cfs_rq->tasks_timeline, leftmost);
}
static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
trace_android_rvh_dequeue_entity(cfs_rq, se);
rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
}
struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
{
struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
if (!left)
return NULL;
return rb_entry(left, struct sched_entity, run_node);
}
static struct sched_entity *__pick_next_entity(struct sched_entity *se)
{
struct rb_node *next = rb_next(&se->run_node);
if (!next)
return NULL;
return rb_entry(next, struct sched_entity, run_node);
}
#ifdef CONFIG_SCHED_DEBUG
struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
{
struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
if (!last)
return NULL;
return rb_entry(last, struct sched_entity, run_node);
}
/**************************************************************
* Scheduling class statistics methods:
*/
int sched_proc_update_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
unsigned int factor = get_update_sysctl_factor();
if (ret || !write)
return ret;
sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
sysctl_sched_min_granularity);
#define WRT_SYSCTL(name) \
(normalized_sysctl_##name = sysctl_##name / (factor))
WRT_SYSCTL(sched_min_granularity);
WRT_SYSCTL(sched_latency);
WRT_SYSCTL(sched_wakeup_granularity);
#undef WRT_SYSCTL
return 0;
}
#endif
/*
* delta /= w
*/
static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
{
if (unlikely(se->load.weight != NICE_0_LOAD))
delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
return delta;
}
/*
* The idea is to set a period in which each task runs once.
*
* When there are too many tasks (sched_nr_latency) we have to stretch
* this period because otherwise the slices get too small.
*
* p = (nr <= nl) ? l : l*nr/nl
*/
static u64 __sched_period(unsigned long nr_running)
{
if (unlikely(nr_running > sched_nr_latency))
return nr_running * sysctl_sched_min_granularity;
else
return sysctl_sched_latency;
}
/*
* We calculate the wall-time slice from the period by taking a part
* proportional to the weight.
*
* s = p*P[w/rw]
*/
static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
unsigned int nr_running = cfs_rq->nr_running;
u64 slice;
if (sched_feat(ALT_PERIOD))
nr_running = rq_of(cfs_rq)->cfs.h_nr_running;
slice = __sched_period(nr_running + !se->on_rq);
for_each_sched_entity(se) {
struct load_weight *load;
struct load_weight lw;
cfs_rq = cfs_rq_of(se);
load = &cfs_rq->load;
if (unlikely(!se->on_rq)) {
lw = cfs_rq->load;
update_load_add(&lw, se->load.weight);
load = &lw;
}
slice = __calc_delta(slice, se->load.weight, load);
}
if (sched_feat(BASE_SLICE))
slice = max(slice, (u64)sysctl_sched_min_granularity);
return slice;
}
/*
* We calculate the vruntime slice of a to-be-inserted task.
*
* vs = s/w
*/
static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
return calc_delta_fair(sched_slice(cfs_rq, se), se);
}
#include "pelt.h"
#ifdef CONFIG_SMP
static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
static unsigned long task_h_load(struct task_struct *p);
static unsigned long capacity_of(int cpu);
/* Give new sched_entity start runnable values to heavy its load in infant time */
void init_entity_runnable_average(struct sched_entity *se)
{
struct sched_avg *sa = &se->avg;
memset(sa, 0, sizeof(*sa));
/*
* Tasks are initialized with full load to be seen as heavy tasks until
* they get a chance to stabilize to their real load level.
* Group entities are initialized with zero load to reflect the fact that
* nothing has been attached to the task group yet.
*/
if (entity_is_task(se))
sa->load_avg = scale_load_down(se->load.weight);
/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
}
static void attach_entity_cfs_rq(struct sched_entity *se);
/*
* With new tasks being created, their initial util_avgs are extrapolated
* based on the cfs_rq's current util_avg:
*
* util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
*
* However, in many cases, the above util_avg does not give a desired
* value. Moreover, the sum of the util_avgs may be divergent, such
* as when the series is a harmonic series.
*
* To solve this problem, we also cap the util_avg of successive tasks to
* only 1/2 of the left utilization budget:
*
* util_avg_cap = (cpu_scale - cfs_rq->avg.util_avg) / 2^n
*
* where n denotes the nth task and cpu_scale the CPU capacity.
*
* For example, for a CPU with 1024 of capacity, a simplest series from
* the beginning would be like:
*
* task util_avg: 512, 256, 128, 64, 32, 16, 8, ...
* cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
*
* Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
* if util_avg > util_avg_cap.
*/
void post_init_entity_util_avg(struct task_struct *p)
{
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
struct sched_avg *sa = &se->avg;
long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)));
long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
if (cap > 0) {
if (cfs_rq->avg.util_avg != 0) {
sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
sa->util_avg /= (cfs_rq->avg.load_avg + 1);
if (sa->util_avg > cap)
sa->util_avg = cap;
} else {
sa->util_avg = cap;
}
}
sa->runnable_avg = sa->util_avg;
if (p->sched_class != &fair_sched_class) {
/*
* For !fair tasks do:
*
update_cfs_rq_load_avg(now, cfs_rq);
attach_entity_load_avg(cfs_rq, se);
switched_from_fair(rq, p);
*
* such that the next switched_to_fair() has the
* expected state.
*/
se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
return;
}
/* Hook before this se's util is attached to cfs_rq's util */
trace_android_rvh_post_init_entity_util_avg(se);
attach_entity_cfs_rq(se);
}
#else /* !CONFIG_SMP */
void init_entity_runnable_average(struct sched_entity *se)
{
}
void post_init_entity_util_avg(struct task_struct *p)
{
}
static void update_tg_load_avg(struct cfs_rq *cfs_rq)
{
}
#endif /* CONFIG_SMP */
/*
* Update the current task's runtime statistics.
*/
static void update_curr(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
u64 now = rq_clock_task(rq_of(cfs_rq));
u64 delta_exec;
if (unlikely(!curr))
return;
delta_exec = now - curr->exec_start;
if (unlikely((s64)delta_exec <= 0))
return;
curr->exec_start = now;
schedstat_set(curr->statistics.exec_max,
max(delta_exec, curr->statistics.exec_max));
curr->sum_exec_runtime += delta_exec;
schedstat_add(cfs_rq->exec_clock, delta_exec);
curr->vruntime += calc_delta_fair(delta_exec, curr);
update_min_vruntime(cfs_rq);
if (entity_is_task(curr)) {
struct task_struct *curtask = task_of(curr);
trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
cgroup_account_cputime(curtask, delta_exec);
account_group_exec_runtime(curtask, delta_exec);
}
account_cfs_rq_runtime(cfs_rq, delta_exec);
}
static void update_curr_fair(struct rq *rq)
{
update_curr(cfs_rq_of(&rq->curr->se));
}
static inline void
update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
u64 wait_start, prev_wait_start;
if (!schedstat_enabled())
return;
wait_start = rq_clock(rq_of(cfs_rq));
prev_wait_start = schedstat_val(se->statistics.wait_start);
if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
likely(wait_start > prev_wait_start))
wait_start -= prev_wait_start;
__schedstat_set(se->statistics.wait_start, wait_start);
}
static inline void
update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
struct task_struct *p;
u64 delta;
if (!schedstat_enabled())
return;
delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
if (entity_is_task(se)) {
p = task_of(se);
if (task_on_rq_migrating(p)) {
/*
* Preserve migrating task's wait time so wait_start
* time stamp can be adjusted to accumulate wait time
* prior to migration.
*/
__schedstat_set(se->statistics.wait_start, delta);
return;
}
trace_sched_stat_wait(p, delta);
}
__schedstat_set(se->statistics.wait_max,
max(schedstat_val(se->statistics.wait_max), delta));
__schedstat_inc(se->statistics.wait_count);
__schedstat_add(se->statistics.wait_sum, delta);
__schedstat_set(se->statistics.wait_start, 0);
}
static inline void
update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
struct task_struct *tsk = NULL;
u64 sleep_start, block_start;
if (!schedstat_enabled())
return;
sleep_start = schedstat_val(se->statistics.sleep_start);
block_start = schedstat_val(se->statistics.block_start);
if (entity_is_task(se))
tsk = task_of(se);
if (sleep_start) {
u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
if ((s64)delta < 0)
delta = 0;
if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
__schedstat_set(se->statistics.sleep_max, delta);
__schedstat_set(se->statistics.sleep_start, 0);
__schedstat_add(se->statistics.sum_sleep_runtime, delta);
if (tsk) {
account_scheduler_latency(tsk, delta >> 10, 1);
trace_sched_stat_sleep(tsk, delta);
}
}
if (block_start) {
u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
if ((s64)delta < 0)
delta = 0;
if (unlikely(delta > schedstat_val(se->statistics.block_max)))
__schedstat_set(se->statistics.block_max, delta);
__schedstat_set(se->statistics.block_start, 0);
__schedstat_add(se->statistics.sum_sleep_runtime, delta);
if (tsk) {
if (tsk->in_iowait) {
__schedstat_add(se->statistics.iowait_sum, delta);
__schedstat_inc(se->statistics.iowait_count);
trace_sched_stat_iowait(tsk, delta);
}
trace_sched_stat_blocked(tsk, delta);
/*
* Blocking time is in units of nanosecs, so shift by
* 20 to get a milliseconds-range estimation of the
* amount of time that the task spent sleeping:
*/
if (unlikely(prof_on == SLEEP_PROFILING)) {
profile_hits(SLEEP_PROFILING,
(void *)get_wchan(tsk),
delta >> 20);
}
account_scheduler_latency(tsk, delta >> 10, 0);
}
}
}
/*
* Task is being enqueued - update stats:
*/
static inline void
update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
if (!schedstat_enabled())
return;
/*
* Are we enqueueing a waiting task? (for current tasks
* a dequeue/enqueue event is a NOP)
*/
if (se != cfs_rq->curr)
update_stats_wait_start(cfs_rq, se);
if (flags & ENQUEUE_WAKEUP)
update_stats_enqueue_sleeper(cfs_rq, se);
}
static inline void
update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
if (!schedstat_enabled())
return;
/*
* Mark the end of the wait period if dequeueing a
* waiting task:
*/
if (se != cfs_rq->curr)
update_stats_wait_end(cfs_rq, se);
if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
struct task_struct *tsk = task_of(se);
if (tsk->state & TASK_INTERRUPTIBLE)
__schedstat_set(se->statistics.sleep_start,
rq_clock(rq_of(cfs_rq)));
if (tsk->state & TASK_UNINTERRUPTIBLE)
__schedstat_set(se->statistics.block_start,
rq_clock(rq_of(cfs_rq)));
}
}
/*
* We are picking a new current task - update its stats:
*/
static inline void
update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
/*
* We are starting a new run period:
*/
se->exec_start = rq_clock_task(rq_of(cfs_rq));
}
/**************************************************
* Scheduling class queueing methods:
*/
#ifdef CONFIG_NUMA_BALANCING
/*
* Approximate time to scan a full NUMA task in ms. The task scan period is
* calculated based on the tasks virtual memory size and
* numa_balancing_scan_size.
*/
unsigned int sysctl_numa_balancing_scan_period_min = 1000;
unsigned int sysctl_numa_balancing_scan_period_max = 60000;
/* Portion of address space to scan in MB */
unsigned int sysctl_numa_balancing_scan_size = 256;
/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
unsigned int sysctl_numa_balancing_scan_delay = 1000;
struct numa_group {
refcount_t refcount;
spinlock_t lock; /* nr_tasks, tasks */
int nr_tasks;
pid_t gid;
int active_nodes;
struct rcu_head rcu;
unsigned long total_faults;
unsigned long max_faults_cpu;
/*
* Faults_cpu is used to decide whether memory should move
* towards the CPU. As a consequence, these stats are weighted
* more by CPU use than by memory faults.
*/
unsigned long *faults_cpu;
unsigned long faults[];
};
/*
* For functions that can be called in multiple contexts that permit reading
* ->numa_group (see struct task_struct for locking rules).
*/
static struct numa_group *deref_task_numa_group(struct task_struct *p)
{
return rcu_dereference_check(p->numa_group, p == current ||
(lockdep_is_held(&task_rq(p)->lock) && !READ_ONCE(p->on_cpu)));
}
static struct numa_group *deref_curr_numa_group(struct task_struct *p)
{
return rcu_dereference_protected(p->numa_group, p == current);
}
static inline unsigned long group_faults_priv(struct numa_group *ng);
static inline unsigned long group_faults_shared(struct numa_group *ng);
static unsigned int task_nr_scan_windows(struct task_struct *p)
{
unsigned long rss = 0;
unsigned long nr_scan_pages;
/*
* Calculations based on RSS as non-present and empty pages are skipped
* by the PTE scanner and NUMA hinting faults should be trapped based
* on resident pages
*/
nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
rss = get_mm_rss(p->mm);
if (!rss)
rss = nr_scan_pages;
rss = round_up(rss, nr_scan_pages);
return rss / nr_scan_pages;
}
/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
#define MAX_SCAN_WINDOW 2560
static unsigned int task_scan_min(struct task_struct *p)
{
unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
unsigned int scan, floor;
unsigned int windows = 1;
if (scan_size < MAX_SCAN_WINDOW)
windows = MAX_SCAN_WINDOW / scan_size;
floor = 1000 / windows;
scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
return max_t(unsigned int, floor, scan);
}
static unsigned int task_scan_start(struct task_struct *p)
{
unsigned long smin = task_scan_min(p);
unsigned long period = smin;
struct numa_group *ng;
/* Scale the maximum scan period with the amount of shared memory. */
rcu_read_lock();
ng = rcu_dereference(p->numa_group);
if (ng) {
unsigned long shared = group_faults_shared(ng);
unsigned long private = group_faults_priv(ng);
period *= refcount_read(&ng->refcount);
period *= shared + 1;
period /= private + shared + 1;
}
rcu_read_unlock();
return max(smin, period);
}
static unsigned int task_scan_max(struct task_struct *p)
{
unsigned long smin = task_scan_min(p);
unsigned long smax;
struct numa_group *ng;
/* Watch for min being lower than max due to floor calculations */
smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
/* Scale the maximum scan period with the amount of shared memory. */
ng = deref_curr_numa_group(p);
if (ng) {
unsigned long shared = group_faults_shared(ng);
unsigned long private = group_faults_priv(ng);
unsigned long period = smax;
period *= refcount_read(&ng->refcount);
period *= shared + 1;
period /= private + shared + 1;
smax = max(smax, period);
}
return max(smin, smax);
}
static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
{
rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
}
static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
{
rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE);
rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
}
/* Shared or private faults. */
#define NR_NUMA_HINT_FAULT_TYPES 2
/* Memory and CPU locality */
#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
/* Averaged statistics, and temporary buffers. */
#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
pid_t task_numa_group_id(struct task_struct *p)
{
struct numa_group *ng;
pid_t gid = 0;
rcu_read_lock();
ng = rcu_dereference(p->numa_group);
if (ng)
gid = ng->gid;
rcu_read_unlock();
return gid;
}
/*
* The averaged statistics, shared & private, memory & CPU,
* occupy the first half of the array. The second half of the
* array is for current counters, which are averaged into the
* first set by task_numa_placement.
*/
static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
{
return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
}
static inline unsigned long task_faults(struct task_struct *p, int nid)
{
if (!p->numa_faults)
return 0;
return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
}
static inline unsigned long group_faults(struct task_struct *p, int nid)
{
struct numa_group *ng = deref_task_numa_group(p);
if (!ng)
return 0;
return ng->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
ng->faults[task_faults_idx(NUMA_MEM, nid, 1)];
}
static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
{
return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
}
static inline unsigned long group_faults_priv(struct numa_group *ng)
{
unsigned long faults = 0;
int node;
for_each_online_node(node) {
faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
}
return faults;
}
static inline unsigned long group_faults_shared(struct numa_group *ng)
{
unsigned long faults = 0;
int node;
for_each_online_node(node) {
faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
}
return faults;
}
/*
* A node triggering more than 1/3 as many NUMA faults as the maximum is
* considered part of a numa group's pseudo-interleaving set. Migrations
* between these nodes are slowed down, to allow things to settle down.
*/
#define ACTIVE_NODE_FRACTION 3
static bool numa_is_active_node(int nid, struct numa_group *ng)
{
return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
}
/* Handle placement on systems where not all nodes are directly connected. */
static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
int maxdist, bool task)
{
unsigned long score = 0;
int node;
/*
* All nodes are directly connected, and the same distance
* from each other. No need for fancy placement algorithms.
*/
if (sched_numa_topology_type == NUMA_DIRECT)
return 0;
/*
* This code is called for each node, introducing N^2 complexity,
* which should be ok given the number of nodes rarely exceeds 8.
*/
for_each_online_node(node) {
unsigned long faults;
int dist = node_distance(nid, node);
/*
* The furthest away nodes in the system are not interesting
* for placement; nid was already counted.
*/
if (dist == sched_max_numa_distance || node == nid)
continue;
/*
* On systems with a backplane NUMA topology, compare groups
* of nodes, and move tasks towards the group with the most
* memory accesses. When comparing two nodes at distance
* "hoplimit", only nodes closer by than "hoplimit" are part
* of each group. Skip other nodes.
*/
if (sched_numa_topology_type == NUMA_BACKPLANE &&
dist >= maxdist)
continue;
/* Add up the faults from nearby nodes. */
if (task)
faults = task_faults(p, node);
else
faults = group_faults(p, node);
/*
* On systems with a glueless mesh NUMA topology, there are
* no fixed "groups of nodes". Instead, nodes that are not
* directly connected bounce traffic through intermediate
* nodes; a numa_group can occupy any set of nodes.
* The further away a node is, the less the faults count.
* This seems to result in good task placement.
*/
if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
faults *= (sched_max_numa_distance - dist);
faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
}
score += faults;
}
return score;
}
/*
* These return the fraction of accesses done by a particular task, or
* task group, on a particular numa node. The group weight is given a
* larger multiplier, in order to group tasks together that are almost
* evenly spread out between numa nodes.
*/
static inline unsigned long task_weight(struct task_struct *p, int nid,
int dist)
{
unsigned long faults, total_faults;
if (!p->numa_faults)
return 0;
total_faults = p->total_numa_faults;
if (!total_faults)
return 0;
faults = task_faults(p, nid);
faults += score_nearby_nodes(p, nid, dist, true);
return 1000 * faults / total_faults;
}
static inline unsigned long group_weight(struct task_struct *p, int nid,
int dist)
{
struct numa_group *ng = deref_task_numa_group(p);
unsigned long faults, total_faults;
if (!ng)
return 0;
total_faults = ng->total_faults;
if (!total_faults)
return 0;
faults = group_faults(p, nid);
faults += score_nearby_nodes(p, nid, dist, false);
return 1000 * faults / total_faults;
}
bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
int src_nid, int dst_cpu)
{
struct numa_group *ng = deref_curr_numa_group(p);
int dst_nid = cpu_to_node(dst_cpu);
int last_cpupid, this_cpupid;
this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
/*
* Allow first faults or private faults to migrate immediately early in
* the lifetime of a task. The magic number 4 is based on waiting for
* two full passes of the "multi-stage node selection" test that is
* executed below.
*/
if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) &&
(cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
return true;
/*
* Multi-stage node selection is used in conjunction with a periodic
* migration fault to build a temporal task<->page relation. By using
* a two-stage filter we remove short/unlikely relations.
*
* Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
* a task's usage of a particular page (n_p) per total usage of this
* page (n_t) (in a given time-span) to a probability.
*
* Our periodic faults will sample this probability and getting the
* same result twice in a row, given these samples are fully
* independent, is then given by P(n)^2, provided our sample period
* is sufficiently short compared to the usage pattern.
*
* This quadric squishes small probabilities, making it less likely we
* act on an unlikely task<->page relation.
*/
if (!cpupid_pid_unset(last_cpupid) &&
cpupid_to_nid(last_cpupid) != dst_nid)
return false;
/* Always allow migrate on private faults */
if (cpupid_match_pid(p, last_cpupid))
return true;
/* A shared fault, but p->numa_group has not been set up yet. */
if (!ng)
return true;
/*
* Destination node is much more heavily used than the source
* node? Allow migration.
*/
if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
ACTIVE_NODE_FRACTION)
return true;
/*
* Distribute memory according to CPU & memory use on each node,
* with 3/4 hysteresis to avoid unnecessary memory migrations:
*
* faults_cpu(dst) 3 faults_cpu(src)
* --------------- * - > ---------------
* faults_mem(dst) 4 faults_mem(src)
*/
return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
}
/*
* 'numa_type' describes the node at the moment of load balancing.
*/
enum numa_type {
/* The node has spare capacity that can be used to run more tasks. */
node_has_spare = 0,
/*
* The node is fully used and the tasks don't compete for more CPU
* cycles. Nevertheless, some tasks might wait before running.
*/
node_fully_busy,
/*
* The node is overloaded and can't provide expected CPU cycles to all
* tasks.
*/
node_overloaded
};
/* Cached statistics for all CPUs within a node */
struct numa_stats {
unsigned long load;
unsigned long runnable;
unsigned long util;
/* Total compute capacity of CPUs on a node */
unsigned long compute_capacity;
unsigned int nr_running;
unsigned int weight;
enum numa_type node_type;
int idle_cpu;
};
static inline bool is_core_idle(int cpu)
{
#ifdef CONFIG_SCHED_SMT
int sibling;
for_each_cpu(sibling, cpu_smt_mask(cpu)) {
if (cpu == sibling)
continue;
if (!idle_cpu(sibling))
return false;
}
#endif
return true;
}
struct task_numa_env {
struct task_struct *p;
int src_cpu, src_nid;
int dst_cpu, dst_nid;
struct numa_stats src_stats, dst_stats;
int imbalance_pct;
int dist;
struct task_struct *best_task;
long best_imp;
int best_cpu;
};
static unsigned long cpu_load(struct rq *rq);
static unsigned long cpu_runnable(struct rq *rq);
static unsigned long cpu_util(int cpu);
static inline long adjust_numa_imbalance(int imbalance, int nr_running);
static inline enum
numa_type numa_classify(unsigned int imbalance_pct,
struct numa_stats *ns)
{
if ((ns->nr_running > ns->weight) &&
(((ns->compute_capacity * 100) < (ns->util * imbalance_pct)) ||
((ns->compute_capacity * imbalance_pct) < (ns->runnable * 100))))
return node_overloaded;
if ((ns->nr_running < ns->weight) ||
(((ns->compute_capacity * 100) > (ns->util * imbalance_pct)) &&
((ns->compute_capacity * imbalance_pct) > (ns->runnable * 100))))
return node_has_spare;
return node_fully_busy;
}
#ifdef CONFIG_SCHED_SMT
/* Forward declarations of select_idle_sibling helpers */
static inline bool test_idle_cores(int cpu, bool def);
static inline int numa_idle_core(int idle_core, int cpu)
{
if (!static_branch_likely(&sched_smt_present) ||
idle_core >= 0 || !test_idle_cores(cpu, false))
return idle_core;
/*
* Prefer cores instead of packing HT siblings
* and triggering future load balancing.
*/
if (is_core_idle(cpu))
idle_core = cpu;
return idle_core;
}
#else
static inline int numa_idle_core(int idle_core, int cpu)
{
return idle_core;
}
#endif
/*
* Gather all necessary information to make NUMA balancing placement
* decisions that are compatible with standard load balancer. This
* borrows code and logic from update_sg_lb_stats but sharing a
* common implementation is impractical.
*/
static void update_numa_stats(struct task_numa_env *env,
struct numa_stats *ns, int nid,
bool find_idle)
{
int cpu, idle_core = -1;
memset(ns, 0, sizeof(*ns));
ns->idle_cpu = -1;
rcu_read_lock();
for_each_cpu(cpu, cpumask_of_node(nid)) {
struct rq *rq = cpu_rq(cpu);
ns->load += cpu_load(rq);
ns->runnable += cpu_runnable(rq);
ns->util += cpu_util(cpu);
ns->nr_running += rq->cfs.h_nr_running;
ns->compute_capacity += capacity_of(cpu);
if (find_idle && !rq->nr_running && idle_cpu(cpu)) {
if (READ_ONCE(rq->numa_migrate_on) ||
!cpumask_test_cpu(cpu, env->p->cpus_ptr))
continue;
if (ns->idle_cpu == -1)
ns->idle_cpu = cpu;
idle_core = numa_idle_core(idle_core, cpu);
}
}
rcu_read_unlock();
ns->weight = cpumask_weight(cpumask_of_node(nid));
ns->node_type = numa_classify(env->imbalance_pct, ns);
if (idle_core >= 0)
ns->idle_cpu = idle_core;
}
static void task_numa_assign(struct task_numa_env *env,
struct task_struct *p, long imp)
{
struct rq *rq = cpu_rq(env->dst_cpu);
/* Check if run-queue part of active NUMA balance. */
if (env->best_cpu != env->dst_cpu && xchg(&rq->numa_migrate_on, 1)) {
int cpu;
int start = env->dst_cpu;
/* Find alternative idle CPU. */
for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start) {
if (cpu == env->best_cpu || !idle_cpu(cpu) ||
!cpumask_test_cpu(cpu, env->p->cpus_ptr)) {
continue;
}
env->dst_cpu = cpu;
rq = cpu_rq(env->dst_cpu);
if (!xchg(&rq->numa_migrate_on, 1))
goto assign;
}
/* Failed to find an alternative idle CPU */
return;
}
assign:
/*
* Clear previous best_cpu/rq numa-migrate flag, since task now
* found a better CPU to move/swap.
*/
if (env->best_cpu != -1 && env->best_cpu != env->dst_cpu) {
rq = cpu_rq(env->best_cpu);
WRITE_ONCE(rq->numa_migrate_on, 0);
}
if (env->best_task)
put_task_struct(env->best_task);
if (p)
get_task_struct(p);
env->best_task = p;
env->best_imp = imp;
env->best_cpu = env->dst_cpu;
}
static bool load_too_imbalanced(long src_load, long dst_load,
struct task_numa_env *env)
{
long imb, old_imb;
long orig_src_load, orig_dst_load;
long src_capacity, dst_capacity;
/*
* The load is corrected for the CPU capacity available on each node.
*
* src_load dst_load
* ------------ vs ---------
* src_capacity dst_capacity
*/
src_capacity = env->src_stats.compute_capacity;
dst_capacity = env->dst_stats.compute_capacity;
imb = abs(dst_load * src_capacity - src_load * dst_capacity);
orig_src_load = env->src_stats.load;
orig_dst_load = env->dst_stats.load;
old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity);
/* Would this change make things worse? */
return (imb > old_imb);
}
/*
* Maximum NUMA importance can be 1998 (2*999);
* SMALLIMP @ 30 would be close to 1998/64.
* Used to deter task migration.
*/
#define SMALLIMP 30
/*
* This checks if the overall compute and NUMA accesses of the system would
* be improved if the source tasks was migrated to the target dst_cpu taking
* into account that it might be best if task running on the dst_cpu should
* be exchanged with the source task
*/
static bool task_numa_compare(struct task_numa_env *env,
long taskimp, long groupimp, bool maymove)
{
struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p);
struct rq *dst_rq = cpu_rq(env->dst_cpu);
long imp = p_ng ? groupimp : taskimp;
struct task_struct *cur;
long src_load, dst_load;
int dist = env->dist;
long moveimp = imp;
long load;
bool stopsearch = false;
if (READ_ONCE(dst_rq->numa_migrate_on))
return false;
rcu_read_lock();
cur = rcu_dereference(dst_rq->curr);
if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
cur = NULL;
/*
* Because we have preemption enabled we can get migrated around and
* end try selecting ourselves (current == env->p) as a swap candidate.
*/
if (cur == env->p) {
stopsearch = true;
goto unlock;
}
if (!cur) {
if (maymove && moveimp >= env->best_imp)
goto assign;
else
goto unlock;
}
/* Skip this swap candidate if cannot move to the source cpu. */
if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
goto unlock;
/*
* Skip this swap candidate if it is not moving to its preferred
* node and the best task is.
*/
if (env->best_task &&
env->best_task->numa_preferred_nid == env->src_nid &&
cur->numa_preferred_nid != env->src_nid) {
goto unlock;
}
/*
* "imp" is the fault differential for the source task between the
* source and destination node. Calculate the total differential for
* the source task and potential destination task. The more negative
* the value is, the more remote accesses that would be expected to
* be incurred if the tasks were swapped.
*
* If dst and source tasks are in the same NUMA group, or not
* in any group then look only at task weights.
*/
cur_ng = rcu_dereference(cur->numa_group);
if (cur_ng == p_ng) {
imp = taskimp + task_weight(cur, env->src_nid, dist) -
task_weight(cur, env->dst_nid, dist);
/*
* Add some hysteresis to prevent swapping the
* tasks within a group over tiny differences.
*/
if (cur_ng)
imp -= imp / 16;
} else {
/*
* Compare the group weights. If a task is all by itself
* (not part of a group), use the task weight instead.
*/
if (cur_ng && p_ng)
imp += group_weight(cur, env->src_nid, dist) -
group_weight(cur, env->dst_nid, dist);
else
imp += task_weight(cur, env->src_nid, dist) -
task_weight(cur, env->dst_nid, dist);
}
/* Discourage picking a task already on its preferred node */
if (cur->numa_preferred_nid == env->dst_nid)
imp -= imp / 16;
/*
* Encourage picking a task that moves to its preferred node.
* This potentially makes imp larger than it's maximum of
* 1998 (see SMALLIMP and task_weight for why) but in this
* case, it does not matter.
*/
if (cur->numa_preferred_nid == env->src_nid)
imp += imp / 8;
if (maymove && moveimp > imp && moveimp > env->best_imp) {
imp = moveimp;
cur = NULL;
goto assign;
}
/*
* Prefer swapping with a task moving to its preferred node over a
* task that is not.
*/
if (env->best_task && cur->numa_preferred_nid == env->src_nid &&
env->best_task->numa_preferred_nid != env->src_nid) {
goto assign;
}
/*
* If the NUMA importance is less than SMALLIMP,
* task migration might only result in ping pong
* of tasks and also hurt performance due to cache
* misses.
*/
if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2)
goto unlock;
/*
* In the overloaded case, try and keep the load balanced.
*/
load = task_h_load(env->p) - task_h_load(cur);
if (!load)
goto assign;
dst_load = env->dst_stats.load + load;
src_load = env->src_stats.load - load;
if (load_too_imbalanced(src_load, dst_load, env))
goto unlock;
assign:
/* Evaluate an idle CPU for a task numa move. */
if (!cur) {
int cpu = env->dst_stats.idle_cpu;
/* Nothing cached so current CPU went idle since the search. */
if (cpu < 0)
cpu = env->dst_cpu;
/*
* If the CPU is no longer truly idle and the previous best CPU
* is, keep using it.
*/
if (!idle_cpu(cpu) && env->best_cpu >= 0 &&
idle_cpu(env->best_cpu)) {
cpu = env->best_cpu;
}
env->dst_cpu = cpu;
}
task_numa_assign(env, cur, imp);
/*
* If a move to idle is allowed because there is capacity or load
* balance improves then stop the search. While a better swap
* candidate may exist, a search is not free.
*/
if (maymove && !cur && env->best_cpu >= 0 && idle_cpu(env->best_cpu))
stopsearch = true;
/*
* If a swap candidate must be identified and the current best task
* moves its preferred node then stop the search.
*/
if (!maymove && env->best_task &&
env->best_task->numa_preferred_nid == env->src_nid) {
stopsearch = true;
}
unlock:
rcu_read_unlock();
return stopsearch;
}
static void task_numa_find_cpu(struct task_numa_env *env,
long taskimp, long groupimp)
{
bool maymove = false;
int cpu;
/*
* If dst node has spare capacity, then check if there is an
* imbalance that would be overruled by the load balancer.
*/
if (env->dst_stats.node_type == node_has_spare) {
unsigned int imbalance;
int src_running, dst_running;
/*
* Would movement cause an imbalance? Note that if src has
* more running tasks that the imbalance is ignored as the
* move improves the imbalance from the perspective of the
* CPU load balancer.
* */
src_running = env->src_stats.nr_running - 1;
dst_running = env->dst_stats.nr_running + 1;
imbalance = max(0, dst_running - src_running);
imbalance = adjust_numa_imbalance(imbalance, dst_running);
/* Use idle CPU if there is no imbalance */
if (!imbalance) {
maymove = true;
if (env->dst_stats.idle_cpu >= 0) {
env->dst_cpu = env->dst_stats.idle_cpu;
task_numa_assign(env, NULL, 0);
return;
}
}
} else {
long src_load, dst_load, load;
/*
* If the improvement from just moving env->p direction is better
* than swapping tasks around, check if a move is possible.
*/
load = task_h_load(env->p);
dst_load = env->dst_stats.load + load;
src_load = env->src_stats.load - load;
maymove = !load_too_imbalanced(src_load, dst_load, env);
}
for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
/* Skip this CPU if the source task cannot migrate */
if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
continue;
env->dst_cpu = cpu;
if (task_numa_compare(env, taskimp, groupimp, maymove))
break;
}
}
static int task_numa_migrate(struct task_struct *p)
{
struct task_numa_env env = {
.p = p,
.src_cpu = task_cpu(p),
.src_nid = task_node(p),
.imbalance_pct = 112,
.best_task = NULL,
.best_imp = 0,
.best_cpu = -1,
};
unsigned long taskweight, groupweight;
struct sched_domain *sd;
long taskimp, groupimp;
struct numa_group *ng;
struct rq *best_rq;
int nid, ret, dist;
/*
* Pick the lowest SD_NUMA domain, as that would have the smallest
* imbalance and would be the first to start moving tasks about.
*
* And we want to avoid any moving of tasks about, as that would create
* random movement of tasks -- counter the numa conditions we're trying
* to satisfy here.
*/
rcu_read_lock();
sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
if (sd)
env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
rcu_read_unlock();
/*
* Cpusets can break the scheduler domain tree into smaller
* balance domains, some of which do not cross NUMA boundaries.
* Tasks that are "trapped" in such domains cannot be migrated
* elsewhere, so there is no point in (re)trying.
*/
if (unlikely(!sd)) {
sched_setnuma(p, task_node(p));
return -EINVAL;
}
env.dst_nid = p->numa_preferred_nid;
dist = env.dist = node_distance(env.src_nid, env.dst_nid);
taskweight = task_weight(p, env.src_nid, dist);
groupweight = group_weight(p, env.src_nid, dist);
update_numa_stats(&env, &env.src_stats, env.src_nid, false);
taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
/* Try to find a spot on the preferred nid. */
task_numa_find_cpu(&env, taskimp, groupimp);
/*
* Look at other nodes in these cases:
* - there is no space available on the preferred_nid
* - the task is part of a numa_group that is interleaved across
* multiple NUMA nodes; in order to better consolidate the group,
* we need to check other locations.
*/
ng = deref_curr_numa_group(p);
if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) {
for_each_online_node(nid) {
if (nid == env.src_nid || nid == p->numa_preferred_nid)
continue;
dist = node_distance(env.src_nid, env.dst_nid);
if (sched_numa_topology_type == NUMA_BACKPLANE &&
dist != env.dist) {
taskweight = task_weight(p, env.src_nid, dist);
groupweight = group_weight(p, env.src_nid, dist);
}
/* Only consider nodes where both task and groups benefit */
taskimp = task_weight(p, nid, dist) - taskweight;
groupimp = group_weight(p, nid, dist) - groupweight;
if (taskimp < 0 && groupimp < 0)
continue;
env.dist = dist;
env.dst_nid = nid;
update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
task_numa_find_cpu(&env, taskimp, groupimp);
}
}
/*
* If the task is part of a workload that spans multiple NUMA nodes,
* and is migrating into one of the workload's active nodes, remember
* this node as the task's preferred numa node, so the workload can
* settle down.
* A task that migrated to a second choice node will be better off
* trying for a better one later. Do not set the preferred node here.
*/
if (ng) {
if (env.best_cpu == -1)
nid = env.src_nid;
else
nid = cpu_to_node(env.best_cpu);
if (nid != p->numa_preferred_nid)
sched_setnuma(p, nid);
}
/* No better CPU than the current one was found. */
if (env.best_cpu == -1) {
trace_sched_stick_numa(p, env.src_cpu, NULL, -1);
return -EAGAIN;
}
best_rq = cpu_rq(env.best_cpu);
if (env.best_task == NULL) {
ret = migrate_task_to(p, env.best_cpu);
WRITE_ONCE(best_rq->numa_migrate_on, 0);
if (ret != 0)
trace_sched_stick_numa(p, env.src_cpu, NULL, env.best_cpu);
return ret;
}
ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu);
WRITE_ONCE(best_rq->numa_migrate_on, 0);
if (ret != 0)
trace_sched_stick_numa(p, env.src_cpu, env.best_task, env.best_cpu);
put_task_struct(env.best_task);
return ret;
}
/* Attempt to migrate a task to a CPU on the preferred node. */
static void numa_migrate_preferred(struct task_struct *p)
{
unsigned long interval = HZ;
/* This task has no NUMA fault statistics yet */
if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults))
return;
/* Periodically retry migrating the task to the preferred node */
interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
p->numa_migrate_retry = jiffies + interval;
/* Success if task is already running on preferred CPU */
if (task_node(p) == p->numa_preferred_nid)
return;
/* Otherwise, try migrate to a CPU on the preferred node */
task_numa_migrate(p);
}
/*
* Find out how many nodes on the workload is actively running on. Do this by
* tracking the nodes from which NUMA hinting faults are triggered. This can
* be different from the set of nodes where the workload's memory is currently
* located.
*/
static void numa_group_count_active_nodes(struct numa_group *numa_group)
{
unsigned long faults, max_faults = 0;
int nid, active_nodes = 0;
for_each_online_node(nid) {
faults = group_faults_cpu(numa_group, nid);
if (faults > max_faults)
max_faults = faults;
}
for_each_online_node(nid) {
faults = group_faults_cpu(numa_group, nid);
if (faults * ACTIVE_NODE_FRACTION > max_faults)
active_nodes++;
}
numa_group->max_faults_cpu = max_faults;
numa_group->active_nodes = active_nodes;
}
/*
* When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
* increments. The more local the fault statistics are, the higher the scan
* period will be for the next scan window. If local/(local+remote) ratio is
* below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
* the scan period will decrease. Aim for 70% local accesses.
*/
#define NUMA_PERIOD_SLOTS 10
#define NUMA_PERIOD_THRESHOLD 7
/*
* Increase the scan period (slow down scanning) if the majority of
* our memory is already on our local node, or if the majority of
* the page accesses are shared with other processes.
* Otherwise, decrease the scan period.
*/
static void update_task_scan_period(struct task_struct *p,
unsigned long shared, unsigned long private)
{
unsigned int period_slot;
int lr_ratio, ps_ratio;
int diff;
unsigned long remote = p->numa_faults_locality[0];
unsigned long local = p->numa_faults_locality[1];
/*
* If there were no record hinting faults then either the task is
* completely idle or all activity is areas that are not of interest
* to automatic numa balancing. Related to that, if there were failed
* migration then it implies we are migrating too quickly or the local
* node is overloaded. In either case, scan slower
*/
if (local + shared == 0 || p->numa_faults_locality[2]) {
p->numa_scan_period = min(p->numa_scan_period_max,
p->numa_scan_period << 1);
p->mm->numa_next_scan = jiffies +
msecs_to_jiffies(p->numa_scan_period);
return;
}
/*
* Prepare to scale scan period relative to the current period.
* == NUMA_PERIOD_THRESHOLD scan period stays the same
* < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
* >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
*/
period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
if (ps_ratio >= NUMA_PERIOD_THRESHOLD) {
/*
* Most memory accesses are local. There is no need to
* do fast NUMA scanning, since memory is already local.
*/
int slot = ps_ratio - NUMA_PERIOD_THRESHOLD;
if (!slot)
slot = 1;
diff = slot * period_slot;
} else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) {
/*
* Most memory accesses are shared with other tasks.
* There is no point in continuing fast NUMA scanning,
* since other tasks may just move the memory elsewhere.
*/
int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;
if (!slot)
slot = 1;
diff = slot * period_slot;
} else {
/*
* Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS,
* yet they are not on the local NUMA node. Speed up
* NUMA scanning to get the memory moved over.
*/
int ratio = max(lr_ratio, ps_ratio);
diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
}
p->numa_scan_period = clamp(p->numa_scan_period + diff,
task_scan_min(p), task_scan_max(p));
memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
}
/*
* Get the fraction of time the task has been running since the last
* NUMA placement cycle. The scheduler keeps similar statistics, but
* decays those on a 32ms period, which is orders of magnitude off
* from the dozens-of-seconds NUMA balancing period. Use the scheduler
* stats only if the task is so new there are no NUMA statistics yet.
*/
static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
{
u64 runtime, delta, now;
/* Use the start of this time slice to avoid calculations. */
now = p->se.exec_start;
runtime = p->se.sum_exec_runtime;
if (p->last_task_numa_placement) {
delta = runtime - p->last_sum_exec_runtime;
*period = now - p->last_task_numa_placement;
/* Avoid time going backwards, prevent potential divide error: */
if (unlikely((s64)*period < 0))
*period = 0;
} else {
delta = p->se.avg.load_sum;
*period = LOAD_AVG_MAX;
}
p->last_sum_exec_runtime = runtime;
p->last_task_numa_placement = now;
return delta;
}
/*
* Determine the preferred nid for a task in a numa_group. This needs to
* be done in a way that produces consistent results with group_weight,
* otherwise workloads might not converge.
*/
static int preferred_group_nid(struct task_struct *p, int nid)
{
nodemask_t nodes;
int dist;
/* Direct connections between all NUMA nodes. */
if (sched_numa_topology_type == NUMA_DIRECT)
return nid;
/*
* On a system with glueless mesh NUMA topology, group_weight
* scores nodes according to the number of NUMA hinting faults on
* both the node itself, and on nearby nodes.
*/
if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
unsigned long score, max_score = 0;
int node, max_node = nid;
dist = sched_max_numa_distance;
for_each_online_node(node) {
score = group_weight(p, node, dist);
if (score > max_score) {
max_score = score;
max_node = node;
}
}
return max_node;
}
/*
* Finding the preferred nid in a system with NUMA backplane
* interconnect topology is more involved. The goal is to locate
* tasks from numa_groups near each other in the system, and
* untangle workloads from different sides of the system. This requires
* searching down the hierarchy of node groups, recursively searching
* inside the highest scoring group of nodes. The nodemask tricks
* keep the complexity of the search down.
*/
nodes = node_online_map;
for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
unsigned long max_faults = 0;
nodemask_t max_group = NODE_MASK_NONE;
int a, b;
/* Are there nodes at this distance from each other? */
if (!find_numa_distance(dist))
continue;
for_each_node_mask(a, nodes) {
unsigned long faults = 0;
nodemask_t this_group;
nodes_clear(this_group);
/* Sum group's NUMA faults; includes a==b case. */
for_each_node_mask(b, nodes) {
if (node_distance(a, b) < dist) {
faults += group_faults(p, b);
node_set(b, this_group);
node_clear(b, nodes);
}
}
/* Remember the top group. */
if (faults > max_faults) {
max_faults = faults;
max_group = this_group;
/*
* subtle: at the smallest distance there is
* just one node left in each "group", the
* winner is the preferred nid.
*/
nid = a;
}
}
/* Next round, evaluate the nodes within max_group. */
if (!max_faults)
break;
nodes = max_group;
}
return nid;
}
static void task_numa_placement(struct task_struct *p)
{
int seq, nid, max_nid = NUMA_NO_NODE;
unsigned long max_faults = 0;
unsigned long fault_types[2] = { 0, 0 };
unsigned long total_faults;
u64 runtime, period;
spinlock_t *group_lock = NULL;
struct numa_group *ng;
/*
* The p->mm->numa_scan_seq field gets updated without
* exclusive access. Use READ_ONCE() here to ensure
* that the field is read in a single access:
*/
seq = READ_ONCE(p->mm->numa_scan_seq);
if (p->numa_scan_seq == seq)
return;
p->numa_scan_seq = seq;
p->numa_scan_period_max = task_scan_max(p);
total_faults = p->numa_faults_locality[0] +
p->numa_faults_locality[1];
runtime = numa_get_avg_runtime(p, &period);
/* If the task is part of a group prevent parallel updates to group stats */
ng = deref_curr_numa_group(p);
if (ng) {
group_lock = &ng->lock;
spin_lock_irq(group_lock);
}
/* Find the node with the highest number of faults */
for_each_online_node(nid) {
/* Keep track of the offsets in numa_faults array */
int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
unsigned long faults = 0, group_faults = 0;
int priv;
for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
long diff, f_diff, f_weight;
mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
/* Decay existing window, copy faults since last scan */
diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
fault_types[priv] += p->numa_faults[membuf_idx];
p->numa_faults[membuf_idx] = 0;
/*
* Normalize the faults_from, so all tasks in a group
* count according to CPU use, instead of by the raw
* number of faults. Tasks with little runtime have
* little over-all impact on throughput, and thus their
* faults are less important.
*/
f_weight = div64_u64(runtime << 16, period + 1);
f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
(total_faults + 1);
f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
p->numa_faults[cpubuf_idx] = 0;
p->numa_faults[mem_idx] += diff;
p->numa_faults[cpu_idx] += f_diff;
faults += p->numa_faults[mem_idx];
p->total_numa_faults += diff;
if (ng) {
/*
* safe because we can only change our own group
*
* mem_idx represents the offset for a given
* nid and priv in a specific region because it
* is at the beginning of the numa_faults array.
*/
ng->faults[mem_idx] += diff;
ng->faults_cpu[mem_idx] += f_diff;
ng->total_faults += diff;
group_faults += ng->faults[mem_idx];
}
}
if (!ng) {
if (faults > max_faults) {
max_faults = faults;
max_nid = nid;
}
} else if (group_faults > max_faults) {
max_faults = group_faults;
max_nid = nid;
}
}
if (ng) {
numa_group_count_active_nodes(ng);
spin_unlock_irq(group_lock);
max_nid = preferred_group_nid(p, max_nid);
}
if (max_faults) {
/* Set the new preferred node */
if (max_nid != p->numa_preferred_nid)
sched_setnuma(p, max_nid);
}
update_task_scan_period(p, fault_types[0], fault_types[1]);
}
static inline int get_numa_group(struct numa_group *grp)
{
return refcount_inc_not_zero(&grp->refcount);
}
static inline void put_numa_group(struct numa_group *grp)
{
if (refcount_dec_and_test(&grp->refcount))
kfree_rcu(grp, rcu);
}
static void task_numa_group(struct task_struct *p, int cpupid, int flags,
int *priv)
{
struct numa_group *grp, *my_grp;
struct task_struct *tsk;
bool join = false;
int cpu = cpupid_to_cpu(cpupid);
int i;
if (unlikely(!deref_curr_numa_group(p))) {
unsigned int size = sizeof(struct numa_group) +
4*nr_node_ids*sizeof(unsigned long);
grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
if (!grp)
return;
refcount_set(&grp->refcount, 1);
grp->active_nodes = 1;
grp->max_faults_cpu = 0;
spin_lock_init(&grp->lock);
grp->gid = p->pid;
/* Second half of the array tracks nids where faults happen */
grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
nr_node_ids;
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
grp->faults[i] = p->numa_faults[i];
grp->total_faults = p->total_numa_faults;
grp->nr_tasks++;
rcu_assign_pointer(p->numa_group, grp);
}
rcu_read_lock();
tsk = READ_ONCE(cpu_rq(cpu)->curr);
if (!cpupid_match_pid(tsk, cpupid))
goto no_join;
grp = rcu_dereference(tsk->numa_group);
if (!grp)
goto no_join;
my_grp = deref_curr_numa_group(p);
if (grp == my_grp)
goto no_join;
/*
* Only join the other group if its bigger; if we're the bigger group,
* the other task will join us.
*/
if (my_grp->nr_tasks > grp->nr_tasks)
goto no_join;
/*
* Tie-break on the grp address.
*/
if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
goto no_join;
/* Always join threads in the same process. */
if (tsk->mm == current->mm)
join = true;
/* Simple filter to avoid false positives due to PID collisions */
if (flags & TNF_SHARED)
join = true;
/* Update priv based on whether false sharing was detected */
*priv = !join;
if (join && !get_numa_group(grp))
goto no_join;
rcu_read_unlock();
if (!join)
return;
BUG_ON(irqs_disabled());
double_lock_irq(&my_grp->lock, &grp->lock);
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
my_grp->faults[i] -= p->numa_faults[i];
grp->faults[i] += p->numa_faults[i];
}
my_grp->total_faults -= p->total_numa_faults;
grp->total_faults += p->total_numa_faults;
my_grp->nr_tasks--;
grp->nr_tasks++;
spin_unlock(&my_grp->lock);
spin_unlock_irq(&grp->lock);
rcu_assign_pointer(p->numa_group, grp);
put_numa_group(my_grp);
return;
no_join:
rcu_read_unlock();
return;
}
/*
* Get rid of NUMA staticstics associated with a task (either current or dead).
* If @final is set, the task is dead and has reached refcount zero, so we can
* safely free all relevant data structures. Otherwise, there might be
* concurrent reads from places like load balancing and procfs, and we should
* reset the data back to default state without freeing ->numa_faults.
*/
void task_numa_free(struct task_struct *p, bool final)
{
/* safe: p either is current or is being freed by current */
struct numa_group *grp = rcu_dereference_raw(p->numa_group);
unsigned long *numa_faults = p->numa_faults;
unsigned long flags;
int i;
if (!numa_faults)
return;
if (grp) {
spin_lock_irqsave(&grp->lock, flags);
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
grp->faults[i] -= p->numa_faults[i];
grp->total_faults -= p->total_numa_faults;
grp->nr_tasks--;
spin_unlock_irqrestore(&grp->lock, flags);
RCU_INIT_POINTER(p->numa_group, NULL);
put_numa_group(grp);
}
if (final) {
p->numa_faults = NULL;
kfree(numa_faults);
} else {
p->total_numa_faults = 0;
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
numa_faults[i] = 0;
}
}
/*
* Got a PROT_NONE fault for a page on @node.
*/
void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
{
struct task_struct *p = current;
bool migrated = flags & TNF_MIGRATED;
int cpu_node = task_node(current);
int local = !!(flags & TNF_FAULT_LOCAL);
struct numa_group *ng;
int priv;
if (!static_branch_likely(&sched_numa_balancing))
return;
/* for example, ksmd faulting in a user's mm */
if (!p->mm)
return;
/* Allocate buffer to track faults on a per-node basis */
if (unlikely(!p->numa_faults)) {
int size = sizeof(*p->numa_faults) *
NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
if (!p->numa_faults)
return;
p->total_numa_faults = 0;
memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
}
/*
* First accesses are treated as private, otherwise consider accesses
* to be private if the accessing pid has not changed
*/
if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
priv = 1;
} else {
priv = cpupid_match_pid(p, last_cpupid);
if (!priv && !(flags & TNF_NO_GROUP))
task_numa_group(p, last_cpupid, flags, &priv);
}
/*
* If a workload spans multiple NUMA nodes, a shared fault that
* occurs wholly within the set of nodes that the workload is
* actively using should be counted as local. This allows the
* scan rate to slow down when a workload has settled down.
*/
ng = deref_curr_numa_group(p);
if (!priv && !local && ng && ng->active_nodes > 1 &&
numa_is_active_node(cpu_node, ng) &&
numa_is_active_node(mem_node, ng))
local = 1;
/*
* Retry to migrate task to preferred node periodically, in case it
* previously failed, or the scheduler moved us.
*/
if (time_after(jiffies, p->numa_migrate_retry)) {
task_numa_placement(p);
numa_migrate_preferred(p);
}
if (migrated)
p->numa_pages_migrated += pages;
if (flags & TNF_MIGRATE_FAIL)
p->numa_faults_locality[2] += pages;
p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
p->numa_faults_locality[local] += pages;
}
static void reset_ptenuma_scan(struct task_struct *p)
{
/*
* We only did a read acquisition of the mmap sem, so
* p->mm->numa_scan_seq is written to without exclusive access
* and the update is not guaranteed to be atomic. That's not
* much of an issue though, since this is just used for
* statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
* expensive, to avoid any form of compiler optimizations:
*/
WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
p->mm->numa_scan_offset = 0;
}
/*
* The expensive part of numa migration is done from task_work context.
* Triggered from task_tick_numa().
*/
static void task_numa_work(struct callback_head *work)
{
unsigned long migrate, next_scan, now = jiffies;
struct task_struct *p = current;
struct mm_struct *mm = p->mm;
u64 runtime = p->se.sum_exec_runtime;
struct vm_area_struct *vma;
unsigned long start, end;
unsigned long nr_pte_updates = 0;
long pages, virtpages;
SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
work->next = work;
/*
* Who cares about NUMA placement when they're dying.
*
* NOTE: make sure not to dereference p->mm before this check,
* exit_task_work() happens _after_ exit_mm() so we could be called
* without p->mm even though we still had it when we enqueued this
* work.
*/
if (p->flags & PF_EXITING)
return;
if (!mm->numa_next_scan) {
mm->numa_next_scan = now +
msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
}
/*
* Enforce maximal scan/migration frequency..
*/
migrate = mm->numa_next_scan;
if (time_before(now, migrate))
return;
if (p->numa_scan_period == 0) {
p->numa_scan_period_max = task_scan_max(p);
p->numa_scan_period = task_scan_start(p);
}
next_scan = now + msecs_to_jiffies(p->numa_scan_period);
if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
return;
/*
* Delay this task enough that another task of this mm will likely win
* the next time around.
*/
p->node_stamp += 2 * TICK_NSEC;
start = mm->numa_scan_offset;
pages = sysctl_numa_balancing_scan_size;
pages <<= 20 - PAGE_SHIFT; /* MB in pages */
virtpages = pages * 8; /* Scan up to this much virtual space */
if (!pages)
return;
if (!mmap_read_trylock(mm))
return;
vma = find_vma(mm, start);
if (!vma) {
reset_ptenuma_scan(p);
start = 0;
vma = mm->mmap;
}
for (; vma; vma = vma->vm_next) {
if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
continue;
}
/*
* Shared library pages mapped by multiple processes are not
* migrated as it is expected they are cache replicated. Avoid
* hinting faults in read-only file-backed mappings or the vdso
* as migrating the pages will be of marginal benefit.
*/
if (!vma->vm_mm ||
(vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
continue;
/*
* Skip inaccessible VMAs to avoid any confusion between
* PROT_NONE and NUMA hinting ptes
*/
if (!vma_is_accessible(vma))
continue;
do {
start = max(start, vma->vm_start);
end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
end = min(end, vma->vm_end);
nr_pte_updates = change_prot_numa(vma, start, end);
/*
* Try to scan sysctl_numa_balancing_size worth of
* hpages that have at least one present PTE that
* is not already pte-numa. If the VMA contains
* areas that are unused or already full of prot_numa
* PTEs, scan up to virtpages, to skip through those
* areas faster.
*/
if (nr_pte_updates)
pages -= (end - start) >> PAGE_SHIFT;
virtpages -= (end - start) >> PAGE_SHIFT;
start = end;
if (pages <= 0 || virtpages <= 0)
goto out;
cond_resched();
} while (end != vma->vm_end);
}
out:
/*
* It is possible to reach the end of the VMA list but the last few
* VMAs are not guaranteed to the vma_migratable. If they are not, we
* would find the !migratable VMA on the next scan but not reset the
* scanner to the start so check it now.
*/
if (vma)
mm->numa_scan_offset = start;
else
reset_ptenuma_scan(p);
mmap_read_unlock(mm);
/*
* Make sure tasks use at least 32x as much time to run other code
* than they used here, to limit NUMA PTE scanning overhead to 3% max.
* Usually update_task_scan_period slows down scanning enough; on an
* overloaded system we need to limit overhead on a per task basis.
*/
if (unlikely(p->se.sum_exec_runtime != runtime)) {
u64 diff = p->se.sum_exec_runtime - runtime;
p->node_stamp += 32 * diff;
}
}
void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
{
int mm_users = 0;
struct mm_struct *mm = p->mm;
if (mm) {
mm_users = atomic_read(&mm->mm_users);
if (mm_users == 1) {
mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
mm->numa_scan_seq = 0;
}
}
p->node_stamp = 0;
p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
p->numa_scan_period = sysctl_numa_balancing_scan_delay;
/* Protect against double add, see task_tick_numa and task_numa_work */
p->numa_work.next = &p->numa_work;
p->numa_faults = NULL;
RCU_INIT_POINTER(p->numa_group, NULL);
p->last_task_numa_placement = 0;
p->last_sum_exec_runtime = 0;
init_task_work(&p->numa_work, task_numa_work);
/* New address space, reset the preferred nid */
if (!(clone_flags & CLONE_VM)) {
p->numa_preferred_nid = NUMA_NO_NODE;
return;
}
/*
* New thread, keep existing numa_preferred_nid which should be copied
* already by arch_dup_task_struct but stagger when scans start.
*/
if (mm) {
unsigned int delay;
delay = min_t(unsigned int, task_scan_max(current),
current->numa_scan_period * mm_users * NSEC_PER_MSEC);
delay += 2 * TICK_NSEC;
p->node_stamp = delay;
}
}
/*
* Drive the periodic memory faults..
*/
static void task_tick_numa(struct rq *rq, struct task_struct *curr)
{
struct callback_head *work = &curr->numa_work;
u64 period, now;
/*
* We don't care about NUMA placement if we don't have memory.
*/
if ((curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work)
return;
/*
* Using runtime rather than walltime has the dual advantage that
* we (mostly) drive the selection from busy threads and that the
* task needs to have done some actual work before we bother with
* NUMA placement.
*/
now = curr->se.sum_exec_runtime;
period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
if (now > curr->node_stamp + period) {
if (!curr->node_stamp)
curr->numa_scan_period = task_scan_start(curr);
curr->node_stamp += period;
if (!time_before(jiffies, curr->mm->numa_next_scan))
task_work_add(curr, work, TWA_RESUME);
}
}
static void update_scan_period(struct task_struct *p, int new_cpu)
{
int src_nid = cpu_to_node(task_cpu(p));
int dst_nid = cpu_to_node(new_cpu);
if (!static_branch_likely(&sched_numa_balancing))
return;
if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING))
return;
if (src_nid == dst_nid)
return;
/*
* Allow resets if faults have been trapped before one scan
* has completed. This is most likely due to a new task that
* is pulled cross-node due to wakeups or load balancing.
*/
if (p->numa_scan_seq) {
/*
* Avoid scan adjustments if moving to the preferred
* node or if the task was not previously running on
* the preferred node.
*/
if (dst_nid == p->numa_preferred_nid ||
(p->numa_preferred_nid != NUMA_NO_NODE &&
src_nid != p->numa_preferred_nid))
return;
}
p->numa_scan_period = task_scan_start(p);
}
#else
static void task_tick_numa(struct rq *rq, struct task_struct *curr)
{
}
static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
{
}
static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
{
}
static inline void update_scan_period(struct task_struct *p, int new_cpu)
{
}
#endif /* CONFIG_NUMA_BALANCING */
static void
account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_add(&cfs_rq->load, se->load.weight);
#ifdef CONFIG_SMP
if (entity_is_task(se)) {
struct rq *rq = rq_of(cfs_rq);
account_numa_enqueue(rq, task_of(se));
list_add(&se->group_node, &rq->cfs_tasks);
}
#endif
cfs_rq->nr_running++;
}
static void
account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_sub(&cfs_rq->load, se->load.weight);
#ifdef CONFIG_SMP
if (entity_is_task(se)) {
account_numa_dequeue(rq_of(cfs_rq), task_of(se));
list_del_init(&se->group_node);
}
#endif
cfs_rq->nr_running--;
}
/*
* Signed add and clamp on underflow.
*
* Explicitly do a load-store to ensure the intermediate value never hits
* memory. This allows lockless observations without ever seeing the negative
* values.
*/
#define add_positive(_ptr, _val) do { \
typeof(_ptr) ptr = (_ptr); \
typeof(_val) val = (_val); \
typeof(*ptr) res, var = READ_ONCE(*ptr); \
\
res = var + val; \
\
if (val < 0 && res > var) \
res = 0; \
\
WRITE_ONCE(*ptr, res); \
} while (0)
/*
* Unsigned subtract and clamp on underflow.
*
* Explicitly do a load-store to ensure the intermediate value never hits
* memory. This allows lockless observations without ever seeing the negative
* values.
*/
#define sub_positive(_ptr, _val) do { \
typeof(_ptr) ptr = (_ptr); \
typeof(*ptr) val = (_val); \
typeof(*ptr) res, var = READ_ONCE(*ptr); \
res = var - val; \
if (res > var) \
res = 0; \
WRITE_ONCE(*ptr, res); \
} while (0)
/*
* Remove and clamp on negative, from a local variable.
*
* A variant of sub_positive(), which does not use explicit load-store
* and is thus optimized for local variable updates.
*/
#define lsub_positive(_ptr, _val) do { \
typeof(_ptr) ptr = (_ptr); \
*ptr -= min_t(typeof(*ptr), *ptr, _val); \
} while (0)
#ifdef CONFIG_SMP
static inline void
enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
cfs_rq->avg.load_avg += se->avg.load_avg;
cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
}
static inline void
dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
}
#else
static inline void
enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
static inline void
dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
#endif
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
{
if (se->on_rq) {
/* commit outstanding execution time */
if (cfs_rq->curr == se)
update_curr(cfs_rq);
update_load_sub(&cfs_rq->load, se->load.weight);
}
dequeue_load_avg(cfs_rq, se);
update_load_set(&se->load, weight);
#ifdef CONFIG_SMP
do {
u32 divider = get_pelt_divider(&se->avg);
se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
} while (0);
#endif
enqueue_load_avg(cfs_rq, se);
if (se->on_rq)
update_load_add(&cfs_rq->load, se->load.weight);
}
void reweight_task(struct task_struct *p, int prio)
{
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
struct load_weight *load = &se->load;
unsigned long weight = scale_load(sched_prio_to_weight[prio]);
reweight_entity(cfs_rq, se, weight);
load->inv_weight = sched_prio_to_wmult[prio];
}
#ifdef CONFIG_FAIR_GROUP_SCHED
#ifdef CONFIG_SMP
/*
* All this does is approximate the hierarchical proportion which includes that
* global sum we all love to hate.
*
* That is, the weight of a group entity, is the proportional share of the
* group weight based on the group runqueue weights. That is:
*
* tg->weight * grq->load.weight
* ge->load.weight = ----------------------------- (1)
* \Sum grq->load.weight
*
* Now, because computing that sum is prohibitively expensive to compute (been
* there, done that) we approximate it with this average stuff. The average
* moves slower and therefore the approximation is cheaper and more stable.
*
* So instead of the above, we substitute:
*
* grq->load.weight -> grq->avg.load_avg (2)
*
* which yields the following:
*
* tg->weight * grq->avg.load_avg
* ge->load.weight = ------------------------------ (3)
* tg->load_avg
*
* Where: tg->load_avg ~= \Sum grq->avg.load_avg
*
* That is shares_avg, and it is right (given the approximation (2)).
*
* The problem with it is that because the average is slow -- it was designed
* to be exactly that of course -- this leads to transients in boundary
* conditions. In specific, the case where the group was idle and we start the
* one task. It takes time for our CPU's grq->avg.load_avg to build up,
* yielding bad latency etc..
*
* Now, in that special case (1) reduces to:
*
* tg->weight * grq->load.weight
* ge->load.weight = ----------------------------- = tg->weight (4)
* grp->load.weight
*
* That is, the sum collapses because all other CPUs are idle; the UP scenario.
*
* So what we do is modify our approximation (3) to approach (4) in the (near)
* UP case, like:
*
* ge->load.weight =
*
* tg->weight * grq->load.weight
* --------------------------------------------------- (5)
* tg->load_avg - grq->avg.load_avg + grq->load.weight
*
* But because grq->load.weight can drop to 0, resulting in a divide by zero,
* we need to use grq->avg.load_avg as its lower bound, which then gives:
*
*
* tg->weight * grq->load.weight
* ge->load.weight = ----------------------------- (6)
* tg_load_avg'
*
* Where:
*
* tg_load_avg' = tg->load_avg - grq->avg.load_avg +
* max(grq->load.weight, grq->avg.load_avg)
*
* And that is shares_weight and is icky. In the (near) UP case it approaches
* (4) while in the normal case it approaches (3). It consistently
* overestimates the ge->load.weight and therefore:
*
* \Sum ge->load.weight >= tg->weight
*
* hence icky!
*/
static long calc_group_shares(struct cfs_rq *cfs_rq)
{
long tg_weight, tg_shares, load, shares;
struct task_group *tg = cfs_rq->tg;
tg_shares = READ_ONCE(tg->shares);
load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
tg_weight = atomic_long_read(&tg->load_avg);
/* Ensure tg_weight >= load */
tg_weight -= cfs_rq->tg_load_avg_contrib;
tg_weight += load;
shares = (tg_shares * load);
if (tg_weight)
shares /= tg_weight;
/*
* MIN_SHARES has to be unscaled here to support per-CPU partitioning
* of a group with small tg->shares value. It is a floor value which is
* assigned as a minimum load.weight to the sched_entity representing
* the group on a CPU.
*
* E.g. on 64-bit for a group with tg->shares of scale_load(15)=15*1024
* on an 8-core system with 8 tasks each runnable on one CPU shares has
* to be 15*1024*1/8=1920 instead of scale_load(MIN_SHARES)=2*1024. In
* case no task is runnable on a CPU MIN_SHARES=2 should be returned
* instead of 0.
*/
return clamp_t(long, shares, MIN_SHARES, tg_shares);
}
#endif /* CONFIG_SMP */
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
/*
* Recomputes the group entity based on the current state of its group
* runqueue.
*/
static void update_cfs_group(struct sched_entity *se)
{
struct cfs_rq *gcfs_rq = group_cfs_rq(se);
long shares;
if (!gcfs_rq)
return;
if (throttled_hierarchy(gcfs_rq))
return;
#ifndef CONFIG_SMP
shares = READ_ONCE(gcfs_rq->tg->shares);
if (likely(se->load.weight == shares))
return;
#else
shares = calc_group_shares(gcfs_rq);
#endif
reweight_entity(cfs_rq_of(se), se, shares);
}
#else /* CONFIG_FAIR_GROUP_SCHED */
static inline void update_cfs_group(struct sched_entity *se)
{
}
#endif /* CONFIG_FAIR_GROUP_SCHED */
static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
{
struct rq *rq = rq_of(cfs_rq);
if (&rq->cfs == cfs_rq) {
/*
* There are a few boundary cases this might miss but it should
* get called often enough that that should (hopefully) not be
* a real problem.
*
* It will not get called when we go idle, because the idle
* thread is a different class (!fair), nor will the utilization
* number include things like RT tasks.
*
* As is, the util number is not freq-invariant (we'd have to
* implement arch_scale_freq_capacity() for that).
*
* See cpu_util().
*/
cpufreq_update_util(rq, flags);
}
}
#ifdef CONFIG_SMP
#ifdef CONFIG_FAIR_GROUP_SCHED
/**
* update_tg_load_avg - update the tg's load avg
* @cfs_rq: the cfs_rq whose avg changed
*
* This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
* However, because tg->load_avg is a global value there are performance
* considerations.
*
* In order to avoid having to look at the other cfs_rq's, we use a
* differential update where we store the last value we propagated. This in
* turn allows skipping updates if the differential is 'small'.
*
* Updating tg's load_avg is necessary before update_cfs_share().
*/
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
{
long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
/*
* No need to update load_avg for root_task_group as it is not used.
*/
if (cfs_rq->tg == &root_task_group)
return;
if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
atomic_long_add(delta, &cfs_rq->tg->load_avg);
cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
}
}
/*
* Called within set_task_rq() right before setting a task's CPU. The
* caller only guarantees p->pi_lock is held; no other assumptions,
* including the state of rq->lock, should be made.
*/
void set_task_rq_fair(struct sched_entity *se,
struct cfs_rq *prev, struct cfs_rq *next)
{
u64 p_last_update_time;
u64 n_last_update_time;
if (!sched_feat(ATTACH_AGE_LOAD))
return;
/*
* We are supposed to update the task to "current" time, then its up to
* date and ready to go to new CPU/cfs_rq. But we have difficulty in
* getting what current time is, so simply throw away the out-of-date
* time. This will result in the wakee task is less decayed, but giving
* the wakee more load sounds not bad.
*/
if (!(se->avg.last_update_time && prev))
return;
#ifndef CONFIG_64BIT
{
u64 p_last_update_time_copy;
u64 n_last_update_time_copy;
do {
p_last_update_time_copy = prev->load_last_update_time_copy;
n_last_update_time_copy = next->load_last_update_time_copy;
smp_rmb();
p_last_update_time = prev->avg.last_update_time;
n_last_update_time = next->avg.last_update_time;
} while (p_last_update_time != p_last_update_time_copy ||
n_last_update_time != n_last_update_time_copy);
}
#else
p_last_update_time = prev->avg.last_update_time;
n_last_update_time = next->avg.last_update_time;
#endif
__update_load_avg_blocked_se(p_last_update_time, se);
se->avg.last_update_time = n_last_update_time;
}
/*
* When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
* propagate its contribution. The key to this propagation is the invariant
* that for each group:
*
* ge->avg == grq->avg (1)
*
* _IFF_ we look at the pure running and runnable sums. Because they
* represent the very same entity, just at different points in the hierarchy.
*
* Per the above update_tg_cfs_util() and update_tg_cfs_runnable() are trivial
* and simply copies the running/runnable sum over (but still wrong, because
* the group entity and group rq do not have their PELT windows aligned).
*
* However, update_tg_cfs_load() is more complex. So we have:
*
* ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg (2)
*
* And since, like util, the runnable part should be directly transferable,
* the following would _appear_ to be the straight forward approach:
*
* grq->avg.load_avg = grq->load.weight * grq->avg.runnable_avg (3)
*
* And per (1) we have:
*
* ge->avg.runnable_avg == grq->avg.runnable_avg
*
* Which gives:
*
* ge->load.weight * grq->avg.load_avg
* ge->avg.load_avg = ----------------------------------- (4)
* grq->load.weight
*
* Except that is wrong!
*
* Because while for entities historical weight is not important and we
* really only care about our future and therefore can consider a pure
* runnable sum, runqueues can NOT do this.
*
* We specifically want runqueues to have a load_avg that includes
* historical weights. Those represent the blocked load, the load we expect
* to (shortly) return to us. This only works by keeping the weights as
* integral part of the sum. We therefore cannot decompose as per (3).
*
* Another reason this doesn't work is that runnable isn't a 0-sum entity.
* Imagine a rq with 2 tasks that each are runnable 2/3 of the time. Then the
* rq itself is runnable anywhere between 2/3 and 1 depending on how the
* runnable section of these tasks overlap (or not). If they were to perfectly
* align the rq as a whole would be runnable 2/3 of the time. If however we
* always have at least 1 runnable task, the rq as a whole is always runnable.
*
* So we'll have to approximate.. :/
*
* Given the constraint:
*
* ge->avg.running_sum <= ge->avg.runnable_sum <= LOAD_AVG_MAX
*
* We can construct a rule that adds runnable to a rq by assuming minimal
* overlap.
*
* On removal, we'll assume each task is equally runnable; which yields:
*
* grq->avg.runnable_sum = grq->avg.load_sum / grq->load.weight
*
* XXX: only do this for the part of runnable > running ?
*
*/
static inline void
update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
u32 divider;
/* Nothing to update */
if (!delta)
return;
/*
* cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
* See ___update_load_avg() for details.
*/
divider = get_pelt_divider(&cfs_rq->avg);
/* Set new sched_entity's utilization */
se->avg.util_avg = gcfs_rq->avg.util_avg;
se->avg.util_sum = se->avg.util_avg * divider;
/* Update parent cfs_rq utilization */
add_positive(&cfs_rq->avg.util_avg, delta);
cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
}
static inline void
update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
u32 divider;
/* Nothing to update */
if (!delta)
return;
/*
* cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
* See ___update_load_avg() for details.
*/
divider = get_pelt_divider(&cfs_rq->avg);
/* Set new sched_entity's runnable */
se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
se->avg.runnable_sum = se->avg.runnable_avg * divider;
/* Update parent cfs_rq runnable */
add_positive(&cfs_rq->avg.runnable_avg, delta);
cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
}
static inline void
update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
long delta, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
unsigned long load_avg;
u64 load_sum = 0;
u32 divider;
if (!runnable_sum)
return;
gcfs_rq->prop_runnable_sum = 0;
/*
* cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
* See ___update_load_avg() for details.
*/
divider = get_pelt_divider(&cfs_rq->avg);
if (runnable_sum >= 0) {
/*
* Add runnable; clip at LOAD_AVG_MAX. Reflects that until
* the CPU is saturated running == runnable.
*/
runnable_sum += se->avg.load_sum;
runnable_sum = min_t(long, runnable_sum, divider);
} else {
/*
* Estimate the new unweighted runnable_sum of the gcfs_rq by
* assuming all tasks are equally runnable.
*/
if (scale_load_down(gcfs_rq->load.weight)) {
load_sum = div_s64(gcfs_rq->avg.load_sum,
scale_load_down(gcfs_rq->load.weight));
}
/* But make sure to not inflate se's runnable */
runnable_sum = min(se->avg.load_sum, load_sum);
}
/*
* runnable_sum can't be lower than running_sum
* Rescale running sum to be in the same range as runnable sum
* running_sum is in [0 : LOAD_AVG_MAX << SCHED_CAPACITY_SHIFT]
* runnable_sum is in [0 : LOAD_AVG_MAX]
*/
running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
runnable_sum = max(runnable_sum, running_sum);
load_sum = (s64)se_weight(se) * runnable_sum;
load_avg = div_s64(load_sum, divider);
delta = load_avg - se->avg.load_avg;
se->avg.load_sum = runnable_sum;
se->avg.load_avg = load_avg;
add_positive(&cfs_rq->avg.load_avg, delta);
cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider;
}
static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
{
cfs_rq->propagate = 1;
cfs_rq->prop_runnable_sum += runnable_sum;
}
/* Update task and its cfs_rq load average */
static inline int propagate_entity_load_avg(struct sched_entity *se)
{
struct cfs_rq *cfs_rq, *gcfs_rq;
if (entity_is_task(se))
return 0;
gcfs_rq = group_cfs_rq(se);
if (!gcfs_rq->propagate)
return 0;
gcfs_rq->propagate = 0;
cfs_rq = cfs_rq_of(se);
add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
update_tg_cfs_util(cfs_rq, se, gcfs_rq);
update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
update_tg_cfs_load(cfs_rq, se, gcfs_rq);
trace_pelt_cfs_tp(cfs_rq);
trace_pelt_se_tp(se);
return 1;
}
/*
* Check if we need to update the load and the utilization of a blocked
* group_entity:
*/
static inline bool skip_blocked_update(struct sched_entity *se)
{
struct cfs_rq *gcfs_rq = group_cfs_rq(se);
/*
* If sched_entity still have not zero load or utilization, we have to
* decay it:
*/
if (se->avg.load_avg || se->avg.util_avg)
return false;
/*
* If there is a pending propagation, we have to update the load and
* the utilization of the sched_entity:
*/
if (gcfs_rq->propagate)
return false;
/*
* Otherwise, the load and the utilization of the sched_entity is
* already zero and there is no pending propagation, so it will be a
* waste of time to try to decay it:
*/
return true;
}
#else /* CONFIG_FAIR_GROUP_SCHED */
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
static inline int propagate_entity_load_avg(struct sched_entity *se)
{
return 0;
}
static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
#endif /* CONFIG_FAIR_GROUP_SCHED */
/**
* update_cfs_rq_load_avg - update the cfs_rq's load/util averages
* @now: current time, as per cfs_rq_clock_pelt()
* @cfs_rq: cfs_rq to update
*
* The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
* avg. The immediate corollary is that all (fair) tasks must be attached, see
* post_init_entity_util_avg().
*
* cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
*
* Returns true if the load decayed or we removed load.
*
* Since both these conditions indicate a changed cfs_rq->avg.load we should
* call update_tg_load_avg() when this function returns true.
*/
static inline int
update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
{
unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0;
struct sched_avg *sa = &cfs_rq->avg;
int decayed = 0;
if (cfs_rq->removed.nr) {
unsigned long r;
u32 divider = get_pelt_divider(&cfs_rq->avg);
raw_spin_lock(&cfs_rq->removed.lock);
swap(cfs_rq->removed.util_avg, removed_util);
swap(cfs_rq->removed.load_avg, removed_load);
swap(cfs_rq->removed.runnable_avg, removed_runnable);
cfs_rq->removed.nr = 0;
raw_spin_unlock(&cfs_rq->removed.lock);
r = removed_load;
sub_positive(&sa->load_avg, r);
sa->load_sum = sa->load_avg * divider;
r = removed_util;
sub_positive(&sa->util_avg, r);
sa->util_sum = sa->util_avg * divider;
r = removed_runnable;
sub_positive(&sa->runnable_avg, r);
sa->runnable_sum = sa->runnable_avg * divider;
/*
* removed_runnable is the unweighted version of removed_load so we
* can use it to estimate removed_load_sum.
*/
add_tg_cfs_propagate(cfs_rq,
-(long)(removed_runnable * divider) >> SCHED_CAPACITY_SHIFT);
decayed = 1;
}
decayed |= __update_load_avg_cfs_rq(now, cfs_rq);
#ifndef CONFIG_64BIT
smp_wmb();
cfs_rq->load_last_update_time_copy = sa->last_update_time;
#endif
return decayed;
}
/**
* attach_entity_load_avg - attach this entity to its cfs_rq load avg
* @cfs_rq: cfs_rq to attach to
* @se: sched_entity to attach
*
* Must call update_cfs_rq_load_avg() before this, since we rely on
* cfs_rq->avg.last_update_time being current.
*/
static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
/*
* cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
* See ___update_load_avg() for details.
*/
u32 divider = get_pelt_divider(&cfs_rq->avg);
/*
* When we attach the @se to the @cfs_rq, we must align the decay
* window because without that, really weird and wonderful things can
* happen.
*
* XXX illustrate
*/
se->avg.last_update_time = cfs_rq->avg.last_update_time;
se->avg.period_contrib = cfs_rq->avg.period_contrib;
/*
* Hell(o) Nasty stuff.. we need to recompute _sum based on the new
* period_contrib. This isn't strictly correct, but since we're
* entirely outside of the PELT hierarchy, nobody cares if we truncate
* _sum a little.
*/
se->avg.util_sum = se->avg.util_avg * divider;
se->avg.runnable_sum = se->avg.runnable_avg * divider;
se->avg.load_sum = divider;
if (se_weight(se)) {
se->avg.load_sum =
div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
}
enqueue_load_avg(cfs_rq, se);
cfs_rq->avg.util_avg += se->avg.util_avg;
cfs_rq->avg.util_sum += se->avg.util_sum;
cfs_rq->avg.runnable_avg += se->avg.runnable_avg;
cfs_rq->avg.runnable_sum += se->avg.runnable_sum;
add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
cfs_rq_util_change(cfs_rq, 0);
trace_pelt_cfs_tp(cfs_rq);
}
/**
* detach_entity_load_avg - detach this entity from its cfs_rq load avg
* @cfs_rq: cfs_rq to detach from
* @se: sched_entity to detach
*
* Must call update_cfs_rq_load_avg() before this, since we rely on
* cfs_rq->avg.last_update_time being current.
*/
static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
/*
* cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
* See ___update_load_avg() for details.
*/
u32 divider = get_pelt_divider(&cfs_rq->avg);
dequeue_load_avg(cfs_rq, se);
sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
cfs_rq_util_change(cfs_rq, 0);
trace_pelt_cfs_tp(cfs_rq);
}
/*
* Optional action to be done while updating the load average
*/
#define UPDATE_TG 0x1
#define SKIP_AGE_LOAD 0x2
#define DO_ATTACH 0x4
/* Update task and its cfs_rq load average */
static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
u64 now = cfs_rq_clock_pelt(cfs_rq);
int decayed;
trace_android_vh_prepare_update_load_avg_se(se, flags);
/*
* Track task load average for carrying it to new CPU after migrated, and
* track group sched_entity load average for task_h_load calc in migration
*/
if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
__update_load_avg_se(now, cfs_rq, se);
trace_android_vh_finish_update_load_avg_se(se, flags);
decayed = update_cfs_rq_load_avg(now, cfs_rq);
decayed |= propagate_entity_load_avg(se);
if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
/*
* DO_ATTACH means we're here from enqueue_entity().
* !last_update_time means we've passed through
* migrate_task_rq_fair() indicating we migrated.
*
* IOW we're enqueueing a task on a new CPU.
*/
attach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq);
} else if (decayed) {
cfs_rq_util_change(cfs_rq, 0);
if (flags & UPDATE_TG)
update_tg_load_avg(cfs_rq);
}
}
#ifndef CONFIG_64BIT
static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
{
u64 last_update_time_copy;
u64 last_update_time;
do {
last_update_time_copy = cfs_rq->load_last_update_time_copy;
smp_rmb();
last_update_time = cfs_rq->avg.last_update_time;
} while (last_update_time != last_update_time_copy);
return last_update_time;
}
#else
static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
{
return cfs_rq->avg.last_update_time;
}
#endif
/*
* Synchronize entity load avg of dequeued entity without locking
* the previous rq.
*/
static void sync_entity_load_avg(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
u64 last_update_time;
last_update_time = cfs_rq_last_update_time(cfs_rq);
__update_load_avg_blocked_se(last_update_time, se);
}
/*
* Task first catches up with cfs_rq, and then subtract
* itself from the cfs_rq (task must be off the queue now).
*/
static void remove_entity_load_avg(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
unsigned long flags;
/*
* tasks cannot exit without having gone through wake_up_new_task() ->
* post_init_entity_util_avg() which will have added things to the
* cfs_rq, so we can remove unconditionally.
*/
sync_entity_load_avg(se);
raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
++cfs_rq->removed.nr;
cfs_rq->removed.util_avg += se->avg.util_avg;
cfs_rq->removed.load_avg += se->avg.load_avg;
cfs_rq->removed.runnable_avg += se->avg.runnable_avg;
raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
}
static inline unsigned long cfs_rq_runnable_avg(struct cfs_rq *cfs_rq)
{
return cfs_rq->avg.runnable_avg;
}
static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
{
return cfs_rq->avg.load_avg;
}
static int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
static inline unsigned long task_util(struct task_struct *p)
{
return READ_ONCE(p->se.avg.util_avg);
}
static inline unsigned long _task_util_est(struct task_struct *p)
{
struct util_est ue = READ_ONCE(p->se.avg.util_est);
return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
}
static inline unsigned long task_util_est(struct task_struct *p)
{
return max(task_util(p), _task_util_est(p));
}
#ifdef CONFIG_UCLAMP_TASK
static inline unsigned long uclamp_task_util(struct task_struct *p)
{
return clamp(task_util_est(p),
uclamp_eff_value(p, UCLAMP_MIN),
uclamp_eff_value(p, UCLAMP_MAX));
}
#else
static inline unsigned long uclamp_task_util(struct task_struct *p)
{
return task_util_est(p);
}
#endif
static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
struct task_struct *p)
{
unsigned int enqueued;
if (!sched_feat(UTIL_EST))
return;
/* Update root cfs_rq's estimated utilization */
enqueued = cfs_rq->avg.util_est.enqueued;
enqueued += _task_util_est(p);
WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
trace_sched_util_est_cfs_tp(cfs_rq);
}
static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
struct task_struct *p)
{
unsigned int enqueued;
if (!sched_feat(UTIL_EST))
return;
/* Update root cfs_rq's estimated utilization */
enqueued = cfs_rq->avg.util_est.enqueued;
enqueued -= min_t(unsigned int, enqueued, _task_util_est(p));
WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
trace_sched_util_est_cfs_tp(cfs_rq);
}
#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100)
/*
* Check if a (signed) value is within a specified (unsigned) margin,
* based on the observation that:
*
* abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
*
* NOTE: this only works when value + maring < INT_MAX.
*/
static inline bool within_margin(int value, int margin)
{
return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
}
static inline void util_est_update(struct cfs_rq *cfs_rq,
struct task_struct *p,
bool task_sleep)
{
long last_ewma_diff, last_enqueued_diff;
struct util_est ue;
int ret = 0;
trace_android_rvh_util_est_update(cfs_rq, p, task_sleep, &ret);
if (ret)
return;
if (!sched_feat(UTIL_EST))
return;
/*
* Skip update of task's estimated utilization when the task has not
* yet completed an activation, e.g. being migrated.
*/
if (!task_sleep)
return;
/*
* If the PELT values haven't changed since enqueue time,
* skip the util_est update.
*/
ue = p->se.avg.util_est;
if (ue.enqueued & UTIL_AVG_UNCHANGED)
return;
last_enqueued_diff = ue.enqueued;
/*
* Reset EWMA on utilization increases, the moving average is used only
* to smooth utilization decreases.
*/
ue.enqueued = task_util(p);
if (sched_feat(UTIL_EST_FASTUP)) {
if (ue.ewma < ue.enqueued) {
ue.ewma = ue.enqueued;
goto done;
}
}
/*
* Skip update of task's estimated utilization when its members are
* already ~1% close to its last activation value.
*/
last_ewma_diff = ue.enqueued - ue.ewma;
last_enqueued_diff -= ue.enqueued;
if (within_margin(last_ewma_diff, UTIL_EST_MARGIN)) {
if (!within_margin(last_enqueued_diff, UTIL_EST_MARGIN))
goto done;
return;
}
/*
* To avoid overestimation of actual task utilization, skip updates if
* we cannot grant there is idle time in this CPU.
*/
if (task_util(p) > capacity_orig_of(cpu_of(rq_of(cfs_rq))))
return;
/*
* Update Task's estimated utilization
*
* When *p completes an activation we can consolidate another sample
* of the task size. This is done by storing the current PELT value
* as ue.enqueued and by using this value to update the Exponential
* Weighted Moving Average (EWMA):
*
* ewma(t) = w * task_util(p) + (1-w) * ewma(t-1)
* = w * task_util(p) + ewma(t-1) - w * ewma(t-1)
* = w * (task_util(p) - ewma(t-1)) + ewma(t-1)
* = w * ( last_ewma_diff ) + ewma(t-1)
* = w * (last_ewma_diff + ewma(t-1) / w)
*
* Where 'w' is the weight of new samples, which is configured to be
* 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
*/
ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
ue.ewma += last_ewma_diff;
ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
done:
ue.enqueued |= UTIL_AVG_UNCHANGED;
WRITE_ONCE(p->se.avg.util_est, ue);
trace_sched_util_est_se_tp(&p->se);
}
static inline int task_fits_capacity(struct task_struct *p, long capacity)
{
return fits_capacity(uclamp_task_util(p), capacity);
}
static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
{
bool need_update = true;
trace_android_rvh_update_misfit_status(p, rq, &need_update);
if (!static_branch_unlikely(&sched_asym_cpucapacity) || !need_update)
return;
if (!p || p->nr_cpus_allowed == 1) {
rq->misfit_task_load = 0;
return;
}
if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) {
rq->misfit_task_load = 0;
return;
}
/*
* Make sure that misfit_task_load will not be null even if
* task_h_load() returns 0.
*/
rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
}
#else /* CONFIG_SMP */
#define UPDATE_TG 0x0
#define SKIP_AGE_LOAD 0x0
#define DO_ATTACH 0x0
static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
{
cfs_rq_util_change(cfs_rq, 0);
}
static inline void remove_entity_load_avg(struct sched_entity *se) {}
static inline void
attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
static inline void
detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
static inline int newidle_balance(struct rq *rq, struct rq_flags *rf)
{
return 0;
}
static inline void
util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
static inline void
util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
static inline void
util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p,
bool task_sleep) {}
static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
#endif /* CONFIG_SMP */
static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
#ifdef CONFIG_SCHED_DEBUG
s64 d = se->vruntime - cfs_rq->min_vruntime;
if (d < 0)
d = -d;
if (d > 3*sysctl_sched_latency)
schedstat_inc(cfs_rq->nr_spread_over);
#endif
}
static void
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
{
u64 vruntime = cfs_rq->min_vruntime;
/*
* The 'current' period is already promised to the current tasks,
* however the extra weight of the new task will slow them down a
* little, place the new task so that it fits in the slot that
* stays open at the end.
*/
if (initial && sched_feat(START_DEBIT))
vruntime += sched_vslice(cfs_rq, se);
/* sleeps up to a single latency don't count. */
if (!initial) {
unsigned long thresh = sysctl_sched_latency;
/*
* Halve their sleep time's effect, to allow
* for a gentler effect of sleepers:
*/
if (sched_feat(GENTLE_FAIR_SLEEPERS))
thresh >>= 1;
vruntime -= thresh;
}
/* ensure we never gain time by being placed backwards. */
se->vruntime = max_vruntime(se->vruntime, vruntime);
trace_android_rvh_place_entity(cfs_rq, se, initial, vruntime);
}
static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
static inline void check_schedstat_required(void)
{
#ifdef CONFIG_SCHEDSTATS
if (schedstat_enabled())
return;
/* Force schedstat enabled if a dependent tracepoint is active */
if (trace_sched_stat_wait_enabled() ||
trace_sched_stat_sleep_enabled() ||
trace_sched_stat_iowait_enabled() ||
trace_sched_stat_blocked_enabled() ||
trace_sched_stat_runtime_enabled()) {
printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
"stat_blocked and stat_runtime require the "
"kernel parameter schedstats=enable or "
"kernel.sched_schedstats=1\n");
}
#endif
}
static inline bool cfs_bandwidth_used(void);
/*
* MIGRATION
*
* dequeue
* update_curr()
* update_min_vruntime()
* vruntime -= min_vruntime
*
* enqueue
* update_curr()
* update_min_vruntime()
* vruntime += min_vruntime
*
* this way the vruntime transition between RQs is done when both
* min_vruntime are up-to-date.
*
* WAKEUP (remote)
*
* ->migrate_task_rq_fair() (p->state == TASK_WAKING)
* vruntime -= min_vruntime
*
* enqueue
* update_curr()
* update_min_vruntime()
* vruntime += min_vruntime
*
* this way we don't have the most up-to-date min_vruntime on the originating
* CPU and an up-to-date min_vruntime on the destination CPU.
*/
static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
bool curr = cfs_rq->curr == se;
/*
* If we're the current task, we must renormalise before calling
* update_curr().
*/
if (renorm && curr)
se->vruntime += cfs_rq->min_vruntime;
update_curr(cfs_rq);
/*
* Otherwise, renormalise after, such that we're placed at the current
* moment in time, instead of some random moment in the past. Being
* placed in the past could significantly boost this task to the
* fairness detriment of existing tasks.
*/
if (renorm && !curr)
se->vruntime += cfs_rq->min_vruntime;
/*
* When enqueuing a sched_entity, we must:
* - Update loads to have both entity and cfs_rq synced with now.
* - Add its load to cfs_rq->runnable_avg
* - For group_entity, update its weight to reflect the new share of
* its group cfs_rq
* - Add its new weight to cfs_rq->load.weight
*/
update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
se_update_runnable(se);
update_cfs_group(se);
account_entity_enqueue(cfs_rq, se);
if (flags & ENQUEUE_WAKEUP)
place_entity(cfs_rq, se, 0);
check_schedstat_required();
update_stats_enqueue(cfs_rq, se, flags);
check_spread(cfs_rq, se);
if (!curr)
__enqueue_entity(cfs_rq, se);
se->on_rq = 1;
/*
* When bandwidth control is enabled, cfs might have been removed
* because of a parent been throttled but cfs->nr_running > 1. Try to
* add it unconditionnally.
*/
if (cfs_rq->nr_running == 1 || cfs_bandwidth_used())
list_add_leaf_cfs_rq(cfs_rq);
if (cfs_rq->nr_running == 1)
check_enqueue_throttle(cfs_rq);
}
static void __clear_buddies_last(struct sched_entity *se)
{
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
if (cfs_rq->last != se)
break;
cfs_rq->last = NULL;
}
}
static void __clear_buddies_next(struct sched_entity *se)
{
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
if (cfs_rq->next != se)
break;
cfs_rq->next = NULL;
}
}
static void __clear_buddies_skip(struct sched_entity *se)
{
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
if (cfs_rq->skip != se)
break;
cfs_rq->skip = NULL;
}
}
static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
if (cfs_rq->last == se)
__clear_buddies_last(se);
if (cfs_rq->next == se)
__clear_buddies_next(se);
if (cfs_rq->skip == se)
__clear_buddies_skip(se);
}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
/*
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
/*
* When dequeuing a sched_entity, we must:
* - Update loads to have both entity and cfs_rq synced with now.
* - Subtract its load from the cfs_rq->runnable_avg.
* - Subtract its previous weight from cfs_rq->load.weight.
* - For group entity, update its weight to reflect the new share
* of its group cfs_rq.
*/
update_load_avg(cfs_rq, se, UPDATE_TG);
se_update_runnable(se);
update_stats_dequeue(cfs_rq, se, flags);
clear_buddies(cfs_rq, se);
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
se->on_rq = 0;
account_entity_dequeue(cfs_rq, se);
/*
* Normalize after update_curr(); which will also have moved
* min_vruntime if @se is the one holding it back. But before doing
* update_min_vruntime() again, which will discount @se's position and
* can move min_vruntime forward still more.
*/
if (!(flags & DEQUEUE_SLEEP))
se->vruntime -= cfs_rq->min_vruntime;
/* return excess runtime on last dequeue */
return_cfs_rq_runtime(cfs_rq);
update_cfs_group(se);
/*
* Now advance min_vruntime if @se was the entity holding it back,
* except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
* put back on, and if we advance min_vruntime, we'll be placed back
* further than we started -- ie. we'll be penalized.
*/
if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
update_min_vruntime(cfs_rq);
}
/*
* Preempt the current task with a newly woken task if needed:
*/
static void
check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
unsigned long ideal_runtime, delta_exec;
struct sched_entity *se;
s64 delta;
bool skip_preempt = false;
ideal_runtime = sched_slice(cfs_rq, curr);
delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
trace_android_rvh_check_preempt_tick(current, &ideal_runtime, &skip_preempt,
delta_exec, cfs_rq, curr, sysctl_sched_min_granularity);
if (skip_preempt)
return;
if (delta_exec > ideal_runtime) {
resched_curr(rq_of(cfs_rq));
/*
* The current task ran long enough, ensure it doesn't get
* re-elected due to buddy favours.
*/
clear_buddies(cfs_rq, curr);
return;
}
/*
* Ensure that a task that missed wakeup preemption by a
* narrow margin doesn't have to wait for a full slice.
* This also mitigates buddy induced latencies under load.
*/
if (delta_exec < sysctl_sched_min_granularity)
return;
se = __pick_first_entity(cfs_rq);
delta = curr->vruntime - se->vruntime;
if (delta < 0)
return;
if (delta > ideal_runtime)
resched_curr(rq_of(cfs_rq));
}
void set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
/* 'current' is not kept within the tree. */
if (se->on_rq) {
/*
* Any task has to be enqueued before it get to execute on
* a CPU. So account for the time it spent waiting on the
* runqueue.
*/
update_stats_wait_end(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
update_load_avg(cfs_rq, se, UPDATE_TG);
}
update_stats_curr_start(cfs_rq, se);
cfs_rq->curr = se;
/*
* Track our maximum slice length, if the CPU's load is at
* least twice that of our own weight (i.e. dont track it
* when there are only lesser-weight tasks around):
*/
if (schedstat_enabled() &&
rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
schedstat_set(se->statistics.slice_max,
max((u64)schedstat_val(se->statistics.slice_max),
se->sum_exec_runtime - se->prev_sum_exec_runtime));
}
se->prev_sum_exec_runtime = se->sum_exec_runtime;
}
EXPORT_SYMBOL_GPL(set_next_entity);
static int
wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
/*
* Pick the next process, keeping these things in mind, in this order:
* 1) keep things fair between processes/task groups
* 2) pick the "next" process, since someone really wants that to run
* 3) pick the "last" process, for cache locality
* 4) do not run the "skip" process, if something else is available
*/
static struct sched_entity *
pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
struct sched_entity *left = __pick_first_entity(cfs_rq);
struct sched_entity *se = NULL;
trace_android_rvh_pick_next_entity(cfs_rq, curr, &se);
if (se)
goto done;
/*
* If curr is set we have to see if its left of the leftmost entity
* still in the tree, provided there was anything in the tree at all.
*/
if (!left || (curr && entity_before(curr, left)))
left = curr;
se = left; /* ideally we run the leftmost entity */
/*
* Avoid running the skip buddy, if running something else can
* be done without getting too unfair.
*/
if (cfs_rq->skip == se) {
struct sched_entity *second;
if (se == curr) {
second = __pick_first_entity(cfs_rq);
} else {
second = __pick_next_entity(se);
if (!second || (curr && entity_before(curr, second)))
second = curr;
}
if (second && wakeup_preempt_entity(second, left) < 1)
se = second;
}
if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
/*
* Someone really wants this to run. If it's not unfair, run it.
*/
se = cfs_rq->next;
} else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) {
/*
* Prefer last buddy, try to return the CPU to a preempted task.
*/
se = cfs_rq->last;
}
done:
clear_buddies(cfs_rq, se);
return se;
}
static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
{
/*
* If still on the runqueue then deactivate_task()
* was not called and update_curr() has to be done:
*/
if (prev->on_rq)
update_curr(cfs_rq);
/* throttle cfs_rqs exceeding runtime */
check_cfs_rq_runtime(cfs_rq);
check_spread(cfs_rq, prev);
if (prev->on_rq) {
update_stats_wait_start(cfs_rq, prev);
/* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev);
/* in !on_rq case, update occurred at dequeue */
update_load_avg(cfs_rq, prev, 0);
}
cfs_rq->curr = NULL;
}
static void
entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
{
/*
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
/*
* Ensure that runnable average is periodically updated.
*/
update_load_avg(cfs_rq, curr, UPDATE_TG);
update_cfs_group(curr);
#ifdef CONFIG_SCHED_HRTICK
/*
* queued ticks are scheduled to match the slice, so don't bother
* validating it and just reschedule.
*/
if (queued) {
resched_curr(rq_of(cfs_rq));
return;
}
/*
* don't let the period tick interfere with the hrtick preemption
*/
if (!sched_feat(DOUBLE_TICK) &&
hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
return;
#endif
if (cfs_rq->nr_running > 1)
check_preempt_tick(cfs_rq, curr);
trace_android_rvh_entity_tick(cfs_rq, curr);
}
/**************************************************
* CFS bandwidth control machinery
*/
#ifdef CONFIG_CFS_BANDWIDTH
#ifdef CONFIG_JUMP_LABEL
static struct static_key __cfs_bandwidth_used;
static inline bool cfs_bandwidth_used(void)
{
return static_key_false(&__cfs_bandwidth_used);
}
void cfs_bandwidth_usage_inc(void)
{
static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used);
}
void cfs_bandwidth_usage_dec(void)
{
static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
}
#else /* CONFIG_JUMP_LABEL */
static bool cfs_bandwidth_used(void)
{
return true;
}
void cfs_bandwidth_usage_inc(void) {}
void cfs_bandwidth_usage_dec(void) {}
#endif /* CONFIG_JUMP_LABEL */
/*
* default period for cfs group bandwidth.
* default: 0.1s, units: nanoseconds
*/
static inline u64 default_cfs_period(void)
{
return 100000000ULL;
}
static inline u64 sched_cfs_bandwidth_slice(void)
{
return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
}
/*
* Replenish runtime according to assigned quota. We use sched_clock_cpu
* directly instead of rq->clock to avoid adding additional synchronization
* around rq->lock.
*
* requires cfs_b->lock
*/
void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
{
if (cfs_b->quota != RUNTIME_INF)
cfs_b->runtime = cfs_b->quota;
}
static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
{
return &tg->cfs_bandwidth;
}
/* returns 0 on failure to allocate runtime */
static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b,
struct cfs_rq *cfs_rq, u64 target_runtime)
{
u64 min_amount, amount = 0;
lockdep_assert_held(&cfs_b->lock);
/* note: this is a positive sum as runtime_remaining <= 0 */
min_amount = target_runtime - cfs_rq->runtime_remaining;
if (cfs_b->quota == RUNTIME_INF)
amount = min_amount;
else {
start_cfs_bandwidth(cfs_b);
if (cfs_b->runtime > 0) {
amount = min(cfs_b->runtime, min_amount);
cfs_b->runtime -= amount;
cfs_b->idle = 0;
}
}
cfs_rq->runtime_remaining += amount;
return cfs_rq->runtime_remaining > 0;
}
/* returns 0 on failure to allocate runtime */
static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
int ret;
raw_spin_lock(&cfs_b->lock);
ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice());
raw_spin_unlock(&cfs_b->lock);
return ret;
}
static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
{
/* dock delta_exec before expiring quota (as it could span periods) */
cfs_rq->runtime_remaining -= delta_exec;
if (likely(cfs_rq->runtime_remaining > 0))
return;
if (cfs_rq->throttled)
return;
/*
* if we're unable to extend our runtime we resched so that the active
* hierarchy can be throttled
*/
if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
resched_curr(rq_of(cfs_rq));
}
static __always_inline
void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
{
if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
return;
__account_cfs_rq_runtime(cfs_rq, delta_exec);
}
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
{
return cfs_bandwidth_used() && cfs_rq->throttled;
}
/* check whether cfs_rq, or any parent, is throttled */
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
{
return cfs_bandwidth_used() && cfs_rq->throttle_count;
}
/*
* Ensure that neither of the group entities corresponding to src_cpu or
* dest_cpu are members of a throttled hierarchy when performing group
* load-balance operations.
*/
static inline int throttled_lb_pair(struct task_group *tg,
int src_cpu, int dest_cpu)
{
struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
src_cfs_rq = tg->cfs_rq[src_cpu];
dest_cfs_rq = tg->cfs_rq[dest_cpu];
return throttled_hierarchy(src_cfs_rq) ||
throttled_hierarchy(dest_cfs_rq);
}
static int tg_unthrottle_up(struct task_group *tg, void *data)
{
struct rq *rq = data;
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
cfs_rq->throttle_count--;
if (!cfs_rq->throttle_count) {
cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
cfs_rq->throttled_clock_task;
/* Add cfs_rq with already running entity in the list */
if (cfs_rq->nr_running >= 1)
list_add_leaf_cfs_rq(cfs_rq);
}
return 0;
}
static int tg_throttle_down(struct task_group *tg, void *data)
{
struct rq *rq = data;
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
/* group is entering throttled state, stop time */
if (!cfs_rq->throttle_count) {
cfs_rq->throttled_clock_task = rq_clock_task(rq);
list_del_leaf_cfs_rq(cfs_rq);
}
cfs_rq->throttle_count++;
return 0;
}
static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
long task_delta, idle_task_delta, dequeue = 1;
raw_spin_lock(&cfs_b->lock);
/* This will start the period timer if necessary */
if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) {
/*
* We have raced with bandwidth becoming available, and if we
* actually throttled the timer might not unthrottle us for an
* entire period. We additionally needed to make sure that any
* subsequent check_cfs_rq_runtime calls agree not to throttle
* us, as we may commit to do cfs put_prev+pick_next, so we ask
* for 1ns of runtime rather than just check cfs_b.
*/
dequeue = 0;
} else {
list_add_tail_rcu(&cfs_rq->throttled_list,
&cfs_b->throttled_cfs_rq);
}
raw_spin_unlock(&cfs_b->lock);
if (!dequeue)
return false; /* Throttle no longer required. */
se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
/* freeze hierarchy runnable averages while throttled */
rcu_read_lock();
walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
rcu_read_unlock();
task_delta = cfs_rq->h_nr_running;
idle_task_delta = cfs_rq->idle_h_nr_running;
for_each_sched_entity(se) {
struct cfs_rq *qcfs_rq = cfs_rq_of(se);
/* throttled entity or throttle-on-deactivate */
if (!se->on_rq)
break;
if (dequeue) {
dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
} else {
update_load_avg(qcfs_rq, se, 0);
se_update_runnable(se);
}
qcfs_rq->h_nr_running -= task_delta;
qcfs_rq->idle_h_nr_running -= idle_task_delta;
if (qcfs_rq->load.weight)
dequeue = 0;
}
if (!se)
sub_nr_running(rq, task_delta);
/*
* Note: distribution will already see us throttled via the
* throttled-list. rq->lock protects completion.
*/
cfs_rq->throttled = 1;
cfs_rq->throttled_clock = rq_clock(rq);
return true;
}
void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
long task_delta, idle_task_delta;
se = cfs_rq->tg->se[cpu_of(rq)];
cfs_rq->throttled = 0;
update_rq_clock(rq);
raw_spin_lock(&cfs_b->lock);
cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
list_del_rcu(&cfs_rq->throttled_list);
raw_spin_unlock(&cfs_b->lock);
/* update hierarchical throttle state */
walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
if (!cfs_rq->load.weight)
return;
task_delta = cfs_rq->h_nr_running;
idle_task_delta = cfs_rq->idle_h_nr_running;
for_each_sched_entity(se) {
if (se->on_rq)
break;
cfs_rq = cfs_rq_of(se);
enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
cfs_rq->h_nr_running += task_delta;
cfs_rq->idle_h_nr_running += idle_task_delta;
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
goto unthrottle_throttle;
}
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
update_load_avg(cfs_rq, se, UPDATE_TG);
se_update_runnable(se);
cfs_rq->h_nr_running += task_delta;
cfs_rq->idle_h_nr_running += idle_task_delta;
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
goto unthrottle_throttle;
/*
* One parent has been throttled and cfs_rq removed from the
* list. Add it back to not break the leaf list.
*/
if (throttled_hierarchy(cfs_rq))
list_add_leaf_cfs_rq(cfs_rq);
}
/* At this point se is NULL and we are at root level*/
add_nr_running(rq, task_delta);
unthrottle_throttle:
/*
* The cfs_rq_throttled() breaks in the above iteration can result in
* incomplete leaf list maintenance, resulting in triggering the
* assertion below.
*/
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
if (list_add_leaf_cfs_rq(cfs_rq))
break;
}
assert_list_leaf_cfs_rq(rq);
/* Determine whether we need to wake up potentially idle CPU: */
if (rq->curr == rq->idle && rq->cfs.nr_running)
resched_curr(rq);
}
static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
{
struct cfs_rq *cfs_rq;
u64 runtime, remaining = 1;
rcu_read_lock();
list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
throttled_list) {
struct rq *rq = rq_of(cfs_rq);
struct rq_flags rf;
rq_lock_irqsave(rq, &rf);
if (!cfs_rq_throttled(cfs_rq))
goto next;
/* By the above check, this should never be true */
SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
raw_spin_lock(&cfs_b->lock);
runtime = -cfs_rq->runtime_remaining + 1;
if (runtime > cfs_b->runtime)
runtime = cfs_b->runtime;
cfs_b->runtime -= runtime;
remaining = cfs_b->runtime;
raw_spin_unlock(&cfs_b->lock);
cfs_rq->runtime_remaining += runtime;
/* we check whether we're throttled above */
if (cfs_rq->runtime_remaining > 0)
unthrottle_cfs_rq(cfs_rq);
next:
rq_unlock_irqrestore(rq, &rf);
if (!remaining)
break;
}
rcu_read_unlock();
}
/*
* Responsible for refilling a task_group's bandwidth and unthrottling its
* cfs_rqs as appropriate. If there has been no activity within the last
* period the timer is deactivated until scheduling resumes; cfs_b->idle is
* used to track this state.
*/
static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
{
int throttled;
/* no need to continue the timer with no bandwidth constraint */
if (cfs_b->quota == RUNTIME_INF)
goto out_deactivate;
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
cfs_b->nr_periods += overrun;
/*
* idle depends on !throttled (for the case of a large deficit), and if
* we're going inactive then everything else can be deferred
*/
if (cfs_b->idle && !throttled)
goto out_deactivate;
__refill_cfs_bandwidth_runtime(cfs_b);
if (!throttled) {
/* mark as potentially idle for the upcoming period */
cfs_b->idle = 1;
return 0;
}
/* account preceding periods in which throttling occurred */
cfs_b->nr_throttled += overrun;
/*
* This check is repeated as we release cfs_b->lock while we unthrottle.
*/
while (throttled && cfs_b->runtime > 0) {
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
/* we can't nest cfs_b->lock while distributing bandwidth */
distribute_cfs_runtime(cfs_b);
raw_spin_lock_irqsave(&cfs_b->lock, flags);
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
}
/*
* While we are ensured activity in the period following an
* unthrottle, this also covers the case in which the new bandwidth is
* insufficient to cover the existing bandwidth deficit. (Forcing the
* timer to remain active while there are any throttled entities.)
*/
cfs_b->idle = 0;
return 0;
out_deactivate:
return 1;
}
/* a cfs_rq won't donate quota below this amount */
static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
/* minimum remaining period time to redistribute slack quota */
static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
/* how long we wait to gather additional slack before distributing */
static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
/*
* Are we near the end of the current quota period?
*
* Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
* hrtimer base being cleared by hrtimer_start. In the case of
* migrate_hrtimers, base is never cleared, so we are fine.
*/
static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
{
struct hrtimer *refresh_timer = &cfs_b->period_timer;
s64 remaining;
/* if the call-back is running a quota refresh is already occurring */
if (hrtimer_callback_running(refresh_timer))
return 1;
/* is a quota refresh about to occur? */
remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
if (remaining < (s64)min_expire)
return 1;
return 0;
}
static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
{
u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
/* if there's a quota refresh soon don't bother with slack */
if (runtime_refresh_within(cfs_b, min_left))
return;
/* don't push forwards an existing deferred unthrottle */
if (cfs_b->slack_started)
return;
cfs_b->slack_started = true;
hrtimer_start(&cfs_b->slack_timer,
ns_to_ktime(cfs_bandwidth_slack_period),
HRTIMER_MODE_REL);
}
/* we know any runtime found here is valid as update_curr() precedes return */
static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
if (slack_runtime <= 0)
return;
raw_spin_lock(&cfs_b->lock);
if (cfs_b->quota != RUNTIME_INF) {
cfs_b->runtime += slack_runtime;
/* we are under rq->lock, defer unthrottling using a timer */
if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
!list_empty(&cfs_b->throttled_cfs_rq))
start_cfs_slack_bandwidth(cfs_b);
}
raw_spin_unlock(&cfs_b->lock);
/* even if it's not valid for return we don't want to try again */
cfs_rq->runtime_remaining -= slack_runtime;
}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
if (!cfs_bandwidth_used())
return;
if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
return;
__return_cfs_rq_runtime(cfs_rq);
}
/*
* This is done with a timer (instead of inline with bandwidth return) since
* it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
*/
static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
{
u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
unsigned long flags;
/* confirm we're still not at a refresh boundary */
raw_spin_lock_irqsave(&cfs_b->lock, flags);
cfs_b->slack_started = false;
if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
return;
}
if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
runtime = cfs_b->runtime;
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
if (!runtime)
return;
distribute_cfs_runtime(cfs_b);
raw_spin_lock_irqsave(&cfs_b->lock, flags);
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
}
/*
* When a group wakes up we want to make sure that its quota is not already
* expired/exceeded, otherwise it may be allowed to steal additional ticks of
* runtime as update_curr() throttling can not not trigger until it's on-rq.
*/
static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
{
if (!cfs_bandwidth_used())
return;
/* an active group must be handled by the update_curr()->put() path */
if (!cfs_rq->runtime_enabled || cfs_rq->curr)
return;
/* ensure the group is not already throttled */
if (cfs_rq_throttled(cfs_rq))
return;
/* update runtime allocation */
account_cfs_rq_runtime(cfs_rq, 0);
if (cfs_rq->runtime_remaining <= 0)
throttle_cfs_rq(cfs_rq);
}
static void sync_throttle(struct task_group *tg, int cpu)
{
struct cfs_rq *pcfs_rq, *cfs_rq;
if (!cfs_bandwidth_used())
return;
if (!tg->parent)
return;
cfs_rq = tg->cfs_rq[cpu];
pcfs_rq = tg->parent->cfs_rq[cpu];
cfs_rq->throttle_count = pcfs_rq->throttle_count;
cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
}
/* conditionally throttle active cfs_rq's from put_prev_entity() */
static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
if (!cfs_bandwidth_used())
return false;
if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
return false;
/*
* it's possible for a throttled entity to be forced into a running
* state (e.g. set_curr_task), in this case we're finished.
*/
if (cfs_rq_throttled(cfs_rq))
return true;
return throttle_cfs_rq(cfs_rq);
}
static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
{
struct cfs_bandwidth *cfs_b =
container_of(timer, struct cfs_bandwidth, slack_timer);
do_sched_cfs_slack_timer(cfs_b);
return HRTIMER_NORESTART;
}
extern const u64 max_cfs_quota_period;
static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
{
struct cfs_bandwidth *cfs_b =
container_of(timer, struct cfs_bandwidth, period_timer);
unsigned long flags;
int overrun;
int idle = 0;
int count = 0;
raw_spin_lock_irqsave(&cfs_b->lock, flags);
for (;;) {
overrun = hrtimer_forward_now(timer, cfs_b->period);
if (!overrun)
break;
idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
if (++count > 3) {
u64 new, old = ktime_to_ns(cfs_b->period);
/*
* Grow period by a factor of 2 to avoid losing precision.
* Precision loss in the quota/period ratio can cause __cfs_schedulable
* to fail.
*/
new = old * 2;
if (new < max_cfs_quota_period) {
cfs_b->period = ns_to_ktime(new);
cfs_b->quota *= 2;
pr_warn_ratelimited(
"cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n",
smp_processor_id(),
div_u64(new, NSEC_PER_USEC),
div_u64(cfs_b->quota, NSEC_PER_USEC));
} else {
pr_warn_ratelimited(
"cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n",
smp_processor_id(),
div_u64(old, NSEC_PER_USEC),
div_u64(cfs_b->quota, NSEC_PER_USEC));
}
/* reset count so we don't come right back in here */
count = 0;
}
}
if (idle)
cfs_b->period_active = 0;
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
}
void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
{
raw_spin_lock_init(&cfs_b->lock);
cfs_b->runtime = 0;
cfs_b->quota = RUNTIME_INF;
cfs_b->period = ns_to_ktime(default_cfs_period());
INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
cfs_b->period_timer.function = sched_cfs_period_timer;
hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
cfs_b->slack_timer.function = sched_cfs_slack_timer;
cfs_b->slack_started = false;
}
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
cfs_rq->runtime_enabled = 0;
INIT_LIST_HEAD(&cfs_rq->throttled_list);
}
void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
{
lockdep_assert_held(&cfs_b->lock);
if (cfs_b->period_active)
return;
cfs_b->period_active = 1;
hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
}
static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
{
/* init_cfs_bandwidth() was not called */
if (!cfs_b->throttled_cfs_rq.next)
return;
hrtimer_cancel(&cfs_b->period_timer);
hrtimer_cancel(&cfs_b->slack_timer);
}
/*
* Both these CPU hotplug callbacks race against unregister_fair_sched_group()
*
* The race is harmless, since modifying bandwidth settings of unhooked group
* bits doesn't do much.
*/
/* cpu online calback */
static void __maybe_unused update_runtime_enabled(struct rq *rq)
{
struct task_group *tg;
lockdep_assert_held(&rq->lock);
rcu_read_lock();
list_for_each_entry_rcu(tg, &task_groups, list) {
struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
raw_spin_lock(&cfs_b->lock);
cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
raw_spin_unlock(&cfs_b->lock);
}
rcu_read_unlock();
}
/* cpu offline callback */
static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
{
struct task_group *tg;
lockdep_assert_held(&rq->lock);
rcu_read_lock();
list_for_each_entry_rcu(tg, &task_groups, list) {
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
if (!cfs_rq->runtime_enabled)
continue;
/*
* clock_task is not advancing so we just need to make sure
* there's some valid quota amount
*/
cfs_rq->runtime_remaining = 1;
/*
* Offline rq is schedulable till CPU is completely disabled
* in take_cpu_down(), so we prevent new cfs throttling here.
*/
cfs_rq->runtime_enabled = 0;
if (cfs_rq_throttled(cfs_rq))
unthrottle_cfs_rq(cfs_rq);
}
rcu_read_unlock();
}
#else /* CONFIG_CFS_BANDWIDTH */
static inline bool cfs_bandwidth_used(void)
{
return false;
}
static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
static inline void sync_throttle(struct task_group *tg, int cpu) {}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
{
return 0;
}
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
{
return 0;
}
static inline int throttled_lb_pair(struct task_group *tg,
int src_cpu, int dest_cpu)
{
return 0;
}
void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
#ifdef CONFIG_FAIR_GROUP_SCHED
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
#endif
static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
{
return NULL;
}
static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
static inline void update_runtime_enabled(struct rq *rq) {}
static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
#endif /* CONFIG_CFS_BANDWIDTH */
/**************************************************
* CFS operations on tasks:
*/
#ifdef CONFIG_SCHED_HRTICK
static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
{
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
SCHED_WARN_ON(task_rq(p) != rq);
if (rq->cfs.h_nr_running > 1) {
u64 slice = sched_slice(cfs_rq, se);
u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
s64 delta = slice - ran;
if (delta < 0) {
if (rq->curr == p)
resched_curr(rq);
return;
}
hrtick_start(rq, delta);
}
}
/*
* called from enqueue/dequeue and updates the hrtick when the
* current task is from our class and nr_running is low enough
* to matter.
*/
static void hrtick_update(struct rq *rq)
{
struct task_struct *curr = rq->curr;
if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
return;
if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
hrtick_start_fair(rq, curr);
}
#else /* !CONFIG_SCHED_HRTICK */
static inline void
hrtick_start_fair(struct rq *rq, struct task_struct *p)
{
}
static inline void hrtick_update(struct rq *rq)
{
}
#endif
#ifdef CONFIG_SMP
static inline unsigned long cpu_util(int cpu);
static inline bool cpu_overutilized(int cpu)
{
int overutilized = -1;
trace_android_rvh_cpu_overutilized(cpu, &overutilized);
if (overutilized != -1)
return overutilized;
return !fits_capacity(cpu_util(cpu), capacity_of(cpu));
}
static inline void update_overutilized_status(struct rq *rq)
{
if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) {
WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
}
}
#else
static inline void update_overutilized_status(struct rq *rq) { }
#endif
/* Runqueue only has SCHED_IDLE tasks enqueued */
static int sched_idle_rq(struct rq *rq)
{
return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
rq->nr_running);
}
#ifdef CONFIG_SMP
static int sched_idle_cpu(int cpu)
{
return sched_idle_rq(cpu_rq(cpu));
}
#endif
/*
* The enqueue_task method is called before nr_running is
* increased. Here we update the fair scheduling stats and
* then put the task into the rbtree:
*/
static void
enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
int idle_h_nr_running = task_has_idle_policy(p);
int task_new = !(flags & ENQUEUE_WAKEUP);
int should_iowait_boost;
/*
* The code below (indirectly) updates schedutil which looks at
* the cfs_rq utilization to select a frequency.
* Let's add the task's estimated utilization to the cfs_rq's
* estimated utilization, before we update schedutil.
*/
util_est_enqueue(&rq->cfs, p);
/*
* If in_iowait is set, the code below may not trigger any cpufreq
* utilization updates, so do it here explicitly with the IOWAIT flag
* passed.
*/
should_iowait_boost = p->in_iowait;
trace_android_rvh_set_iowait(p, &should_iowait_boost);
if (should_iowait_boost)
cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
for_each_sched_entity(se) {
if (se->on_rq)
break;
cfs_rq = cfs_rq_of(se);
enqueue_entity(cfs_rq, se, flags);
cfs_rq->h_nr_running++;
cfs_rq->idle_h_nr_running += idle_h_nr_running;
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
goto enqueue_throttle;
flags = ENQUEUE_WAKEUP;
}
trace_android_rvh_enqueue_task_fair(rq, p, flags);
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
update_load_avg(cfs_rq, se, UPDATE_TG);
se_update_runnable(se);
update_cfs_group(se);
cfs_rq->h_nr_running++;
cfs_rq->idle_h_nr_running += idle_h_nr_running;
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
goto enqueue_throttle;
/*
* One parent has been throttled and cfs_rq removed from the
* list. Add it back to not break the leaf list.
*/
if (throttled_hierarchy(cfs_rq))
list_add_leaf_cfs_rq(cfs_rq);
}
/* At this point se is NULL and we are at root level*/
add_nr_running(rq, 1);
/*
* Since new tasks are assigned an initial util_avg equal to
* half of the spare capacity of their CPU, tiny tasks have the
* ability to cross the overutilized threshold, which will
* result in the load balancer ruining all the task placement
* done by EAS. As a way to mitigate that effect, do not account
* for the first enqueue operation of new tasks during the
* overutilized flag detection.
*
* A better way of solving this problem would be to wait for
* the PELT signals of tasks to converge before taking them
* into account, but that is not straightforward to implement,
* and the following generally works well enough in practice.
*/
if (!task_new)
update_overutilized_status(rq);
enqueue_throttle:
if (cfs_bandwidth_used()) {
/*
* When bandwidth control is enabled; the cfs_rq_throttled()
* breaks in the above iteration can result in incomplete
* leaf list maintenance, resulting in triggering the assertion
* below.
*/
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
if (list_add_leaf_cfs_rq(cfs_rq))
break;
}
}
assert_list_leaf_cfs_rq(rq);
hrtick_update(rq);
}
static void set_next_buddy(struct sched_entity *se);
/*
* The dequeue_task method is called before nr_running is
* decreased. We remove the task from the rbtree and
* update the fair scheduling stats:
*/
static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
int task_sleep = flags & DEQUEUE_SLEEP;
int idle_h_nr_running = task_has_idle_policy(p);
bool was_sched_idle = sched_idle_rq(rq);
util_est_dequeue(&rq->cfs, p);
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
dequeue_entity(cfs_rq, se, flags);
cfs_rq->h_nr_running--;
cfs_rq->idle_h_nr_running -= idle_h_nr_running;
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
goto dequeue_throttle;
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
/* Avoid re-evaluating load for this entity: */
se = parent_entity(se);
/*
* Bias pick_next to pick a task from this cfs_rq, as
* p is sleeping when it is within its sched_slice.
*/
if (task_sleep && se && !throttled_hierarchy(cfs_rq))
set_next_buddy(se);
break;
}
flags |= DEQUEUE_SLEEP;
}
trace_android_rvh_dequeue_task_fair(rq, p, flags);
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
update_load_avg(cfs_rq, se, UPDATE_TG);
se_update_runnable(se);
update_cfs_group(se);
cfs_rq->h_nr_running--;
cfs_rq->idle_h_nr_running -= idle_h_nr_running;
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
goto dequeue_throttle;
}
/* At this point se is NULL and we are at root level*/
sub_nr_running(rq, 1);
/* balance early to pull high priority tasks */
if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
rq->next_balance = jiffies;
dequeue_throttle:
util_est_update(&rq->cfs, p, task_sleep);
hrtick_update(rq);
}
#ifdef CONFIG_SMP
/* Working cpumask for: load_balance, load_balance_newidle. */
DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
#ifdef CONFIG_NO_HZ_COMMON
static struct {
cpumask_var_t idle_cpus_mask;
atomic_t nr_cpus;
int has_blocked; /* Idle CPUS has blocked load */
unsigned long next_balance; /* in jiffy units */
unsigned long next_blocked; /* Next update of blocked load in jiffies */
} nohz ____cacheline_aligned;
#endif /* CONFIG_NO_HZ_COMMON */
static unsigned long cpu_load(struct rq *rq)
{
return cfs_rq_load_avg(&rq->cfs);
}
/*
* cpu_load_without - compute CPU load without any contributions from *p
* @cpu: the CPU which load is requested
* @p: the task which load should be discounted
*
* The load of a CPU is defined by the load of tasks currently enqueued on that
* CPU as well as tasks which are currently sleeping after an execution on that
* CPU.
*
* This method returns the load of the specified CPU by discounting the load of
* the specified task, whenever the task is currently contributing to the CPU
* load.
*/
static unsigned long cpu_load_without(struct rq *rq, struct task_struct *p)
{
struct cfs_rq *cfs_rq;
unsigned int load;
/* Task has no contribution or is new */
if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
return cpu_load(rq);
cfs_rq = &rq->cfs;
load = READ_ONCE(cfs_rq->avg.load_avg);
/* Discount task's util from CPU's util */
lsub_positive(&load, task_h_load(p));
return load;
}
static unsigned long cpu_runnable(struct rq *rq)
{
return cfs_rq_runnable_avg(&rq->cfs);
}
static unsigned long cpu_runnable_without(struct rq *rq, struct task_struct *p)
{
struct cfs_rq *cfs_rq;
unsigned int runnable;
/* Task has no contribution or is new */
if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
return cpu_runnable(rq);
cfs_rq = &rq->cfs;
runnable = READ_ONCE(cfs_rq->avg.runnable_avg);
/* Discount task's runnable from CPU's runnable */
lsub_positive(&runnable, p->se.avg.runnable_avg);
return runnable;
}
static unsigned long capacity_of(int cpu)
{
return cpu_rq(cpu)->cpu_capacity;
}
static void record_wakee(struct task_struct *p)
{
/*
* Only decay a single time; tasks that have less then 1 wakeup per
* jiffy will not have built up many flips.
*/
if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
current->wakee_flips >>= 1;
current->wakee_flip_decay_ts = jiffies;
}
if (current->last_wakee != p) {
current->last_wakee = p;
current->wakee_flips++;
}
}
/*
* Detect M:N waker/wakee relationships via a switching-frequency heuristic.
*
* A waker of many should wake a different task than the one last awakened
* at a frequency roughly N times higher than one of its wakees.
*
* In order to determine whether we should let the load spread vs consolidating
* to shared cache, we look for a minimum 'flip' frequency of llc_size in one
* partner, and a factor of lls_size higher frequency in the other.
*
* With both conditions met, we can be relatively sure that the relationship is
* non-monogamous, with partner count exceeding socket size.
*
* Waker/wakee being client/server, worker/dispatcher, interrupt source or
* whatever is irrelevant, spread criteria is apparent partner count exceeds
* socket size.
*/
static int wake_wide(struct task_struct *p)
{
unsigned int master = current->wakee_flips;
unsigned int slave = p->wakee_flips;
int factor = __this_cpu_read(sd_llc_size);
if (master < slave)
swap(master, slave);
if (slave < factor || master < slave * factor)
return 0;
return 1;
}
/*
* The purpose of wake_affine() is to quickly determine on which CPU we can run
* soonest. For the purpose of speed we only consider the waking and previous
* CPU.
*
* wake_affine_idle() - only considers 'now', it check if the waking CPU is
* cache-affine and is (or will be) idle.
*
* wake_affine_weight() - considers the weight to reflect the average
* scheduling latency of the CPUs. This seems to work
* for the overloaded case.
*/
static int
wake_affine_idle(int this_cpu, int prev_cpu, int sync)
{
/*
* If this_cpu is idle, it implies the wakeup is from interrupt
* context. Only allow the move if cache is shared. Otherwise an
* interrupt intensive workload could force all tasks onto one
* node depending on the IO topology or IRQ affinity settings.
*
* If the prev_cpu is idle and cache affine then avoid a migration.
* There is no guarantee that the cache hot data from an interrupt
* is more important than cache hot data on the prev_cpu and from
* a cpufreq perspective, it's better to have higher utilisation
* on one CPU.
*/
if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
if (sync && cpu_rq(this_cpu)->nr_running == 1)
return this_cpu;
return nr_cpumask_bits;
}
static int
wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
int this_cpu, int prev_cpu, int sync)
{
s64 this_eff_load, prev_eff_load;
unsigned long task_load;
this_eff_load = cpu_load(cpu_rq(this_cpu));
if (sync) {
unsigned long current_load = task_h_load(current);
if (current_load > this_eff_load)
return this_cpu;
this_eff_load -= current_load;
}
task_load = task_h_load(p);
this_eff_load += task_load;
if (sched_feat(WA_BIAS))
this_eff_load *= 100;
this_eff_load *= capacity_of(prev_cpu);
prev_eff_load = cpu_load(cpu_rq(prev_cpu));
prev_eff_load -= task_load;
if (sched_feat(WA_BIAS))
prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
prev_eff_load *= capacity_of(this_cpu);
/*
* If sync, adjust the weight of prev_eff_load such that if
* prev_eff == this_eff that select_idle_sibling() will consider
* stacking the wakee on top of the waker if no other CPU is
* idle.
*/
if (sync)
prev_eff_load += 1;
return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
}
static int wake_affine(struct sched_domain *sd, struct task_struct *p,
int this_cpu, int prev_cpu, int sync)
{
int target = nr_cpumask_bits;
if (sched_feat(WA_IDLE))
target = wake_affine_idle(this_cpu, prev_cpu, sync);
if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits)
target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
if (target == nr_cpumask_bits)
return prev_cpu;
schedstat_inc(sd->ttwu_move_affine);
schedstat_inc(p->se.statistics.nr_wakeups_affine);
return target;
}
static struct sched_group *
find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu);
/*
* find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
*/
static int
find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
{
unsigned long load, min_load = ULONG_MAX;
unsigned int min_exit_latency = UINT_MAX;
u64 latest_idle_timestamp = 0;
int least_loaded_cpu = this_cpu;
int shallowest_idle_cpu = -1;
int i;
/* Check if we have any choice: */
if (group->group_weight == 1)
return cpumask_first(sched_group_span(group));
/* Traverse only the allowed CPUs */
for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
if (sched_idle_cpu(i))
return i;
if (available_idle_cpu(i)) {
struct rq *rq = cpu_rq(i);
struct cpuidle_state *idle = idle_get_state(rq);
if (idle && idle->exit_latency < min_exit_latency) {
/*
* We give priority to a CPU whose idle state
* has the smallest exit latency irrespective
* of any idle timestamp.
*/
min_exit_latency = idle->exit_latency;
latest_idle_timestamp = rq->idle_stamp;
shallowest_idle_cpu = i;
} else if ((!idle || idle->exit_latency == min_exit_latency) &&
rq->idle_stamp > latest_idle_timestamp) {
/*
* If equal or no active idle state, then
* the most recently idled CPU might have
* a warmer cache.
*/
latest_idle_timestamp = rq->idle_stamp;
shallowest_idle_cpu = i;
}
} else if (shallowest_idle_cpu == -1) {
load = cpu_load(cpu_rq(i));
if (load < min_load) {
min_load = load;
least_loaded_cpu = i;
}
}
}
return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
}
static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
int cpu, int prev_cpu, int sd_flag)
{
int new_cpu = cpu;
if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr))
return prev_cpu;
/*
* We need task's util for cpu_util_without, sync it up to
* prev_cpu's last_update_time.
*/
if (!(sd_flag & SD_BALANCE_FORK))
sync_entity_load_avg(&p->se);
while (sd) {
struct sched_group *group;
struct sched_domain *tmp;
int weight;
if (!(sd->flags & sd_flag)) {
sd = sd->child;
continue;
}
group = find_idlest_group(sd, p, cpu);
if (!group) {
sd = sd->child;
continue;
}
new_cpu = find_idlest_group_cpu(group, p, cpu);
if (new_cpu == cpu) {
/* Now try balancing at a lower domain level of 'cpu': */
sd = sd->child;
continue;
}
/* Now try balancing at a lower domain level of 'new_cpu': */
cpu = new_cpu;
weight = sd->span_weight;
sd = NULL;
for_each_domain(cpu, tmp) {
if (weight <= tmp->span_weight)
break;
if (tmp->flags & sd_flag)
sd = tmp;
}
}
return new_cpu;
}
#ifdef CONFIG_SCHED_SMT
DEFINE_STATIC_KEY_FALSE(sched_smt_present);
EXPORT_SYMBOL_GPL(sched_smt_present);
static inline void set_idle_cores(int cpu, int val)
{
struct sched_domain_shared *sds;
sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
if (sds)
WRITE_ONCE(sds->has_idle_cores, val);
}
static inline bool test_idle_cores(int cpu, bool def)
{
struct sched_domain_shared *sds;
sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
if (sds)
return READ_ONCE(sds->has_idle_cores);
return def;
}
/*
* Scans the local SMT mask to see if the entire core is idle, and records this
* information in sd_llc_shared->has_idle_cores.
*
* Since SMT siblings share all cache levels, inspecting this limited remote
* state should be fairly cheap.
*/
void __update_idle_core(struct rq *rq)
{
int core = cpu_of(rq);
int cpu;
rcu_read_lock();
if (test_idle_cores(core, true))
goto unlock;
for_each_cpu(cpu, cpu_smt_mask(core)) {
if (cpu == core)
continue;
if (!available_idle_cpu(cpu))
goto unlock;
}
set_idle_cores(core, 1);
unlock:
rcu_read_unlock();
}
/*
* Scan the entire LLC domain for idle cores; this dynamically switches off if
* there are no idle cores left in the system; tracked through
* sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
*/
static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
{
struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
int core, cpu;
if (!static_branch_likely(&sched_smt_present))
return -1;
if (!test_idle_cores(target, false))
return -1;
cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
for_each_cpu_wrap(core, cpus, target) {
bool idle = true;
for_each_cpu(cpu, cpu_smt_mask(core)) {
if (!available_idle_cpu(cpu)) {
idle = false;
break;
}
}
cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
if (idle)
return core;
}
/*
* Failed to find an idle core; stop looking for one.
*/
set_idle_cores(target, 0);
return -1;
}
/*
* Scan the local SMT mask for idle CPUs.
*/
static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
{
int cpu;
if (!static_branch_likely(&sched_smt_present))
return -1;
for_each_cpu(cpu, cpu_smt_mask(target)) {
if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
!cpumask_test_cpu(cpu, sched_domain_span(sd)))
continue;
if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
return cpu;
}
return -1;
}
#else /* CONFIG_SCHED_SMT */
static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
{
return -1;
}
static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
{
return -1;
}
#endif /* CONFIG_SCHED_SMT */
/*
* Scan the LLC domain for idle CPUs; this is dynamically regulated by
* comparing the average scan cost (tracked in sd->avg_scan_cost) against the
* average idle time for this rq (as found in rq->avg_idle).
*/
static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
{
struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
struct sched_domain *this_sd;
u64 avg_cost, avg_idle;
u64 time;
int this = smp_processor_id();
int cpu, nr = INT_MAX;
this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
if (!this_sd)
return -1;
/*
* Due to large variance we need a large fuzz factor; hackbench in
* particularly is sensitive here.
*/
avg_idle = this_rq()->avg_idle / 512;
avg_cost = this_sd->avg_scan_cost + 1;
if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost)
return -1;
if (sched_feat(SIS_PROP)) {
u64 span_avg = sd->span_weight * avg_idle;
if (span_avg > 4*avg_cost)
nr = div_u64(span_avg, avg_cost);
else
nr = 4;
}
time = cpu_clock(this);
cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
for_each_cpu_wrap(cpu, cpus, target) {
if (!--nr)
return -1;
if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
break;
}
time = cpu_clock(this) - time;
update_avg(&this_sd->avg_scan_cost, time);
return cpu;
}
/*
* Scan the asym_capacity domain for idle CPUs; pick the first idle one on which
* the task fits. If no CPU is big enough, but there are idle ones, try to
* maximize capacity.
*/
static int
select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
{
unsigned long task_util, best_cap = 0;
int cpu, best_cpu = -1;
struct cpumask *cpus;
cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
task_util = uclamp_task_util(p);
for_each_cpu_wrap(cpu, cpus, target) {
unsigned long cpu_cap = capacity_of(cpu);
if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
continue;
if (fits_capacity(task_util, cpu_cap))
return cpu;
if (cpu_cap > best_cap) {
best_cap = cpu_cap;
best_cpu = cpu;
}
}
return best_cpu;
}
static inline bool asym_fits_capacity(int task_util, int cpu)
{
if (static_branch_unlikely(&sched_asym_cpucapacity))
return fits_capacity(task_util, capacity_of(cpu));
return true;
}
/*
* Try and locate an idle core/thread in the LLC cache domain.
*/
static int select_idle_sibling(struct task_struct *p, int prev, int target)
{
struct sched_domain *sd;
unsigned long task_util;
int i, recent_used_cpu;
/*
* On asymmetric system, update task utilization because we will check
* that the task fits with cpu's capacity.
*/
if (static_branch_unlikely(&sched_asym_cpucapacity)) {
sync_entity_load_avg(&p->se);
task_util = uclamp_task_util(p);
}
if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
asym_fits_capacity(task_util, target))
return target;
/*
* If the previous CPU is cache affine and idle, don't be stupid:
*/
if (prev != target && cpus_share_cache(prev, target) &&
(available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
asym_fits_capacity(task_util, prev))
return prev;
/*
* Allow a per-cpu kthread to stack with the wakee if the
* kworker thread and the tasks previous CPUs are the same.
* The assumption is that the wakee queued work for the
* per-cpu kthread that is now complete and the wakeup is
* essentially a sync wakeup. An obvious example of this
* pattern is IO completions.
*/
if (is_per_cpu_kthread(current) &&
prev == smp_processor_id() &&
this_rq()->nr_running <= 1) {
return prev;
}
/* Check a recently used CPU as a potential idle candidate: */
recent_used_cpu = p->recent_used_cpu;
if (recent_used_cpu != prev &&
recent_used_cpu != target &&
cpus_share_cache(recent_used_cpu, target) &&
(available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) &&
asym_fits_capacity(task_util, recent_used_cpu)) {
/*
* Replace recent_used_cpu with prev as it is a potential
* candidate for the next wake:
*/
p->recent_used_cpu = prev;
return recent_used_cpu;
}
/*
* For asymmetric CPU capacity systems, our domain of interest is
* sd_asym_cpucapacity rather than sd_llc.
*/
if (static_branch_unlikely(&sched_asym_cpucapacity)) {
sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
/*
* On an asymmetric CPU capacity system where an exclusive
* cpuset defines a symmetric island (i.e. one unique
* capacity_orig value through the cpuset), the key will be set
* but the CPUs within that cpuset will not have a domain with
* SD_ASYM_CPUCAPACITY. These should follow the usual symmetric
* capacity path.
*/
if (sd) {
i = select_idle_capacity(p, sd, target);
return ((unsigned)i < nr_cpumask_bits) ? i : target;
}
}
sd = rcu_dereference(per_cpu(sd_llc, target));
if (!sd)
return target;
i = select_idle_core(p, sd, target);
if ((unsigned)i < nr_cpumask_bits)
return i;
i = select_idle_cpu(p, sd, target);
if ((unsigned)i < nr_cpumask_bits)
return i;
i = select_idle_smt(p, sd, target);
if ((unsigned)i < nr_cpumask_bits)
return i;
return target;
}
/**
* Amount of capacity of a CPU that is (estimated to be) used by CFS tasks
* @cpu: the CPU to get the utilization of
*
* The unit of the return value must be the one of capacity so we can compare
* the utilization with the capacity of the CPU that is available for CFS task
* (ie cpu_capacity).
*
* cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
* recent utilization of currently non-runnable tasks on a CPU. It represents
* the amount of utilization of a CPU in the range [0..capacity_orig] where
* capacity_orig is the cpu_capacity available at the highest frequency
* (arch_scale_freq_capacity()).
* The utilization of a CPU converges towards a sum equal to or less than the
* current capacity (capacity_curr <= capacity_orig) of the CPU because it is
* the running time on this CPU scaled by capacity_curr.
*
* The estimated utilization of a CPU is defined to be the maximum between its
* cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks
* currently RUNNABLE on that CPU.
* This allows to properly represent the expected utilization of a CPU which
* has just got a big task running since a long sleep period. At the same time
* however it preserves the benefits of the "blocked utilization" in
* describing the potential for other tasks waking up on the same CPU.
*
* Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
* higher than capacity_orig because of unfortunate rounding in
* cfs.avg.util_avg or just after migrating tasks and new task wakeups until
* the average stabilizes with the new running time. We need to check that the
* utilization stays within the range of [0..capacity_orig] and cap it if
* necessary. Without utilization capping, a group could be seen as overloaded
* (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
* available capacity. We allow utilization to overshoot capacity_curr (but not
* capacity_orig) as it useful for predicting the capacity required after task
* migrations (scheduler-driven DVFS).
*
* Return: the (estimated) utilization for the specified CPU
*/
static inline unsigned long cpu_util(int cpu)
{
struct cfs_rq *cfs_rq;
unsigned int util;
cfs_rq = &cpu_rq(cpu)->cfs;
util = READ_ONCE(cfs_rq->avg.util_avg);
if (sched_feat(UTIL_EST))
util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
return min_t(unsigned long, util, capacity_orig_of(cpu));
}
/*
* cpu_util_without: compute cpu utilization without any contributions from *p
* @cpu: the CPU which utilization is requested
* @p: the task which utilization should be discounted
*
* The utilization of a CPU is defined by the utilization of tasks currently
* enqueued on that CPU as well as tasks which are currently sleeping after an
* execution on that CPU.
*
* This method returns the utilization of the specified CPU by discounting the
* utilization of the specified task, whenever the task is currently
* contributing to the CPU utilization.
*/
static unsigned long cpu_util_without(int cpu, struct task_struct *p)
{
struct cfs_rq *cfs_rq;
unsigned int util;
/* Task has no contribution or is new */
if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
return cpu_util(cpu);
cfs_rq = &cpu_rq(cpu)->cfs;
util = READ_ONCE(cfs_rq->avg.util_avg);
/* Discount task's util from CPU's util */
lsub_positive(&util, task_util(p));
/*
* Covered cases:
*
* a) if *p is the only task sleeping on this CPU, then:
* cpu_util (== task_util) > util_est (== 0)
* and thus we return:
* cpu_util_without = (cpu_util - task_util) = 0
*
* b) if other tasks are SLEEPING on this CPU, which is now exiting
* IDLE, then:
* cpu_util >= task_util
* cpu_util > util_est (== 0)
* and thus we discount *p's blocked utilization to return:
* cpu_util_without = (cpu_util - task_util) >= 0
*
* c) if other tasks are RUNNABLE on that CPU and
* util_est > cpu_util
* then we use util_est since it returns a more restrictive
* estimation of the spare capacity on that CPU, by just
* considering the expected utilization of tasks already
* runnable on that CPU.
*
* Cases a) and b) are covered by the above code, while case c) is
* covered by the following code when estimated utilization is
* enabled.
*/
if (sched_feat(UTIL_EST)) {
unsigned int estimated =
READ_ONCE(cfs_rq->avg.util_est.enqueued);
/*
* Despite the following checks we still have a small window
* for a possible race, when an execl's select_task_rq_fair()
* races with LB's detach_task():
*
* detach_task()
* p->on_rq = TASK_ON_RQ_MIGRATING;
* ---------------------------------- A
* deactivate_task() \
* dequeue_task() + RaceTime
* util_est_dequeue() /
* ---------------------------------- B
*
* The additional check on "current == p" it's required to
* properly fix the execl regression and it helps in further
* reducing the chances for the above race.
*/
if (unlikely(task_on_rq_queued(p) || current == p))
lsub_positive(&estimated, _task_util_est(p));
util = max(util, estimated);
}
/*
* Utilization (estimated) can exceed the CPU capacity, thus let's
* clamp to the maximum CPU capacity to ensure consistency with
* the cpu_util call.
*/
return min_t(unsigned long, util, capacity_orig_of(cpu));
}
/*
* Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued)
* to @dst_cpu.
*/
static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
{
struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg);
/*
* If @p migrates from @cpu to another, remove its contribution. Or,
* if @p migrates from another CPU to @cpu, add its contribution. In
* the other cases, @cpu is not impacted by the migration, so the
* util_avg should already be correct.
*/
if (task_cpu(p) == cpu && dst_cpu != cpu)
sub_positive(&util, task_util(p));
else if (task_cpu(p) != cpu && dst_cpu == cpu)
util += task_util(p);
if (sched_feat(UTIL_EST)) {
util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
/*
* During wake-up, the task isn't enqueued yet and doesn't
* appear in the cfs_rq->avg.util_est.enqueued of any rq,
* so just add it (if needed) to "simulate" what will be
* cpu_util() after the task has been enqueued.
*/
if (dst_cpu == cpu)
util_est += _task_util_est(p);
util = max(util, util_est);
}
return min(util, capacity_orig_of(cpu));
}
/*
* compute_energy(): Estimates the energy that @pd would consume if @p was
* migrated to @dst_cpu. compute_energy() predicts what will be the utilization
* landscape of @pd's CPUs after the task migration, and uses the Energy Model
* to compute what would be the energy if we decided to actually migrate that
* task.
*/
static long
compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
{
struct cpumask *pd_mask = perf_domain_span(pd);
unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
unsigned long max_util = 0, sum_util = 0;
unsigned long energy = 0;
int cpu;
/*
* The capacity state of CPUs of the current rd can be driven by CPUs
* of another rd if they belong to the same pd. So, account for the
* utilization of these CPUs too by masking pd with cpu_online_mask
* instead of the rd span.
*
* If an entire pd is outside of the current rd, it will not appear in
* its pd list and will not be accounted by compute_energy().
*/
for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
unsigned long cpu_util, util_cfs = cpu_util_next(cpu, p, dst_cpu);
struct task_struct *tsk = cpu == dst_cpu ? p : NULL;
/*
* Busy time computation: utilization clamping is not
* required since the ratio (sum_util / cpu_capacity)
* is already enough to scale the EM reported power
* consumption at the (eventually clamped) cpu_capacity.
*/
sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
ENERGY_UTIL, NULL);
/*
* Performance domain frequency: utilization clamping
* must be considered since it affects the selection
* of the performance domain frequency.
* NOTE: in case RT tasks are running, by default the
* FREQUENCY_UTIL's utilization can be max OPP.
*/
cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
FREQUENCY_UTIL, tsk);
max_util = max(max_util, cpu_util);
}
trace_android_vh_em_cpu_energy(pd->em_pd, max_util, sum_util, &energy);
if (!energy)
energy = em_cpu_energy(pd->em_pd, max_util, sum_util);
return energy;
}
/*
* find_energy_efficient_cpu(): Find most energy-efficient target CPU for the
* waking task. find_energy_efficient_cpu() looks for the CPU with maximum
* spare capacity in each performance domain and uses it as a potential
* candidate to execute the task. Then, it uses the Energy Model to figure
* out which of the CPU candidates is the most energy-efficient.
*
* The rationale for this heuristic is as follows. In a performance domain,
* all the most energy efficient CPU candidates (according to the Energy
* Model) are those for which we'll request a low frequency. When there are
* several CPUs for which the frequency request will be the same, we don't
* have enough data to break the tie between them, because the Energy Model
* only includes active power costs. With this model, if we assume that
* frequency requests follow utilization (e.g. using schedutil), the CPU with
* the maximum spare capacity in a performance domain is guaranteed to be among
* the best candidates of the performance domain.
*
* In practice, it could be preferable from an energy standpoint to pack
* small tasks on a CPU in order to let other CPUs go in deeper idle states,
* but that could also hurt our chances to go cluster idle, and we have no
* ways to tell with the current Energy Model if this is actually a good
* idea or not. So, find_energy_efficient_cpu() basically favors
* cluster-packing, and spreading inside a cluster. That should at least be
* a good thing for latency, and this is consistent with the idea that most
* of the energy savings of EAS come from the asymmetry of the system, and
* not so much from breaking the tie between identical CPUs. That's also the
* reason why EAS is enabled in the topology code only for systems where
* SD_ASYM_CPUCAPACITY is set.
*
* NOTE: Forkees are not accepted in the energy-aware wake-up path because
* they don't have any useful utilization data yet and it's not possible to
* forecast their impact on energy consumption. Consequently, they will be
* placed by find_idlest_cpu() on the least loaded CPU, which might turn out
* to be energy-inefficient in some use-cases. The alternative would be to
* bias new tasks towards specific types of CPUs first, or to try to infer
* their util_avg from the parent task, but those heuristics could hurt
* other use-cases too. So, until someone finds a better way to solve this,
* let's keep things simple by re-using the existing slow path.
*/
static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu, int sync)
{
unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
int max_spare_cap_cpu_ls = prev_cpu, best_idle_cpu = -1;
unsigned long max_spare_cap_ls = 0, target_cap;
unsigned long cpu_cap, util, base_energy = 0;
bool boosted, latency_sensitive = false;
unsigned int min_exit_lat = UINT_MAX;
int cpu, best_energy_cpu = prev_cpu;
struct cpuidle_state *idle;
struct sched_domain *sd;
struct perf_domain *pd;
int new_cpu = INT_MAX;
sync_entity_load_avg(&p->se);
trace_android_rvh_find_energy_efficient_cpu(p, prev_cpu, sync, &new_cpu);
if (new_cpu != INT_MAX)
return new_cpu;
rcu_read_lock();
pd = rcu_dereference(rd->pd);
if (!pd || READ_ONCE(rd->overutilized))
goto fail;
cpu = smp_processor_id();
if (sync && cpu_rq(cpu)->nr_running == 1 &&
cpumask_test_cpu(cpu, p->cpus_ptr) &&
task_fits_capacity(p, capacity_of(cpu))) {
rcu_read_unlock();
return cpu;
}
/*
* Energy-aware wake-up happens on the lowest sched_domain starting
* from sd_asym_cpucapacity spanning over this_cpu and prev_cpu.
*/
sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity));
while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
sd = sd->parent;
if (!sd)
goto fail;
if (!task_util_est(p))
goto unlock;
latency_sensitive = uclamp_latency_sensitive(p);
boosted = uclamp_boosted(p);
target_cap = boosted ? 0 : ULONG_MAX;
for (; pd; pd = pd->next) {
unsigned long cur_delta, spare_cap, max_spare_cap = 0;
unsigned long base_energy_pd;
int max_spare_cap_cpu = -1;
/* Compute the 'base' energy of the pd, without @p */
base_energy_pd = compute_energy(p, -1, pd);
base_energy += base_energy_pd;
for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
continue;
util = cpu_util_next(cpu, p, cpu);
cpu_cap = capacity_of(cpu);
spare_cap = cpu_cap;
lsub_positive(&spare_cap, util);
/*
* Skip CPUs that cannot satisfy the capacity request.
* IOW, placing the task there would make the CPU
* overutilized. Take uclamp into account to see how
* much capacity we can get out of the CPU; this is
* aligned with schedutil_cpu_util().
*/
util = uclamp_rq_util_with(cpu_rq(cpu), util, p);
if (!fits_capacity(util, cpu_cap))
continue;
/* Always use prev_cpu as a candidate. */
if (!latency_sensitive && cpu == prev_cpu) {
prev_delta = compute_energy(p, prev_cpu, pd);
prev_delta -= base_energy_pd;
best_delta = min(best_delta, prev_delta);
}
/*
* Find the CPU with the maximum spare capacity in
* the performance domain
*/
if (spare_cap > max_spare_cap) {
max_spare_cap = spare_cap;
max_spare_cap_cpu = cpu;
}
if (!latency_sensitive)
continue;
if (idle_cpu(cpu)) {
cpu_cap = capacity_orig_of(cpu);
if (boosted && cpu_cap < target_cap)
continue;
if (!boosted && cpu_cap > target_cap)
continue;
idle = idle_get_state(cpu_rq(cpu));
if (idle && idle->exit_latency > min_exit_lat &&
cpu_cap == target_cap)
continue;
if (idle)
min_exit_lat = idle->exit_latency;
target_cap = cpu_cap;
best_idle_cpu = cpu;
} else if (spare_cap > max_spare_cap_ls) {
max_spare_cap_ls = spare_cap;
max_spare_cap_cpu_ls = cpu;
}
}
/* Evaluate the energy impact of using this CPU. */
if (!latency_sensitive && max_spare_cap_cpu >= 0 &&
max_spare_cap_cpu != prev_cpu) {
cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
cur_delta -= base_energy_pd;
if (cur_delta < best_delta) {
best_delta = cur_delta;
best_energy_cpu = max_spare_cap_cpu;
}
}
}
unlock:
rcu_read_unlock();
if (latency_sensitive)
return best_idle_cpu >= 0 ? best_idle_cpu : max_spare_cap_cpu_ls;
/*
* Pick the best CPU if prev_cpu cannot be used, or if it saves at
* least 6% of the energy used by prev_cpu.
*/
if (prev_delta == ULONG_MAX)
return best_energy_cpu;
if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
return best_energy_cpu;
return prev_cpu;
fail:
rcu_read_unlock();
return -1;
}
/*
* select_task_rq_fair: Select target runqueue for the waking task in domains
* that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
* SD_BALANCE_FORK, or SD_BALANCE_EXEC.
*
* Balances load by selecting the idlest CPU in the idlest group, or under
* certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set.
*
* Returns the target CPU number.
*
* preempt must be disabled.
*/
static int
select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
{
struct sched_domain *tmp, *sd = NULL;
int cpu = smp_processor_id();
int new_cpu = prev_cpu;
int want_affine = 0;
int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
int target_cpu = -1;
if (trace_android_rvh_select_task_rq_fair_enabled() &&
!(sd_flag & SD_BALANCE_FORK))
sync_entity_load_avg(&p->se);
trace_android_rvh_select_task_rq_fair(p, prev_cpu, sd_flag,
wake_flags, &target_cpu);
if (target_cpu >= 0)
return target_cpu;
if (sd_flag & SD_BALANCE_WAKE) {
record_wakee(p);
if (sched_energy_enabled()) {
new_cpu = find_energy_efficient_cpu(p, prev_cpu, sync);
if (new_cpu >= 0)
return new_cpu;
new_cpu = prev_cpu;
}
want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr);
}
rcu_read_lock();
for_each_domain(cpu, tmp) {
/*
* If both 'cpu' and 'prev_cpu' are part of this domain,
* cpu is a valid SD_WAKE_AFFINE target.
*/
if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
if (cpu != prev_cpu)
new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync);
sd = NULL; /* Prefer wake_affine over balance flags */
break;
}
if (tmp->flags & sd_flag)
sd = tmp;
else if (!want_affine)
break;
}
if (unlikely(sd)) {
/* Slow path */
new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
} else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */
/* Fast path */
new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
if (want_affine)
current->recent_used_cpu = cpu;
}
rcu_read_unlock();
return new_cpu;
}
static void detach_entity_cfs_rq(struct sched_entity *se);
/*
* Called immediately before a task is migrated to a new CPU; task_cpu(p) and
* cfs_rq_of(p) references at time of call are still valid and identify the
* previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
*/
static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
{
/*
* As blocked tasks retain absolute vruntime the migration needs to
* deal with this by subtracting the old and adding the new
* min_vruntime -- the latter is done by enqueue_entity() when placing
* the task on the new runqueue.
*/
if (p->state == TASK_WAKING) {
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
u64 min_vruntime;
#ifndef CONFIG_64BIT
u64 min_vruntime_copy;
do {
min_vruntime_copy = cfs_rq->min_vruntime_copy;
smp_rmb();
min_vruntime = cfs_rq->min_vruntime;
} while (min_vruntime != min_vruntime_copy);
#else
min_vruntime = cfs_rq->min_vruntime;
#endif
se->vruntime -= min_vruntime;
}
if (p->on_rq == TASK_ON_RQ_MIGRATING) {
/*
* In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old'
* rq->lock and can modify state directly.
*/
lockdep_assert_held(&task_rq(p)->lock);
detach_entity_cfs_rq(&p->se);
} else {
/*
* We are supposed to update the task to "current" time, then
* its up to date and ready to go to new CPU/cfs_rq. But we
* have difficulty in getting what current time is, so simply
* throw away the out-of-date time. This will result in the
* wakee task is less decayed, but giving the wakee more load
* sounds not bad.
*/
remove_entity_load_avg(&p->se);
}
/* Tell new CPU we are migrated */
p->se.avg.last_update_time = 0;
/* We have migrated, no longer consider this task hot */
p->se.exec_start = 0;
update_scan_period(p, new_cpu);
}
static void task_dead_fair(struct task_struct *p)
{
remove_entity_load_avg(&p->se);
}
static int
balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
if (rq->nr_running)
return 1;
return newidle_balance(rq, rf) != 0;
}
#endif /* CONFIG_SMP */
static unsigned long wakeup_gran(struct sched_entity *se)
{
unsigned long gran = sysctl_sched_wakeup_granularity;
/*
* Since its curr running now, convert the gran from real-time
* to virtual-time in his units.
*
* By using 'se' instead of 'curr' we penalize light tasks, so
* they get preempted easier. That is, if 'se' < 'curr' then
* the resulting gran will be larger, therefore penalizing the
* lighter, if otoh 'se' > 'curr' then the resulting gran will
* be smaller, again penalizing the lighter task.
*
* This is especially important for buddies when the leftmost
* task is higher priority than the buddy.
*/
return calc_delta_fair(gran, se);
}
/*
* Should 'se' preempt 'curr'.
*
* |s1
* |s2
* |s3
* g
* |<--->|c
*
* w(c, s1) = -1
* w(c, s2) = 0
* w(c, s3) = 1
*
*/
static int
wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
{
s64 gran, vdiff = curr->vruntime - se->vruntime;
if (vdiff <= 0)
return -1;
gran = wakeup_gran(se);
if (vdiff > gran)
return 1;
return 0;
}
static void set_last_buddy(struct sched_entity *se)
{
if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
return;
for_each_sched_entity(se) {
if (SCHED_WARN_ON(!se->on_rq))
return;
cfs_rq_of(se)->last = se;
}
}
static void set_next_buddy(struct sched_entity *se)
{
if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
return;
for_each_sched_entity(se) {
if (SCHED_WARN_ON(!se->on_rq))
return;
cfs_rq_of(se)->next = se;
}
}
static void set_skip_buddy(struct sched_entity *se)
{
for_each_sched_entity(se)
cfs_rq_of(se)->skip = se;
}
/*
* Preempt the current task with a newly woken task if needed:
*/
static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
{
struct task_struct *curr = rq->curr;
struct sched_entity *se = &curr->se, *pse = &p->se;
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
int scale = cfs_rq->nr_running >= sched_nr_latency;
int next_buddy_marked = 0;
bool preempt = false, nopreempt = false;
bool ignore = false;
if (unlikely(se == pse))
return;
trace_android_rvh_check_preempt_wakeup_ignore(curr, &ignore);
if (ignore)
return;
/*
* This is possible from callers such as attach_tasks(), in which we
* unconditionally check_prempt_curr() after an enqueue (which may have
* lead to a throttle). This both saves work and prevents false
* next-buddy nomination below.
*/
if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
return;
if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
set_next_buddy(pse);
next_buddy_marked = 1;
}
/*
* We can come here with TIF_NEED_RESCHED already set from new task
* wake up path.
*
* Note: this also catches the edge-case of curr being in a throttled
* group (e.g. via set_curr_task), since update_curr() (in the
* enqueue of curr) will have resulted in resched being set. This
* prevents us from potentially nominating it as a false LAST_BUDDY
* below.
*/
if (test_tsk_need_resched(curr))
return;
/* Idle tasks are by definition preempted by non-idle tasks. */
if (unlikely(task_has_idle_policy(curr)) &&
likely(!task_has_idle_policy(p)))
goto preempt;
/*
* Batch and idle tasks do not preempt non-idle tasks (their preemption
* is driven by the tick):
*/
if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
return;
find_matching_se(&se, &pse);
update_curr(cfs_rq_of(se));
trace_android_rvh_check_preempt_wakeup(rq, p, &preempt, &nopreempt,
wake_flags, se, pse, next_buddy_marked, sysctl_sched_wakeup_granularity);
if (preempt)
goto preempt;
if (nopreempt)
return;
BUG_ON(!pse);
if (wakeup_preempt_entity(se, pse) == 1) {
/*
* Bias pick_next to pick the sched entity that is
* triggering this preemption.
*/
if (!next_buddy_marked)
set_next_buddy(pse);
goto preempt;
}
return;
preempt:
resched_curr(rq);
/*
* Only set the backward buddy when the current task is still
* on the rq. This can happen when a wakeup gets interleaved
* with schedule on the ->pre_schedule() or idle_balance()
* point, either of which can * drop the rq lock.
*
* Also, during early boot the idle thread is in the fair class,
* for obvious reasons its a bad idea to schedule back to it.
*/
if (unlikely(!se->on_rq || curr == rq->idle))
return;
if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
set_last_buddy(se);
}
struct task_struct *
pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
struct cfs_rq *cfs_rq = &rq->cfs;
struct sched_entity *se = NULL;
struct task_struct *p = NULL;
int new_tasks;
bool repick = false;
again:
if (!sched_fair_runnable(rq))
goto idle;
#ifdef CONFIG_FAIR_GROUP_SCHED
if (!prev || prev->sched_class != &fair_sched_class)
goto simple;
/*
* Because of the set_next_buddy() in dequeue_task_fair() it is rather
* likely that a next task is from the same cgroup as the current.
*
* Therefore attempt to avoid putting and setting the entire cgroup
* hierarchy, only change the part that actually changes.
*/
do {
struct sched_entity *curr = cfs_rq->curr;
/*
* Since we got here without doing put_prev_entity() we also
* have to consider cfs_rq->curr. If it is still a runnable
* entity, update_curr() will update its vruntime, otherwise
* forget we've ever seen it.
*/
if (curr) {
if (curr->on_rq)
update_curr(cfs_rq);
else
curr = NULL;
/*
* This call to check_cfs_rq_runtime() will do the
* throttle and dequeue its entity in the parent(s).
* Therefore the nr_running test will indeed
* be correct.
*/
if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
cfs_rq = &rq->cfs;
if (!cfs_rq->nr_running)
goto idle;
goto simple;
}
}
se = pick_next_entity(cfs_rq, curr);
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
p = task_of(se);
trace_android_rvh_replace_next_task_fair(rq, &p, &se, &repick, false, prev);
/*
* Since we haven't yet done put_prev_entity and if the selected task
* is a different task than we started out with, try and touch the
* least amount of cfs_rqs.
*/
if (prev != p) {
struct sched_entity *pse = &prev->se;
while (!(cfs_rq = is_same_group(se, pse))) {
int se_depth = se->depth;
int pse_depth = pse->depth;
if (se_depth <= pse_depth) {
put_prev_entity(cfs_rq_of(pse), pse);
pse = parent_entity(pse);
}
if (se_depth >= pse_depth) {
set_next_entity(cfs_rq_of(se), se);
se = parent_entity(se);
}
}
put_prev_entity(cfs_rq, pse);
set_next_entity(cfs_rq, se);
}
goto done;
simple:
#endif
if (prev)
put_prev_task(rq, prev);
trace_android_rvh_replace_next_task_fair(rq, &p, &se, &repick, true, prev);
if (repick) {
for_each_sched_entity(se)
set_next_entity(cfs_rq_of(se), se);
goto done;
}
do {
se = pick_next_entity(cfs_rq, NULL);
set_next_entity(cfs_rq, se);
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
p = task_of(se);
done: __maybe_unused;
#ifdef CONFIG_SMP
/*
* Move the next running task to the front of
* the list, so our cfs_tasks list becomes MRU
* one.
*/
list_move(&p->se.group_node, &rq->cfs_tasks);
#endif
if (hrtick_enabled(rq))
hrtick_start_fair(rq, p);
update_misfit_status(p, rq);
return p;
idle:
if (!rf)
return NULL;
new_tasks = newidle_balance(rq, rf);
/*
* Because newidle_balance() releases (and re-acquires) rq->lock, it is
* possible for any higher priority task to appear. In that case we
* must re-start the pick_next_entity() loop.
*/
if (new_tasks < 0)
return RETRY_TASK;
if (new_tasks > 0)
goto again;
/*
* rq is about to be idle, check if we need to update the
* lost_idle_time of clock_pelt
*/
update_idle_rq_clock_pelt(rq);
return NULL;
}
static struct task_struct *__pick_next_task_fair(struct rq *rq)
{
return pick_next_task_fair(rq, NULL, NULL);
}
/*
* Account for a descheduled task:
*/
static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
{
struct sched_entity *se = &prev->se;
struct cfs_rq *cfs_rq;
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
put_prev_entity(cfs_rq, se);
}
}
/*
* sched_yield() is very simple
*
* The magic of dealing with the ->skip buddy is in pick_next_entity.
*/
static void yield_task_fair(struct rq *rq)
{
struct task_struct *curr = rq->curr;
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
struct sched_entity *se = &curr->se;
/*
* Are we the only task in the tree?
*/
if (unlikely(rq->nr_running == 1))
return;
clear_buddies(cfs_rq, se);
if (curr->policy != SCHED_BATCH) {
update_rq_clock(rq);
/*
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
/*
* Tell update_rq_clock() that we've just updated,
* so we don't do microscopic update in schedule()
* and double the fastpath cost.
*/
rq_clock_skip_update(rq);
}
set_skip_buddy(se);
}
static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
{
struct sched_entity *se = &p->se;
/* throttled hierarchies are not runnable */
if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
return false;
/* Tell the scheduler that we'd really like pse to run next. */
set_next_buddy(se);
yield_task_fair(rq);
return true;
}
#ifdef CONFIG_SMP
/**************************************************
* Fair scheduling class load-balancing methods.
*
* BASICS
*
* The purpose of load-balancing is to achieve the same basic fairness the
* per-CPU scheduler provides, namely provide a proportional amount of compute
* time to each task. This is expressed in the following equation:
*
* W_i,n/P_i == W_j,n/P_j for all i,j (1)
*
* Where W_i,n is the n-th weight average for CPU i. The instantaneous weight
* W_i,0 is defined as:
*
* W_i,0 = \Sum_j w_i,j (2)
*
* Where w_i,j is the weight of the j-th runnable task on CPU i. This weight
* is derived from the nice value as per sched_prio_to_weight[].
*
* The weight average is an exponential decay average of the instantaneous
* weight:
*
* W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
*
* C_i is the compute capacity of CPU i, typically it is the
* fraction of 'recent' time available for SCHED_OTHER task execution. But it
* can also include other factors [XXX].
*
* To achieve this balance we define a measure of imbalance which follows
* directly from (1):
*
* imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j } (4)
*
* We them move tasks around to minimize the imbalance. In the continuous
* function space it is obvious this converges, in the discrete case we get
* a few fun cases generally called infeasible weight scenarios.
*
* [XXX expand on:
* - infeasible weights;
* - local vs global optima in the discrete case. ]
*
*
* SCHED DOMAINS
*
* In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
* for all i,j solution, we create a tree of CPUs that follows the hardware
* topology where each level pairs two lower groups (or better). This results
* in O(log n) layers. Furthermore we reduce the number of CPUs going up the
* tree to only the first of the previous level and we decrease the frequency
* of load-balance at each level inv. proportional to the number of CPUs in
* the groups.
*
* This yields:
*
* log_2 n 1 n
* \Sum { --- * --- * 2^i } = O(n) (5)
* i = 0 2^i 2^i
* `- size of each group
* | | `- number of CPUs doing load-balance
* | `- freq
* `- sum over all levels
*
* Coupled with a limit on how many tasks we can migrate every balance pass,
* this makes (5) the runtime complexity of the balancer.
*
* An important property here is that each CPU is still (indirectly) connected
* to every other CPU in at most O(log n) steps:
*
* The adjacency matrix of the resulting graph is given by:
*
* log_2 n
* A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
* k = 0
*
* And you'll find that:
*
* A^(log_2 n)_i,j != 0 for all i,j (7)
*
* Showing there's indeed a path between every CPU in at most O(log n) steps.
* The task movement gives a factor of O(m), giving a convergence complexity
* of:
*
* O(nm log n), n := nr_cpus, m := nr_tasks (8)
*
*
* WORK CONSERVING
*
* In order to avoid CPUs going idle while there's still work to do, new idle
* balancing is more aggressive and has the newly idle CPU iterate up the domain
* tree itself instead of relying on other CPUs to bring it work.
*
* This adds some complexity to both (5) and (8) but it reduces the total idle
* time.
*
* [XXX more?]
*
*
* CGROUPS
*
* Cgroups make a horror show out of (2), instead of a simple sum we get:
*
* s_k,i
* W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
* S_k
*
* Where
*
* s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
*
* w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on CPU i.
*
* The big problem is S_k, its a global sum needed to compute a local (W_i)
* property.
*
* [XXX write more on how we solve this.. _after_ merging pjt's patches that
* rewrite all of this once again.]
*/
unsigned long __read_mostly max_load_balance_interval = HZ/10;
EXPORT_SYMBOL_GPL(max_load_balance_interval);
enum fbq_type { regular, remote, all };
/*
* 'group_type' describes the group of CPUs at the moment of load balancing.
*
* The enum is ordered by pulling priority, with the group with lowest priority
* first so the group_type can simply be compared when selecting the busiest
* group. See update_sd_pick_busiest().
*/
enum group_type {
/* The group has spare capacity that can be used to run more tasks. */
group_has_spare = 0,
/*
* The group is fully used and the tasks don't compete for more CPU
* cycles. Nevertheless, some tasks might wait before running.
*/
group_fully_busy,
/*
* SD_ASYM_CPUCAPACITY only: One task doesn't fit with CPU's capacity
* and must be migrated to a more powerful CPU.
*/
group_misfit_task,
/*
* SD_ASYM_PACKING only: One local CPU with higher capacity is available,
* and the task should be migrated to it instead of running on the
* current CPU.
*/
group_asym_packing,
/*
* The tasks' affinity constraints previously prevented the scheduler
* from balancing the load across the system.
*/
group_imbalanced,
/*
* The CPU is overloaded and can't provide expected CPU cycles to all
* tasks.
*/
group_overloaded
};
enum migration_type {
migrate_load = 0,
migrate_util,
migrate_task,
migrate_misfit
};
#define LBF_ALL_PINNED 0x01
#define LBF_NEED_BREAK 0x02
#define LBF_DST_PINNED 0x04
#define LBF_SOME_PINNED 0x08
#define LBF_NOHZ_STATS 0x10
#define LBF_NOHZ_AGAIN 0x20
struct lb_env {
struct sched_domain *sd;
struct rq *src_rq;
int src_cpu;
int dst_cpu;
struct rq *dst_rq;
struct cpumask *dst_grpmask;
int new_dst_cpu;
enum cpu_idle_type idle;
long imbalance;
/* The set of CPUs under consideration for load-balancing */
struct cpumask *cpus;
unsigned int flags;
unsigned int loop;
unsigned int loop_break;
unsigned int loop_max;
enum fbq_type fbq_type;
enum migration_type migration_type;
struct list_head tasks;
struct rq_flags *src_rq_rf;
};
/*
* Is this task likely cache-hot:
*/
static int task_hot(struct task_struct *p, struct lb_env *env)
{
s64 delta;
lockdep_assert_held(&env->src_rq->lock);
if (p->sched_class != &fair_sched_class)
return 0;
if (unlikely(task_has_idle_policy(p)))
return 0;
/* SMT siblings share cache */
if (env->sd->flags & SD_SHARE_CPUCAPACITY)
return 0;
/*
* Buddy candidates are cache hot:
*/
if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
(&p->se == cfs_rq_of(&p->se)->next ||
&p->se == cfs_rq_of(&p->se)->last))
return 1;
if (sysctl_sched_migration_cost == -1)
return 1;
if (sysctl_sched_migration_cost == 0)
return 0;
delta = rq_clock_task(env->src_rq) - p->se.exec_start;
return delta < (s64)sysctl_sched_migration_cost;
}
#ifdef CONFIG_NUMA_BALANCING
/*
* Returns 1, if task migration degrades locality
* Returns 0, if task migration improves locality i.e migration preferred.
* Returns -1, if task migration is not affected by locality.
*/
static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
{
struct numa_group *numa_group = rcu_dereference(p->numa_group);
unsigned long src_weight, dst_weight;
int src_nid, dst_nid, dist;
if (!static_branch_likely(&sched_numa_balancing))
return -1;
if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
return -1;
src_nid = cpu_to_node(env->src_cpu);
dst_nid = cpu_to_node(env->dst_cpu);
if (src_nid == dst_nid)
return -1;
/* Migrating away from the preferred node is always bad. */
if (src_nid == p->numa_preferred_nid) {
if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
return 1;
else
return -1;
}
/* Encourage migration to the preferred node. */
if (dst_nid == p->numa_preferred_nid)
return 0;
/* Leaving a core idle is often worse than degrading locality. */
if (env->idle == CPU_IDLE)
return -1;
dist = node_distance(src_nid, dst_nid);
if (numa_group) {
src_weight = group_weight(p, src_nid, dist);
dst_weight = group_weight(p, dst_nid, dist);
} else {
src_weight = task_weight(p, src_nid, dist);
dst_weight = task_weight(p, dst_nid, dist);
}
return dst_weight < src_weight;
}
#else
static inline int migrate_degrades_locality(struct task_struct *p,
struct lb_env *env)
{
return -1;
}
#endif
/*
* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
*/
static
int can_migrate_task(struct task_struct *p, struct lb_env *env)
{
int tsk_cache_hot;
int can_migrate = 1;
lockdep_assert_held(&env->src_rq->lock);
trace_android_rvh_can_migrate_task(p, env->dst_cpu, &can_migrate);
if (!can_migrate)
return 0;
/*
* We do not migrate tasks that are:
* 1) throttled_lb_pair, or
* 2) cannot be migrated to this CPU due to cpus_ptr, or
* 3) running (obviously), or
* 4) are cache-hot on their current CPU.
*/
if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
return 0;
/* Disregard pcpu kthreads; they are where they need to be. */
if (kthread_is_per_cpu(p))
return 0;
if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
int cpu;
schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
env->flags |= LBF_SOME_PINNED;
/*
* Remember if this task can be migrated to any other CPU in
* our sched_group. We may want to revisit it if we couldn't
* meet load balance goals by pulling other tasks on src_cpu.
*
* Avoid computing new_dst_cpu for NEWLY_IDLE or if we have
* already computed one in current iteration.
*/
if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED))
return 0;
/* Prevent to re-select dst_cpu via env's CPUs: */
for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
if (cpumask_test_cpu(cpu, p->cpus_ptr)) {
env->flags |= LBF_DST_PINNED;
env->new_dst_cpu = cpu;
break;
}
}
return 0;
}
/* Record that we found atleast one task that could run on dst_cpu */
env->flags &= ~LBF_ALL_PINNED;
if (task_running(env->src_rq, p)) {
schedstat_inc(p->se.statistics.nr_failed_migrations_running);
return 0;
}
/*
* Aggressive migration if:
* 1) destination numa is preferred
* 2) task is cache cold, or
* 3) too many balance attempts have failed.
*/
tsk_cache_hot = migrate_degrades_locality(p, env);
if (tsk_cache_hot == -1)
tsk_cache_hot = task_hot(p, env);
if (tsk_cache_hot <= 0 ||
env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
if (tsk_cache_hot == 1) {
schedstat_inc(env->sd->lb_hot_gained[env->idle]);
schedstat_inc(p->se.statistics.nr_forced_migrations);
}
return 1;
}
schedstat_inc(p->se.statistics.nr_failed_migrations_hot);
return 0;
}
/*
* detach_task() -- detach the task for the migration specified in env
*/
static void detach_task(struct task_struct *p, struct lb_env *env)
{
int detached = 0;
lockdep_assert_held(&env->src_rq->lock);
/*
* The vendor hook may drop the lock temporarily, so
* pass the rq flags to unpin lock. We expect the
* rq lock to be held after return.
*/
trace_android_rvh_migrate_queued_task(env->src_rq, env->src_rq_rf, p,
env->dst_cpu, &detached);
if (detached)
return;
deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
set_task_cpu(p, env->dst_cpu);
}
/*
* detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
* part of active balancing operations within "domain".
*
* Returns a task if successful and NULL otherwise.
*/
static struct task_struct *detach_one_task(struct lb_env *env)
{
struct task_struct *p;
lockdep_assert_held(&env->src_rq->lock);
list_for_each_entry_reverse(p,
&env->src_rq->cfs_tasks, se.group_node) {
if (!can_migrate_task(p, env))
continue;
detach_task(p, env);
/*
* Right now, this is only the second place where
* lb_gained[env->idle] is updated (other is detach_tasks)
* so we can safely collect stats here rather than
* inside detach_tasks().
*/
schedstat_inc(env->sd->lb_gained[env->idle]);
return p;
}
return NULL;
}
static const unsigned int sched_nr_migrate_break = 32;
/*
* detach_tasks() -- tries to detach up to imbalance load/util/tasks from
* busiest_rq, as part of a balancing operation within domain "sd".
*
* Returns number of detached tasks if successful and 0 otherwise.
*/
static int detach_tasks(struct lb_env *env)
{
struct list_head *tasks = &env->src_rq->cfs_tasks;
unsigned long util, load;
struct task_struct *p;
int detached = 0;
lockdep_assert_held(&env->src_rq->lock);
if (env->imbalance <= 0)
return 0;
while (!list_empty(tasks)) {
/*
* We don't want to steal all, otherwise we may be treated likewise,
* which could at worst lead to a livelock crash.
*/
if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
break;
p = list_last_entry(tasks, struct task_struct, se.group_node);
env->loop++;
/* We've more or less seen every task there is, call it quits */
if (env->loop > env->loop_max)
break;
/* take a breather every nr_migrate tasks */
if (env->loop > env->loop_break) {
env->loop_break += sched_nr_migrate_break;
env->flags |= LBF_NEED_BREAK;
break;
}
if (!can_migrate_task(p, env))
goto next;
switch (env->migration_type) {
case migrate_load:
/*
* Depending of the number of CPUs and tasks and the
* cgroup hierarchy, task_h_load() can return a null
* value. Make sure that env->imbalance decreases
* otherwise detach_tasks() will stop only after
* detaching up to loop_max tasks.
*/
load = max_t(unsigned long, task_h_load(p), 1);
if (sched_feat(LB_MIN) &&
load < 16 && !env->sd->nr_balance_failed)
goto next;
/*
* Make sure that we don't migrate too much load.
* Nevertheless, let relax the constraint if
* scheduler fails to find a good waiting task to
* migrate.
*/
if (shr_bound(load, env->sd->nr_balance_failed) > env->imbalance)
goto next;
env->imbalance -= load;
break;
case migrate_util:
util = task_util_est(p);
if (util > env->imbalance)
goto next;
env->imbalance -= util;
break;
case migrate_task:
env->imbalance--;
break;
case migrate_misfit:
/* This is not a misfit task */
if (task_fits_capacity(p, capacity_of(env->src_cpu)))
goto next;
env->imbalance = 0;
break;
}
detach_task(p, env);
list_add(&p->se.group_node, &env->tasks);
detached++;
#ifdef CONFIG_PREEMPTION
/*
* NEWIDLE balancing is a source of latency, so preemptible
* kernels will stop after the first task is detached to minimize
* the critical section.
*/
if (env->idle == CPU_NEWLY_IDLE)
break;
#endif
/*
* We only want to steal up to the prescribed amount of
* load/util/tasks.
*/
if (env->imbalance <= 0)
break;
continue;
next:
list_move(&p->se.group_node, tasks);
}
/*
* Right now, this is one of only two places we collect this stat
* so we can safely collect detach_one_task() stats here rather
* than inside detach_one_task().
*/
schedstat_add(env->sd->lb_gained[env->idle], detached);
return detached;
}
/*
* attach_task() -- attach the task detached by detach_task() to its new rq.
*/
static void attach_task(struct rq *rq, struct task_struct *p)
{
lockdep_assert_held(&rq->lock);
BUG_ON(task_rq(p) != rq);
activate_task(rq, p, ENQUEUE_NOCLOCK);
check_preempt_curr(rq, p, 0);
}
/*
* attach_one_task() -- attaches the task returned from detach_one_task() to
* its new rq.
*/
static void attach_one_task(struct rq *rq, struct task_struct *p)
{
struct rq_flags rf;
rq_lock(rq, &rf);
update_rq_clock(rq);
attach_task(rq, p);
rq_unlock(rq, &rf);
}
/*
* attach_tasks() -- attaches all tasks detached by detach_tasks() to their
* new rq.
*/
static void attach_tasks(struct lb_env *env)
{
struct list_head *tasks = &env->tasks;
struct task_struct *p;
struct rq_flags rf;
rq_lock(env->dst_rq, &rf);
update_rq_clock(env->dst_rq);
while (!list_empty(tasks)) {
p = list_first_entry(tasks, struct task_struct, se.group_node);
list_del_init(&p->se.group_node);
attach_task(env->dst_rq, p);
}
rq_unlock(env->dst_rq, &rf);
}
#ifdef CONFIG_NO_HZ_COMMON
static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
{
if (cfs_rq->avg.load_avg)
return true;
if (cfs_rq->avg.util_avg)
return true;
return false;
}
static inline bool others_have_blocked(struct rq *rq)
{
if (READ_ONCE(rq->avg_rt.util_avg))
return true;
if (READ_ONCE(rq->avg_dl.util_avg))
return true;
if (thermal_load_avg(rq))
return true;
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
if (READ_ONCE(rq->avg_irq.util_avg))
return true;
#endif
return false;
}
static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
{
rq->last_blocked_load_update_tick = jiffies;
if (!has_blocked)
rq->has_blocked_load = 0;
}
#else
static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
static inline bool others_have_blocked(struct rq *rq) { return false; }
static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
#endif
static bool __update_blocked_others(struct rq *rq, bool *done)
{
const struct sched_class *curr_class;
u64 now = rq_clock_pelt(rq);
unsigned long thermal_pressure;
bool decayed;
/*
* update_load_avg() can call cpufreq_update_util(). Make sure that RT,
* DL and IRQ signals have been updated before updating CFS.
*/
curr_class = rq->curr->sched_class;
thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) |
update_irq_load_avg(rq, 0);
if (others_have_blocked(rq))
*done = false;
return decayed;
}
#ifdef CONFIG_FAIR_GROUP_SCHED
static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
{
if (cfs_rq->load.weight)
return false;
if (cfs_rq->avg.load_sum)
return false;
if (cfs_rq->avg.util_sum)
return false;
if (cfs_rq->avg.runnable_sum)
return false;
return true;
}
static bool __update_blocked_fair(struct rq *rq, bool *done)
{
struct cfs_rq *cfs_rq, *pos;
bool decayed = false;
int cpu = cpu_of(rq);
/*
* Iterates the task_group tree in a bottom up fashion, see
* list_add_leaf_cfs_rq() for details.
*/
for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
struct sched_entity *se;
if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
update_tg_load_avg(cfs_rq);
if (cfs_rq == &rq->cfs)
decayed = true;
}
/* Propagate pending load changes to the parent, if any: */
se = cfs_rq->tg->se[cpu];
if (se && !skip_blocked_update(se))
update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
/*
* There can be a lot of idle CPU cgroups. Don't let fully
* decayed cfs_rqs linger on the list.
*/
if (cfs_rq_is_decayed(cfs_rq))
list_del_leaf_cfs_rq(cfs_rq);
/* Don't need periodic decay once load/util_avg are null */
if (cfs_rq_has_blocked(cfs_rq))
*done = false;
}
return decayed;
}
/*
* Compute the hierarchical load factor for cfs_rq and all its ascendants.
* This needs to be done in a top-down fashion because the load of a child
* group is a fraction of its parents load.
*/
static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
unsigned long now = jiffies;
unsigned long load;
if (cfs_rq->last_h_load_update == now)
return;
WRITE_ONCE(cfs_rq->h_load_next, NULL);
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
WRITE_ONCE(cfs_rq->h_load_next, se);
if (cfs_rq->last_h_load_update == now)
break;
}
if (!se) {
cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
cfs_rq->last_h_load_update = now;
}
while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) {
load = cfs_rq->h_load;
load = div64_ul(load * se->avg.load_avg,
cfs_rq_load_avg(cfs_rq) + 1);
cfs_rq = group_cfs_rq(se);
cfs_rq->h_load = load;
cfs_rq->last_h_load_update = now;
}
}
static unsigned long task_h_load(struct task_struct *p)
{
struct cfs_rq *cfs_rq = task_cfs_rq(p);
update_cfs_rq_h_load(cfs_rq);
return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
cfs_rq_load_avg(cfs_rq) + 1);
}
#else
static bool __update_blocked_fair(struct rq *rq, bool *done)
{
struct cfs_rq *cfs_rq = &rq->cfs;
bool decayed;
decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
if (cfs_rq_has_blocked(cfs_rq))
*done = false;
return decayed;
}
static unsigned long task_h_load(struct task_struct *p)
{
return p->se.avg.load_avg;
}
#endif
static void update_blocked_averages(int cpu)
{
bool decayed = false, done = true;
struct rq *rq = cpu_rq(cpu);
struct rq_flags rf;
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
decayed |= __update_blocked_others(rq, &done);
decayed |= __update_blocked_fair(rq, &done);
update_blocked_load_status(rq, !done);
if (decayed)
cpufreq_update_util(rq, 0);
rq_unlock_irqrestore(rq, &rf);
}
/********** Helpers for find_busiest_group ************************/
/*
* sg_lb_stats - stats of a sched_group required for load_balancing
*/
struct sg_lb_stats {
unsigned long avg_load; /*Avg load across the CPUs of the group */
unsigned long group_load; /* Total load over the CPUs of the group */
unsigned long group_capacity;
unsigned long group_util; /* Total utilization over the CPUs of the group */
unsigned long group_runnable; /* Total runnable time over the CPUs of the group */
unsigned int sum_nr_running; /* Nr of tasks running in the group */
unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */
unsigned int idle_cpus;
unsigned int group_weight;
enum group_type group_type;
unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
#endif
};
/*
* sd_lb_stats - Structure to store the statistics of a sched_domain
* during load balancing.
*/
struct sd_lb_stats {
struct sched_group *busiest; /* Busiest group in this sd */
struct sched_group *local; /* Local group in this sd */
unsigned long total_load; /* Total load of all groups in sd */
unsigned long total_capacity; /* Total capacity of all groups in sd */
unsigned long avg_load; /* Average load across all groups in sd */
unsigned int prefer_sibling; /* tasks should go to sibling first */
struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
struct sg_lb_stats local_stat; /* Statistics of the local group */
};
static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
{
/*
* Skimp on the clearing to avoid duplicate work. We can avoid clearing
* local_stat because update_sg_lb_stats() does a full clear/assignment.
* We must however set busiest_stat::group_type and
* busiest_stat::idle_cpus to the worst busiest group because
* update_sd_pick_busiest() reads these before assignment.
*/
*sds = (struct sd_lb_stats){
.busiest = NULL,
.local = NULL,
.total_load = 0UL,
.total_capacity = 0UL,
.busiest_stat = {
.idle_cpus = UINT_MAX,
.group_type = group_has_spare,
},
};
}
static unsigned long scale_rt_capacity(int cpu)
{
struct rq *rq = cpu_rq(cpu);
unsigned long max = arch_scale_cpu_capacity(cpu);
unsigned long used, free;
unsigned long irq;
irq = cpu_util_irq(rq);
if (unlikely(irq >= max))
return 1;
/*
* avg_rt.util_avg and avg_dl.util_avg track binary signals
* (running and not running) with weights 0 and 1024 respectively.
* avg_thermal.load_avg tracks thermal pressure and the weighted
* average uses the actual delta max capacity(load).
*/
used = READ_ONCE(rq->avg_rt.util_avg);
used += READ_ONCE(rq->avg_dl.util_avg);
used += thermal_load_avg(rq);
if (unlikely(used >= max))
return 1;
free = max - used;
return scale_irq_capacity(free, irq, max);
}
static void update_cpu_capacity(struct sched_domain *sd, int cpu)
{
unsigned long capacity = scale_rt_capacity(cpu);
struct sched_group *sdg = sd->groups;
cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
if (!capacity)
capacity = 1;
trace_android_rvh_update_cpu_capacity(cpu, &capacity);
cpu_rq(cpu)->cpu_capacity = capacity;
trace_sched_cpu_capacity_tp(cpu_rq(cpu));
sdg->sgc->capacity = capacity;
sdg->sgc->min_capacity = capacity;
sdg->sgc->max_capacity = capacity;
}
void update_group_capacity(struct sched_domain *sd, int cpu)
{
struct sched_domain *child = sd->child;
struct sched_group *group, *sdg = sd->groups;
unsigned long capacity, min_capacity, max_capacity;
unsigned long interval;
interval = msecs_to_jiffies(sd->balance_interval);
interval = clamp(interval, 1UL, max_load_balance_interval);
sdg->sgc->next_update = jiffies + interval;
if (!child) {
update_cpu_capacity(sd, cpu);
return;
}
capacity = 0;
min_capacity = ULONG_MAX;
max_capacity = 0;
if (child->flags & SD_OVERLAP) {
/*
* SD_OVERLAP domains cannot assume that child groups
* span the current group.
*/
for_each_cpu(cpu, sched_group_span(sdg)) {
unsigned long cpu_cap = capacity_of(cpu);
capacity += cpu_cap;
min_capacity = min(cpu_cap, min_capacity);
max_capacity = max(cpu_cap, max_capacity);
}
} else {
/*
* !SD_OVERLAP domains can assume that child groups
* span the current group.
*/
group = child->groups;
do {
struct sched_group_capacity *sgc = group->sgc;
capacity += sgc->capacity;
min_capacity = min(sgc->min_capacity, min_capacity);
max_capacity = max(sgc->max_capacity, max_capacity);
group = group->next;
} while (group != child->groups);
}
sdg->sgc->capacity = capacity;
sdg->sgc->min_capacity = min_capacity;
sdg->sgc->max_capacity = max_capacity;
}
/*
* Check whether the capacity of the rq has been noticeably reduced by side
* activity. The imbalance_pct is used for the threshold.
* Return true is the capacity is reduced
*/
static inline int
check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
{
return ((rq->cpu_capacity * sd->imbalance_pct) <
(rq->cpu_capacity_orig * 100));
}
/*
* Check whether a rq has a misfit task and if it looks like we can actually
* help that task: we can migrate the task to a CPU of higher capacity, or
* the task's current CPU is heavily pressured.
*/
static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
{
return rq->misfit_task_load &&
(rq->cpu_capacity_orig < rq->rd->max_cpu_capacity ||
check_cpu_capacity(rq, sd));
}
/*
* Group imbalance indicates (and tries to solve) the problem where balancing
* groups is inadequate due to ->cpus_ptr constraints.
*
* Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a
* cpumask covering 1 CPU of the first group and 3 CPUs of the second group.
* Something like:
*
* { 0 1 2 3 } { 4 5 6 7 }
* * * * *
*
* If we were to balance group-wise we'd place two tasks in the first group and
* two tasks in the second group. Clearly this is undesired as it will overload
* cpu 3 and leave one of the CPUs in the second group unused.
*
* The current solution to this issue is detecting the skew in the first group
* by noticing the lower domain failed to reach balance and had difficulty
* moving tasks due to affinity constraints.
*
* When this is so detected; this group becomes a candidate for busiest; see
* update_sd_pick_busiest(). And calculate_imbalance() and
* find_busiest_group() avoid some of the usual balance conditions to allow it
* to create an effective group imbalance.
*
* This is a somewhat tricky proposition since the next run might not find the
* group imbalance and decide the groups need to be balanced again. A most
* subtle and fragile situation.
*/
static inline int sg_imbalanced(struct sched_group *group)
{
return group->sgc->imbalance;
}
/*
* group_has_capacity returns true if the group has spare capacity that could
* be used by some tasks.
* We consider that a group has spare capacity if the * number of task is
* smaller than the number of CPUs or if the utilization is lower than the
* available capacity for CFS tasks.
* For the latter, we use a threshold to stabilize the state, to take into
* account the variance of the tasks' load and to return true if the available
* capacity in meaningful for the load balancer.
* As an example, an available capacity of 1% can appear but it doesn't make
* any benefit for the load balance.
*/
static inline bool
group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
{
if (sgs->sum_nr_running < sgs->group_weight)
return true;
if ((sgs->group_capacity * imbalance_pct) <
(sgs->group_runnable * 100))
return false;
if ((sgs->group_capacity * 100) >
(sgs->group_util * imbalance_pct))
return true;
return false;
}
/*
* group_is_overloaded returns true if the group has more tasks than it can
* handle.
* group_is_overloaded is not equals to !group_has_capacity because a group
* with the exact right number of tasks, has no more spare capacity but is not
* overloaded so both group_has_capacity and group_is_overloaded return
* false.
*/
static inline bool
group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
{
if (sgs->sum_nr_running <= sgs->group_weight)
return false;
if ((sgs->group_capacity * 100) <
(sgs->group_util * imbalance_pct))
return true;
if ((sgs->group_capacity * imbalance_pct) <
(sgs->group_runnable * 100))
return true;
return false;
}
/*
* group_smaller_min_cpu_capacity: Returns true if sched_group sg has smaller
* per-CPU capacity than sched_group ref.
*/
static inline bool
group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
{
return fits_capacity(sg->sgc->min_capacity, ref->sgc->min_capacity);
}
/*
* group_smaller_max_cpu_capacity: Returns true if sched_group sg has smaller
* per-CPU capacity_orig than sched_group ref.
*/
static inline bool
group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
{
return fits_capacity(sg->sgc->max_capacity, ref->sgc->max_capacity);
}
static inline enum
group_type group_classify(unsigned int imbalance_pct,
struct sched_group *group,
struct sg_lb_stats *sgs)
{
if (group_is_overloaded(imbalance_pct, sgs))
return group_overloaded;
if (sg_imbalanced(group))
return group_imbalanced;
if (sgs->group_asym_packing)
return group_asym_packing;
if (sgs->group_misfit_task_load)
return group_misfit_task;
if (!group_has_capacity(imbalance_pct, sgs))
return group_fully_busy;
return group_has_spare;
}
static bool update_nohz_stats(struct rq *rq, bool force)
{
#ifdef CONFIG_NO_HZ_COMMON
unsigned int cpu = rq->cpu;
if (!rq->has_blocked_load)
return false;
if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
return false;
if (!force && !time_after(jiffies, rq->last_blocked_load_update_tick))
return true;
update_blocked_averages(cpu);
return rq->has_blocked_load;
#else
return false;
#endif
}
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @env: The load balancing environment.
* @group: sched_group whose statistics are to be updated.
* @sgs: variable to hold the statistics for this group.
* @sg_status: Holds flag indicating the status of the sched_group
*/
static inline void update_sg_lb_stats(struct lb_env *env,
struct sched_group *group,
struct sg_lb_stats *sgs,
int *sg_status)
{
int i, nr_running, local_group;
memset(sgs, 0, sizeof(*sgs));
local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group));
for_each_cpu_and(i, sched_group_span(group), env->cpus) {
struct rq *rq = cpu_rq(i);
if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
env->flags |= LBF_NOHZ_AGAIN;
sgs->group_load += cpu_load(rq);
sgs->group_util += cpu_util(i);
sgs->group_runnable += cpu_runnable(rq);
sgs->sum_h_nr_running += rq->cfs.h_nr_running;
nr_running = rq->nr_running;
sgs->sum_nr_running += nr_running;
if (nr_running > 1)
*sg_status |= SG_OVERLOAD;
if (cpu_overutilized(i))
*sg_status |= SG_OVERUTILIZED;
#ifdef CONFIG_NUMA_BALANCING
sgs->nr_numa_running += rq->nr_numa_running;
sgs->nr_preferred_running += rq->nr_preferred_running;
#endif
/*
* No need to call idle_cpu() if nr_running is not 0
*/
if (!nr_running && idle_cpu(i)) {
sgs->idle_cpus++;
/* Idle cpu can't have misfit task */
continue;
}
if (local_group)
continue;
/* Check for a misfit task on the cpu */
if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
sgs->group_misfit_task_load < rq->misfit_task_load) {
sgs->group_misfit_task_load = rq->misfit_task_load;
*sg_status |= SG_OVERLOAD;
}
}
/* Check if dst CPU is idle and preferred to this group */
if (env->sd->flags & SD_ASYM_PACKING &&
env->idle != CPU_NOT_IDLE &&
sgs->sum_h_nr_running &&
sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) {
sgs->group_asym_packing = 1;
}
sgs->group_capacity = group->sgc->capacity;
sgs->group_weight = group->group_weight;
sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
/* Computing avg_load makes sense only when group is overloaded */
if (sgs->group_type == group_overloaded)
sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
sgs->group_capacity;
}
/**
* update_sd_pick_busiest - return 1 on busiest group
* @env: The load balancing environment.
* @sds: sched_domain statistics
* @sg: sched_group candidate to be checked for being the busiest
* @sgs: sched_group statistics
*
* Determine if @sg is a busier group than the previously selected
* busiest group.
*
* Return: %true if @sg is a busier group than the previously selected
* busiest group. %false otherwise.
*/
static bool update_sd_pick_busiest(struct lb_env *env,
struct sd_lb_stats *sds,
struct sched_group *sg,
struct sg_lb_stats *sgs)
{
struct sg_lb_stats *busiest = &sds->busiest_stat;
/* Make sure that there is at least one task to pull */
if (!sgs->sum_h_nr_running)
return false;
/*
* Don't try to pull misfit tasks we can't help.
* We can use max_capacity here as reduction in capacity on some
* CPUs in the group should either be possible to resolve
* internally or be covered by avg_load imbalance (eventually).
*/
if (sgs->group_type == group_misfit_task &&
(!group_smaller_max_cpu_capacity(sg, sds->local) ||
sds->local_stat.group_type != group_has_spare))
return false;
if (sgs->group_type > busiest->group_type)
return true;
if (sgs->group_type < busiest->group_type)
return false;
/*
* The candidate and the current busiest group are the same type of
* group. Let check which one is the busiest according to the type.
*/
switch (sgs->group_type) {
case group_overloaded:
/* Select the overloaded group with highest avg_load. */
if (sgs->avg_load <= busiest->avg_load)
return false;
break;
case group_imbalanced:
/*
* Select the 1st imbalanced group as we don't have any way to
* choose one more than another.
*/
return false;
case group_asym_packing:
/* Prefer to move from lowest priority CPU's work */
if (sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu))
return false;
break;
case group_misfit_task:
/*
* If we have more than one misfit sg go with the biggest
* misfit.
*/
if (sgs->group_misfit_task_load < busiest->group_misfit_task_load)
return false;
break;
case group_fully_busy:
/*
* Select the fully busy group with highest avg_load. In
* theory, there is no need to pull task from such kind of
* group because tasks have all compute capacity that they need
* but we can still improve the overall throughput by reducing
* contention when accessing shared HW resources.
*
* XXX for now avg_load is not computed and always 0 so we
* select the 1st one.
*/
if (sgs->avg_load <= busiest->avg_load)
return false;
break;
case group_has_spare:
/*
* Select not overloaded group with lowest number of idle cpus
* and highest number of running tasks. We could also compare
* the spare capacity which is more stable but it can end up
* that the group has less spare capacity but finally more idle
* CPUs which means less opportunity to pull tasks.
*/
if (sgs->idle_cpus > busiest->idle_cpus)
return false;
else if ((sgs->idle_cpus == busiest->idle_cpus) &&
(sgs->sum_nr_running <= busiest->sum_nr_running))
return false;
break;
}
/*
* Candidate sg has no more than one task per CPU and has higher
* per-CPU capacity. Migrating tasks to less capable CPUs may harm
* throughput. Maximize throughput, power/energy consequences are not
* considered.
*/
if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
(sgs->group_type <= group_fully_busy) &&
(group_smaller_min_cpu_capacity(sds->local, sg)))
return false;
return true;
}
#ifdef CONFIG_NUMA_BALANCING
static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
{
if (sgs->sum_h_nr_running > sgs->nr_numa_running)
return regular;
if (sgs->sum_h_nr_running > sgs->nr_preferred_running)
return remote;
return all;
}
static inline enum fbq_type fbq_classify_rq(struct rq *rq)
{
if (rq->nr_running > rq->nr_numa_running)
return regular;
if (rq->nr_running > rq->nr_preferred_running)
return remote;
return all;
}
#else
static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
{
return all;
}
static inline enum fbq_type fbq_classify_rq(struct rq *rq)
{
return regular;
}
#endif /* CONFIG_NUMA_BALANCING */
struct sg_lb_stats;
/*
* task_running_on_cpu - return 1 if @p is running on @cpu.
*/
static unsigned int task_running_on_cpu(int cpu, struct task_struct *p)
{
/* Task has no contribution or is new */
if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
return 0;
if (task_on_rq_queued(p))
return 1;
return 0;
}
/**
* idle_cpu_without - would a given CPU be idle without p ?
* @cpu: the processor on which idleness is tested.
* @p: task which should be ignored.
*
* Return: 1 if the CPU would be idle. 0 otherwise.
*/
static int idle_cpu_without(int cpu, struct task_struct *p)
{
struct rq *rq = cpu_rq(cpu);
if (rq->curr != rq->idle && rq->curr != p)
return 0;
/*
* rq->nr_running can't be used but an updated version without the
* impact of p on cpu must be used instead. The updated nr_running
* be computed and tested before calling idle_cpu_without().
*/
#ifdef CONFIG_SMP
if (rq->ttwu_pending)
return 0;
#endif
return 1;
}
/*
* update_sg_wakeup_stats - Update sched_group's statistics for wakeup.
* @sd: The sched_domain level to look for idlest group.
* @group: sched_group whose statistics are to be updated.
* @sgs: variable to hold the statistics for this group.
* @p: The task for which we look for the idlest group/CPU.
*/
static inline void update_sg_wakeup_stats(struct sched_domain *sd,
struct sched_group *group,
struct sg_lb_stats *sgs,
struct task_struct *p)
{
int i, nr_running;
memset(sgs, 0, sizeof(*sgs));
for_each_cpu(i, sched_group_span(group)) {
struct rq *rq = cpu_rq(i);
unsigned int local;
sgs->group_load += cpu_load_without(rq, p);
sgs->group_util += cpu_util_without(i, p);
sgs->group_runnable += cpu_runnable_without(rq, p);
local = task_running_on_cpu(i, p);
sgs->sum_h_nr_running += rq->cfs.h_nr_running - local;
nr_running = rq->nr_running - local;
sgs->sum_nr_running += nr_running;
/*
* No need to call idle_cpu_without() if nr_running is not 0
*/
if (!nr_running && idle_cpu_without(i, p))
sgs->idle_cpus++;
}
/* Check if task fits in the group */
if (sd->flags & SD_ASYM_CPUCAPACITY &&
!task_fits_capacity(p, group->sgc->max_capacity)) {
sgs->group_misfit_task_load = 1;
}
sgs->group_capacity = group->sgc->capacity;
sgs->group_weight = group->group_weight;
sgs->group_type = group_classify(sd->imbalance_pct, group, sgs);
/*
* Computing avg_load makes sense only when group is fully busy or
* overloaded
*/
if (sgs->group_type == group_fully_busy ||
sgs->group_type == group_overloaded)
sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
sgs->group_capacity;
}
static bool update_pick_idlest(struct sched_group *idlest,
struct sg_lb_stats *idlest_sgs,
struct sched_group *group,
struct sg_lb_stats *sgs)
{
if (sgs->group_type < idlest_sgs->group_type)
return true;
if (sgs->group_type > idlest_sgs->group_type)
return false;
/*
* The candidate and the current idlest group are the same type of
* group. Let check which one is the idlest according to the type.
*/
switch (sgs->group_type) {
case group_overloaded:
case group_fully_busy:
/* Select the group with lowest avg_load. */
if (idlest_sgs->avg_load <= sgs->avg_load)
return false;
break;
case group_imbalanced:
case group_asym_packing:
/* Those types are not used in the slow wakeup path */
return false;
case group_misfit_task:
/* Select group with the highest max capacity */
if (idlest->sgc->max_capacity >= group->sgc->max_capacity)
return false;
break;
case group_has_spare:
/* Select group with most idle CPUs */
if (idlest_sgs->idle_cpus > sgs->idle_cpus)
return false;
/* Select group with lowest group_util */
if (idlest_sgs->idle_cpus == sgs->idle_cpus &&
idlest_sgs->group_util <= sgs->group_util)
return false;
break;
}
return true;
}
/*
* find_idlest_group() finds and returns the least busy CPU group within the
* domain.
*
* Assumes p is allowed on at least one CPU in sd.
*/
static struct sched_group *
find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
{
struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups;
struct sg_lb_stats local_sgs, tmp_sgs;
struct sg_lb_stats *sgs;
unsigned long imbalance;
struct sg_lb_stats idlest_sgs = {
.avg_load = UINT_MAX,
.group_type = group_overloaded,
};
imbalance = scale_load_down(NICE_0_LOAD) *
(sd->imbalance_pct-100) / 100;
do {
int local_group;
/* Skip over this group if it has no CPUs allowed */
if (!cpumask_intersects(sched_group_span(group),
p->cpus_ptr))
continue;
local_group = cpumask_test_cpu(this_cpu,
sched_group_span(group));
if (local_group) {
sgs = &local_sgs;
local = group;
} else {
sgs = &tmp_sgs;
}
update_sg_wakeup_stats(sd, group, sgs, p);
if (!local_group && update_pick_idlest(idlest, &idlest_sgs, group, sgs)) {
idlest = group;
idlest_sgs = *sgs;
}
} while (group = group->next, group != sd->groups);
/* There is no idlest group to push tasks to */
if (!idlest)
return NULL;
/* The local group has been skipped because of CPU affinity */
if (!local)
return idlest;
/*
* If the local group is idler than the selected idlest group
* don't try and push the task.
*/
if (local_sgs.group_type < idlest_sgs.group_type)
return NULL;
/*
* If the local group is busier than the selected idlest group
* try and push the task.
*/
if (local_sgs.group_type > idlest_sgs.group_type)
return idlest;
switch (local_sgs.group_type) {
case group_overloaded:
case group_fully_busy:
/*
* When comparing groups across NUMA domains, it's possible for
* the local domain to be very lightly loaded relative to the
* remote domains but "imbalance" skews the comparison making
* remote CPUs look much more favourable. When considering
* cross-domain, add imbalance to the load on the remote node
* and consider staying local.
*/
if ((sd->flags & SD_NUMA) &&
((idlest_sgs.avg_load + imbalance) >= local_sgs.avg_load))
return NULL;
/*
* If the local group is less loaded than the selected
* idlest group don't try and push any tasks.
*/
if (idlest_sgs.avg_load >= (local_sgs.avg_load + imbalance))
return NULL;
if (100 * local_sgs.avg_load <= sd->imbalance_pct * idlest_sgs.avg_load)
return NULL;
break;
case group_imbalanced:
case group_asym_packing:
/* Those type are not used in the slow wakeup path */
return NULL;
case group_misfit_task:
/* Select group with the highest max capacity */
if (local->sgc->max_capacity >= idlest->sgc->max_capacity)
return NULL;
break;
case group_has_spare:
if (sd->flags & SD_NUMA) {
#ifdef CONFIG_NUMA_BALANCING
int idlest_cpu;
/*
* If there is spare capacity at NUMA, try to select
* the preferred node
*/
if (cpu_to_node(this_cpu) == p->numa_preferred_nid)
return NULL;
idlest_cpu = cpumask_first(sched_group_span(idlest));
if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid)
return idlest;
#endif
/*
* Otherwise, keep the task on this node to stay close
* its wakeup source and improve locality. If there is
* a real need of migration, periodic load balance will
* take care of it.
*/
if (local_sgs.idle_cpus)
return NULL;
}
/*
* Select group with highest number of idle CPUs. We could also
* compare the utilization which is more stable but it can end
* up that the group has less spare capacity but finally more
* idle CPUs which means more opportunity to run task.
*/
if (local_sgs.idle_cpus >= idlest_sgs.idle_cpus)
return NULL;
break;
}
return idlest;
}
/**
* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
* @env: The load balancing environment.
* @sds: variable to hold the statistics for this sched_domain.
*/
static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
{
struct sched_domain *child = env->sd->child;
struct sched_group *sg = env->sd->groups;
struct sg_lb_stats *local = &sds->local_stat;
struct sg_lb_stats tmp_sgs;
int sg_status = 0;
#ifdef CONFIG_NO_HZ_COMMON
if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked))
env->flags |= LBF_NOHZ_STATS;
#endif
do {
struct sg_lb_stats *sgs = &tmp_sgs;
int local_group;
local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg));
if (local_group) {
sds->local = sg;
sgs = local;
if (env->idle != CPU_NEWLY_IDLE ||
time_after_eq(jiffies, sg->sgc->next_update))
update_group_capacity(env->sd, env->dst_cpu);
}
update_sg_lb_stats(env, sg, sgs, &sg_status);
if (local_group)
goto next_group;
if (update_sd_pick_busiest(env, sds, sg, sgs)) {
sds->busiest = sg;
sds->busiest_stat = *sgs;
}
next_group:
/* Now, start updating sd_lb_stats */
sds->total_load += sgs->group_load;
sds->total_capacity += sgs->group_capacity;
sg = sg->next;
} while (sg != env->sd->groups);
/* Tag domain that child domain prefers tasks go to siblings first */
sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
#ifdef CONFIG_NO_HZ_COMMON
if ((env->flags & LBF_NOHZ_AGAIN) &&
cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
WRITE_ONCE(nohz.next_blocked,
jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
}
#endif
if (env->sd->flags & SD_NUMA)
env->fbq_type = fbq_classify_group(&sds->busiest_stat);
if (!env->sd->parent) {
struct root_domain *rd = env->dst_rq->rd;
/* update overload indicator if we are at root domain */
WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD);
/* Update over-utilization (tipping point, U >= 0) indicator */
WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED);
} else if (sg_status & SG_OVERUTILIZED) {
struct root_domain *rd = env->dst_rq->rd;
WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
}
}
static inline long adjust_numa_imbalance(int imbalance, int nr_running)
{
unsigned int imbalance_min;
/*
* Allow a small imbalance based on a simple pair of communicating
* tasks that remain local when the source domain is almost idle.
*/
imbalance_min = 2;
if (nr_running <= imbalance_min)
return 0;
return imbalance;
}
/**
* calculate_imbalance - Calculate the amount of imbalance present within the
* groups of a given sched_domain during load balance.
* @env: load balance environment
* @sds: statistics of the sched_domain whose imbalance is to be calculated.
*/
static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
{
struct sg_lb_stats *local, *busiest;
local = &sds->local_stat;
busiest = &sds->busiest_stat;
if (busiest->group_type == group_misfit_task) {
/* Set imbalance to allow misfit tasks to be balanced. */
env->migration_type = migrate_misfit;
env->imbalance = 1;
return;
}
if (busiest->group_type == group_asym_packing) {
/*
* In case of asym capacity, we will try to migrate all load to
* the preferred CPU.
*/
env->migration_type = migrate_task;
env->imbalance = busiest->sum_h_nr_running;
return;
}
if (busiest->group_type == group_imbalanced) {
/*
* In the group_imb case we cannot rely on group-wide averages
* to ensure CPU-load equilibrium, try to move any task to fix
* the imbalance. The next load balance will take care of
* balancing back the system.
*/
env->migration_type = migrate_task;
env->imbalance = 1;
return;
}
/*
* Try to use spare capacity of local group without overloading it or
* emptying busiest.
*/
if (local->group_type == group_has_spare) {
if ((busiest->group_type > group_fully_busy) &&
!(env->sd->flags & SD_SHARE_PKG_RESOURCES)) {
/*
* If busiest is overloaded, try to fill spare
* capacity. This might end up creating spare capacity
* in busiest or busiest still being overloaded but
* there is no simple way to directly compute the
* amount of load to migrate in order to balance the
* system.
*/
env->migration_type = migrate_util;
env->imbalance = max(local->group_capacity, local->group_util) -
local->group_util;
/*
* In some cases, the group's utilization is max or even
* higher than capacity because of migrations but the
* local CPU is (newly) idle. There is at least one
* waiting task in this overloaded busiest group. Let's
* try to pull it.
*/
if (env->idle != CPU_NOT_IDLE && env->imbalance == 0) {
env->migration_type = migrate_task;
env->imbalance = 1;
}
return;
}
if (busiest->group_weight == 1 || sds->prefer_sibling) {
unsigned int nr_diff = busiest->sum_nr_running;
/*
* When prefer sibling, evenly spread running tasks on
* groups.
*/
env->migration_type = migrate_task;
lsub_positive(&nr_diff, local->sum_nr_running);
env->imbalance = nr_diff >> 1;
} else {
/*
* If there is no overload, we just want to even the number of
* idle cpus.
*/
env->migration_type = migrate_task;
env->imbalance = max_t(long, 0, (local->idle_cpus -
busiest->idle_cpus) >> 1);
}
/* Consider allowing a small imbalance between NUMA groups */
if (env->sd->flags & SD_NUMA)
env->imbalance = adjust_numa_imbalance(env->imbalance,
busiest->sum_nr_running);
return;
}
/*
* Local is fully busy but has to take more load to relieve the
* busiest group
*/
if (local->group_type < group_overloaded) {
/*
* Local will become overloaded so the avg_load metrics are
* finally needed.
*/
local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) /
local->group_capacity;
sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
sds->total_capacity;
/*
* If the local group is more loaded than the selected
* busiest group don't try to pull any tasks.
*/
if (local->avg_load >= busiest->avg_load) {
env->imbalance = 0;
return;
}
}
/*
* Both group are or will become overloaded and we're trying to get all
* the CPUs to the average_load, so we don't want to push ourselves
* above the average load, nor do we wish to reduce the max loaded CPU
* below the average load. At the same time, we also don't want to
* reduce the group load below the group capacity. Thus we look for
* the minimum possible imbalance.
*/
env->migration_type = migrate_load;
env->imbalance = min(
(busiest->avg_load - sds->avg_load) * busiest->group_capacity,
(sds->avg_load - local->avg_load) * local->group_capacity
) / SCHED_CAPACITY_SCALE;
}
/******* find_busiest_group() helpers end here *********************/
/*
* Decision matrix according to the local and busiest group type:
*
* busiest \ local has_spare fully_busy misfit asym imbalanced overloaded
* has_spare nr_idle balanced N/A N/A balanced balanced
* fully_busy nr_idle nr_idle N/A N/A balanced balanced
* misfit_task force N/A N/A N/A force force
* asym_packing force force N/A N/A force force
* imbalanced force force N/A N/A force force
* overloaded force force N/A N/A force avg_load
*
* N/A : Not Applicable because already filtered while updating
* statistics.
* balanced : The system is balanced for these 2 groups.
* force : Calculate the imbalance as load migration is probably needed.
* avg_load : Only if imbalance is significant enough.
* nr_idle : dst_cpu is not busy and the number of idle CPUs is quite
* different in groups.
*/
/**
* find_busiest_group - Returns the busiest group within the sched_domain
* if there is an imbalance.
*
* Also calculates the amount of runnable load which should be moved
* to restore balance.
*
* @env: The load balancing environment.
*
* Return: - The busiest group if imbalance exists.
*/
static struct sched_group *find_busiest_group(struct lb_env *env)
{
struct sg_lb_stats *local, *busiest;
struct sd_lb_stats sds;
init_sd_lb_stats(&sds);
/*
* Compute the various statistics relevant for load balancing at
* this level.
*/
update_sd_lb_stats(env, &sds);
if (sched_energy_enabled()) {
struct root_domain *rd = env->dst_rq->rd;
int out_balance = 1;
trace_android_rvh_find_busiest_group(sds.busiest, env->dst_rq,
&out_balance);
if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized)
&& out_balance)
goto out_balanced;
}
local = &sds.local_stat;
busiest = &sds.busiest_stat;
/* There is no busy sibling group to pull tasks from */
if (!sds.busiest)
goto out_balanced;
/* Misfit tasks should be dealt with regardless of the avg load */
if (busiest->group_type == group_misfit_task)
goto force_balance;
/* ASYM feature bypasses nice load balance check */
if (busiest->group_type == group_asym_packing)
goto force_balance;
/*
* If the busiest group is imbalanced the below checks don't
* work because they assume all things are equal, which typically
* isn't true due to cpus_ptr constraints and the like.
*/
if (busiest->group_type == group_imbalanced)
goto force_balance;
/*
* If the local group is busier than the selected busiest group
* don't try and pull any tasks.
*/
if (local->group_type > busiest->group_type)
goto out_balanced;
/*
* When groups are overloaded, use the avg_load to ensure fairness
* between tasks.
*/
if (local->group_type == group_overloaded) {
/*
* If the local group is more loaded than the selected
* busiest group don't try to pull any tasks.
*/
if (local->avg_load >= busiest->avg_load)
goto out_balanced;
/* XXX broken for overlapping NUMA groups */
sds.avg_load = (sds.total_load * SCHED_CAPACITY_SCALE) /
sds.total_capacity;
/*
* Don't pull any tasks if this group is already above the
* domain average load.
*/
if (local->avg_load >= sds.avg_load)
goto out_balanced;
/*
* If the busiest group is more loaded, use imbalance_pct to be
* conservative.
*/
if (100 * busiest->avg_load <=
env->sd->imbalance_pct * local->avg_load)
goto out_balanced;
}
/* Try to move all excess tasks to child's sibling domain */
if (sds.prefer_sibling && local->group_type == group_has_spare &&
busiest->sum_nr_running > local->sum_nr_running + 1)
goto force_balance;
if (busiest->group_type != group_overloaded) {
if (env->idle == CPU_NOT_IDLE)
/*
* If the busiest group is not overloaded (and as a
* result the local one too) but this CPU is already
* busy, let another idle CPU try to pull task.
*/
goto out_balanced;
if (busiest->group_weight > 1 &&
local->idle_cpus <= (busiest->idle_cpus + 1))
/*
* If the busiest group is not overloaded
* and there is no imbalance between this and busiest
* group wrt idle CPUs, it is balanced. The imbalance
* becomes significant if the diff is greater than 1
* otherwise we might end up to just move the imbalance
* on another group. Of course this applies only if
* there is more than 1 CPU per group.
*/
goto out_balanced;
if (busiest->sum_h_nr_running == 1)
/*
* busiest doesn't have any tasks waiting to run
*/
goto out_balanced;
}
force_balance:
/* Looks like there is an imbalance. Compute it */
calculate_imbalance(env, &sds);
return env->imbalance ? sds.busiest : NULL;
out_balanced:
env->imbalance = 0;
return NULL;
}
/*
* find_busiest_queue - find the busiest runqueue among the CPUs in the group.
*/
static struct rq *find_busiest_queue(struct lb_env *env,
struct sched_group *group)
{
struct rq *busiest = NULL, *rq;
unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1;
unsigned int busiest_nr = 0;
int i, done = 0;
trace_android_rvh_find_busiest_queue(env->dst_cpu, group, env->cpus,
&busiest, &done);
if (done)
return busiest;
for_each_cpu_and(i, sched_group_span(group), env->cpus) {
unsigned long capacity, load, util;
unsigned int nr_running;
enum fbq_type rt;
rq = cpu_rq(i);
rt = fbq_classify_rq(rq);
/*
* We classify groups/runqueues into three groups:
* - regular: there are !numa tasks
* - remote: there are numa tasks that run on the 'wrong' node
* - all: there is no distinction
*
* In order to avoid migrating ideally placed numa tasks,
* ignore those when there's better options.
*
* If we ignore the actual busiest queue to migrate another
* task, the next balance pass can still reduce the busiest
* queue by moving tasks around inside the node.
*
* If we cannot move enough load due to this classification
* the next pass will adjust the group classification and
* allow migration of more tasks.
*
* Both cases only affect the total convergence complexity.
*/
if (rt > env->fbq_type)
continue;
capacity = capacity_of(i);
nr_running = rq->cfs.h_nr_running;
/*
* For ASYM_CPUCAPACITY domains, don't pick a CPU that could
* eventually lead to active_balancing high->low capacity.
* Higher per-CPU capacity is considered better than balancing
* average load.
*/
if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
capacity_of(env->dst_cpu) < capacity &&
nr_running == 1)
continue;
switch (env->migration_type) {
case migrate_load:
/*
* When comparing with load imbalance, use cpu_load()
* which is not scaled with the CPU capacity.
*/
load = cpu_load(rq);
if (nr_running == 1 && load > env->imbalance &&
!check_cpu_capacity(rq, env->sd))
break;
/*
* For the load comparisons with the other CPUs,
* consider the cpu_load() scaled with the CPU
* capacity, so that the load can be moved away
* from the CPU that is potentially running at a
* lower capacity.
*
* Thus we're looking for max(load_i / capacity_i),
* crosswise multiplication to rid ourselves of the
* division works out to:
* load_i * capacity_j > load_j * capacity_i;
* where j is our previous maximum.
*/
if (load * busiest_capacity > busiest_load * capacity) {
busiest_load = load;
busiest_capacity = capacity;
busiest = rq;
}
break;
case migrate_util:
util = cpu_util(cpu_of(rq));
/*
* Don't try to pull utilization from a CPU with one
* running task. Whatever its utilization, we will fail
* detach the task.
*/
if (nr_running <= 1)
continue;
if (busiest_util < util) {
busiest_util = util;
busiest = rq;
}
break;
case migrate_task:
if (busiest_nr < nr_running) {
busiest_nr = nr_running;
busiest = rq;
}
break;
case migrate_misfit:
/*
* For ASYM_CPUCAPACITY domains with misfit tasks we
* simply seek the "biggest" misfit task.
*/
if (rq->misfit_task_load > busiest_load) {
busiest_load = rq->misfit_task_load;
busiest = rq;
}
break;
}
}
return busiest;
}
/*
* Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
* so long as it is large enough.
*/
#define MAX_PINNED_INTERVAL 512
static inline bool
asym_active_balance(struct lb_env *env)
{
/*
* ASYM_PACKING needs to force migrate tasks from busy but
* lower priority CPUs in order to pack all tasks in the
* highest priority CPUs.
*/
return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) &&
sched_asym_prefer(env->dst_cpu, env->src_cpu);
}
static inline bool
voluntary_active_balance(struct lb_env *env)
{
struct sched_domain *sd = env->sd;
if (asym_active_balance(env))
return 1;
/*
* The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
* It's worth migrating the task if the src_cpu's capacity is reduced
* because of other sched_class or IRQs if more capacity stays
* available on dst_cpu.
*/
if ((env->idle != CPU_NOT_IDLE) &&
(env->src_rq->cfs.h_nr_running == 1)) {
if ((check_cpu_capacity(env->src_rq, sd)) &&
(capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
return 1;
}
if (env->migration_type == migrate_misfit)
return 1;
return 0;
}
static int need_active_balance(struct lb_env *env)
{
struct sched_domain *sd = env->sd;
if (voluntary_active_balance(env))
return 1;
return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
}
static int active_load_balance_cpu_stop(void *data);
static int should_we_balance(struct lb_env *env)
{
struct sched_group *sg = env->sd->groups;
int cpu;
/*
* Ensure the balancing environment is consistent; can happen
* when the softirq triggers 'during' hotplug.
*/
if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
return 0;
/*
* In the newly idle case, we will allow all the CPUs
* to do the newly idle load balance.
*/
if (env->idle == CPU_NEWLY_IDLE)
return 1;
/* Try to find first idle CPU */
for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
if (!idle_cpu(cpu))
continue;
/* Are we the first idle CPU? */
return cpu == env->dst_cpu;
}
/* Are we the first CPU of this group ? */
return group_balance_cpu(sg) == env->dst_cpu;
}
/*
* Check this_cpu to ensure it is balanced within domain. Attempt to move
* tasks if there is an imbalance.
*/
static int load_balance(int this_cpu, struct rq *this_rq,
struct sched_domain *sd, enum cpu_idle_type idle,
int *continue_balancing)
{
int ld_moved, cur_ld_moved, active_balance = 0;
struct sched_domain *sd_parent = sd->parent;
struct sched_group *group;
struct rq *busiest;
struct rq_flags rf;
struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
struct lb_env env = {
.sd = sd,
.dst_cpu = this_cpu,
.dst_rq = this_rq,
.dst_grpmask = sched_group_span(sd->groups),
.idle = idle,
.loop_break = sched_nr_migrate_break,
.cpus = cpus,
.fbq_type = all,
.tasks = LIST_HEAD_INIT(env.tasks),
};
cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
schedstat_inc(sd->lb_count[idle]);
redo:
if (!should_we_balance(&env)) {
*continue_balancing = 0;
goto out_balanced;
}
group = find_busiest_group(&env);
if (!group) {
schedstat_inc(sd->lb_nobusyg[idle]);
goto out_balanced;
}
busiest = find_busiest_queue(&env, group);
if (!busiest) {
schedstat_inc(sd->lb_nobusyq[idle]);
goto out_balanced;
}
BUG_ON(busiest == env.dst_rq);
schedstat_add(sd->lb_imbalance[idle], env.imbalance);
env.src_cpu = busiest->cpu;
env.src_rq = busiest;
ld_moved = 0;
if (busiest->nr_running > 1) {
/*
* Attempt to move tasks. If find_busiest_group has found
* an imbalance but busiest->nr_running <= 1, the group is
* still unbalanced. ld_moved simply stays zero, so it is
* correctly treated as an imbalance.
*/
env.flags |= LBF_ALL_PINNED;
env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
more_balance:
rq_lock_irqsave(busiest, &rf);
env.src_rq_rf = &rf;
update_rq_clock(busiest);
/*
* cur_ld_moved - load moved in current iteration
* ld_moved - cumulative load moved across iterations
*/
cur_ld_moved = detach_tasks(&env);
/*
* We've detached some tasks from busiest_rq. Every
* task is masked "TASK_ON_RQ_MIGRATING", so we can safely
* unlock busiest->lock, and we are able to be sure
* that nobody can manipulate the tasks in parallel.
* See task_rq_lock() family for the details.
*/
rq_unlock(busiest, &rf);
if (cur_ld_moved) {
attach_tasks(&env);
ld_moved += cur_ld_moved;
}
local_irq_restore(rf.flags);
if (env.flags & LBF_NEED_BREAK) {
env.flags &= ~LBF_NEED_BREAK;
goto more_balance;
}
/*
* Revisit (affine) tasks on src_cpu that couldn't be moved to
* us and move them to an alternate dst_cpu in our sched_group
* where they can run. The upper limit on how many times we
* iterate on same src_cpu is dependent on number of CPUs in our
* sched_group.
*
* This changes load balance semantics a bit on who can move
* load to a given_cpu. In addition to the given_cpu itself
* (or a ilb_cpu acting on its behalf where given_cpu is
* nohz-idle), we now have balance_cpu in a position to move
* load to given_cpu. In rare situations, this may cause
* conflicts (balance_cpu and given_cpu/ilb_cpu deciding
* _independently_ and at _same_ time to move some load to
* given_cpu) causing exceess load to be moved to given_cpu.
* This however should not happen so much in practice and
* moreover subsequent load balance cycles should correct the
* excess load moved.
*/
if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
/* Prevent to re-select dst_cpu via env's CPUs */
__cpumask_clear_cpu(env.dst_cpu, env.cpus);
env.dst_rq = cpu_rq(env.new_dst_cpu);
env.dst_cpu = env.new_dst_cpu;
env.flags &= ~LBF_DST_PINNED;
env.loop = 0;
env.loop_break = sched_nr_migrate_break;
/*
* Go back to "more_balance" rather than "redo" since we
* need to continue with same src_cpu.
*/
goto more_balance;
}
/*
* We failed to reach balance because of affinity.
*/
if (sd_parent) {
int *group_imbalance = &sd_parent->groups->sgc->imbalance;
if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
*group_imbalance = 1;
}
/* All tasks on this runqueue were pinned by CPU affinity */
if (unlikely(env.flags & LBF_ALL_PINNED)) {
__cpumask_clear_cpu(cpu_of(busiest), cpus);
/*
* Attempting to continue load balancing at the current
* sched_domain level only makes sense if there are
* active CPUs remaining as possible busiest CPUs to
* pull load from which are not contained within the
* destination group that is receiving any migrated
* load.
*/
if (!cpumask_subset(cpus, env.dst_grpmask)) {
env.loop = 0;
env.loop_break = sched_nr_migrate_break;
goto redo;
}
goto out_all_pinned;
}
}
if (!ld_moved) {
schedstat_inc(sd->lb_failed[idle]);
/*
* Increment the failure counter only on periodic balance.
* We do not want newidle balance, which can be very
* frequent, pollute the failure counter causing
* excessive cache_hot migrations and active balances.
*/
if (idle != CPU_NEWLY_IDLE)
sd->nr_balance_failed++;
if (need_active_balance(&env)) {
unsigned long flags;
raw_spin_lock_irqsave(&busiest->lock, flags);
/*
* Don't kick the active_load_balance_cpu_stop,
* if the curr task on busiest CPU can't be
* moved to this_cpu:
*/
if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
raw_spin_unlock_irqrestore(&busiest->lock,
flags);
env.flags |= LBF_ALL_PINNED;
goto out_one_pinned;
}
/*
* ->active_balance synchronizes accesses to
* ->active_balance_work. Once set, it's cleared
* only after active load balance is finished.
*/
if (!busiest->active_balance) {
busiest->active_balance = 1;
busiest->push_cpu = this_cpu;
active_balance = 1;
}
raw_spin_unlock_irqrestore(&busiest->lock, flags);
if (active_balance) {
stop_one_cpu_nowait(cpu_of(busiest),
active_load_balance_cpu_stop, busiest,
&busiest->active_balance_work);
}
/* We've kicked active balancing, force task migration. */
sd->nr_balance_failed = sd->cache_nice_tries+1;
}
} else
sd->nr_balance_failed = 0;
if (likely(!active_balance) || voluntary_active_balance(&env)) {
/* We were unbalanced, so reset the balancing interval */
sd->balance_interval = sd->min_interval;
} else {
/*
* If we've begun active balancing, start to back off. This
* case may not be covered by the all_pinned logic if there
* is only 1 task on the busy runqueue (because we don't call
* detach_tasks).
*/
if (sd->balance_interval < sd->max_interval)
sd->balance_interval *= 2;
}
goto out;
out_balanced:
/*
* We reach balance although we may have faced some affinity
* constraints. Clear the imbalance flag only if other tasks got
* a chance to move and fix the imbalance.
*/
if (sd_parent && !(env.flags & LBF_ALL_PINNED)) {
int *group_imbalance = &sd_parent->groups->sgc->imbalance;
if (*group_imbalance)
*group_imbalance = 0;
}
out_all_pinned:
/*
* We reach balance because all tasks are pinned at this level so
* we can't migrate them. Let the imbalance flag set so parent level
* can try to migrate them.
*/
schedstat_inc(sd->lb_balanced[idle]);
sd->nr_balance_failed = 0;
out_one_pinned:
ld_moved = 0;
/*
* newidle_balance() disregards balance intervals, so we could
* repeatedly reach this code, which would lead to balance_interval
* skyrocketting in a short amount of time. Skip the balance_interval
* increase logic to avoid that.
*/
if (env.idle == CPU_NEWLY_IDLE)
goto out;
/* tune up the balancing interval */
if ((env.flags & LBF_ALL_PINNED &&
sd->balance_interval < MAX_PINNED_INTERVAL) ||
sd->balance_interval < sd->max_interval)
sd->balance_interval *= 2;
out:
return ld_moved;
}
static inline unsigned long
get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
{
unsigned long interval = sd->balance_interval;
if (cpu_busy)
interval *= sd->busy_factor;
/* scale ms to jiffies */
interval = msecs_to_jiffies(interval);
/*
* Reduce likelihood of busy balancing at higher domains racing with
* balancing at lower domains by preventing their balancing periods
* from being multiples of each other.
*/
if (cpu_busy)
interval -= 1;
interval = clamp(interval, 1UL, max_load_balance_interval);
return interval;
}
static inline void
update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
{
unsigned long interval, next;
/* used by idle balance, so cpu_busy = 0 */
interval = get_sd_balance_interval(sd, 0);
next = sd->last_balance + interval;
if (time_after(*next_balance, next))
*next_balance = next;
}
/*
* active_load_balance_cpu_stop is run by the CPU stopper. It pushes
* running tasks off the busiest CPU onto idle CPUs. It requires at
* least 1 task to be running on each physical CPU where possible, and
* avoids physical / logical imbalances.
*/
static int active_load_balance_cpu_stop(void *data)
{
struct rq *busiest_rq = data;
int busiest_cpu = cpu_of(busiest_rq);
int target_cpu = busiest_rq->push_cpu;
struct rq *target_rq = cpu_rq(target_cpu);
struct sched_domain *sd;
struct task_struct *p = NULL;
struct rq_flags rf;
rq_lock_irq(busiest_rq, &rf);
/*
* Between queueing the stop-work and running it is a hole in which
* CPUs can become inactive. We should not move tasks from or to
* inactive CPUs.
*/
if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
goto out_unlock;
/* Make sure the requested CPU hasn't gone down in the meantime: */
if (unlikely(busiest_cpu != smp_processor_id() ||
!busiest_rq->active_balance))
goto out_unlock;
/* Is there any task to move? */
if (busiest_rq->nr_running <= 1)
goto out_unlock;
/*
* This condition is "impossible", if it occurs
* we need to fix it. Originally reported by
* Bjorn Helgaas on a 128-CPU setup.
*/
BUG_ON(busiest_rq == target_rq);
/* Search for an sd spanning us and the target CPU. */
rcu_read_lock();
for_each_domain(target_cpu, sd) {
if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
break;
}
if (likely(sd)) {
struct lb_env env = {
.sd = sd,
.dst_cpu = target_cpu,
.dst_rq = target_rq,
.src_cpu = busiest_rq->cpu,
.src_rq = busiest_rq,
.idle = CPU_IDLE,
/*
* can_migrate_task() doesn't need to compute new_dst_cpu
* for active balancing. Since we have CPU_IDLE, but no
* @dst_grpmask we need to make that test go away with lying
* about DST_PINNED.
*/
.flags = LBF_DST_PINNED,
.src_rq_rf = &rf,
};
schedstat_inc(sd->alb_count);
update_rq_clock(busiest_rq);
p = detach_one_task(&env);
if (p) {
schedstat_inc(sd->alb_pushed);
/* Active balancing done, reset the failure counter. */
sd->nr_balance_failed = 0;
} else {
schedstat_inc(sd->alb_failed);
}
}
rcu_read_unlock();
out_unlock:
busiest_rq->active_balance = 0;
rq_unlock(busiest_rq, &rf);
if (p)
attach_one_task(target_rq, p);
local_irq_enable();
return 0;
}
static DEFINE_SPINLOCK(balancing);
/*
* Scale the max load_balance interval with the number of CPUs in the system.
* This trades load-balance latency on larger machines for less cross talk.
*/
void update_max_interval(void)
{
max_load_balance_interval = HZ*num_active_cpus()/10;
}
/*
* It checks each scheduling domain to see if it is due to be balanced,
* and initiates a balancing operation if so.
*
* Balancing parameters are set up in init_sched_domains.
*/
static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
{
int continue_balancing = 1;
int cpu = rq->cpu;
int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
unsigned long interval;
struct sched_domain *sd;
/* Earliest time when we have to do rebalance again */
unsigned long next_balance = jiffies + 60*HZ;
int update_next_balance = 0;
int need_serialize, need_decay = 0;
u64 max_cost = 0;
trace_android_rvh_sched_rebalance_domains(rq, &continue_balancing);
if (!continue_balancing)
return;
rcu_read_lock();
for_each_domain(cpu, sd) {
/*
* Decay the newidle max times here because this is a regular
* visit to all the domains. Decay ~1% per second.
*/
if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
sd->max_newidle_lb_cost =
(sd->max_newidle_lb_cost * 253) / 256;
sd->next_decay_max_lb_cost = jiffies + HZ;
need_decay = 1;
}
max_cost += sd->max_newidle_lb_cost;
/*
* Stop the load balance at this level. There is another
* CPU in our sched group which is doing load balancing more
* actively.
*/
if (!continue_balancing) {
if (need_decay)
continue;
break;
}
interval = get_sd_balance_interval(sd, busy);
need_serialize = sd->flags & SD_SERIALIZE;
if (need_serialize) {
if (!spin_trylock(&balancing))
goto out;
}
if (time_after_eq(jiffies, sd->last_balance + interval)) {
if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
/*
* The LBF_DST_PINNED logic could have changed
* env->dst_cpu, so we can't know our idle
* state even if we migrated tasks. Update it.
*/
idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
}
sd->last_balance = jiffies;
interval = get_sd_balance_interval(sd, busy);
}
if (need_serialize)
spin_unlock(&balancing);
out:
if (time_after(next_balance, sd->last_balance + interval)) {
next_balance = sd->last_balance + interval;
update_next_balance = 1;
}
}
if (need_decay) {
/*
* Ensure the rq-wide value also decays but keep it at a
* reasonable floor to avoid funnies with rq->avg_idle.
*/
rq->max_idle_balance_cost =
max((u64)sysctl_sched_migration_cost, max_cost);
}
rcu_read_unlock();
/*
* next_balance will be updated only when there is a need.
* When the cpu is attached to null domain for ex, it will not be
* updated.
*/
if (likely(update_next_balance)) {
rq->next_balance = next_balance;
#ifdef CONFIG_NO_HZ_COMMON
/*
* If this CPU has been elected to perform the nohz idle
* balance. Other idle CPUs have already rebalanced with
* nohz_idle_balance() and nohz.next_balance has been
* updated accordingly. This CPU is now running the idle load
* balance for itself and we need to update the
* nohz.next_balance accordingly.
*/
if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
nohz.next_balance = rq->next_balance;
#endif
}
}
static inline int on_null_domain(struct rq *rq)
{
return unlikely(!rcu_dereference_sched(rq->sd));
}
#ifdef CONFIG_NO_HZ_COMMON
/*
* idle load balancing details
* - When one of the busy CPUs notice that there may be an idle rebalancing
* needed, they will kick the idle load balancer, which then does idle
* load balancing for all the idle CPUs.
* - HK_FLAG_MISC CPUs are used for this task, because HK_FLAG_SCHED not set
* anywhere yet.
*/
static inline int find_new_ilb(void)
{
int ilb = -1;
trace_android_rvh_find_new_ilb(nohz.idle_cpus_mask, &ilb);
if (ilb >= 0)
return ilb;
for_each_cpu_and(ilb, nohz.idle_cpus_mask,
housekeeping_cpumask(HK_FLAG_MISC)) {
if (idle_cpu(ilb))
return ilb;
}
return nr_cpu_ids;
}
/*
* Kick a CPU to do the nohz balancing, if it is time for it. We pick any
* idle CPU in the HK_FLAG_MISC housekeeping set (if there is one).
*/
static void kick_ilb(unsigned int flags)
{
int ilb_cpu;
/*
* Increase nohz.next_balance only when if full ilb is triggered but
* not if we only update stats.
*/
if (flags & NOHZ_BALANCE_KICK)
nohz.next_balance = jiffies+1;
ilb_cpu = find_new_ilb();
if (ilb_cpu >= nr_cpu_ids)
return;
/*
* Access to rq::nohz_csd is serialized by NOHZ_KICK_MASK; he who sets
* the first flag owns it; cleared by nohz_csd_func().
*/
flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
if (flags & NOHZ_KICK_MASK)
return;
/*
* This way we generate an IPI on the target CPU which
* is idle. And the softirq performing nohz idle load balance
* will be run before returning from the IPI.
*/
smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd);
}
/*
* Current decision point for kicking the idle load balancer in the presence
* of idle CPUs in the system.
*/
static void nohz_balancer_kick(struct rq *rq)
{
unsigned long now = jiffies;
struct sched_domain_shared *sds;
struct sched_domain *sd;
int nr_busy, i, cpu = rq->cpu;
unsigned int flags = 0;
int done = 0;
if (unlikely(rq->idle_balance))
return;
/*
* We may be recently in ticked or tickless idle mode. At the first
* busy tick after returning from idle, we will update the busy stats.
*/
nohz_balance_exit_idle(rq);
/*
* None are in tickless mode and hence no need for NOHZ idle load
* balancing.
*/
if (likely(!atomic_read(&nohz.nr_cpus)))
return;
if (READ_ONCE(nohz.has_blocked) &&
time_after(now, READ_ONCE(nohz.next_blocked)))
flags = NOHZ_STATS_KICK;
if (time_before(now, nohz.next_balance))
goto out;
trace_android_rvh_sched_nohz_balancer_kick(rq, &flags, &done);
if (done)
goto out;
if (rq->nr_running >= 2) {
flags = NOHZ_KICK_MASK;
goto out;
}
rcu_read_lock();
sd = rcu_dereference(rq->sd);
if (sd) {
/*
* If there's a CFS task and the current CPU has reduced
* capacity; kick the ILB to see if there's a better CPU to run
* on.
*/
if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
flags = NOHZ_KICK_MASK;
goto unlock;
}
}
sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
if (sd) {
/*
* When ASYM_PACKING; see if there's a more preferred CPU
* currently idle; in which case, kick the ILB to move tasks
* around.
*/
for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
if (sched_asym_prefer(i, cpu)) {
flags = NOHZ_KICK_MASK;
goto unlock;
}
}
}
sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu));
if (sd) {
/*
* When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
* to run the misfit task on.
*/
if (check_misfit_status(rq, sd)) {
flags = NOHZ_KICK_MASK;
goto unlock;
}
/*
* For asymmetric systems, we do not want to nicely balance
* cache use, instead we want to embrace asymmetry and only
* ensure tasks have enough CPU capacity.
*
* Skip the LLC logic because it's not relevant in that case.
*/
goto unlock;
}
sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
if (sds) {
/*
* If there is an imbalance between LLC domains (IOW we could
* increase the overall cache use), we need some less-loaded LLC
* domain to pull some load. Likewise, we may need to spread
* load within the current LLC domain (e.g. packed SMT cores but
* other CPUs are idle). We can't really know from here how busy
* the others are - so just get a nohz balance going if it looks
* like this LLC domain has tasks we could move.
*/
nr_busy = atomic_read(&sds->nr_busy_cpus);
if (nr_busy > 1) {
flags = NOHZ_KICK_MASK;
goto unlock;
}
}
unlock:
rcu_read_unlock();
out:
if (flags)
kick_ilb(flags);
}
static void set_cpu_sd_state_busy(int cpu)
{
struct sched_domain *sd;
rcu_read_lock();
sd = rcu_dereference(per_cpu(sd_llc, cpu));
if (!sd || !sd->nohz_idle)
goto unlock;
sd->nohz_idle = 0;
atomic_inc(&sd->shared->nr_busy_cpus);
unlock:
rcu_read_unlock();
}
void nohz_balance_exit_idle(struct rq *rq)
{
SCHED_WARN_ON(rq != this_rq());
if (likely(!rq->nohz_tick_stopped))
return;
rq->nohz_tick_stopped = 0;
cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask);
atomic_dec(&nohz.nr_cpus);
set_cpu_sd_state_busy(rq->cpu);
}
static void set_cpu_sd_state_idle(int cpu)
{
struct sched_domain *sd;
rcu_read_lock();
sd = rcu_dereference(per_cpu(sd_llc, cpu));
if (!sd || sd->nohz_idle)
goto unlock;
sd->nohz_idle = 1;
atomic_dec(&sd->shared->nr_busy_cpus);
unlock:
rcu_read_unlock();
}
/*
* This routine will record that the CPU is going idle with tick stopped.
* This info will be used in performing idle load balancing in the future.
*/
void nohz_balance_enter_idle(int cpu)
{
struct rq *rq = cpu_rq(cpu);
SCHED_WARN_ON(cpu != smp_processor_id());
if (!cpu_active(cpu)) {
/*
* A CPU can be paused while it is idle with it's tick
* stopped. nohz_balance_exit_idle() should be called
* from the local CPU, so it can't be called during
* pause. This results in paused CPU participating in
* the nohz idle balance, which should be avoided.
*
* When the paused CPU exits idle and enters again,
* exempt the paused CPU from nohz_balance_exit_idle.
*/
nohz_balance_exit_idle(rq);
return;
}
/* Spare idle load balancing on CPUs that don't want to be disturbed: */
if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
return;
/*
* Can be set safely without rq->lock held
* If a clear happens, it will have evaluated last additions because
* rq->lock is held during the check and the clear
*/
rq->has_blocked_load = 1;
/*
* The tick is still stopped but load could have been added in the
* meantime. We set the nohz.has_blocked flag to trig a check of the
* *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear
* of nohz.has_blocked can only happen after checking the new load
*/
if (rq->nohz_tick_stopped)
goto out;
/* If we're a completely isolated CPU, we don't play: */
if (on_null_domain(rq))
return;
rq->nohz_tick_stopped = 1;
cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
atomic_inc(&nohz.nr_cpus);
/*
* Ensures that if nohz_idle_balance() fails to observe our
* @idle_cpus_mask store, it must observe the @has_blocked
* store.
*/
smp_mb__after_atomic();
set_cpu_sd_state_idle(cpu);
out:
/*
* Each time a cpu enter idle, we assume that it has blocked load and
* enable the periodic update of the load of idle cpus
*/
WRITE_ONCE(nohz.has_blocked, 1);
}
/*
* Internal function that runs load balance for all idle cpus. The load balance
* can be a simple update of blocked load or a complete load balance with
* tasks movement depending of flags.
* The function returns false if the loop has stopped before running
* through all idle CPUs.
*/
static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
enum cpu_idle_type idle)
{
/* Earliest time when we have to do rebalance again */
unsigned long now = jiffies;
unsigned long next_balance = now + 60*HZ;
bool has_blocked_load = false;
int update_next_balance = 0;
int this_cpu = this_rq->cpu;
int balance_cpu;
int ret = false;
struct rq *rq;
SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
/*
* We assume there will be no idle load after this update and clear
* the has_blocked flag. If a cpu enters idle in the mean time, it will
* set the has_blocked flag and trig another update of idle load.
* Because a cpu that becomes idle, is added to idle_cpus_mask before
* setting the flag, we are sure to not clear the state and not
* check the load of an idle cpu.
*/
WRITE_ONCE(nohz.has_blocked, 0);
/*
* Ensures that if we miss the CPU, we must see the has_blocked
* store from nohz_balance_enter_idle().
*/
smp_mb();
for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
continue;
/*
* If this CPU gets work to do, stop the load balancing
* work being done for other CPUs. Next load
* balancing owner will pick it up.
*/
if (need_resched()) {
has_blocked_load = true;
goto abort;
}
rq = cpu_rq(balance_cpu);
has_blocked_load |= update_nohz_stats(rq, true);
/*
* If time for next balance is due,
* do the balance.
*/
if (time_after_eq(jiffies, rq->next_balance)) {
struct rq_flags rf;
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
rq_unlock_irqrestore(rq, &rf);
if (flags & NOHZ_BALANCE_KICK)
rebalance_domains(rq, CPU_IDLE);
}
if (time_after(next_balance, rq->next_balance)) {
next_balance = rq->next_balance;
update_next_balance = 1;
}
}
/*
* next_balance will be updated only when there is a need.
* When the CPU is attached to null domain for ex, it will not be
* updated.
*/
if (likely(update_next_balance))
nohz.next_balance = next_balance;
/* Newly idle CPU doesn't need an update */
if (idle != CPU_NEWLY_IDLE) {
update_blocked_averages(this_cpu);
has_blocked_load |= this_rq->has_blocked_load;
}
if (flags & NOHZ_BALANCE_KICK)
rebalance_domains(this_rq, CPU_IDLE);
WRITE_ONCE(nohz.next_blocked,
now + msecs_to_jiffies(LOAD_AVG_PERIOD));
/* The full idle balance loop has been done */
ret = true;
abort:
/* There is still blocked load, enable periodic update */
if (has_blocked_load)
WRITE_ONCE(nohz.has_blocked, 1);
return ret;
}
/*
* In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
* rebalancing for all the cpus for whom scheduler ticks are stopped.
*/
static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
{
unsigned int flags = this_rq->nohz_idle_balance;
if (!flags)
return false;
this_rq->nohz_idle_balance = 0;
if (idle != CPU_IDLE)
return false;
_nohz_idle_balance(this_rq, flags, idle);
return true;
}
static void nohz_newidle_balance(struct rq *this_rq)
{
int this_cpu = this_rq->cpu;
/*
* This CPU doesn't want to be disturbed by scheduler
* housekeeping
*/
if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED))
return;
/* Will wake up very soon. No time for doing anything else*/
if (this_rq->avg_idle < sysctl_sched_migration_cost)
return;
/* Don't need to update blocked load of idle CPUs*/
if (!READ_ONCE(nohz.has_blocked) ||
time_before(jiffies, READ_ONCE(nohz.next_blocked)))
return;
raw_spin_unlock(&this_rq->lock);
/*
* This CPU is going to be idle and blocked load of idle CPUs
* need to be updated. Run the ilb locally as it is a good
* candidate for ilb instead of waking up another idle CPU.
* Kick an normal ilb if we failed to do the update.
*/
if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
kick_ilb(NOHZ_STATS_KICK);
raw_spin_lock(&this_rq->lock);
}
#else /* !CONFIG_NO_HZ_COMMON */
static inline void nohz_balancer_kick(struct rq *rq) { }
static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
{
return false;
}
static inline void nohz_newidle_balance(struct rq *this_rq) { }
#endif /* CONFIG_NO_HZ_COMMON */
/*
* idle_balance is called by schedule() if this_cpu is about to become
* idle. Attempts to pull tasks from other CPUs.
*
* Returns:
* < 0 - we released the lock and there are !fair tasks present
* 0 - failed, no new tasks
* > 0 - success, new (fair) tasks present
*/
static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
{
unsigned long next_balance = jiffies + HZ;
int this_cpu = this_rq->cpu;
struct sched_domain *sd;
int pulled_task = 0;
u64 curr_cost = 0;
int done = 0;
trace_android_rvh_sched_newidle_balance(this_rq, rf, &pulled_task, &done);
if (done)
return pulled_task;
update_misfit_status(NULL, this_rq);
/*
* We must set idle_stamp _before_ calling idle_balance(), such that we
* measure the duration of idle_balance() as idle time.
*/
this_rq->idle_stamp = rq_clock(this_rq);
/*
* Do not pull tasks towards !active CPUs...
*/
if (!cpu_active(this_cpu))
return 0;
/*
* This is OK, because current is on_cpu, which avoids it being picked
* for load-balance and preemption/IRQs are still disabled avoiding
* further scheduler activity on it and we're being very careful to
* re-start the picking loop.
*/
rq_unpin_lock(this_rq, rf);
if (this_rq->avg_idle < sysctl_sched_migration_cost ||
!READ_ONCE(this_rq->rd->overload)) {
rcu_read_lock();
sd = rcu_dereference_check_sched_domain(this_rq->sd);
if (sd)
update_next_balance(sd, &next_balance);
rcu_read_unlock();
nohz_newidle_balance(this_rq);
goto out;
}
raw_spin_unlock(&this_rq->lock);
update_blocked_averages(this_cpu);
rcu_read_lock();
for_each_domain(this_cpu, sd) {
int continue_balancing = 1;
u64 t0, domain_cost;
if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
update_next_balance(sd, &next_balance);
break;
}
if (sd->flags & SD_BALANCE_NEWIDLE) {
t0 = sched_clock_cpu(this_cpu);
pulled_task = load_balance(this_cpu, this_rq,
sd, CPU_NEWLY_IDLE,
&continue_balancing);
domain_cost = sched_clock_cpu(this_cpu) - t0;
if (domain_cost > sd->max_newidle_lb_cost)
sd->max_newidle_lb_cost = domain_cost;
curr_cost += domain_cost;
}
update_next_balance(sd, &next_balance);
/*
* Stop searching for tasks to pull if there are
* now runnable tasks on this rq.
*/
if (pulled_task || this_rq->nr_running > 0)
break;
}
rcu_read_unlock();
raw_spin_lock(&this_rq->lock);
if (curr_cost > this_rq->max_idle_balance_cost)
this_rq->max_idle_balance_cost = curr_cost;
out:
/*
* While browsing the domains, we released the rq lock, a task could
* have been enqueued in the meantime. Since we're not going idle,
* pretend we pulled a task.
*/
if (this_rq->cfs.h_nr_running && !pulled_task)
pulled_task = 1;
/* Move the next balance forward */
if (time_after(this_rq->next_balance, next_balance))
this_rq->next_balance = next_balance;
/* Is there a task of a high priority class? */
if (this_rq->nr_running != this_rq->cfs.h_nr_running)
pulled_task = -1;
if (pulled_task)
this_rq->idle_stamp = 0;
rq_repin_lock(this_rq, rf);
return pulled_task;
}
/*
* run_rebalance_domains is triggered when needed from the scheduler tick.
* Also triggered for nohz idle balancing (with nohz_balancing_kick set).
*/
static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
{
struct rq *this_rq = this_rq();
enum cpu_idle_type idle = this_rq->idle_balance ?
CPU_IDLE : CPU_NOT_IDLE;
/*
* If this CPU has a pending nohz_balance_kick, then do the
* balancing on behalf of the other idle CPUs whose ticks are
* stopped. Do nohz_idle_balance *before* rebalance_domains to
* give the idle CPUs a chance to load balance. Else we may
* load balance only within the local sched_domain hierarchy
* and abort nohz_idle_balance altogether if we pull some load.
*/
if (nohz_idle_balance(this_rq, idle))
return;
/* normal load balance */
update_blocked_averages(this_rq->cpu);
rebalance_domains(this_rq, idle);
}
/*
* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
*/
void trigger_load_balance(struct rq *rq)
{
/* Don't need to rebalance while attached to NULL domain */
if (unlikely(on_null_domain(rq)))
return;
if (time_after_eq(jiffies, rq->next_balance))
raise_softirq(SCHED_SOFTIRQ);
nohz_balancer_kick(rq);
}
static void rq_online_fair(struct rq *rq)
{
update_sysctl();
update_runtime_enabled(rq);
}
static void rq_offline_fair(struct rq *rq)
{
update_sysctl();
/* Ensure any throttled groups are reachable by pick_next_task */
unthrottle_offline_cfs_rqs(rq);
}
#endif /* CONFIG_SMP */
/*
* scheduler tick hitting a task of our scheduling class.
*
* NOTE: This function can be called remotely by the tick offload that
* goes along full dynticks. Therefore no local assumption can be made
* and everything must be accessed through the @rq and @curr passed in
* parameters.
*/
static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &curr->se;
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
entity_tick(cfs_rq, se, queued);
}
if (static_branch_unlikely(&sched_numa_balancing))
task_tick_numa(rq, curr);
update_misfit_status(curr, rq);
update_overutilized_status(task_rq(curr));
}
/*
* called on fork with the child task as argument from the parent's context
* - child not yet on the tasklist
* - preemption disabled
*/
static void task_fork_fair(struct task_struct *p)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se, *curr;
struct rq *rq = this_rq();
struct rq_flags rf;
rq_lock(rq, &rf);
update_rq_clock(rq);
cfs_rq = task_cfs_rq(current);
curr = cfs_rq->curr;
if (curr) {
update_curr(cfs_rq);
se->vruntime = curr->vruntime;
}
place_entity(cfs_rq, se, 1);
if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
/*
* Upon rescheduling, sched_class::put_prev_task() will place
* 'current' within the tree based on its new key value.
*/
swap(curr->vruntime, se->vruntime);
resched_curr(rq);
}
se->vruntime -= cfs_rq->min_vruntime;
rq_unlock(rq, &rf);
}
/*
* Priority of the task has changed. Check to see if we preempt
* the current task.
*/
static void
prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
{
if (!task_on_rq_queued(p))
return;
if (rq->cfs.nr_running == 1)
return;
/*
* Reschedule if we are currently running on this runqueue and
* our priority decreased, or if we are not currently running on
* this runqueue and our priority is higher than the current's
*/
if (rq->curr == p) {
if (p->prio > oldprio)
resched_curr(rq);
} else
check_preempt_curr(rq, p, 0);
}
static inline bool vruntime_normalized(struct task_struct *p)
{
struct sched_entity *se = &p->se;
/*
* In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
* the dequeue_entity(.flags=0) will already have normalized the
* vruntime.
*/
if (p->on_rq)
return true;
/*
* When !on_rq, vruntime of the task has usually NOT been normalized.
* But there are some cases where it has already been normalized:
*
* - A forked child which is waiting for being woken up by
* wake_up_new_task().
* - A task which has been woken up by try_to_wake_up() and
* waiting for actually being woken up by sched_ttwu_pending().
*/
if (!se->sum_exec_runtime ||
(p->state == TASK_WAKING && p->sched_remote_wakeup))
return true;
return false;
}
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
* Propagate the changes of the sched_entity across the tg tree to make it
* visible to the root
*/
static void propagate_entity_cfs_rq(struct sched_entity *se)
{
struct cfs_rq *cfs_rq;
list_add_leaf_cfs_rq(cfs_rq_of(se));
/* Start to propagate at parent */
se = se->parent;
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
if (!cfs_rq_throttled(cfs_rq)){
update_load_avg(cfs_rq, se, UPDATE_TG);
list_add_leaf_cfs_rq(cfs_rq);
continue;
}
if (list_add_leaf_cfs_rq(cfs_rq))
break;
}
}
#else
static void propagate_entity_cfs_rq(struct sched_entity *se) { }
#endif
static void detach_entity_cfs_rq(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
/* Catch up with the cfs_rq and remove our load when we leave */
update_load_avg(cfs_rq, se, 0);
detach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq);
propagate_entity_cfs_rq(se);
}
static void attach_entity_cfs_rq(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
* Since the real-depth could have been changed (only FAIR
* class maintain depth value), reset depth properly.
*/
se->depth = se->parent ? se->parent->depth + 1 : 0;
#endif
/* Synchronize entity with its cfs_rq */
update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
attach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq);
propagate_entity_cfs_rq(se);
}
static void detach_task_cfs_rq(struct task_struct *p)
{
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
if (!vruntime_normalized(p)) {
/*
* Fix up our vruntime so that the current sleep doesn't
* cause 'unlimited' sleep bonus.
*/
place_entity(cfs_rq, se, 0);
se->vruntime -= cfs_rq->min_vruntime;
}
detach_entity_cfs_rq(se);
}
static void attach_task_cfs_rq(struct task_struct *p)
{
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
attach_entity_cfs_rq(se);
if (!vruntime_normalized(p))
se->vruntime += cfs_rq->min_vruntime;
}
static void switched_from_fair(struct rq *rq, struct task_struct *p)
{
detach_task_cfs_rq(p);
}
static void switched_to_fair(struct rq *rq, struct task_struct *p)
{
attach_task_cfs_rq(p);
if (task_on_rq_queued(p)) {
/*
* We were most likely switched from sched_rt, so
* kick off the schedule if running, otherwise just see
* if we can still preempt the current task.
*/
if (rq->curr == p)
resched_curr(rq);
else
check_preempt_curr(rq, p, 0);
}
}
/* Account for a task changing its policy or group.
*
* This routine is mostly called to set cfs_rq->curr field when a task
* migrates between groups/classes.
*/
static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
{
struct sched_entity *se = &p->se;
#ifdef CONFIG_SMP
if (task_on_rq_queued(p)) {
/*
* Move the next running task to the front of the list, so our
* cfs_tasks list becomes MRU one.
*/
list_move(&se->group_node, &rq->cfs_tasks);
}
#endif
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
set_next_entity(cfs_rq, se);
/* ensure bandwidth has been allocated on our new cfs_rq */
account_cfs_rq_runtime(cfs_rq, 0);
}
}
void init_cfs_rq(struct cfs_rq *cfs_rq)
{
cfs_rq->tasks_timeline = RB_ROOT_CACHED;
cfs_rq->min_vruntime = (u64)(-(1LL << 20));
#ifndef CONFIG_64BIT
cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
#endif
#ifdef CONFIG_SMP
raw_spin_lock_init(&cfs_rq->removed.lock);
#endif
}
#ifdef CONFIG_FAIR_GROUP_SCHED
static void task_set_group_fair(struct task_struct *p)
{
struct sched_entity *se = &p->se;
set_task_rq(p, task_cpu(p));
se->depth = se->parent ? se->parent->depth + 1 : 0;
}
static void task_move_group_fair(struct task_struct *p)
{
detach_task_cfs_rq(p);
set_task_rq(p, task_cpu(p));
#ifdef CONFIG_SMP
/* Tell se's cfs_rq has been changed -- migrated */
p->se.avg.last_update_time = 0;
#endif
attach_task_cfs_rq(p);
}
static void task_change_group_fair(struct task_struct *p, int type)
{
switch (type) {
case TASK_SET_GROUP:
task_set_group_fair(p);
break;
case TASK_MOVE_GROUP:
task_move_group_fair(p);
break;
}
}
void free_fair_sched_group(struct task_group *tg)
{
int i;
destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
for_each_possible_cpu(i) {
if (tg->cfs_rq)
kfree(tg->cfs_rq[i]);
if (tg->se)
kfree(tg->se[i]);
}
kfree(tg->cfs_rq);
kfree(tg->se);
}
int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
{
struct sched_entity *se;
struct cfs_rq *cfs_rq;
int i;
tg->cfs_rq = kcalloc(nr_cpu_ids, sizeof(cfs_rq), GFP_KERNEL);
if (!tg->cfs_rq)
goto err;
tg->se = kcalloc(nr_cpu_ids, sizeof(se), GFP_KERNEL);
if (!tg->se)
goto err;
tg->shares = NICE_0_LOAD;
init_cfs_bandwidth(tg_cfs_bandwidth(tg));
for_each_possible_cpu(i) {
cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
GFP_KERNEL, cpu_to_node(i));
if (!cfs_rq)
goto err;
se = kzalloc_node(sizeof(struct sched_entity),
GFP_KERNEL, cpu_to_node(i));
if (!se)
goto err_free_rq;
init_cfs_rq(cfs_rq);
init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
init_entity_runnable_average(se);
}
return 1;
err_free_rq:
kfree(cfs_rq);
err:
return 0;
}
void online_fair_sched_group(struct task_group *tg)
{
struct sched_entity *se;
struct rq_flags rf;
struct rq *rq;
int i;
for_each_possible_cpu(i) {
rq = cpu_rq(i);
se = tg->se[i];
rq_lock_irq(rq, &rf);
update_rq_clock(rq);
attach_entity_cfs_rq(se);
sync_throttle(tg, i);
rq_unlock_irq(rq, &rf);
}
}
void unregister_fair_sched_group(struct task_group *tg)
{
unsigned long flags;
struct rq *rq;
int cpu;
for_each_possible_cpu(cpu) {
if (tg->se[cpu])
remove_entity_load_avg(tg->se[cpu]);
/*
* Only empty task groups can be destroyed; so we can speculatively
* check on_list without danger of it being re-added.
*/
if (!tg->cfs_rq[cpu]->on_list)
continue;
rq = cpu_rq(cpu);
raw_spin_lock_irqsave(&rq->lock, flags);
list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
}
void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
struct sched_entity *se, int cpu,
struct sched_entity *parent)
{
struct rq *rq = cpu_rq(cpu);
cfs_rq->tg = tg;
cfs_rq->rq = rq;
init_cfs_rq_runtime(cfs_rq);
tg->cfs_rq[cpu] = cfs_rq;
tg->se[cpu] = se;
/* se could be NULL for root_task_group */
if (!se)
return;
if (!parent) {
se->cfs_rq = &rq->cfs;
se->depth = 0;
} else {
se->cfs_rq = parent->my_q;
se->depth = parent->depth + 1;
}
se->my_q = cfs_rq;
/* guarantee group entities always have weight */
update_load_set(&se->load, NICE_0_LOAD);
se->parent = parent;
}
static DEFINE_MUTEX(shares_mutex);
int sched_group_set_shares(struct task_group *tg, unsigned long shares)
{
int i;
/*
* We can't change the weight of the root cgroup.
*/
if (!tg->se[0])
return -EINVAL;
shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
mutex_lock(&shares_mutex);
if (tg->shares == shares)
goto done;
tg->shares = shares;
for_each_possible_cpu(i) {
struct rq *rq = cpu_rq(i);
struct sched_entity *se = tg->se[i];
struct rq_flags rf;
/* Propagate contribution to hierarchy */
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
for_each_sched_entity(se) {
update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
update_cfs_group(se);
}
rq_unlock_irqrestore(rq, &rf);
}
done:
mutex_unlock(&shares_mutex);
return 0;
}
#else /* CONFIG_FAIR_GROUP_SCHED */
void free_fair_sched_group(struct task_group *tg) { }
int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
{
return 1;
}
void online_fair_sched_group(struct task_group *tg) { }
void unregister_fair_sched_group(struct task_group *tg) { }
#endif /* CONFIG_FAIR_GROUP_SCHED */
static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
{
struct sched_entity *se = &task->se;
unsigned int rr_interval = 0;
/*
* Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
* idle runqueue:
*/
if (rq->cfs.load.weight)
rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
return rr_interval;
}
/*
* All the scheduling class methods:
*/
const struct sched_class fair_sched_class
__section("__fair_sched_class") = {
.enqueue_task = enqueue_task_fair,
.dequeue_task = dequeue_task_fair,
.yield_task = yield_task_fair,
.yield_to_task = yield_to_task_fair,
.check_preempt_curr = check_preempt_wakeup,
.pick_next_task = __pick_next_task_fair,
.put_prev_task = put_prev_task_fair,
.set_next_task = set_next_task_fair,
#ifdef CONFIG_SMP
.balance = balance_fair,
.select_task_rq = select_task_rq_fair,
.migrate_task_rq = migrate_task_rq_fair,
.rq_online = rq_online_fair,
.rq_offline = rq_offline_fair,
.task_dead = task_dead_fair,
.set_cpus_allowed = set_cpus_allowed_common,
#endif
.task_tick = task_tick_fair,
.task_fork = task_fork_fair,
.prio_changed = prio_changed_fair,
.switched_from = switched_from_fair,
.switched_to = switched_to_fair,
.get_rr_interval = get_rr_interval_fair,
.update_curr = update_curr_fair,
#ifdef CONFIG_FAIR_GROUP_SCHED
.task_change_group = task_change_group_fair,
#endif
#ifdef CONFIG_UCLAMP_TASK
.uclamp_enabled = 1,
#endif
};
#ifdef CONFIG_SCHED_DEBUG
void print_cfs_stats(struct seq_file *m, int cpu)
{
struct cfs_rq *cfs_rq, *pos;
rcu_read_lock();
for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
print_cfs_rq(m, cpu, cfs_rq);
rcu_read_unlock();
}
#ifdef CONFIG_NUMA_BALANCING
void show_numa_stats(struct task_struct *p, struct seq_file *m)
{
int node;
unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
struct numa_group *ng;
rcu_read_lock();
ng = rcu_dereference(p->numa_group);
for_each_online_node(node) {
if (p->numa_faults) {
tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
}
if (ng) {
gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)],
gpf = ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
}
print_numa_stats(m, node, tsf, tpf, gsf, gpf);
}
rcu_read_unlock();
}
#endif /* CONFIG_NUMA_BALANCING */
#endif /* CONFIG_SCHED_DEBUG */
__init void init_sched_fair_class(void)
{
#ifdef CONFIG_SMP
open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
#ifdef CONFIG_NO_HZ_COMMON
nohz.next_balance = jiffies;
nohz.next_blocked = jiffies;
zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
#endif
#endif /* SMP */
}
/*
* Helper functions to facilitate extracting info from tracepoints.
*/
const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq)
{
#ifdef CONFIG_SMP
return cfs_rq ? &cfs_rq->avg : NULL;
#else
return NULL;
#endif
}
EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg);
char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len)
{
if (!cfs_rq) {
if (str)
strlcpy(str, "(null)", len);
else
return NULL;
}
cfs_rq_tg_path(cfs_rq, str, len);
return str;
}
EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path);
int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq)
{
return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1;
}
EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu);
const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq)
{
#ifdef CONFIG_SMP
return rq ? &rq->avg_rt : NULL;
#else
return NULL;
#endif
}
EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt);
const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq)
{
#ifdef CONFIG_SMP
return rq ? &rq->avg_dl : NULL;
#else
return NULL;
#endif
}
EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl);
const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq)
{
#if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ)
return rq ? &rq->avg_irq : NULL;
#else
return NULL;
#endif
}
EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq);
int sched_trace_rq_cpu(struct rq *rq)
{
return rq ? cpu_of(rq) : -1;
}
EXPORT_SYMBOL_GPL(sched_trace_rq_cpu);
int sched_trace_rq_cpu_capacity(struct rq *rq)
{
return rq ?
#ifdef CONFIG_SMP
rq->cpu_capacity
#else
SCHED_CAPACITY_SCALE
#endif
: -1;
}
EXPORT_SYMBOL_GPL(sched_trace_rq_cpu_capacity);
const struct cpumask *sched_trace_rd_span(struct root_domain *rd)
{
#ifdef CONFIG_SMP
return rd ? rd->span : NULL;
#else
return NULL;
#endif
}
EXPORT_SYMBOL_GPL(sched_trace_rd_span);
int sched_trace_rq_nr_running(struct rq *rq)
{
return rq ? rq->nr_running : -1;
}
EXPORT_SYMBOL_GPL(sched_trace_rq_nr_running);