Files
android_kernel_xiaomi_sm8450/net/ipv4/tcp_ipv4.c
Greg Kroah-Hartman 477f5e6b9e Merge 5.10.188 into android12-5.10-lts
Changes in 5.10.188
	media: atomisp: fix "variable dereferenced before check 'asd'"
	x86/smp: Use dedicated cache-line for mwait_play_dead()
	can: isotp: isotp_sendmsg(): fix return error fix on TX path
	video: imsttfb: check for ioremap() failures
	fbdev: imsttfb: Fix use after free bug in imsttfb_probe
	HID: wacom: Use ktime_t rather than int when dealing with timestamps
	HID: logitech-hidpp: add HIDPP_QUIRK_DELAYED_INIT for the T651.
	Revert "thermal/drivers/mediatek: Use devm_of_iomap to avoid resource leak in mtk_thermal_probe"
	scripts/tags.sh: Resolve gtags empty index generation
	drm/amdgpu: Validate VM ioctl flags.
	nubus: Partially revert proc_create_single_data() conversion
	fs: pipe: reveal missing function protoypes
	x86/resctrl: Only show tasks' pid in current pid namespace
	blk-iocost: use spin_lock_irqsave in adjust_inuse_and_calc_cost
	md/raid10: check slab-out-of-bounds in md_bitmap_get_counter
	md/raid10: fix overflow of md/safe_mode_delay
	md/raid10: fix wrong setting of max_corr_read_errors
	md/raid10: fix null-ptr-deref of mreplace in raid10_sync_request
	md/raid10: fix io loss while replacement replace rdev
	irqchip/jcore-aic: Kill use of irq_create_strict_mappings()
	irqchip/jcore-aic: Fix missing allocation of IRQ descriptors
	posix-timers: Prevent RT livelock in itimer_delete()
	tracing/timer: Add missing hrtimer modes to decode_hrtimer_mode().
	clocksource/drivers/cadence-ttc: Fix memory leak in ttc_timer_probe
	PM: domains: fix integer overflow issues in genpd_parse_state()
	perf/arm-cmn: Fix DTC reset
	powercap: RAPL: Fix CONFIG_IOSF_MBI dependency
	ARM: 9303/1: kprobes: avoid missing-declaration warnings
	cpufreq: intel_pstate: Fix energy_performance_preference for passive
	thermal/drivers/sun8i: Fix some error handling paths in sun8i_ths_probe()
	rcuscale: Console output claims too few grace periods
	rcuscale: Always log error message
	rcuscale: Move shutdown from wait_event() to wait_event_idle()
	rcu/rcuscale: Move rcu_scale_*() after kfree_scale_cleanup()
	rcu/rcuscale: Stop kfree_scale_thread thread(s) after unloading rcuscale
	perf/ibs: Fix interface via core pmu events
	x86/mm: Fix __swp_entry_to_pte() for Xen PV guests
	evm: Complete description of evm_inode_setattr()
	ima: Fix build warnings
	pstore/ram: Add check for kstrdup
	igc: Enable and fix RX hash usage by netstack
	wifi: ath9k: fix AR9003 mac hardware hang check register offset calculation
	wifi: ath9k: avoid referencing uninit memory in ath9k_wmi_ctrl_rx
	samples/bpf: Fix buffer overflow in tcp_basertt
	spi: spi-geni-qcom: Correct CS_TOGGLE bit in SPI_TRANS_CFG
	wifi: wilc1000: fix for absent RSN capabilities WFA testcase
	wifi: mwifiex: Fix the size of a memory allocation in mwifiex_ret_802_11_scan()
	bpf: Remove extra lock_sock for TCP_ZEROCOPY_RECEIVE
	sctp: add bpf_bypass_getsockopt proto callback
	libbpf: fix offsetof() and container_of() to work with CO-RE
	nfc: constify several pointers to u8, char and sk_buff
	nfc: llcp: fix possible use of uninitialized variable in nfc_llcp_send_connect()
	bpftool: JIT limited misreported as negative value on aarch64
	regulator: core: Fix more error checking for debugfs_create_dir()
	regulator: core: Streamline debugfs operations
	wifi: orinoco: Fix an error handling path in spectrum_cs_probe()
	wifi: orinoco: Fix an error handling path in orinoco_cs_probe()
	wifi: atmel: Fix an error handling path in atmel_probe()
	wl3501_cs: Fix misspelling and provide missing documentation
	net: create netdev->dev_addr assignment helpers
	wl3501_cs: use eth_hw_addr_set()
	wifi: wl3501_cs: Fix an error handling path in wl3501_probe()
	wifi: ray_cs: Utilize strnlen() in parse_addr()
	wifi: ray_cs: Drop useless status variable in parse_addr()
	wifi: ray_cs: Fix an error handling path in ray_probe()
	wifi: ath9k: don't allow to overwrite ENDPOINT0 attributes
	wifi: rsi: Do not configure WoWlan in shutdown hook if not enabled
	wifi: rsi: Do not set MMC_PM_KEEP_POWER in shutdown
	watchdog/perf: define dummy watchdog_update_hrtimer_threshold() on correct config
	watchdog/perf: more properly prevent false positives with turbo modes
	kexec: fix a memory leak in crash_shrink_memory()
	memstick r592: make memstick_debug_get_tpc_name() static
	wifi: ath9k: Fix possible stall on ath9k_txq_list_has_key()
	rtnetlink: extend RTEXT_FILTER_SKIP_STATS to IFLA_VF_INFO
	wifi: iwlwifi: pull from TXQs with softirqs disabled
	wifi: cfg80211: rewrite merging of inherited elements
	wifi: ath9k: convert msecs to jiffies where needed
	igc: Fix race condition in PTP tx code
	net: stmmac: fix double serdes powerdown
	netlink: fix potential deadlock in netlink_set_err()
	netlink: do not hard code device address lenth in fdb dumps
	selftests: rtnetlink: remove netdevsim device after ipsec offload test
	gtp: Fix use-after-free in __gtp_encap_destroy().
	net: axienet: Move reset before 64-bit DMA detection
	sfc: fix crash when reading stats while NIC is resetting
	nfc: llcp: simplify llcp_sock_connect() error paths
	net: nfc: Fix use-after-free caused by nfc_llcp_find_local
	lib/ts_bm: reset initial match offset for every block of text
	netfilter: conntrack: dccp: copy entire header to stack buffer, not just basic one
	netfilter: nf_conntrack_sip: fix the ct_sip_parse_numerical_param() return value.
	ipvlan: Fix return value of ipvlan_queue_xmit()
	netlink: Add __sock_i_ino() for __netlink_diag_dump().
	radeon: avoid double free in ci_dpm_init()
	drm/amd/display: Explicitly specify update type per plane info change
	Input: drv260x - sleep between polling GO bit
	drm/bridge: tc358768: always enable HS video mode
	drm/bridge: tc358768: fix PLL parameters computation
	drm/bridge: tc358768: fix PLL target frequency
	drm/bridge: tc358768: fix TCLK_ZEROCNT computation
	drm/bridge: tc358768: Add atomic_get_input_bus_fmts() implementation
	drm/bridge: tc358768: fix TCLK_TRAILCNT computation
	drm/bridge: tc358768: fix THS_ZEROCNT computation
	drm/bridge: tc358768: fix TXTAGOCNT computation
	drm/bridge: tc358768: fix THS_TRAILCNT computation
	drm/vram-helper: fix function names in vram helper doc
	ARM: dts: BCM5301X: Drop "clock-names" from the SPI node
	ARM: dts: meson8b: correct uart_B and uart_C clock references
	Input: adxl34x - do not hardcode interrupt trigger type
	drm: sun4i_tcon: use devm_clk_get_enabled in `sun4i_tcon_init_clocks`
	drm/panel: sharp-ls043t1le01: adjust mode settings
	ARM: dts: stm32: Move ethernet MAC EEPROM from SoM to carrier boards
	bus: ti-sysc: Fix dispc quirk masking bool variables
	arm64: dts: microchip: sparx5: do not use PSCI on reference boards
	RDMA/bnxt_re: Disable/kill tasklet only if it is enabled
	RDMA/bnxt_re: Fix to remove unnecessary return labels
	RDMA/bnxt_re: Use unique names while registering interrupts
	RDMA/bnxt_re: Remove a redundant check inside bnxt_re_update_gid
	RDMA/bnxt_re: Fix to remove an unnecessary log
	ARM: dts: gta04: Move model property out of pinctrl node
	arm64: dts: qcom: msm8916: correct camss unit address
	arm64: dts: qcom: msm8994: correct SPMI unit address
	arm64: dts: qcom: msm8996: correct camss unit address
	drm/panel: simple: fix active size for Ampire AM-480272H3TMQW-T01H
	ARM: ep93xx: fix missing-prototype warnings
	ARM: omap2: fix missing tick_broadcast() prototype
	arm64: dts: qcom: apq8096: fix fixed regulator name property
	ARM: dts: stm32: Shorten the AV96 HDMI sound card name
	memory: brcmstb_dpfe: fix testing array offset after use
	ASoC: es8316: Increment max value for ALC Capture Target Volume control
	ASoC: es8316: Do not set rate constraints for unsupported MCLKs
	ARM: dts: meson8: correct uart_B and uart_C clock references
	soc/fsl/qe: fix usb.c build errors
	IB/hfi1: Use bitmap_zalloc() when applicable
	IB/hfi1: Fix sdma.h tx->num_descs off-by-one errors
	IB/hfi1: Fix wrong mmu_node used for user SDMA packet after invalidate
	RDMA: Remove uverbs_ex_cmd_mask values that are linked to functions
	RDMA/hns: Fix coding style issues
	RDMA/hns: Use refcount_t APIs for HEM
	RDMA/hns: Clean the hardware related code for HEM
	RDMA/hns: Fix hns_roce_table_get return value
	ARM: dts: iwg20d-q7-common: Fix backlight pwm specifier
	arm64: dts: renesas: ulcb-kf: Remove flow control for SCIF1
	fbdev: omapfb: lcd_mipid: Fix an error handling path in mipid_spi_probe()
	arm64: dts: ti: k3-j7200: Fix physical address of pin
	ARM: dts: stm32: Fix audio routing on STM32MP15xx DHCOM PDK2
	ARM: dts: stm32: fix i2s endpoint format property for stm32mp15xx-dkx
	hwmon: (gsc-hwmon) fix fan pwm temperature scaling
	hwmon: (adm1275) enable adm1272 temperature reporting
	hwmon: (adm1275) Allow setting sample averaging
	hwmon: (pmbus/adm1275) Fix problems with temperature monitoring on ADM1272
	ARM: dts: BCM5301X: fix duplex-full => full-duplex
	drm/amdkfd: Fix potential deallocation of previously deallocated memory.
	drm/radeon: fix possible division-by-zero errors
	amdgpu: validate offset_in_bo of drm_amdgpu_gem_va
	RDMA/bnxt_re: wraparound mbox producer index
	RDMA/bnxt_re: Avoid calling wake_up threads from spin_lock context
	clk: imx: clk-imx8mn: fix memory leak in imx8mn_clocks_probe
	clk: imx: clk-imx8mp: improve error handling in imx8mp_clocks_probe()
	clk: tegra: tegra124-emc: Fix potential memory leak
	ALSA: ac97: Fix possible NULL dereference in snd_ac97_mixer
	drm/msm/dpu: do not enable color-management if DSPPs are not available
	drm/msm/dp: Free resources after unregistering them
	clk: vc5: check memory returned by kasprintf()
	clk: cdce925: check return value of kasprintf()
	clk: si5341: Allow different output VDD_SEL values
	clk: si5341: Add sysfs properties to allow checking/resetting device faults
	clk: si5341: return error if one synth clock registration fails
	clk: si5341: check return value of {devm_}kasprintf()
	clk: si5341: free unused memory on probe failure
	clk: keystone: sci-clk: check return value of kasprintf()
	clk: ti: clkctrl: check return value of kasprintf()
	drivers: meson: secure-pwrc: always enable DMA domain
	ovl: update of dentry revalidate flags after copy up
	ASoC: imx-audmix: check return value of devm_kasprintf()
	PCI: cadence: Fix Gen2 Link Retraining process
	scsi: qedf: Fix NULL dereference in error handling
	pinctrl: bcm2835: Handle gpiochip_add_pin_range() errors
	PCI/ASPM: Disable ASPM on MFD function removal to avoid use-after-free
	scsi: 3w-xxxx: Add error handling for initialization failure in tw_probe()
	PCI: pciehp: Cancel bringup sequence if card is not present
	PCI: ftpci100: Release the clock resources
	PCI: Add pci_clear_master() stub for non-CONFIG_PCI
	perf bench: Use unbuffered output when pipe/tee'ing to a file
	perf bench: Add missing setlocale() call to allow usage of %'d style formatting
	pinctrl: cherryview: Return correct value if pin in push-pull mode
	kcsan: Don't expect 64 bits atomic builtins from 32 bits architectures
	perf script: Fixup 'struct evsel_script' method prefix
	perf script: Fix allocation of evsel->priv related to per-event dump files
	perf dwarf-aux: Fix off-by-one in die_get_varname()
	pinctrl: at91-pio4: check return value of devm_kasprintf()
	powerpc/powernv/sriov: perform null check on iov before dereferencing iov
	mm: rename pud_page_vaddr to pud_pgtable and make it return pmd_t *
	mm: rename p4d_page_vaddr to p4d_pgtable and make it return pud_t *
	powerpc/book3s64/mm: Fix DirectMap stats in /proc/meminfo
	powerpc/mm/dax: Fix the condition when checking if altmap vmemap can cross-boundary
	hwrng: virtio - add an internal buffer
	hwrng: virtio - don't wait on cleanup
	hwrng: virtio - don't waste entropy
	hwrng: virtio - always add a pending request
	hwrng: virtio - Fix race on data_avail and actual data
	crypto: nx - fix build warnings when DEBUG_FS is not enabled
	modpost: fix section mismatch message for R_ARM_ABS32
	modpost: fix section mismatch message for R_ARM_{PC24,CALL,JUMP24}
	crypto: marvell/cesa - Fix type mismatch warning
	modpost: fix off by one in is_executable_section()
	ARC: define ASM_NL and __ALIGN(_STR) outside #ifdef __ASSEMBLY__ guard
	NFSv4.1: freeze the session table upon receiving NFS4ERR_BADSESSION
	dax: Fix dax_mapping_release() use after free
	dax: Introduce alloc_dev_dax_id()
	hwrng: st - keep clock enabled while hwrng is registered
	io_uring: ensure IOPOLL locks around deferred work
	USB: serial: option: add LARA-R6 01B PIDs
	usb: dwc3: gadget: Propagate core init errors to UDC during pullup
	phy: tegra: xusb: Clear the driver reference in usb-phy dev
	block: fix signed int overflow in Amiga partition support
	block: change all __u32 annotations to __be32 in affs_hardblocks.h
	SUNRPC: Fix UAF in svc_tcp_listen_data_ready()
	w1: w1_therm: fix locking behavior in convert_t
	w1: fix loop in w1_fini()
	sh: j2: Use ioremap() to translate device tree address into kernel memory
	serial: 8250: omap: Fix freeing of resources on failed register
	clk: qcom: gcc-ipq6018: Use floor ops for sdcc clocks
	media: usb: Check az6007_read() return value
	media: videodev2.h: Fix struct v4l2_input tuner index comment
	media: usb: siano: Fix warning due to null work_func_t function pointer
	clk: qcom: reset: Allow specifying custom reset delay
	clk: qcom: reset: support resetting multiple bits
	clk: qcom: ipq6018: fix networking resets
	usb: dwc3: qcom: Fix potential memory leak
	usb: gadget: u_serial: Add null pointer check in gserial_suspend
	extcon: Fix kernel doc of property fields to avoid warnings
	extcon: Fix kernel doc of property capability fields to avoid warnings
	usb: phy: phy-tahvo: fix memory leak in tahvo_usb_probe()
	usb: hide unused usbfs_notify_suspend/resume functions
	serial: 8250: lock port for stop_rx() in omap8250_irq()
	serial: 8250: lock port for UART_IER access in omap8250_irq()
	kernfs: fix missing kernfs_idr_lock to remove an ID from the IDR
	coresight: Fix loss of connection info when a module is unloaded
	mfd: rt5033: Drop rt5033-battery sub-device
	media: venus: helpers: Fix ALIGN() of non power of two
	media: atomisp: gmin_platform: fix out_len in gmin_get_config_dsm_var()
	KVM: s390: fix KVM_S390_GET_CMMA_BITS for GFNs in memslot holes
	usb: dwc3: qcom: Release the correct resources in dwc3_qcom_remove()
	usb: dwc3: qcom: Fix an error handling path in dwc3_qcom_probe()
	usb: common: usb-conn-gpio: Set last role to unknown before initial detection
	usb: dwc3-meson-g12a: Fix an error handling path in dwc3_meson_g12a_probe()
	mfd: intel-lpss: Add missing check for platform_get_resource
	Revert "usb: common: usb-conn-gpio: Set last role to unknown before initial detection"
	serial: 8250_omap: Use force_suspend and resume for system suspend
	test_firmware: return ENOMEM instead of ENOSPC on failed memory allocation
	mfd: stmfx: Fix error path in stmfx_chip_init
	mfd: stmfx: Nullify stmfx->vdd in case of error
	KVM: s390: vsie: fix the length of APCB bitmap
	mfd: stmpe: Only disable the regulators if they are enabled
	phy: tegra: xusb: check return value of devm_kzalloc()
	pwm: imx-tpm: force 'real_period' to be zero in suspend
	pwm: sysfs: Do not apply state to already disabled PWMs
	rtc: st-lpc: Release some resources in st_rtc_probe() in case of error
	media: cec: i2c: ch7322: also select REGMAP
	sctp: fix potential deadlock on &net->sctp.addr_wq_lock
	Add MODULE_FIRMWARE() for FIRMWARE_TG357766.
	net: dsa: vsc73xx: fix MTU configuration
	spi: bcm-qspi: return error if neither hif_mspi nor mspi is available
	mailbox: ti-msgmgr: Fill non-message tx data fields with 0x0
	f2fs: fix error path handling in truncate_dnode()
	octeontx2-af: Fix mapping for NIX block from CGX connection
	powerpc: allow PPC_EARLY_DEBUG_CPM only when SERIAL_CPM=y
	net: bridge: keep ports without IFF_UNICAST_FLT in BR_PROMISC mode
	tcp: annotate data races in __tcp_oow_rate_limited()
	xsk: Honor SO_BINDTODEVICE on bind
	net/sched: act_pedit: Add size check for TCA_PEDIT_PARMS_EX
	pptp: Fix fib lookup calls.
	net: dsa: tag_sja1105: fix MAC DA patching from meta frames
	s390/qeth: Fix vipa deletion
	sh: dma: Fix DMA channel offset calculation
	apparmor: fix missing error check for rhashtable_insert_fast
	i2c: xiic: Defer xiic_wakeup() and __xiic_start_xfer() in xiic_process()
	i2c: xiic: Don't try to handle more interrupt events after error
	ALSA: jack: Fix mutex call in snd_jack_report()
	i2c: qup: Add missing unwind goto in qup_i2c_probe()
	NFSD: add encoding of op_recall flag for write delegation
	io_uring: wait interruptibly for request completions on exit
	mmc: core: disable TRIM on Kingston EMMC04G-M627
	mmc: core: disable TRIM on Micron MTFC4GACAJCN-1M
	mmc: mmci: Set PROBE_PREFER_ASYNCHRONOUS
	mmc: sdhci: fix DMA configure compatibility issue when 64bit DMA mode is used.
	bcache: fixup btree_cache_wait list damage
	bcache: Remove unnecessary NULL point check in node allocations
	bcache: Fix __bch_btree_node_alloc to make the failure behavior consistent
	um: Use HOST_DIR for mrproper
	integrity: Fix possible multiple allocation in integrity_inode_get()
	autofs: use flexible array in ioctl structure
	shmem: use ramfs_kill_sb() for kill_sb method of ramfs-based tmpfs
	jffs2: reduce stack usage in jffs2_build_xattr_subsystem()
	fs: avoid empty option when generating legacy mount string
	ext4: Remove ext4 locking of moved directory
	Revert "f2fs: fix potential corruption when moving a directory"
	fs: Establish locking order for unrelated directories
	fs: Lock moved directories
	btrfs: add handling for RAID1C23/DUP to btrfs_reduce_alloc_profile
	btrfs: fix race when deleting quota root from the dirty cow roots list
	ASoC: mediatek: mt8173: Fix irq error path
	ASoC: mediatek: mt8173: Fix snd_soc_component_initialize error path
	ARM: orion5x: fix d2net gpio initialization
	leds: trigger: netdev: Recheck NETDEV_LED_MODE_LINKUP on dev rename
	fs: no need to check source
	fanotify: disallow mount/sb marks on kernel internal pseudo fs
	tpm, tpm_tis: Claim locality in interrupt handler
	selftests/bpf: Add verifier test for PTR_TO_MEM spill
	block: add overflow checks for Amiga partition support
	sh: pgtable-3level: Fix cast to pointer from integer of different size
	netfilter: nf_tables: use net_generic infra for transaction data
	netfilter: nf_tables: add rescheduling points during loop detection walks
	netfilter: nf_tables: incorrect error path handling with NFT_MSG_NEWRULE
	netfilter: nf_tables: fix chain binding transaction logic
	netfilter: nf_tables: add NFT_TRANS_PREPARE_ERROR to deal with bound set/chain
	netfilter: nf_tables: reject unbound anonymous set before commit phase
	netfilter: nf_tables: reject unbound chain set before commit phase
	netfilter: nftables: rename set element data activation/deactivation functions
	netfilter: nf_tables: drop map element references from preparation phase
	netfilter: nf_tables: unbind non-anonymous set if rule construction fails
	netfilter: nf_tables: fix scheduling-while-atomic splat
	netfilter: conntrack: Avoid nf_ct_helper_hash uses after free
	netfilter: nf_tables: do not ignore genmask when looking up chain by id
	netfilter: nf_tables: prevent OOB access in nft_byteorder_eval
	wireguard: queueing: use saner cpu selection wrapping
	wireguard: netlink: send staged packets when setting initial private key
	tty: serial: fsl_lpuart: add earlycon for imx8ulp platform
	rcu-tasks: Mark ->trc_reader_nesting data races
	rcu-tasks: Mark ->trc_reader_special.b.need_qs data races
	rcu-tasks: Simplify trc_read_check_handler() atomic operations
	block/partition: fix signedness issue for Amiga partitions
	io_uring: Use io_schedule* in cqring wait
	io_uring: add reschedule point to handle_tw_list()
	net: lan743x: Don't sleep in atomic context
	workqueue: clean up WORK_* constant types, clarify masking
	drm/panel: simple: Add connector_type for innolux_at043tn24
	drm/panel: simple: Add Powertip PH800480T013 drm_display_mode flags
	igc: Remove delay during TX ring configuration
	net/mlx5e: fix double free in mlx5e_destroy_flow_table
	net/mlx5e: Check for NOT_READY flag state after locking
	igc: set TP bit in 'supported' and 'advertising' fields of ethtool_link_ksettings
	scsi: qla2xxx: Fix error code in qla2x00_start_sp()
	net: mvneta: fix txq_map in case of txq_number==1
	net/sched: cls_fw: Fix improper refcount update leads to use-after-free
	gve: Set default duplex configuration to full
	ionic: remove WARN_ON to prevent panic_on_warn
	net: bgmac: postpone turning IRQs off to avoid SoC hangs
	net: prevent skb corruption on frag list segmentation
	icmp6: Fix null-ptr-deref of ip6_null_entry->rt6i_idev in icmp6_dev().
	udp6: fix udp6_ehashfn() typo
	ntb: idt: Fix error handling in idt_pci_driver_init()
	NTB: amd: Fix error handling in amd_ntb_pci_driver_init()
	ntb: intel: Fix error handling in intel_ntb_pci_driver_init()
	NTB: ntb_transport: fix possible memory leak while device_register() fails
	NTB: ntb_tool: Add check for devm_kcalloc
	ipv6/addrconf: fix a potential refcount underflow for idev
	platform/x86: wmi: remove unnecessary argument
	platform/x86: wmi: use guid_t and guid_equal()
	platform/x86: wmi: move variables
	platform/x86: wmi: Break possible infinite loop when parsing GUID
	igc: Fix launchtime before start of cycle
	igc: Fix inserting of empty frame for launchtime
	riscv: bpf: Move bpf_jit_alloc_exec() and bpf_jit_free_exec() to core
	riscv: bpf: Avoid breaking W^X
	bpf, riscv: Support riscv jit to provide bpf_line_info
	riscv, bpf: Fix inconsistent JIT image generation
	erofs: avoid infinite loop in z_erofs_do_read_page() when reading beyond EOF
	wifi: airo: avoid uninitialized warning in airo_get_rate()
	net/sched: flower: Ensure both minimum and maximum ports are specified
	netdevsim: fix uninitialized data in nsim_dev_trap_fa_cookie_write()
	net/sched: make psched_mtu() RTNL-less safe
	net/sched: sch_qfq: refactor parsing of netlink parameters
	net/sched: sch_qfq: account for stab overhead in qfq_enqueue
	nvme-pci: fix DMA direction of unmapping integrity data
	f2fs: fix to avoid NULL pointer dereference f2fs_write_end_io()
	pinctrl: amd: Fix mistake in handling clearing pins at startup
	pinctrl: amd: Detect internal GPIO0 debounce handling
	pinctrl: amd: Only use special debounce behavior for GPIO 0
	tpm: tpm_vtpm_proxy: fix a race condition in /dev/vtpmx creation
	mtd: rawnand: meson: fix unaligned DMA buffers handling
	net: bcmgenet: Ensure MDIO unregistration has clocks enabled
	powerpc: Fail build if using recordmcount with binutils v2.37
	misc: fastrpc: Create fastrpc scalar with correct buffer count
	erofs: fix compact 4B support for 16k block size
	MIPS: Loongson: Fix cpu_probe_loongson() again
	ext4: Fix reusing stale buffer heads from last failed mounting
	ext4: fix wrong unit use in ext4_mb_clear_bb
	ext4: get block from bh in ext4_free_blocks for fast commit replay
	ext4: fix wrong unit use in ext4_mb_new_blocks
	ext4: only update i_reserved_data_blocks on successful block allocation
	jfs: jfs_dmap: Validate db_l2nbperpage while mounting
	hwrng: imx-rngc - fix the timeout for init and self check
	PCI/PM: Avoid putting EloPOS E2/S2/H2 PCIe Ports in D3cold
	PCI: Add function 1 DMA alias quirk for Marvell 88SE9235
	PCI: qcom: Disable write access to read only registers for IP v2.3.3
	PCI: rockchip: Assert PCI Configuration Enable bit after probe
	PCI: rockchip: Write PCI Device ID to correct register
	PCI: rockchip: Add poll and timeout to wait for PHY PLLs to be locked
	PCI: rockchip: Fix legacy IRQ generation for RK3399 PCIe endpoint core
	PCI: rockchip: Use u32 variable to access 32-bit registers
	PCI: rockchip: Set address alignment for endpoint mode
	misc: pci_endpoint_test: Free IRQs before removing the device
	misc: pci_endpoint_test: Re-init completion for every test
	md/raid0: add discard support for the 'original' layout
	fs: dlm: return positive pid value for F_GETLK
	drm/atomic: Allow vblank-enabled + self-refresh "disable"
	drm/rockchip: vop: Leave vblank enabled in self-refresh
	drm/amd/display: Correct `DMUB_FW_VERSION` macro
	serial: atmel: don't enable IRQs prematurely
	tty: serial: samsung_tty: Fix a memory leak in s3c24xx_serial_getclk() in case of error
	tty: serial: samsung_tty: Fix a memory leak in s3c24xx_serial_getclk() when iterating clk
	firmware: stratix10-svc: Fix a potential resource leak in svc_create_memory_pool()
	ceph: don't let check_caps skip sending responses for revoke msgs
	xhci: Fix resume issue of some ZHAOXIN hosts
	xhci: Fix TRB prefetch issue of ZHAOXIN hosts
	xhci: Show ZHAOXIN xHCI root hub speed correctly
	meson saradc: fix clock divider mask length
	Revert "8250: add support for ASIX devices with a FIFO bug"
	s390/decompressor: fix misaligned symbol build error
	tracing/histograms: Add histograms to hist_vars if they have referenced variables
	samples: ftrace: Save required argument registers in sample trampolines
	net: ena: fix shift-out-of-bounds in exponential backoff
	ring-buffer: Fix deadloop issue on reading trace_pipe
	xtensa: ISS: fix call to split_if_spec
	tracing: Fix null pointer dereference in tracing_err_log_open()
	tracing/probes: Fix not to count error code to total length
	scsi: qla2xxx: Wait for io return on terminate rport
	scsi: qla2xxx: Array index may go out of bound
	scsi: qla2xxx: Fix buffer overrun
	scsi: qla2xxx: Fix potential NULL pointer dereference
	scsi: qla2xxx: Check valid rport returned by fc_bsg_to_rport()
	scsi: qla2xxx: Correct the index of array
	scsi: qla2xxx: Pointer may be dereferenced
	scsi: qla2xxx: Remove unused nvme_ls_waitq wait queue
	net/sched: sch_qfq: reintroduce lmax bound check for MTU
	RDMA/cma: Ensure rdma_addr_cancel() happens before issuing more requests
	drm/atomic: Fix potential use-after-free in nonblocking commits
	ALSA: hda/realtek - remove 3k pull low procedure
	ALSA: hda/realtek: Enable Mute LED on HP Laptop 15s-eq2xxx
	keys: Fix linking a duplicate key to a keyring's assoc_array
	perf probe: Add test for regression introduced by switch to die_get_decl_file()
	btrfs: fix warning when putting transaction with qgroups enabled after abort
	fuse: revalidate: don't invalidate if interrupted
	selftests: tc: set timeout to 15 minutes
	selftests: tc: add 'ct' action kconfig dep
	regmap: Drop initial version of maximum transfer length fixes
	regmap: Account for register length in SMBus I/O limits
	can: bcm: Fix UAF in bcm_proc_show()
	drm/client: Fix memory leak in drm_client_target_cloned
	drm/client: Fix memory leak in drm_client_modeset_probe
	ASoC: fsl_sai: Disable bit clock with transmitter
	ext4: correct inline offset when handling xattrs in inode body
	debugobjects: Recheck debug_objects_enabled before reporting
	nbd: Add the maximum limit of allocated index in nbd_dev_add
	md: fix data corruption for raid456 when reshape restart while grow up
	md/raid10: prevent soft lockup while flush writes
	posix-timers: Ensure timer ID search-loop limit is valid
	btrfs: add xxhash to fast checksum implementations
	ACPI: button: Add lid disable DMI quirk for Nextbook Ares 8A
	ACPI: video: Add backlight=native DMI quirk for Apple iMac11,3
	ACPI: video: Add backlight=native DMI quirk for Lenovo ThinkPad X131e (3371 AMD version)
	arm64: set __exception_irq_entry with __irq_entry as a default
	arm64: mm: fix VA-range sanity check
	sched/fair: Don't balance task to its current running CPU
	wifi: ath11k: fix registration of 6Ghz-only phy without the full channel range
	bpf: Address KCSAN report on bpf_lru_list
	devlink: report devlink_port_type_warn source device
	wifi: wext-core: Fix -Wstringop-overflow warning in ioctl_standard_iw_point()
	wifi: iwlwifi: mvm: avoid baid size integer overflow
	igb: Fix igb_down hung on surprise removal
	spi: bcm63xx: fix max prepend length
	fbdev: imxfb: warn about invalid left/right margin
	pinctrl: amd: Use amd_pinconf_set() for all config options
	net: ethernet: ti: cpsw_ale: Fix cpsw_ale_get_field()/cpsw_ale_set_field()
	bridge: Add extack warning when enabling STP in netns.
	iavf: Fix use-after-free in free_netdev
	iavf: Fix out-of-bounds when setting channels on remove
	security: keys: Modify mismatched function name
	octeontx2-pf: Dont allocate BPIDs for LBK interfaces
	tcp: annotate data-races around tcp_rsk(req)->ts_recent
	net: ipv4: Use kfree_sensitive instead of kfree
	net:ipv6: check return value of pskb_trim()
	Revert "tcp: avoid the lookup process failing to get sk in ehash table"
	fbdev: au1200fb: Fix missing IRQ check in au1200fb_drv_probe
	llc: Don't drop packet from non-root netns.
	netfilter: nf_tables: fix spurious set element insertion failure
	netfilter: nf_tables: can't schedule in nft_chain_validate
	netfilter: nft_set_pipapo: fix improper element removal
	netfilter: nf_tables: skip bound chain in netns release path
	netfilter: nf_tables: skip bound chain on rule flush
	tcp: annotate data-races around tp->tcp_tx_delay
	tcp: annotate data-races around tp->keepalive_time
	tcp: annotate data-races around tp->keepalive_intvl
	tcp: annotate data-races around tp->keepalive_probes
	net: Introduce net.ipv4.tcp_migrate_req.
	tcp: Fix data-races around sysctl_tcp_syn(ack)?_retries.
	tcp: annotate data-races around icsk->icsk_syn_retries
	tcp: annotate data-races around tp->linger2
	tcp: annotate data-races around rskq_defer_accept
	tcp: annotate data-races around tp->notsent_lowat
	tcp: annotate data-races around icsk->icsk_user_timeout
	tcp: annotate data-races around fastopenq.max_qlen
	net: phy: prevent stale pointer dereference in phy_init()
	tracing/histograms: Return an error if we fail to add histogram to hist_vars list
	tracing: Fix memory leak of iter->temp when reading trace_pipe
	ftrace: Store the order of pages allocated in ftrace_page
	ftrace: Fix possible warning on checking all pages used in ftrace_process_locs()
	Linux 5.10.188

Change-Id: Ibcc1adc43df5b8f649b12078eedd5d4f57de4578
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
2023-08-03 11:23:27 +00:00

3038 lines
79 KiB
C

// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
* interface as the means of communication with the user level.
*
* Implementation of the Transmission Control Protocol(TCP).
*
* IPv4 specific functions
*
* code split from:
* linux/ipv4/tcp.c
* linux/ipv4/tcp_input.c
* linux/ipv4/tcp_output.c
*
* See tcp.c for author information
*/
/*
* Changes:
* David S. Miller : New socket lookup architecture.
* This code is dedicated to John Dyson.
* David S. Miller : Change semantics of established hash,
* half is devoted to TIME_WAIT sockets
* and the rest go in the other half.
* Andi Kleen : Add support for syncookies and fixed
* some bugs: ip options weren't passed to
* the TCP layer, missed a check for an
* ACK bit.
* Andi Kleen : Implemented fast path mtu discovery.
* Fixed many serious bugs in the
* request_sock handling and moved
* most of it into the af independent code.
* Added tail drop and some other bugfixes.
* Added new listen semantics.
* Mike McLagan : Routing by source
* Juan Jose Ciarlante: ip_dynaddr bits
* Andi Kleen: various fixes.
* Vitaly E. Lavrov : Transparent proxy revived after year
* coma.
* Andi Kleen : Fix new listen.
* Andi Kleen : Fix accept error reporting.
* YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
* Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
* a single port at the same time.
*/
#define pr_fmt(fmt) "TCP: " fmt
#include <linux/bottom_half.h>
#include <linux/types.h>
#include <linux/fcntl.h>
#include <linux/module.h>
#include <linux/random.h>
#include <linux/cache.h>
#include <linux/jhash.h>
#include <linux/init.h>
#include <linux/times.h>
#include <linux/slab.h>
#include <net/net_namespace.h>
#include <net/icmp.h>
#include <net/inet_hashtables.h>
#include <net/tcp.h>
#include <net/transp_v6.h>
#include <net/ipv6.h>
#include <net/inet_common.h>
#include <net/timewait_sock.h>
#include <net/xfrm.h>
#include <net/secure_seq.h>
#include <net/busy_poll.h>
#include <linux/inet.h>
#include <linux/ipv6.h>
#include <linux/stddef.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/inetdevice.h>
#include <linux/btf_ids.h>
#include <crypto/hash.h>
#include <linux/scatterlist.h>
#include <trace/events/tcp.h>
#ifdef CONFIG_TCP_MD5SIG
static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
__be32 daddr, __be32 saddr, const struct tcphdr *th);
#endif
struct inet_hashinfo tcp_hashinfo;
EXPORT_SYMBOL(tcp_hashinfo);
static u32 tcp_v4_init_seq(const struct sk_buff *skb)
{
return secure_tcp_seq(ip_hdr(skb)->daddr,
ip_hdr(skb)->saddr,
tcp_hdr(skb)->dest,
tcp_hdr(skb)->source);
}
static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
{
return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
}
int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
{
int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
const struct inet_timewait_sock *tw = inet_twsk(sktw);
const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
struct tcp_sock *tp = tcp_sk(sk);
if (reuse == 2) {
/* Still does not detect *everything* that goes through
* lo, since we require a loopback src or dst address
* or direct binding to 'lo' interface.
*/
bool loopback = false;
if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
loopback = true;
#if IS_ENABLED(CONFIG_IPV6)
if (tw->tw_family == AF_INET6) {
if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
loopback = true;
} else
#endif
{
if (ipv4_is_loopback(tw->tw_daddr) ||
ipv4_is_loopback(tw->tw_rcv_saddr))
loopback = true;
}
if (!loopback)
reuse = 0;
}
/* With PAWS, it is safe from the viewpoint
of data integrity. Even without PAWS it is safe provided sequence
spaces do not overlap i.e. at data rates <= 80Mbit/sec.
Actually, the idea is close to VJ's one, only timestamp cache is
held not per host, but per port pair and TW bucket is used as state
holder.
If TW bucket has been already destroyed we fall back to VJ's scheme
and use initial timestamp retrieved from peer table.
*/
if (tcptw->tw_ts_recent_stamp &&
(!twp || (reuse && time_after32(ktime_get_seconds(),
tcptw->tw_ts_recent_stamp)))) {
/* In case of repair and re-using TIME-WAIT sockets we still
* want to be sure that it is safe as above but honor the
* sequence numbers and time stamps set as part of the repair
* process.
*
* Without this check re-using a TIME-WAIT socket with TCP
* repair would accumulate a -1 on the repair assigned
* sequence number. The first time it is reused the sequence
* is -1, the second time -2, etc. This fixes that issue
* without appearing to create any others.
*/
if (likely(!tp->repair)) {
u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
if (!seq)
seq = 1;
WRITE_ONCE(tp->write_seq, seq);
tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
}
sock_hold(sktw);
return 1;
}
return 0;
}
EXPORT_SYMBOL_GPL(tcp_twsk_unique);
static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
int addr_len)
{
/* This check is replicated from tcp_v4_connect() and intended to
* prevent BPF program called below from accessing bytes that are out
* of the bound specified by user in addr_len.
*/
if (addr_len < sizeof(struct sockaddr_in))
return -EINVAL;
sock_owned_by_me(sk);
return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
}
/* This will initiate an outgoing connection. */
int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
struct inet_sock *inet = inet_sk(sk);
struct tcp_sock *tp = tcp_sk(sk);
__be16 orig_sport, orig_dport;
__be32 daddr, nexthop;
struct flowi4 *fl4;
struct rtable *rt;
int err;
struct ip_options_rcu *inet_opt;
struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
if (addr_len < sizeof(struct sockaddr_in))
return -EINVAL;
if (usin->sin_family != AF_INET)
return -EAFNOSUPPORT;
nexthop = daddr = usin->sin_addr.s_addr;
inet_opt = rcu_dereference_protected(inet->inet_opt,
lockdep_sock_is_held(sk));
if (inet_opt && inet_opt->opt.srr) {
if (!daddr)
return -EINVAL;
nexthop = inet_opt->opt.faddr;
}
orig_sport = inet->inet_sport;
orig_dport = usin->sin_port;
fl4 = &inet->cork.fl.u.ip4;
rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
IPPROTO_TCP,
orig_sport, orig_dport, sk);
if (IS_ERR(rt)) {
err = PTR_ERR(rt);
if (err == -ENETUNREACH)
IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
return err;
}
if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
ip_rt_put(rt);
return -ENETUNREACH;
}
if (!inet_opt || !inet_opt->opt.srr)
daddr = fl4->daddr;
if (!inet->inet_saddr)
inet->inet_saddr = fl4->saddr;
sk_rcv_saddr_set(sk, inet->inet_saddr);
if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
/* Reset inherited state */
tp->rx_opt.ts_recent = 0;
tp->rx_opt.ts_recent_stamp = 0;
if (likely(!tp->repair))
WRITE_ONCE(tp->write_seq, 0);
}
inet->inet_dport = usin->sin_port;
sk_daddr_set(sk, daddr);
inet_csk(sk)->icsk_ext_hdr_len = 0;
if (inet_opt)
inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
/* Socket identity is still unknown (sport may be zero).
* However we set state to SYN-SENT and not releasing socket
* lock select source port, enter ourselves into the hash tables and
* complete initialization after this.
*/
tcp_set_state(sk, TCP_SYN_SENT);
err = inet_hash_connect(tcp_death_row, sk);
if (err)
goto failure;
sk_set_txhash(sk);
rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
inet->inet_sport, inet->inet_dport, sk);
if (IS_ERR(rt)) {
err = PTR_ERR(rt);
rt = NULL;
goto failure;
}
/* OK, now commit destination to socket. */
sk->sk_gso_type = SKB_GSO_TCPV4;
sk_setup_caps(sk, &rt->dst);
rt = NULL;
if (likely(!tp->repair)) {
if (!tp->write_seq)
WRITE_ONCE(tp->write_seq,
secure_tcp_seq(inet->inet_saddr,
inet->inet_daddr,
inet->inet_sport,
usin->sin_port));
tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
inet->inet_saddr,
inet->inet_daddr);
}
inet->inet_id = prandom_u32();
if (tcp_fastopen_defer_connect(sk, &err))
return err;
if (err)
goto failure;
err = tcp_connect(sk);
if (err)
goto failure;
return 0;
failure:
/*
* This unhashes the socket and releases the local port,
* if necessary.
*/
tcp_set_state(sk, TCP_CLOSE);
if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
inet_reset_saddr(sk);
ip_rt_put(rt);
sk->sk_route_caps = 0;
inet->inet_dport = 0;
return err;
}
EXPORT_SYMBOL(tcp_v4_connect);
/*
* This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
* It can be called through tcp_release_cb() if socket was owned by user
* at the time tcp_v4_err() was called to handle ICMP message.
*/
void tcp_v4_mtu_reduced(struct sock *sk)
{
struct inet_sock *inet = inet_sk(sk);
struct dst_entry *dst;
u32 mtu;
if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
return;
mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
dst = inet_csk_update_pmtu(sk, mtu);
if (!dst)
return;
/* Something is about to be wrong... Remember soft error
* for the case, if this connection will not able to recover.
*/
if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
sk->sk_err_soft = EMSGSIZE;
mtu = dst_mtu(dst);
if (inet->pmtudisc != IP_PMTUDISC_DONT &&
ip_sk_accept_pmtu(sk) &&
inet_csk(sk)->icsk_pmtu_cookie > mtu) {
tcp_sync_mss(sk, mtu);
/* Resend the TCP packet because it's
* clear that the old packet has been
* dropped. This is the new "fast" path mtu
* discovery.
*/
tcp_simple_retransmit(sk);
} /* else let the usual retransmit timer handle it */
}
EXPORT_SYMBOL(tcp_v4_mtu_reduced);
static void do_redirect(struct sk_buff *skb, struct sock *sk)
{
struct dst_entry *dst = __sk_dst_check(sk, 0);
if (dst)
dst->ops->redirect(dst, sk, skb);
}
/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
void tcp_req_err(struct sock *sk, u32 seq, bool abort)
{
struct request_sock *req = inet_reqsk(sk);
struct net *net = sock_net(sk);
/* ICMPs are not backlogged, hence we cannot get
* an established socket here.
*/
if (seq != tcp_rsk(req)->snt_isn) {
__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
} else if (abort) {
/*
* Still in SYN_RECV, just remove it silently.
* There is no good way to pass the error to the newly
* created socket, and POSIX does not want network
* errors returned from accept().
*/
inet_csk_reqsk_queue_drop(req->rsk_listener, req);
tcp_listendrop(req->rsk_listener);
}
reqsk_put(req);
}
EXPORT_SYMBOL(tcp_req_err);
/* TCP-LD (RFC 6069) logic */
void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
s32 remaining;
u32 delta_us;
if (sock_owned_by_user(sk))
return;
if (seq != tp->snd_una || !icsk->icsk_retransmits ||
!icsk->icsk_backoff)
return;
skb = tcp_rtx_queue_head(sk);
if (WARN_ON_ONCE(!skb))
return;
icsk->icsk_backoff--;
icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
tcp_mstamp_refresh(tp);
delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
if (remaining > 0) {
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
remaining, TCP_RTO_MAX);
} else {
/* RTO revert clocked out retransmission.
* Will retransmit now.
*/
tcp_retransmit_timer(sk);
}
}
EXPORT_SYMBOL(tcp_ld_RTO_revert);
/*
* This routine is called by the ICMP module when it gets some
* sort of error condition. If err < 0 then the socket should
* be closed and the error returned to the user. If err > 0
* it's just the icmp type << 8 | icmp code. After adjustment
* header points to the first 8 bytes of the tcp header. We need
* to find the appropriate port.
*
* The locking strategy used here is very "optimistic". When
* someone else accesses the socket the ICMP is just dropped
* and for some paths there is no check at all.
* A more general error queue to queue errors for later handling
* is probably better.
*
*/
int tcp_v4_err(struct sk_buff *skb, u32 info)
{
const struct iphdr *iph = (const struct iphdr *)skb->data;
struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
struct tcp_sock *tp;
struct inet_sock *inet;
const int type = icmp_hdr(skb)->type;
const int code = icmp_hdr(skb)->code;
struct sock *sk;
struct request_sock *fastopen;
u32 seq, snd_una;
int err;
struct net *net = dev_net(skb->dev);
sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
th->dest, iph->saddr, ntohs(th->source),
inet_iif(skb), 0);
if (!sk) {
__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
return -ENOENT;
}
if (sk->sk_state == TCP_TIME_WAIT) {
inet_twsk_put(inet_twsk(sk));
return 0;
}
seq = ntohl(th->seq);
if (sk->sk_state == TCP_NEW_SYN_RECV) {
tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
type == ICMP_TIME_EXCEEDED ||
(type == ICMP_DEST_UNREACH &&
(code == ICMP_NET_UNREACH ||
code == ICMP_HOST_UNREACH)));
return 0;
}
bh_lock_sock(sk);
/* If too many ICMPs get dropped on busy
* servers this needs to be solved differently.
* We do take care of PMTU discovery (RFC1191) special case :
* we can receive locally generated ICMP messages while socket is held.
*/
if (sock_owned_by_user(sk)) {
if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
}
if (sk->sk_state == TCP_CLOSE)
goto out;
if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
goto out;
}
tp = tcp_sk(sk);
/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
fastopen = rcu_dereference(tp->fastopen_rsk);
snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
if (sk->sk_state != TCP_LISTEN &&
!between(seq, snd_una, tp->snd_nxt)) {
__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
goto out;
}
switch (type) {
case ICMP_REDIRECT:
if (!sock_owned_by_user(sk))
do_redirect(skb, sk);
goto out;
case ICMP_SOURCE_QUENCH:
/* Just silently ignore these. */
goto out;
case ICMP_PARAMETERPROB:
err = EPROTO;
break;
case ICMP_DEST_UNREACH:
if (code > NR_ICMP_UNREACH)
goto out;
if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
/* We are not interested in TCP_LISTEN and open_requests
* (SYN-ACKs send out by Linux are always <576bytes so
* they should go through unfragmented).
*/
if (sk->sk_state == TCP_LISTEN)
goto out;
WRITE_ONCE(tp->mtu_info, info);
if (!sock_owned_by_user(sk)) {
tcp_v4_mtu_reduced(sk);
} else {
if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
sock_hold(sk);
}
goto out;
}
err = icmp_err_convert[code].errno;
/* check if this ICMP message allows revert of backoff.
* (see RFC 6069)
*/
if (!fastopen &&
(code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
tcp_ld_RTO_revert(sk, seq);
break;
case ICMP_TIME_EXCEEDED:
err = EHOSTUNREACH;
break;
default:
goto out;
}
switch (sk->sk_state) {
case TCP_SYN_SENT:
case TCP_SYN_RECV:
/* Only in fast or simultaneous open. If a fast open socket is
* already accepted it is treated as a connected one below.
*/
if (fastopen && !fastopen->sk)
break;
ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
if (!sock_owned_by_user(sk)) {
sk->sk_err = err;
sk->sk_error_report(sk);
tcp_done(sk);
} else {
sk->sk_err_soft = err;
}
goto out;
}
/* If we've already connected we will keep trying
* until we time out, or the user gives up.
*
* rfc1122 4.2.3.9 allows to consider as hard errors
* only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
* but it is obsoleted by pmtu discovery).
*
* Note, that in modern internet, where routing is unreliable
* and in each dark corner broken firewalls sit, sending random
* errors ordered by their masters even this two messages finally lose
* their original sense (even Linux sends invalid PORT_UNREACHs)
*
* Now we are in compliance with RFCs.
* --ANK (980905)
*/
inet = inet_sk(sk);
if (!sock_owned_by_user(sk) && inet->recverr) {
sk->sk_err = err;
sk->sk_error_report(sk);
} else { /* Only an error on timeout */
sk->sk_err_soft = err;
}
out:
bh_unlock_sock(sk);
sock_put(sk);
return 0;
}
void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
{
struct tcphdr *th = tcp_hdr(skb);
th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
skb->csum_start = skb_transport_header(skb) - skb->head;
skb->csum_offset = offsetof(struct tcphdr, check);
}
/* This routine computes an IPv4 TCP checksum. */
void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
{
const struct inet_sock *inet = inet_sk(sk);
__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
}
EXPORT_SYMBOL(tcp_v4_send_check);
/*
* This routine will send an RST to the other tcp.
*
* Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
* for reset.
* Answer: if a packet caused RST, it is not for a socket
* existing in our system, if it is matched to a socket,
* it is just duplicate segment or bug in other side's TCP.
* So that we build reply only basing on parameters
* arrived with segment.
* Exception: precedence violation. We do not implement it in any case.
*/
static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
{
const struct tcphdr *th = tcp_hdr(skb);
struct {
struct tcphdr th;
#ifdef CONFIG_TCP_MD5SIG
__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
#endif
} rep;
struct ip_reply_arg arg;
#ifdef CONFIG_TCP_MD5SIG
struct tcp_md5sig_key *key = NULL;
const __u8 *hash_location = NULL;
unsigned char newhash[16];
int genhash;
struct sock *sk1 = NULL;
#endif
u64 transmit_time = 0;
struct sock *ctl_sk;
struct net *net;
/* Never send a reset in response to a reset. */
if (th->rst)
return;
/* If sk not NULL, it means we did a successful lookup and incoming
* route had to be correct. prequeue might have dropped our dst.
*/
if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
return;
/* Swap the send and the receive. */
memset(&rep, 0, sizeof(rep));
rep.th.dest = th->source;
rep.th.source = th->dest;
rep.th.doff = sizeof(struct tcphdr) / 4;
rep.th.rst = 1;
if (th->ack) {
rep.th.seq = th->ack_seq;
} else {
rep.th.ack = 1;
rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
skb->len - (th->doff << 2));
}
memset(&arg, 0, sizeof(arg));
arg.iov[0].iov_base = (unsigned char *)&rep;
arg.iov[0].iov_len = sizeof(rep.th);
net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
#ifdef CONFIG_TCP_MD5SIG
rcu_read_lock();
hash_location = tcp_parse_md5sig_option(th);
if (sk && sk_fullsock(sk)) {
const union tcp_md5_addr *addr;
int l3index;
/* sdif set, means packet ingressed via a device
* in an L3 domain and inet_iif is set to it.
*/
l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
} else if (hash_location) {
const union tcp_md5_addr *addr;
int sdif = tcp_v4_sdif(skb);
int dif = inet_iif(skb);
int l3index;
/*
* active side is lost. Try to find listening socket through
* source port, and then find md5 key through listening socket.
* we are not loose security here:
* Incoming packet is checked with md5 hash with finding key,
* no RST generated if md5 hash doesn't match.
*/
sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
ip_hdr(skb)->saddr,
th->source, ip_hdr(skb)->daddr,
ntohs(th->source), dif, sdif);
/* don't send rst if it can't find key */
if (!sk1)
goto out;
/* sdif set, means packet ingressed via a device
* in an L3 domain and dif is set to it.
*/
l3index = sdif ? dif : 0;
addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
if (!key)
goto out;
genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
if (genhash || memcmp(hash_location, newhash, 16) != 0)
goto out;
}
if (key) {
rep.opt[0] = htonl((TCPOPT_NOP << 24) |
(TCPOPT_NOP << 16) |
(TCPOPT_MD5SIG << 8) |
TCPOLEN_MD5SIG);
/* Update length and the length the header thinks exists */
arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
rep.th.doff = arg.iov[0].iov_len / 4;
tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
key, ip_hdr(skb)->saddr,
ip_hdr(skb)->daddr, &rep.th);
}
#endif
arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
ip_hdr(skb)->saddr, /* XXX */
arg.iov[0].iov_len, IPPROTO_TCP, 0);
arg.csumoffset = offsetof(struct tcphdr, check) / 2;
arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
/* When socket is gone, all binding information is lost.
* routing might fail in this case. No choice here, if we choose to force
* input interface, we will misroute in case of asymmetric route.
*/
if (sk) {
arg.bound_dev_if = sk->sk_bound_dev_if;
if (sk_fullsock(sk))
trace_tcp_send_reset(sk, skb);
}
BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
offsetof(struct inet_timewait_sock, tw_bound_dev_if));
arg.tos = ip_hdr(skb)->tos;
arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
local_bh_disable();
ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
if (sk) {
ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
inet_twsk(sk)->tw_mark : sk->sk_mark;
ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
inet_twsk(sk)->tw_priority : sk->sk_priority;
transmit_time = tcp_transmit_time(sk);
}
ip_send_unicast_reply(ctl_sk,
skb, &TCP_SKB_CB(skb)->header.h4.opt,
ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
&arg, arg.iov[0].iov_len,
transmit_time);
ctl_sk->sk_mark = 0;
__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
local_bh_enable();
#ifdef CONFIG_TCP_MD5SIG
out:
rcu_read_unlock();
#endif
}
/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
outside socket context is ugly, certainly. What can I do?
*/
static void tcp_v4_send_ack(const struct sock *sk,
struct sk_buff *skb, u32 seq, u32 ack,
u32 win, u32 tsval, u32 tsecr, int oif,
struct tcp_md5sig_key *key,
int reply_flags, u8 tos)
{
const struct tcphdr *th = tcp_hdr(skb);
struct {
struct tcphdr th;
__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
#ifdef CONFIG_TCP_MD5SIG
+ (TCPOLEN_MD5SIG_ALIGNED >> 2)
#endif
];
} rep;
struct net *net = sock_net(sk);
struct ip_reply_arg arg;
struct sock *ctl_sk;
u64 transmit_time;
memset(&rep.th, 0, sizeof(struct tcphdr));
memset(&arg, 0, sizeof(arg));
arg.iov[0].iov_base = (unsigned char *)&rep;
arg.iov[0].iov_len = sizeof(rep.th);
if (tsecr) {
rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
(TCPOPT_TIMESTAMP << 8) |
TCPOLEN_TIMESTAMP);
rep.opt[1] = htonl(tsval);
rep.opt[2] = htonl(tsecr);
arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
}
/* Swap the send and the receive. */
rep.th.dest = th->source;
rep.th.source = th->dest;
rep.th.doff = arg.iov[0].iov_len / 4;
rep.th.seq = htonl(seq);
rep.th.ack_seq = htonl(ack);
rep.th.ack = 1;
rep.th.window = htons(win);
#ifdef CONFIG_TCP_MD5SIG
if (key) {
int offset = (tsecr) ? 3 : 0;
rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
(TCPOPT_NOP << 16) |
(TCPOPT_MD5SIG << 8) |
TCPOLEN_MD5SIG);
arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
rep.th.doff = arg.iov[0].iov_len/4;
tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
key, ip_hdr(skb)->saddr,
ip_hdr(skb)->daddr, &rep.th);
}
#endif
arg.flags = reply_flags;
arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
ip_hdr(skb)->saddr, /* XXX */
arg.iov[0].iov_len, IPPROTO_TCP, 0);
arg.csumoffset = offsetof(struct tcphdr, check) / 2;
if (oif)
arg.bound_dev_if = oif;
arg.tos = tos;
arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
local_bh_disable();
ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
inet_twsk(sk)->tw_mark : sk->sk_mark;
ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
inet_twsk(sk)->tw_priority : sk->sk_priority;
transmit_time = tcp_transmit_time(sk);
ip_send_unicast_reply(ctl_sk,
skb, &TCP_SKB_CB(skb)->header.h4.opt,
ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
&arg, arg.iov[0].iov_len,
transmit_time);
ctl_sk->sk_mark = 0;
__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
local_bh_enable();
}
static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
{
struct inet_timewait_sock *tw = inet_twsk(sk);
struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
tcp_v4_send_ack(sk, skb,
tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
tcp_time_stamp_raw() + tcptw->tw_ts_offset,
tcptw->tw_ts_recent,
tw->tw_bound_dev_if,
tcp_twsk_md5_key(tcptw),
tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
tw->tw_tos
);
inet_twsk_put(tw);
}
static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
struct request_sock *req)
{
const union tcp_md5_addr *addr;
int l3index;
/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
* sk->sk_state == TCP_SYN_RECV -> for Fast Open.
*/
u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
tcp_sk(sk)->snd_nxt;
/* RFC 7323 2.3
* The window field (SEG.WND) of every outgoing segment, with the
* exception of <SYN> segments, MUST be right-shifted by
* Rcv.Wind.Shift bits:
*/
addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
tcp_v4_send_ack(sk, skb, seq,
tcp_rsk(req)->rcv_nxt,
req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
READ_ONCE(req->ts_recent),
0,
tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
ip_hdr(skb)->tos);
}
/*
* Send a SYN-ACK after having received a SYN.
* This still operates on a request_sock only, not on a big
* socket.
*/
static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
struct flowi *fl,
struct request_sock *req,
struct tcp_fastopen_cookie *foc,
enum tcp_synack_type synack_type,
struct sk_buff *syn_skb)
{
const struct inet_request_sock *ireq = inet_rsk(req);
struct flowi4 fl4;
int err = -1;
struct sk_buff *skb;
u8 tos;
/* First, grab a route. */
if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
return -1;
skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
if (skb) {
__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
(tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
(inet_sk(sk)->tos & INET_ECN_MASK) :
inet_sk(sk)->tos;
if (!INET_ECN_is_capable(tos) &&
tcp_bpf_ca_needs_ecn((struct sock *)req))
tos |= INET_ECN_ECT_0;
rcu_read_lock();
err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
ireq->ir_rmt_addr,
rcu_dereference(ireq->ireq_opt),
tos);
rcu_read_unlock();
err = net_xmit_eval(err);
}
return err;
}
/*
* IPv4 request_sock destructor.
*/
static void tcp_v4_reqsk_destructor(struct request_sock *req)
{
kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
}
#ifdef CONFIG_TCP_MD5SIG
/*
* RFC2385 MD5 checksumming requires a mapping of
* IP address->MD5 Key.
* We need to maintain these in the sk structure.
*/
DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
EXPORT_SYMBOL(tcp_md5_needed);
static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
{
if (!old)
return true;
/* l3index always overrides non-l3index */
if (old->l3index && new->l3index == 0)
return false;
if (old->l3index == 0 && new->l3index)
return true;
return old->prefixlen < new->prefixlen;
}
/* Find the Key structure for an address. */
struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
const union tcp_md5_addr *addr,
int family)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct tcp_md5sig_key *key;
const struct tcp_md5sig_info *md5sig;
__be32 mask;
struct tcp_md5sig_key *best_match = NULL;
bool match;
/* caller either holds rcu_read_lock() or socket lock */
md5sig = rcu_dereference_check(tp->md5sig_info,
lockdep_sock_is_held(sk));
if (!md5sig)
return NULL;
hlist_for_each_entry_rcu(key, &md5sig->head, node,
lockdep_sock_is_held(sk)) {
if (key->family != family)
continue;
if (key->l3index && key->l3index != l3index)
continue;
if (family == AF_INET) {
mask = inet_make_mask(key->prefixlen);
match = (key->addr.a4.s_addr & mask) ==
(addr->a4.s_addr & mask);
#if IS_ENABLED(CONFIG_IPV6)
} else if (family == AF_INET6) {
match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
key->prefixlen);
#endif
} else {
match = false;
}
if (match && better_md5_match(best_match, key))
best_match = key;
}
return best_match;
}
EXPORT_SYMBOL(__tcp_md5_do_lookup);
static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
const union tcp_md5_addr *addr,
int family, u8 prefixlen,
int l3index)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct tcp_md5sig_key *key;
unsigned int size = sizeof(struct in_addr);
const struct tcp_md5sig_info *md5sig;
/* caller either holds rcu_read_lock() or socket lock */
md5sig = rcu_dereference_check(tp->md5sig_info,
lockdep_sock_is_held(sk));
if (!md5sig)
return NULL;
#if IS_ENABLED(CONFIG_IPV6)
if (family == AF_INET6)
size = sizeof(struct in6_addr);
#endif
hlist_for_each_entry_rcu(key, &md5sig->head, node,
lockdep_sock_is_held(sk)) {
if (key->family != family)
continue;
if (key->l3index != l3index)
continue;
if (!memcmp(&key->addr, addr, size) &&
key->prefixlen == prefixlen)
return key;
}
return NULL;
}
struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
const struct sock *addr_sk)
{
const union tcp_md5_addr *addr;
int l3index;
l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
addr_sk->sk_bound_dev_if);
addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
}
EXPORT_SYMBOL(tcp_v4_md5_lookup);
/* This can be called on a newly created socket, from other files */
int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
int family, u8 prefixlen, int l3index,
const u8 *newkey, u8 newkeylen, gfp_t gfp)
{
/* Add Key to the list */
struct tcp_md5sig_key *key;
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_md5sig_info *md5sig;
key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
if (key) {
/* Pre-existing entry - just update that one.
* Note that the key might be used concurrently.
* data_race() is telling kcsan that we do not care of
* key mismatches, since changing MD5 key on live flows
* can lead to packet drops.
*/
data_race(memcpy(key->key, newkey, newkeylen));
/* Pairs with READ_ONCE() in tcp_md5_hash_key().
* Also note that a reader could catch new key->keylen value
* but old key->key[], this is the reason we use __GFP_ZERO
* at sock_kmalloc() time below these lines.
*/
WRITE_ONCE(key->keylen, newkeylen);
return 0;
}
md5sig = rcu_dereference_protected(tp->md5sig_info,
lockdep_sock_is_held(sk));
if (!md5sig) {
md5sig = kmalloc(sizeof(*md5sig), gfp);
if (!md5sig)
return -ENOMEM;
sk_nocaps_add(sk, NETIF_F_GSO_MASK);
INIT_HLIST_HEAD(&md5sig->head);
rcu_assign_pointer(tp->md5sig_info, md5sig);
}
key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
if (!key)
return -ENOMEM;
if (!tcp_alloc_md5sig_pool()) {
sock_kfree_s(sk, key, sizeof(*key));
return -ENOMEM;
}
memcpy(key->key, newkey, newkeylen);
key->keylen = newkeylen;
key->family = family;
key->prefixlen = prefixlen;
key->l3index = l3index;
memcpy(&key->addr, addr,
(family == AF_INET6) ? sizeof(struct in6_addr) :
sizeof(struct in_addr));
hlist_add_head_rcu(&key->node, &md5sig->head);
return 0;
}
EXPORT_SYMBOL(tcp_md5_do_add);
int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
u8 prefixlen, int l3index)
{
struct tcp_md5sig_key *key;
key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
if (!key)
return -ENOENT;
hlist_del_rcu(&key->node);
atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
kfree_rcu(key, rcu);
return 0;
}
EXPORT_SYMBOL(tcp_md5_do_del);
static void tcp_clear_md5_list(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_md5sig_key *key;
struct hlist_node *n;
struct tcp_md5sig_info *md5sig;
md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
hlist_del_rcu(&key->node);
atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
kfree_rcu(key, rcu);
}
}
static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
sockptr_t optval, int optlen)
{
struct tcp_md5sig cmd;
struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
const union tcp_md5_addr *addr;
u8 prefixlen = 32;
int l3index = 0;
if (optlen < sizeof(cmd))
return -EINVAL;
if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
return -EFAULT;
if (sin->sin_family != AF_INET)
return -EINVAL;
if (optname == TCP_MD5SIG_EXT &&
cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
prefixlen = cmd.tcpm_prefixlen;
if (prefixlen > 32)
return -EINVAL;
}
if (optname == TCP_MD5SIG_EXT &&
cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
struct net_device *dev;
rcu_read_lock();
dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
if (dev && netif_is_l3_master(dev))
l3index = dev->ifindex;
rcu_read_unlock();
/* ok to reference set/not set outside of rcu;
* right now device MUST be an L3 master
*/
if (!dev || !l3index)
return -EINVAL;
}
addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
if (!cmd.tcpm_keylen)
return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
return -EINVAL;
return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
}
static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
__be32 daddr, __be32 saddr,
const struct tcphdr *th, int nbytes)
{
struct tcp4_pseudohdr *bp;
struct scatterlist sg;
struct tcphdr *_th;
bp = hp->scratch;
bp->saddr = saddr;
bp->daddr = daddr;
bp->pad = 0;
bp->protocol = IPPROTO_TCP;
bp->len = cpu_to_be16(nbytes);
_th = (struct tcphdr *)(bp + 1);
memcpy(_th, th, sizeof(*th));
_th->check = 0;
sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
ahash_request_set_crypt(hp->md5_req, &sg, NULL,
sizeof(*bp) + sizeof(*th));
return crypto_ahash_update(hp->md5_req);
}
static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
__be32 daddr, __be32 saddr, const struct tcphdr *th)
{
struct tcp_md5sig_pool *hp;
struct ahash_request *req;
hp = tcp_get_md5sig_pool();
if (!hp)
goto clear_hash_noput;
req = hp->md5_req;
if (crypto_ahash_init(req))
goto clear_hash;
if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
goto clear_hash;
if (tcp_md5_hash_key(hp, key))
goto clear_hash;
ahash_request_set_crypt(req, NULL, md5_hash, 0);
if (crypto_ahash_final(req))
goto clear_hash;
tcp_put_md5sig_pool();
return 0;
clear_hash:
tcp_put_md5sig_pool();
clear_hash_noput:
memset(md5_hash, 0, 16);
return 1;
}
int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
const struct sock *sk,
const struct sk_buff *skb)
{
struct tcp_md5sig_pool *hp;
struct ahash_request *req;
const struct tcphdr *th = tcp_hdr(skb);
__be32 saddr, daddr;
if (sk) { /* valid for establish/request sockets */
saddr = sk->sk_rcv_saddr;
daddr = sk->sk_daddr;
} else {
const struct iphdr *iph = ip_hdr(skb);
saddr = iph->saddr;
daddr = iph->daddr;
}
hp = tcp_get_md5sig_pool();
if (!hp)
goto clear_hash_noput;
req = hp->md5_req;
if (crypto_ahash_init(req))
goto clear_hash;
if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
goto clear_hash;
if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
goto clear_hash;
if (tcp_md5_hash_key(hp, key))
goto clear_hash;
ahash_request_set_crypt(req, NULL, md5_hash, 0);
if (crypto_ahash_final(req))
goto clear_hash;
tcp_put_md5sig_pool();
return 0;
clear_hash:
tcp_put_md5sig_pool();
clear_hash_noput:
memset(md5_hash, 0, 16);
return 1;
}
EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
#endif
/* Called with rcu_read_lock() */
static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
const struct sk_buff *skb,
int dif, int sdif)
{
#ifdef CONFIG_TCP_MD5SIG
/*
* This gets called for each TCP segment that arrives
* so we want to be efficient.
* We have 3 drop cases:
* o No MD5 hash and one expected.
* o MD5 hash and we're not expecting one.
* o MD5 hash and its wrong.
*/
const __u8 *hash_location = NULL;
struct tcp_md5sig_key *hash_expected;
const struct iphdr *iph = ip_hdr(skb);
const struct tcphdr *th = tcp_hdr(skb);
const union tcp_md5_addr *addr;
unsigned char newhash[16];
int genhash, l3index;
/* sdif set, means packet ingressed via a device
* in an L3 domain and dif is set to the l3mdev
*/
l3index = sdif ? dif : 0;
addr = (union tcp_md5_addr *)&iph->saddr;
hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
hash_location = tcp_parse_md5sig_option(th);
/* We've parsed the options - do we have a hash? */
if (!hash_expected && !hash_location)
return false;
if (hash_expected && !hash_location) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
return true;
}
if (!hash_expected && hash_location) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
return true;
}
/* Okay, so this is hash_expected and hash_location -
* so we need to calculate the checksum.
*/
genhash = tcp_v4_md5_hash_skb(newhash,
hash_expected,
NULL, skb);
if (genhash || memcmp(hash_location, newhash, 16) != 0) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
&iph->saddr, ntohs(th->source),
&iph->daddr, ntohs(th->dest),
genhash ? " tcp_v4_calc_md5_hash failed"
: "", l3index);
return true;
}
return false;
#endif
return false;
}
static void tcp_v4_init_req(struct request_sock *req,
const struct sock *sk_listener,
struct sk_buff *skb)
{
struct inet_request_sock *ireq = inet_rsk(req);
struct net *net = sock_net(sk_listener);
sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
}
static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
struct flowi *fl,
const struct request_sock *req)
{
return inet_csk_route_req(sk, &fl->u.ip4, req);
}
struct request_sock_ops tcp_request_sock_ops __read_mostly = {
.family = PF_INET,
.obj_size = sizeof(struct tcp_request_sock),
.rtx_syn_ack = tcp_rtx_synack,
.send_ack = tcp_v4_reqsk_send_ack,
.destructor = tcp_v4_reqsk_destructor,
.send_reset = tcp_v4_send_reset,
.syn_ack_timeout = tcp_syn_ack_timeout,
};
const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
.mss_clamp = TCP_MSS_DEFAULT,
#ifdef CONFIG_TCP_MD5SIG
.req_md5_lookup = tcp_v4_md5_lookup,
.calc_md5_hash = tcp_v4_md5_hash_skb,
#endif
.init_req = tcp_v4_init_req,
#ifdef CONFIG_SYN_COOKIES
.cookie_init_seq = cookie_v4_init_sequence,
#endif
.route_req = tcp_v4_route_req,
.init_seq = tcp_v4_init_seq,
.init_ts_off = tcp_v4_init_ts_off,
.send_synack = tcp_v4_send_synack,
};
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
{
/* Never answer to SYNs send to broadcast or multicast */
if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
goto drop;
return tcp_conn_request(&tcp_request_sock_ops,
&tcp_request_sock_ipv4_ops, sk, skb);
drop:
tcp_listendrop(sk);
return 0;
}
EXPORT_SYMBOL(tcp_v4_conn_request);
/*
* The three way handshake has completed - we got a valid synack -
* now create the new socket.
*/
struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
struct dst_entry *dst,
struct request_sock *req_unhash,
bool *own_req)
{
struct inet_request_sock *ireq;
bool found_dup_sk = false;
struct inet_sock *newinet;
struct tcp_sock *newtp;
struct sock *newsk;
#ifdef CONFIG_TCP_MD5SIG
const union tcp_md5_addr *addr;
struct tcp_md5sig_key *key;
int l3index;
#endif
struct ip_options_rcu *inet_opt;
if (sk_acceptq_is_full(sk))
goto exit_overflow;
newsk = tcp_create_openreq_child(sk, req, skb);
if (!newsk)
goto exit_nonewsk;
newsk->sk_gso_type = SKB_GSO_TCPV4;
inet_sk_rx_dst_set(newsk, skb);
newtp = tcp_sk(newsk);
newinet = inet_sk(newsk);
ireq = inet_rsk(req);
sk_daddr_set(newsk, ireq->ir_rmt_addr);
sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
newsk->sk_bound_dev_if = ireq->ir_iif;
newinet->inet_saddr = ireq->ir_loc_addr;
inet_opt = rcu_dereference(ireq->ireq_opt);
RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
newinet->mc_index = inet_iif(skb);
newinet->mc_ttl = ip_hdr(skb)->ttl;
newinet->rcv_tos = ip_hdr(skb)->tos;
inet_csk(newsk)->icsk_ext_hdr_len = 0;
if (inet_opt)
inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
newinet->inet_id = prandom_u32();
/* Set ToS of the new socket based upon the value of incoming SYN.
* ECT bits are set later in tcp_init_transfer().
*/
if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
if (!dst) {
dst = inet_csk_route_child_sock(sk, newsk, req);
if (!dst)
goto put_and_exit;
} else {
/* syncookie case : see end of cookie_v4_check() */
}
sk_setup_caps(newsk, dst);
tcp_ca_openreq_child(newsk, dst);
tcp_sync_mss(newsk, dst_mtu(dst));
newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
tcp_initialize_rcv_mss(newsk);
#ifdef CONFIG_TCP_MD5SIG
l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
/* Copy over the MD5 key from the original socket */
addr = (union tcp_md5_addr *)&newinet->inet_daddr;
key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
if (key) {
/*
* We're using one, so create a matching key
* on the newsk structure. If we fail to get
* memory, then we end up not copying the key
* across. Shucks.
*/
tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
key->key, key->keylen, GFP_ATOMIC);
sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
}
#endif
if (__inet_inherit_port(sk, newsk) < 0)
goto put_and_exit;
*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
&found_dup_sk);
if (likely(*own_req)) {
tcp_move_syn(newtp, req);
ireq->ireq_opt = NULL;
} else {
newinet->inet_opt = NULL;
if (!req_unhash && found_dup_sk) {
/* This code path should only be executed in the
* syncookie case only
*/
bh_unlock_sock(newsk);
sock_put(newsk);
newsk = NULL;
}
}
return newsk;
exit_overflow:
NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
exit_nonewsk:
dst_release(dst);
exit:
tcp_listendrop(sk);
return NULL;
put_and_exit:
newinet->inet_opt = NULL;
inet_csk_prepare_forced_close(newsk);
tcp_done(newsk);
goto exit;
}
EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
{
#ifdef CONFIG_SYN_COOKIES
const struct tcphdr *th = tcp_hdr(skb);
if (!th->syn)
sk = cookie_v4_check(sk, skb);
#endif
return sk;
}
u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
struct tcphdr *th, u32 *cookie)
{
u16 mss = 0;
#ifdef CONFIG_SYN_COOKIES
mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
&tcp_request_sock_ipv4_ops, sk, th);
if (mss) {
*cookie = __cookie_v4_init_sequence(iph, th, &mss);
tcp_synq_overflow(sk);
}
#endif
return mss;
}
/* The socket must have it's spinlock held when we get
* here, unless it is a TCP_LISTEN socket.
*
* We have a potential double-lock case here, so even when
* doing backlog processing we use the BH locking scheme.
* This is because we cannot sleep with the original spinlock
* held.
*/
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
struct sock *rsk;
if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
struct dst_entry *dst;
dst = rcu_dereference_protected(sk->sk_rx_dst,
lockdep_sock_is_held(sk));
sock_rps_save_rxhash(sk, skb);
sk_mark_napi_id(sk, skb);
if (dst) {
if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
!dst->ops->check(dst, 0)) {
RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
dst_release(dst);
}
}
tcp_rcv_established(sk, skb);
return 0;
}
if (tcp_checksum_complete(skb))
goto csum_err;
if (sk->sk_state == TCP_LISTEN) {
struct sock *nsk = tcp_v4_cookie_check(sk, skb);
if (!nsk)
goto discard;
if (nsk != sk) {
if (tcp_child_process(sk, nsk, skb)) {
rsk = nsk;
goto reset;
}
return 0;
}
} else
sock_rps_save_rxhash(sk, skb);
if (tcp_rcv_state_process(sk, skb)) {
rsk = sk;
goto reset;
}
return 0;
reset:
tcp_v4_send_reset(rsk, skb);
discard:
kfree_skb(skb);
/* Be careful here. If this function gets more complicated and
* gcc suffers from register pressure on the x86, sk (in %ebx)
* might be destroyed here. This current version compiles correctly,
* but you have been warned.
*/
return 0;
csum_err:
TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
goto discard;
}
EXPORT_SYMBOL(tcp_v4_do_rcv);
int tcp_v4_early_demux(struct sk_buff *skb)
{
const struct iphdr *iph;
const struct tcphdr *th;
struct sock *sk;
if (skb->pkt_type != PACKET_HOST)
return 0;
if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
return 0;
iph = ip_hdr(skb);
th = tcp_hdr(skb);
if (th->doff < sizeof(struct tcphdr) / 4)
return 0;
sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
iph->saddr, th->source,
iph->daddr, ntohs(th->dest),
skb->skb_iif, inet_sdif(skb));
if (sk) {
skb->sk = sk;
skb->destructor = sock_edemux;
if (sk_fullsock(sk)) {
struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
if (dst)
dst = dst_check(dst, 0);
if (dst &&
inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
skb_dst_set_noref(skb, dst);
}
}
return 0;
}
bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
{
u32 limit, tail_gso_size, tail_gso_segs;
struct skb_shared_info *shinfo;
const struct tcphdr *th;
struct tcphdr *thtail;
struct sk_buff *tail;
unsigned int hdrlen;
bool fragstolen;
u32 gso_segs;
u32 gso_size;
int delta;
/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
* we can fix skb->truesize to its real value to avoid future drops.
* This is valid because skb is not yet charged to the socket.
* It has been noticed pure SACK packets were sometimes dropped
* (if cooked by drivers without copybreak feature).
*/
skb_condense(skb);
skb_dst_drop(skb);
if (unlikely(tcp_checksum_complete(skb))) {
bh_unlock_sock(sk);
__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
return true;
}
/* Attempt coalescing to last skb in backlog, even if we are
* above the limits.
* This is okay because skb capacity is limited to MAX_SKB_FRAGS.
*/
th = (const struct tcphdr *)skb->data;
hdrlen = th->doff * 4;
tail = sk->sk_backlog.tail;
if (!tail)
goto no_coalesce;
thtail = (struct tcphdr *)tail->data;
if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
((TCP_SKB_CB(tail)->tcp_flags |
TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
!((TCP_SKB_CB(tail)->tcp_flags &
TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
((TCP_SKB_CB(tail)->tcp_flags ^
TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
#ifdef CONFIG_TLS_DEVICE
tail->decrypted != skb->decrypted ||
#endif
thtail->doff != th->doff ||
memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
goto no_coalesce;
__skb_pull(skb, hdrlen);
shinfo = skb_shinfo(skb);
gso_size = shinfo->gso_size ?: skb->len;
gso_segs = shinfo->gso_segs ?: 1;
shinfo = skb_shinfo(tail);
tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
tail_gso_segs = shinfo->gso_segs ?: 1;
if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
thtail->window = th->window;
}
/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
* thtail->fin, so that the fast path in tcp_rcv_established()
* is not entered if we append a packet with a FIN.
* SYN, RST, URG are not present.
* ACK is set on both packets.
* PSH : we do not really care in TCP stack,
* at least for 'GRO' packets.
*/
thtail->fin |= th->fin;
TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
if (TCP_SKB_CB(skb)->has_rxtstamp) {
TCP_SKB_CB(tail)->has_rxtstamp = true;
tail->tstamp = skb->tstamp;
skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
}
/* Not as strict as GRO. We only need to carry mss max value */
shinfo->gso_size = max(gso_size, tail_gso_size);
shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
sk->sk_backlog.len += delta;
__NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPBACKLOGCOALESCE);
kfree_skb_partial(skb, fragstolen);
return false;
}
__skb_push(skb, hdrlen);
no_coalesce:
limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
/* Only socket owner can try to collapse/prune rx queues
* to reduce memory overhead, so add a little headroom here.
* Few sockets backlog are possibly concurrently non empty.
*/
limit += 64 * 1024;
if (unlikely(sk_add_backlog(sk, skb, limit))) {
bh_unlock_sock(sk);
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
return true;
}
return false;
}
EXPORT_SYMBOL(tcp_add_backlog);
int tcp_filter(struct sock *sk, struct sk_buff *skb)
{
struct tcphdr *th = (struct tcphdr *)skb->data;
return sk_filter_trim_cap(sk, skb, th->doff * 4);
}
EXPORT_SYMBOL(tcp_filter);
static void tcp_v4_restore_cb(struct sk_buff *skb)
{
memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
sizeof(struct inet_skb_parm));
}
static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
const struct tcphdr *th)
{
/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
* barrier() makes sure compiler wont play fool^Waliasing games.
*/
memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
sizeof(struct inet_skb_parm));
barrier();
TCP_SKB_CB(skb)->seq = ntohl(th->seq);
TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
skb->len - th->doff * 4);
TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
TCP_SKB_CB(skb)->tcp_tw_isn = 0;
TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
TCP_SKB_CB(skb)->sacked = 0;
TCP_SKB_CB(skb)->has_rxtstamp =
skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
}
/*
* From tcp_input.c
*/
int tcp_v4_rcv(struct sk_buff *skb)
{
struct net *net = dev_net(skb->dev);
struct sk_buff *skb_to_free;
int sdif = inet_sdif(skb);
int dif = inet_iif(skb);
const struct iphdr *iph;
const struct tcphdr *th;
bool refcounted;
struct sock *sk;
int ret;
if (skb->pkt_type != PACKET_HOST)
goto discard_it;
/* Count it even if it's bad */
__TCP_INC_STATS(net, TCP_MIB_INSEGS);
if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
goto discard_it;
th = (const struct tcphdr *)skb->data;
if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
goto bad_packet;
if (!pskb_may_pull(skb, th->doff * 4))
goto discard_it;
/* An explanation is required here, I think.
* Packet length and doff are validated by header prediction,
* provided case of th->doff==0 is eliminated.
* So, we defer the checks. */
if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
goto csum_error;
th = (const struct tcphdr *)skb->data;
iph = ip_hdr(skb);
lookup:
sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
th->dest, sdif, &refcounted);
if (!sk)
goto no_tcp_socket;
process:
if (sk->sk_state == TCP_TIME_WAIT)
goto do_time_wait;
if (sk->sk_state == TCP_NEW_SYN_RECV) {
struct request_sock *req = inet_reqsk(sk);
bool req_stolen = false;
struct sock *nsk;
sk = req->rsk_listener;
if (unlikely(!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb) ||
tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
sk_drops_add(sk, skb);
reqsk_put(req);
goto discard_it;
}
if (tcp_checksum_complete(skb)) {
reqsk_put(req);
goto csum_error;
}
if (unlikely(sk->sk_state != TCP_LISTEN)) {
inet_csk_reqsk_queue_drop_and_put(sk, req);
goto lookup;
}
/* We own a reference on the listener, increase it again
* as we might lose it too soon.
*/
sock_hold(sk);
refcounted = true;
nsk = NULL;
if (!tcp_filter(sk, skb)) {
th = (const struct tcphdr *)skb->data;
iph = ip_hdr(skb);
tcp_v4_fill_cb(skb, iph, th);
nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
}
if (!nsk) {
reqsk_put(req);
if (req_stolen) {
/* Another cpu got exclusive access to req
* and created a full blown socket.
* Try to feed this packet to this socket
* instead of discarding it.
*/
tcp_v4_restore_cb(skb);
sock_put(sk);
goto lookup;
}
goto discard_and_relse;
}
nf_reset_ct(skb);
if (nsk == sk) {
reqsk_put(req);
tcp_v4_restore_cb(skb);
} else if (tcp_child_process(sk, nsk, skb)) {
tcp_v4_send_reset(nsk, skb);
goto discard_and_relse;
} else {
sock_put(sk);
return 0;
}
}
if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
goto discard_and_relse;
}
if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
goto discard_and_relse;
if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
goto discard_and_relse;
nf_reset_ct(skb);
if (tcp_filter(sk, skb))
goto discard_and_relse;
th = (const struct tcphdr *)skb->data;
iph = ip_hdr(skb);
tcp_v4_fill_cb(skb, iph, th);
skb->dev = NULL;
if (sk->sk_state == TCP_LISTEN) {
ret = tcp_v4_do_rcv(sk, skb);
goto put_and_return;
}
sk_incoming_cpu_update(sk);
bh_lock_sock_nested(sk);
tcp_segs_in(tcp_sk(sk), skb);
ret = 0;
if (!sock_owned_by_user(sk)) {
skb_to_free = sk->sk_rx_skb_cache;
sk->sk_rx_skb_cache = NULL;
ret = tcp_v4_do_rcv(sk, skb);
} else {
if (tcp_add_backlog(sk, skb))
goto discard_and_relse;
skb_to_free = NULL;
}
bh_unlock_sock(sk);
if (skb_to_free)
__kfree_skb(skb_to_free);
put_and_return:
if (refcounted)
sock_put(sk);
return ret;
no_tcp_socket:
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
goto discard_it;
tcp_v4_fill_cb(skb, iph, th);
if (tcp_checksum_complete(skb)) {
csum_error:
__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
bad_packet:
__TCP_INC_STATS(net, TCP_MIB_INERRS);
} else {
tcp_v4_send_reset(NULL, skb);
}
discard_it:
/* Discard frame. */
kfree_skb(skb);
return 0;
discard_and_relse:
sk_drops_add(sk, skb);
if (refcounted)
sock_put(sk);
goto discard_it;
do_time_wait:
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
inet_twsk_put(inet_twsk(sk));
goto discard_it;
}
tcp_v4_fill_cb(skb, iph, th);
if (tcp_checksum_complete(skb)) {
inet_twsk_put(inet_twsk(sk));
goto csum_error;
}
switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
case TCP_TW_SYN: {
struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
&tcp_hashinfo, skb,
__tcp_hdrlen(th),
iph->saddr, th->source,
iph->daddr, th->dest,
inet_iif(skb),
sdif);
if (sk2) {
inet_twsk_deschedule_put(inet_twsk(sk));
sk = sk2;
tcp_v4_restore_cb(skb);
refcounted = false;
goto process;
}
}
/* to ACK */
fallthrough;
case TCP_TW_ACK:
tcp_v4_timewait_ack(sk, skb);
break;
case TCP_TW_RST:
tcp_v4_send_reset(sk, skb);
inet_twsk_deschedule_put(inet_twsk(sk));
goto discard_it;
case TCP_TW_SUCCESS:;
}
goto discard_it;
}
static struct timewait_sock_ops tcp_timewait_sock_ops = {
.twsk_obj_size = sizeof(struct tcp_timewait_sock),
.twsk_unique = tcp_twsk_unique,
.twsk_destructor= tcp_twsk_destructor,
};
void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
if (dst && dst_hold_safe(dst)) {
rcu_assign_pointer(sk->sk_rx_dst, dst);
inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
}
}
EXPORT_SYMBOL(inet_sk_rx_dst_set);
const struct inet_connection_sock_af_ops ipv4_specific = {
.queue_xmit = ip_queue_xmit,
.send_check = tcp_v4_send_check,
.rebuild_header = inet_sk_rebuild_header,
.sk_rx_dst_set = inet_sk_rx_dst_set,
.conn_request = tcp_v4_conn_request,
.syn_recv_sock = tcp_v4_syn_recv_sock,
.net_header_len = sizeof(struct iphdr),
.setsockopt = ip_setsockopt,
.getsockopt = ip_getsockopt,
.addr2sockaddr = inet_csk_addr2sockaddr,
.sockaddr_len = sizeof(struct sockaddr_in),
.mtu_reduced = tcp_v4_mtu_reduced,
};
EXPORT_SYMBOL(ipv4_specific);
#ifdef CONFIG_TCP_MD5SIG
static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
.md5_lookup = tcp_v4_md5_lookup,
.calc_md5_hash = tcp_v4_md5_hash_skb,
.md5_parse = tcp_v4_parse_md5_keys,
};
#endif
/* NOTE: A lot of things set to zero explicitly by call to
* sk_alloc() so need not be done here.
*/
static int tcp_v4_init_sock(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
tcp_init_sock(sk);
icsk->icsk_af_ops = &ipv4_specific;
#ifdef CONFIG_TCP_MD5SIG
tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
#endif
return 0;
}
void tcp_v4_destroy_sock(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
trace_tcp_destroy_sock(sk);
tcp_clear_xmit_timers(sk);
tcp_cleanup_congestion_control(sk);
tcp_cleanup_ulp(sk);
/* Cleanup up the write buffer. */
tcp_write_queue_purge(sk);
/* Check if we want to disable active TFO */
tcp_fastopen_active_disable_ofo_check(sk);
/* Cleans up our, hopefully empty, out_of_order_queue. */
skb_rbtree_purge(&tp->out_of_order_queue);
#ifdef CONFIG_TCP_MD5SIG
/* Clean up the MD5 key list, if any */
if (tp->md5sig_info) {
tcp_clear_md5_list(sk);
kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
tp->md5sig_info = NULL;
}
#endif
/* Clean up a referenced TCP bind bucket. */
if (inet_csk(sk)->icsk_bind_hash)
inet_put_port(sk);
BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
/* If socket is aborted during connect operation */
tcp_free_fastopen_req(tp);
tcp_fastopen_destroy_cipher(sk);
tcp_saved_syn_free(tp);
sk_sockets_allocated_dec(sk);
}
EXPORT_SYMBOL(tcp_v4_destroy_sock);
#ifdef CONFIG_PROC_FS
/* Proc filesystem TCP sock list dumping. */
/*
* Get next listener socket follow cur. If cur is NULL, get first socket
* starting from bucket given in st->bucket; when st->bucket is zero the
* very first socket in the hash table is returned.
*/
static void *listening_get_next(struct seq_file *seq, void *cur)
{
struct tcp_seq_afinfo *afinfo;
struct tcp_iter_state *st = seq->private;
struct net *net = seq_file_net(seq);
struct inet_listen_hashbucket *ilb;
struct hlist_nulls_node *node;
struct sock *sk = cur;
if (st->bpf_seq_afinfo)
afinfo = st->bpf_seq_afinfo;
else
afinfo = PDE_DATA(file_inode(seq->file));
if (!sk) {
get_head:
ilb = &tcp_hashinfo.listening_hash[st->bucket];
spin_lock(&ilb->lock);
sk = sk_nulls_head(&ilb->nulls_head);
st->offset = 0;
goto get_sk;
}
ilb = &tcp_hashinfo.listening_hash[st->bucket];
++st->num;
++st->offset;
sk = sk_nulls_next(sk);
get_sk:
sk_nulls_for_each_from(sk, node) {
if (!net_eq(sock_net(sk), net))
continue;
if (afinfo->family == AF_UNSPEC ||
sk->sk_family == afinfo->family)
return sk;
}
spin_unlock(&ilb->lock);
st->offset = 0;
if (++st->bucket < INET_LHTABLE_SIZE)
goto get_head;
return NULL;
}
static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
{
struct tcp_iter_state *st = seq->private;
void *rc;
st->bucket = 0;
st->offset = 0;
rc = listening_get_next(seq, NULL);
while (rc && *pos) {
rc = listening_get_next(seq, rc);
--*pos;
}
return rc;
}
static inline bool empty_bucket(const struct tcp_iter_state *st)
{
return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
}
/*
* Get first established socket starting from bucket given in st->bucket.
* If st->bucket is zero, the very first socket in the hash is returned.
*/
static void *established_get_first(struct seq_file *seq)
{
struct tcp_seq_afinfo *afinfo;
struct tcp_iter_state *st = seq->private;
struct net *net = seq_file_net(seq);
void *rc = NULL;
if (st->bpf_seq_afinfo)
afinfo = st->bpf_seq_afinfo;
else
afinfo = PDE_DATA(file_inode(seq->file));
st->offset = 0;
for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
struct sock *sk;
struct hlist_nulls_node *node;
spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
/* Lockless fast path for the common case of empty buckets */
if (empty_bucket(st))
continue;
spin_lock_bh(lock);
sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
if ((afinfo->family != AF_UNSPEC &&
sk->sk_family != afinfo->family) ||
!net_eq(sock_net(sk), net)) {
continue;
}
rc = sk;
goto out;
}
spin_unlock_bh(lock);
}
out:
return rc;
}
static void *established_get_next(struct seq_file *seq, void *cur)
{
struct tcp_seq_afinfo *afinfo;
struct sock *sk = cur;
struct hlist_nulls_node *node;
struct tcp_iter_state *st = seq->private;
struct net *net = seq_file_net(seq);
if (st->bpf_seq_afinfo)
afinfo = st->bpf_seq_afinfo;
else
afinfo = PDE_DATA(file_inode(seq->file));
++st->num;
++st->offset;
sk = sk_nulls_next(sk);
sk_nulls_for_each_from(sk, node) {
if ((afinfo->family == AF_UNSPEC ||
sk->sk_family == afinfo->family) &&
net_eq(sock_net(sk), net))
return sk;
}
spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
++st->bucket;
return established_get_first(seq);
}
static void *established_get_idx(struct seq_file *seq, loff_t pos)
{
struct tcp_iter_state *st = seq->private;
void *rc;
st->bucket = 0;
rc = established_get_first(seq);
while (rc && pos) {
rc = established_get_next(seq, rc);
--pos;
}
return rc;
}
static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
{
void *rc;
struct tcp_iter_state *st = seq->private;
st->state = TCP_SEQ_STATE_LISTENING;
rc = listening_get_idx(seq, &pos);
if (!rc) {
st->state = TCP_SEQ_STATE_ESTABLISHED;
rc = established_get_idx(seq, pos);
}
return rc;
}
static void *tcp_seek_last_pos(struct seq_file *seq)
{
struct tcp_iter_state *st = seq->private;
int bucket = st->bucket;
int offset = st->offset;
int orig_num = st->num;
void *rc = NULL;
switch (st->state) {
case TCP_SEQ_STATE_LISTENING:
if (st->bucket >= INET_LHTABLE_SIZE)
break;
st->state = TCP_SEQ_STATE_LISTENING;
rc = listening_get_next(seq, NULL);
while (offset-- && rc && bucket == st->bucket)
rc = listening_get_next(seq, rc);
if (rc)
break;
st->bucket = 0;
st->state = TCP_SEQ_STATE_ESTABLISHED;
fallthrough;
case TCP_SEQ_STATE_ESTABLISHED:
if (st->bucket > tcp_hashinfo.ehash_mask)
break;
rc = established_get_first(seq);
while (offset-- && rc && bucket == st->bucket)
rc = established_get_next(seq, rc);
}
st->num = orig_num;
return rc;
}
void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
{
struct tcp_iter_state *st = seq->private;
void *rc;
if (*pos && *pos == st->last_pos) {
rc = tcp_seek_last_pos(seq);
if (rc)
goto out;
}
st->state = TCP_SEQ_STATE_LISTENING;
st->num = 0;
st->bucket = 0;
st->offset = 0;
rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
out:
st->last_pos = *pos;
return rc;
}
EXPORT_SYMBOL(tcp_seq_start);
void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct tcp_iter_state *st = seq->private;
void *rc = NULL;
if (v == SEQ_START_TOKEN) {
rc = tcp_get_idx(seq, 0);
goto out;
}
switch (st->state) {
case TCP_SEQ_STATE_LISTENING:
rc = listening_get_next(seq, v);
if (!rc) {
st->state = TCP_SEQ_STATE_ESTABLISHED;
st->bucket = 0;
st->offset = 0;
rc = established_get_first(seq);
}
break;
case TCP_SEQ_STATE_ESTABLISHED:
rc = established_get_next(seq, v);
break;
}
out:
++*pos;
st->last_pos = *pos;
return rc;
}
EXPORT_SYMBOL(tcp_seq_next);
void tcp_seq_stop(struct seq_file *seq, void *v)
{
struct tcp_iter_state *st = seq->private;
switch (st->state) {
case TCP_SEQ_STATE_LISTENING:
if (v != SEQ_START_TOKEN)
spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
break;
case TCP_SEQ_STATE_ESTABLISHED:
if (v)
spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
break;
}
}
EXPORT_SYMBOL(tcp_seq_stop);
static void get_openreq4(const struct request_sock *req,
struct seq_file *f, int i)
{
const struct inet_request_sock *ireq = inet_rsk(req);
long delta = req->rsk_timer.expires - jiffies;
seq_printf(f, "%4d: %08X:%04X %08X:%04X"
" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
i,
ireq->ir_loc_addr,
ireq->ir_num,
ireq->ir_rmt_addr,
ntohs(ireq->ir_rmt_port),
TCP_SYN_RECV,
0, 0, /* could print option size, but that is af dependent. */
1, /* timers active (only the expire timer) */
jiffies_delta_to_clock_t(delta),
req->num_timeout,
from_kuid_munged(seq_user_ns(f),
sock_i_uid(req->rsk_listener)),
0, /* non standard timer */
0, /* open_requests have no inode */
0,
req);
}
static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
{
int timer_active;
unsigned long timer_expires;
const struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
const struct inet_sock *inet = inet_sk(sk);
const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
__be32 dest = inet->inet_daddr;
__be32 src = inet->inet_rcv_saddr;
__u16 destp = ntohs(inet->inet_dport);
__u16 srcp = ntohs(inet->inet_sport);
int rx_queue;
int state;
if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
timer_active = 1;
timer_expires = icsk->icsk_timeout;
} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
timer_active = 4;
timer_expires = icsk->icsk_timeout;
} else if (timer_pending(&sk->sk_timer)) {
timer_active = 2;
timer_expires = sk->sk_timer.expires;
} else {
timer_active = 0;
timer_expires = jiffies;
}
state = inet_sk_state_load(sk);
if (state == TCP_LISTEN)
rx_queue = READ_ONCE(sk->sk_ack_backlog);
else
/* Because we don't lock the socket,
* we might find a transient negative value.
*/
rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
READ_ONCE(tp->copied_seq), 0);
seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
i, src, srcp, dest, destp, state,
READ_ONCE(tp->write_seq) - tp->snd_una,
rx_queue,
timer_active,
jiffies_delta_to_clock_t(timer_expires - jiffies),
icsk->icsk_retransmits,
from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
icsk->icsk_probes_out,
sock_i_ino(sk),
refcount_read(&sk->sk_refcnt), sk,
jiffies_to_clock_t(icsk->icsk_rto),
jiffies_to_clock_t(icsk->icsk_ack.ato),
(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
tp->snd_cwnd,
state == TCP_LISTEN ?
fastopenq->max_qlen :
(tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
}
static void get_timewait4_sock(const struct inet_timewait_sock *tw,
struct seq_file *f, int i)
{
long delta = tw->tw_timer.expires - jiffies;
__be32 dest, src;
__u16 destp, srcp;
dest = tw->tw_daddr;
src = tw->tw_rcv_saddr;
destp = ntohs(tw->tw_dport);
srcp = ntohs(tw->tw_sport);
seq_printf(f, "%4d: %08X:%04X %08X:%04X"
" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
refcount_read(&tw->tw_refcnt), tw);
}
#define TMPSZ 150
static int tcp4_seq_show(struct seq_file *seq, void *v)
{
struct tcp_iter_state *st;
struct sock *sk = v;
seq_setwidth(seq, TMPSZ - 1);
if (v == SEQ_START_TOKEN) {
seq_puts(seq, " sl local_address rem_address st tx_queue "
"rx_queue tr tm->when retrnsmt uid timeout "
"inode");
goto out;
}
st = seq->private;
if (sk->sk_state == TCP_TIME_WAIT)
get_timewait4_sock(v, seq, st->num);
else if (sk->sk_state == TCP_NEW_SYN_RECV)
get_openreq4(v, seq, st->num);
else
get_tcp4_sock(v, seq, st->num);
out:
seq_pad(seq, '\n');
return 0;
}
#ifdef CONFIG_BPF_SYSCALL
struct bpf_iter__tcp {
__bpf_md_ptr(struct bpf_iter_meta *, meta);
__bpf_md_ptr(struct sock_common *, sk_common);
uid_t uid __aligned(8);
};
static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
struct sock_common *sk_common, uid_t uid)
{
struct bpf_iter__tcp ctx;
meta->seq_num--; /* skip SEQ_START_TOKEN */
ctx.meta = meta;
ctx.sk_common = sk_common;
ctx.uid = uid;
return bpf_iter_run_prog(prog, &ctx);
}
static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
{
struct bpf_iter_meta meta;
struct bpf_prog *prog;
struct sock *sk = v;
uid_t uid;
if (v == SEQ_START_TOKEN)
return 0;
if (sk->sk_state == TCP_TIME_WAIT) {
uid = 0;
} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
const struct request_sock *req = v;
uid = from_kuid_munged(seq_user_ns(seq),
sock_i_uid(req->rsk_listener));
} else {
uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
}
meta.seq = seq;
prog = bpf_iter_get_info(&meta, false);
return tcp_prog_seq_show(prog, &meta, v, uid);
}
static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
{
struct bpf_iter_meta meta;
struct bpf_prog *prog;
if (!v) {
meta.seq = seq;
prog = bpf_iter_get_info(&meta, true);
if (prog)
(void)tcp_prog_seq_show(prog, &meta, v, 0);
}
tcp_seq_stop(seq, v);
}
static const struct seq_operations bpf_iter_tcp_seq_ops = {
.show = bpf_iter_tcp_seq_show,
.start = tcp_seq_start,
.next = tcp_seq_next,
.stop = bpf_iter_tcp_seq_stop,
};
#endif
static const struct seq_operations tcp4_seq_ops = {
.show = tcp4_seq_show,
.start = tcp_seq_start,
.next = tcp_seq_next,
.stop = tcp_seq_stop,
};
static struct tcp_seq_afinfo tcp4_seq_afinfo = {
.family = AF_INET,
};
static int __net_init tcp4_proc_init_net(struct net *net)
{
if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
return -ENOMEM;
return 0;
}
static void __net_exit tcp4_proc_exit_net(struct net *net)
{
remove_proc_entry("tcp", net->proc_net);
}
static struct pernet_operations tcp4_net_ops = {
.init = tcp4_proc_init_net,
.exit = tcp4_proc_exit_net,
};
int __init tcp4_proc_init(void)
{
return register_pernet_subsys(&tcp4_net_ops);
}
void tcp4_proc_exit(void)
{
unregister_pernet_subsys(&tcp4_net_ops);
}
#endif /* CONFIG_PROC_FS */
struct proto tcp_prot = {
.name = "TCP",
.owner = THIS_MODULE,
.close = tcp_close,
.pre_connect = tcp_v4_pre_connect,
.connect = tcp_v4_connect,
.disconnect = tcp_disconnect,
.accept = inet_csk_accept,
.ioctl = tcp_ioctl,
.init = tcp_v4_init_sock,
.destroy = tcp_v4_destroy_sock,
.shutdown = tcp_shutdown,
.setsockopt = tcp_setsockopt,
.getsockopt = tcp_getsockopt,
.bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
.keepalive = tcp_set_keepalive,
.recvmsg = tcp_recvmsg,
.sendmsg = tcp_sendmsg,
.sendpage = tcp_sendpage,
.backlog_rcv = tcp_v4_do_rcv,
.release_cb = tcp_release_cb,
.hash = inet_hash,
.unhash = inet_unhash,
.get_port = inet_csk_get_port,
.enter_memory_pressure = tcp_enter_memory_pressure,
.leave_memory_pressure = tcp_leave_memory_pressure,
.stream_memory_free = tcp_stream_memory_free,
.sockets_allocated = &tcp_sockets_allocated,
.orphan_count = &tcp_orphan_count,
.memory_allocated = &tcp_memory_allocated,
.memory_pressure = &tcp_memory_pressure,
.sysctl_mem = sysctl_tcp_mem,
.sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
.sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp_sock),
.slab_flags = SLAB_TYPESAFE_BY_RCU,
.twsk_prot = &tcp_timewait_sock_ops,
.rsk_prot = &tcp_request_sock_ops,
.h.hashinfo = &tcp_hashinfo,
.no_autobind = true,
.diag_destroy = tcp_abort,
};
EXPORT_SYMBOL(tcp_prot);
static void __net_exit tcp_sk_exit(struct net *net)
{
int cpu;
if (net->ipv4.tcp_congestion_control)
bpf_module_put(net->ipv4.tcp_congestion_control,
net->ipv4.tcp_congestion_control->owner);
for_each_possible_cpu(cpu)
inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
free_percpu(net->ipv4.tcp_sk);
}
static int __net_init tcp_sk_init(struct net *net)
{
int res, cpu, cnt;
net->ipv4.tcp_sk = alloc_percpu(struct sock *);
if (!net->ipv4.tcp_sk)
return -ENOMEM;
for_each_possible_cpu(cpu) {
struct sock *sk;
res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
IPPROTO_TCP, net);
if (res)
goto fail;
sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
/* Please enforce IP_DF and IPID==0 for RST and
* ACK sent in SYN-RECV and TIME-WAIT state.
*/
inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
}
net->ipv4.sysctl_tcp_ecn = 2;
net->ipv4.sysctl_tcp_ecn_fallback = 1;
net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
net->ipv4.sysctl_tcp_syncookies = 1;
net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
net->ipv4.sysctl_tcp_orphan_retries = 0;
net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
net->ipv4.sysctl_tcp_tw_reuse = 2;
net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
cnt = tcp_hashinfo.ehash_mask + 1;
net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
net->ipv4.sysctl_tcp_sack = 1;
net->ipv4.sysctl_tcp_window_scaling = 1;
net->ipv4.sysctl_tcp_timestamps = 1;
net->ipv4.sysctl_tcp_early_retrans = 3;
net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
net->ipv4.sysctl_tcp_retrans_collapse = 1;
net->ipv4.sysctl_tcp_max_reordering = 300;
net->ipv4.sysctl_tcp_dsack = 1;
net->ipv4.sysctl_tcp_app_win = 31;
net->ipv4.sysctl_tcp_adv_win_scale = 1;
net->ipv4.sysctl_tcp_frto = 2;
net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
/* This limits the percentage of the congestion window which we
* will allow a single TSO frame to consume. Building TSO frames
* which are too large can cause TCP streams to be bursty.
*/
net->ipv4.sysctl_tcp_tso_win_divisor = 3;
/* Default TSQ limit of 16 TSO segments */
net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
/* rfc5961 challenge ack rate limiting */
net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
net->ipv4.sysctl_tcp_min_tso_segs = 2;
net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
net->ipv4.sysctl_tcp_autocorking = 1;
net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
if (net != &init_net) {
memcpy(net->ipv4.sysctl_tcp_rmem,
init_net.ipv4.sysctl_tcp_rmem,
sizeof(init_net.ipv4.sysctl_tcp_rmem));
memcpy(net->ipv4.sysctl_tcp_wmem,
init_net.ipv4.sysctl_tcp_wmem,
sizeof(init_net.ipv4.sysctl_tcp_wmem));
}
net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
net->ipv4.sysctl_tcp_comp_sack_nr = 44;
net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
atomic_set(&net->ipv4.tfo_active_disable_times, 0);
/* Reno is always built in */
if (!net_eq(net, &init_net) &&
bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
init_net.ipv4.tcp_congestion_control->owner))
net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
else
net->ipv4.tcp_congestion_control = &tcp_reno;
return 0;
fail:
tcp_sk_exit(net);
return res;
}
static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
{
struct net *net;
inet_twsk_purge(&tcp_hashinfo, AF_INET);
list_for_each_entry(net, net_exit_list, exit_list)
tcp_fastopen_ctx_destroy(net);
}
static struct pernet_operations __net_initdata tcp_sk_ops = {
.init = tcp_sk_init,
.exit = tcp_sk_exit,
.exit_batch = tcp_sk_exit_batch,
};
#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
struct sock_common *sk_common, uid_t uid)
static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
{
struct tcp_iter_state *st = priv_data;
struct tcp_seq_afinfo *afinfo;
int ret;
afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN);
if (!afinfo)
return -ENOMEM;
afinfo->family = AF_UNSPEC;
st->bpf_seq_afinfo = afinfo;
ret = bpf_iter_init_seq_net(priv_data, aux);
if (ret)
kfree(afinfo);
return ret;
}
static void bpf_iter_fini_tcp(void *priv_data)
{
struct tcp_iter_state *st = priv_data;
kfree(st->bpf_seq_afinfo);
bpf_iter_fini_seq_net(priv_data);
}
static const struct bpf_iter_seq_info tcp_seq_info = {
.seq_ops = &bpf_iter_tcp_seq_ops,
.init_seq_private = bpf_iter_init_tcp,
.fini_seq_private = bpf_iter_fini_tcp,
.seq_priv_size = sizeof(struct tcp_iter_state),
};
static struct bpf_iter_reg tcp_reg_info = {
.target = "tcp",
.ctx_arg_info_size = 1,
.ctx_arg_info = {
{ offsetof(struct bpf_iter__tcp, sk_common),
PTR_TO_BTF_ID_OR_NULL },
},
.seq_info = &tcp_seq_info,
};
static void __init bpf_iter_register(void)
{
tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
if (bpf_iter_reg_target(&tcp_reg_info))
pr_warn("Warning: could not register bpf iterator tcp\n");
}
#endif
void __init tcp_v4_init(void)
{
if (register_pernet_subsys(&tcp_sk_ops))
panic("Failed to create the TCP control socket.\n");
#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
bpf_iter_register();
#endif
}