Merge tag 'v5.1' into next

Sync up with mainline to bring in the latest APIs.
This commit is contained in:
Dmitry Torokhov
2019-05-10 11:43:46 -07:00
19305 changed files with 1063275 additions and 560062 deletions

View File

@@ -72,8 +72,14 @@ ForEachMacros:
- 'apei_estatus_for_each_section'
- 'ata_for_each_dev'
- 'ata_for_each_link'
- '__ata_qc_for_each'
- 'ata_qc_for_each'
- 'ata_qc_for_each_raw'
- 'ata_qc_for_each_with_internal'
- 'ax25_for_each'
- 'ax25_uid_for_each'
- '__bio_for_each_bvec'
- 'bio_for_each_bvec'
- 'bio_for_each_integrity_vec'
- '__bio_for_each_segment'
- 'bio_for_each_segment'
@@ -85,6 +91,7 @@ ForEachMacros:
- 'blk_queue_for_each_rl'
- 'bond_for_each_slave'
- 'bond_for_each_slave_rcu'
- 'bpf_for_each_spilled_reg'
- 'btree_for_each_safe128'
- 'btree_for_each_safe32'
- 'btree_for_each_safe64'
@@ -103,6 +110,8 @@ ForEachMacros:
- 'drm_atomic_crtc_for_each_plane'
- 'drm_atomic_crtc_state_for_each_plane'
- 'drm_atomic_crtc_state_for_each_plane_state'
- 'drm_atomic_for_each_plane_damage'
- 'drm_connector_for_each_possible_encoder'
- 'drm_for_each_connector_iter'
- 'drm_for_each_crtc'
- 'drm_for_each_encoder'
@@ -111,21 +120,33 @@ ForEachMacros:
- 'drm_for_each_legacy_plane'
- 'drm_for_each_plane'
- 'drm_for_each_plane_mask'
- 'drm_for_each_privobj'
- 'drm_mm_for_each_hole'
- 'drm_mm_for_each_node'
- 'drm_mm_for_each_node_in_range'
- 'drm_mm_for_each_node_safe'
- 'flow_action_for_each'
- 'for_each_active_drhd_unit'
- 'for_each_active_iommu'
- 'for_each_available_child_of_node'
- 'for_each_bio'
- 'for_each_board_func_rsrc'
- 'for_each_bvec'
- 'for_each_card_components'
- 'for_each_card_links'
- 'for_each_card_links_safe'
- 'for_each_card_prelinks'
- 'for_each_card_rtds'
- 'for_each_card_rtds_safe'
- 'for_each_cgroup_storage_type'
- 'for_each_child_of_node'
- 'for_each_clear_bit'
- 'for_each_clear_bit_from'
- 'for_each_cmsghdr'
- 'for_each_compatible_node'
- 'for_each_component_dais'
- 'for_each_component_dais_safe'
- 'for_each_comp_order'
- 'for_each_console'
- 'for_each_cpu'
- 'for_each_cpu_and'
@@ -133,10 +154,17 @@ ForEachMacros:
- 'for_each_cpu_wrap'
- 'for_each_dev_addr'
- 'for_each_dma_cap_mask'
- 'for_each_dpcm_be'
- 'for_each_dpcm_be_rollback'
- 'for_each_dpcm_be_safe'
- 'for_each_dpcm_fe'
- 'for_each_drhd_unit'
- 'for_each_dss_dev'
- 'for_each_efi_memory_desc'
- 'for_each_efi_memory_desc_in_map'
- 'for_each_element'
- 'for_each_element_extid'
- 'for_each_element_id'
- 'for_each_endpoint_of_node'
- 'for_each_evictable_lru'
- 'for_each_fib6_node_rt_rcu'
@@ -149,6 +177,7 @@ ForEachMacros:
- 'for_each_iommu'
- 'for_each_ip_tunnel_rcu'
- 'for_each_irq_nr'
- 'for_each_link_codecs'
- 'for_each_lru'
- 'for_each_matching_node'
- 'for_each_matching_node_and_match'
@@ -160,6 +189,7 @@ ForEachMacros:
- 'for_each_mem_range_rev'
- 'for_each_migratetype_order'
- 'for_each_msi_entry'
- 'for_each_msi_entry_safe'
- 'for_each_net'
- 'for_each_netdev'
- 'for_each_netdev_continue'
@@ -172,6 +202,7 @@ ForEachMacros:
- 'for_each_net_rcu'
- 'for_each_new_connector_in_state'
- 'for_each_new_crtc_in_state'
- 'for_each_new_mst_mgr_in_state'
- 'for_each_new_plane_in_state'
- 'for_each_new_private_obj_in_state'
- 'for_each_node'
@@ -183,12 +214,16 @@ ForEachMacros:
- 'for_each_node_with_property'
- 'for_each_of_allnodes'
- 'for_each_of_allnodes_from'
- 'for_each_of_cpu_node'
- 'for_each_of_pci_range'
- 'for_each_old_connector_in_state'
- 'for_each_old_crtc_in_state'
- 'for_each_old_mst_mgr_in_state'
- 'for_each_oldnew_connector_in_state'
- 'for_each_oldnew_crtc_in_state'
- 'for_each_oldnew_mst_mgr_in_state'
- 'for_each_oldnew_plane_in_state'
- 'for_each_oldnew_plane_in_state_reverse'
- 'for_each_oldnew_private_obj_in_state'
- 'for_each_old_plane_in_state'
- 'for_each_old_private_obj_in_state'
@@ -206,14 +241,21 @@ ForEachMacros:
- 'for_each_process'
- 'for_each_process_thread'
- 'for_each_property_of_node'
- 'for_each_registered_fb'
- 'for_each_reserved_mem_region'
- 'for_each_resv_unavail_range'
- 'for_each_rtd_codec_dai'
- 'for_each_rtd_codec_dai_rollback'
- 'for_each_rtdcom'
- 'for_each_rtdcom_safe'
- 'for_each_set_bit'
- 'for_each_set_bit_from'
- 'for_each_sg'
- 'for_each_sg_dma_page'
- 'for_each_sg_page'
- 'for_each_sibling_event'
- 'for_each_subelement'
- 'for_each_subelement_extid'
- 'for_each_subelement_id'
- '__for_each_thread'
- 'for_each_thread'
- 'for_each_zone'
@@ -223,6 +265,8 @@ ForEachMacros:
- 'fwnode_for_each_child_node'
- 'fwnode_graph_for_each_endpoint'
- 'gadget_for_each_ep'
- 'genradix_for_each'
- 'genradix_for_each_from'
- 'hash_for_each'
- 'hash_for_each_possible'
- 'hash_for_each_possible_rcu'
@@ -251,6 +295,8 @@ ForEachMacros:
- 'hlist_nulls_for_each_entry_from'
- 'hlist_nulls_for_each_entry_rcu'
- 'hlist_nulls_for_each_entry_safe'
- 'i3c_bus_for_each_i2cdev'
- 'i3c_bus_for_each_i3cdev'
- 'ide_host_for_each_port'
- 'ide_port_for_each_dev'
- 'ide_port_for_each_present_dev'
@@ -259,19 +305,25 @@ ForEachMacros:
- 'idr_for_each_entry_ul'
- 'inet_bind_bucket_for_each'
- 'inet_lhash2_for_each_icsk_rcu'
- 'iov_for_each'
- 'key_for_each'
- 'key_for_each_safe'
- 'klp_for_each_func'
- 'klp_for_each_func_safe'
- 'klp_for_each_func_static'
- 'klp_for_each_object'
- 'klp_for_each_object_safe'
- 'klp_for_each_object_static'
- 'kvm_for_each_memslot'
- 'kvm_for_each_vcpu'
- 'list_for_each'
- 'list_for_each_codec'
- 'list_for_each_codec_safe'
- 'list_for_each_entry'
- 'list_for_each_entry_continue'
- 'list_for_each_entry_continue_rcu'
- 'list_for_each_entry_continue_reverse'
- 'list_for_each_entry_from'
- 'list_for_each_entry_from_rcu'
- 'list_for_each_entry_from_reverse'
- 'list_for_each_entry_lockless'
- 'list_for_each_entry_rcu'
@@ -291,6 +343,9 @@ ForEachMacros:
- 'media_device_for_each_intf'
- 'media_device_for_each_link'
- 'media_device_for_each_pad'
- 'mp_bvec_for_each_page'
- 'mp_bvec_for_each_segment'
- 'nanddev_io_for_each_page'
- 'netdev_for_each_lower_dev'
- 'netdev_for_each_lower_private'
- 'netdev_for_each_lower_private_rcu'
@@ -326,6 +381,7 @@ ForEachMacros:
- 'radix_tree_for_each_slot'
- 'radix_tree_for_each_tagged'
- 'rbtree_postorder_for_each_entry_safe'
- 'rdma_for_each_port'
- 'resource_list_for_each_entry'
- 'resource_list_for_each_entry_safe'
- 'rhl_for_each_entry_rcu'
@@ -340,6 +396,7 @@ ForEachMacros:
- 'rht_for_each_rcu'
- 'rht_for_each_rcu_continue'
- '__rq_for_each_bio'
- 'rq_for_each_bvec'
- 'rq_for_each_segment'
- 'scsi_for_each_prot_sg'
- 'scsi_for_each_sg'
@@ -357,12 +414,14 @@ ForEachMacros:
- 'sk_nulls_for_each'
- 'sk_nulls_for_each_from'
- 'sk_nulls_for_each_rcu'
- 'snd_array_for_each'
- 'snd_pcm_group_for_each_entry'
- 'snd_soc_dapm_widget_for_each_path'
- 'snd_soc_dapm_widget_for_each_path_safe'
- 'snd_soc_dapm_widget_for_each_sink_path'
- 'snd_soc_dapm_widget_for_each_source_path'
- 'tb_property_for_each'
- 'tcf_exts_for_each_action'
- 'udp_portaddr_for_each_entry'
- 'udp_portaddr_for_each_entry_rcu'
- 'usb_hub_for_each_child'
@@ -371,6 +430,13 @@ ForEachMacros:
- 'v4l2_m2m_for_each_dst_buf_safe'
- 'v4l2_m2m_for_each_src_buf'
- 'v4l2_m2m_for_each_src_buf_safe'
- 'virtio_device_for_each_vq'
- 'xa_for_each'
- 'xa_for_each_marked'
- 'xa_for_each_start'
- 'xas_for_each'
- 'xas_for_each_conflict'
- 'xas_for_each_marked'
- 'zorro_for_each_dev'
#IncludeBlocks: Preserve # Unknown to clang-format-5.0

1
.gitignore vendored
View File

@@ -15,6 +15,7 @@
*.bin
*.bz2
*.c.[012]*.*
*.dt.yaml
*.dtb
*.dtb.S
*.dwo

View File

@@ -36,9 +36,10 @@ Bart Van Assche <bvanassche@acm.org> <bart.vanassche@sandisk.com>
Ben Gardner <bgardner@wabtec.com>
Ben M Cahill <ben.m.cahill@intel.com>
Björn Steinbrink <B.Steinbrink@gmx.de>
Boris Brezillon <boris.brezillon@bootlin.com> <boris.brezillon@free-electrons.com>
Boris Brezillon <boris.brezillon@bootlin.com> <b.brezillon.dev@gmail.com>
Boris Brezillon <boris.brezillon@bootlin.com> <b.brezillon@overkiz.com>
Boris Brezillon <bbrezillon@kernel.org> <boris.brezillon@bootlin.com>
Boris Brezillon <bbrezillon@kernel.org> <boris.brezillon@free-electrons.com>
Boris Brezillon <bbrezillon@kernel.org> <b.brezillon.dev@gmail.com>
Boris Brezillon <bbrezillon@kernel.org> <b.brezillon@overkiz.com>
Brian Avery <b.avery@hp.com>
Brian King <brking@us.ibm.com>
Christoph Hellwig <hch@lst.de>
@@ -47,7 +48,10 @@ Corey Minyard <minyard@acm.org>
Damian Hobson-Garcia <dhobsong@igel.co.jp>
David Brownell <david-b@pacbell.net>
David Woodhouse <dwmw2@shinybook.infradead.org>
Deng-Cheng Zhu <dengcheng.zhu@mips.com> <dengcheng.zhu@imgtec.com>
Dengcheng Zhu <dzhu@wavecomp.com> <dengcheng.zhu@mips.com>
Dengcheng Zhu <dzhu@wavecomp.com> <dengcheng.zhu@imgtec.com>
Dengcheng Zhu <dzhu@wavecomp.com> <dczhu@mips.com>
Dengcheng Zhu <dzhu@wavecomp.com> <dengcheng.zhu@gmail.com>
Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
Domen Puncer <domen@coderock.org>
Douglas Gilbert <dougg@torque.net>
@@ -119,6 +123,7 @@ Mark Brown <broonie@sirena.org.uk>
Mark Yao <markyao0591@gmail.com> <mark.yao@rock-chips.com>
Martin Kepplinger <martink@posteo.de> <martin.kepplinger@theobroma-systems.com>
Martin Kepplinger <martink@posteo.de> <martin.kepplinger@ginzinger.com>
Mathieu Othacehe <m.othacehe@gmail.com>
Matthew Wilcox <willy@infradead.org> <matthew.r.wilcox@intel.com>
Matthew Wilcox <willy@infradead.org> <matthew@wil.cx>
Matthew Wilcox <willy@infradead.org> <mawilcox@linuxonhyperv.com>
@@ -151,6 +156,8 @@ Morten Welinder <welinder@darter.rentec.com>
Morten Welinder <welinder@troll.com>
Mythri P K <mythripk@ti.com>
Nguyen Anh Quynh <aquynh@gmail.com>
Nicolas Pitre <nico@fluxnic.net> <nicolas.pitre@linaro.org>
Nicolas Pitre <nico@fluxnic.net> <nico@linaro.org>
Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Patrick Mochel <mochel@digitalimplant.org>
Paul Burton <paul.burton@mips.com> <paul.burton@imgtec.com>
@@ -219,3 +226,5 @@ Yakir Yang <kuankuan.y@gmail.com> <ykk@rock-chips.com>
Yusuke Goda <goda.yusuke@renesas.com>
Gustavo Padovan <gustavo@las.ic.unicamp.br>
Gustavo Padovan <padovan@profusion.mobi>
Changbin Du <changbin.du@intel.com> <changbin.du@intel.com>
Changbin Du <changbin.du@intel.com> <changbin.du@gmail.com>

28
CREDITS
View File

@@ -842,10 +842,9 @@ D: ax25-utils maintainer.
N: Helge Deller
E: deller@gmx.de
E: hdeller@redhat.de
D: PA-RISC Linux hacker, LASI-, ASP-, WAX-, LCD/LED-driver
S: Schimmelsrain 1
S: D-69231 Rauenberg
W: http://www.parisc-linux.org/
D: PA-RISC Linux architecture maintainer
D: LASI-, ASP-, WAX-, LCD/LED-driver
S: Germany
N: Jean Delvare
@@ -1222,7 +1221,7 @@ S: Brazil
N: Oded Gabbay
E: oded.gabbay@gmail.com
D: AMD KFD maintainer
D: HabanaLabs and AMD KFD maintainer
S: 12 Shraga Raphaeli
S: Petah-Tikva, 4906418
S: Israel
@@ -1361,7 +1360,7 @@ S: Stellenbosch, Western Cape
S: South Africa
N: Grant Grundler
E: grundler@parisc-linux.org
E: grantgrundler@gmail.com
W: http://obmouse.sourceforge.net/
W: http://www.parisc-linux.org/
D: obmouse - rewrote Olivier Florent's Omnibook 600 "pop-up" mouse driver
@@ -2208,6 +2207,12 @@ N: Christopher Li
E: sparse@chrisli.org
D: Sparse maintainer 2009 - 2018
N: Shaohua Li
D: Worked on many parts of the kernel, from core x86, ACPI, PCI, KVM, MM,
D: and much more. He was the maintainer of MD from 2016 to 2018. Shaohua
D: passed away late 2018, he will be greatly missed.
W: https://www.spinics.net/lists/raid/msg61993.html
N: Stephan Linz
E: linz@mazet.de
E: Stephan.Linz@gmx.de
@@ -2486,7 +2491,7 @@ S: Syracuse, New York 13206
S: USA
N: Kyle McMartin
E: kyle@parisc-linux.org
E: kyle@mcmartin.ca
D: Linux/PARISC hacker
D: AD1889 sound driver
S: Ottawa, Canada
@@ -3774,14 +3779,13 @@ S: 21513 Conradia Ct
S: Cupertino, CA 95014
S: USA
N: Thibaut Varene
E: T-Bone@parisc-linux.org
W: http://www.parisc-linux.org/~varenet/
P: 1024D/B7D2F063 E67C 0D43 A75E 12A5 BB1C FA2F 1E32 C3DA B7D2 F063
N: Thibaut Varène
E: hacks+kernel@slashdirt.org
W: http://hacks.slashdirt.org/
D: PA-RISC port minion, PDC and GSCPS2 drivers, debuglocks and other bits
D: Some ARM at91rm9200 bits, S1D13XXX FB driver, random patches here and there
D: AD1889 sound driver
S: Paris, France
S: France
N: Heikki Vatiainen
E: hessu@cs.tut.fi

View File

@@ -0,0 +1,22 @@
What: /sys/class/dax/
Date: May, 2016
KernelVersion: v4.7
Contact: linux-nvdimm@lists.01.org
Description: Device DAX is the device-centric analogue of Filesystem
DAX (CONFIG_FS_DAX). It allows memory ranges to be
allocated and mapped without need of an intervening file
system. Device DAX is strict, precise and predictable.
Specifically this interface:
1/ Guarantees fault granularity with respect to a given
page size (pte, pmd, or pud) set at configuration time.
2/ Enforces deterministic behavior by being strict about
what fault scenarios are supported.
The /sys/class/dax/ interface enumerates all the
device-dax instances in the system. The ABI is
deprecated and will be removed after 2020. It is
replaced with the DAX bus interface /sys/bus/dax/ where
device-dax instances can be found under
/sys/bus/dax/devices/

View File

@@ -146,3 +146,36 @@ KernelVersion: 4.16
Contact: Stephen Hemminger <sthemmin@microsoft.com>
Description: Binary file created by uio_hv_generic for ring buffer
Users: Userspace drivers
What: /sys/bus/vmbus/devices/<UUID>/channels/<N>/intr_in_full
Date: February 2019
KernelVersion: 5.0
Contact: Michael Kelley <mikelley@microsoft.com>
Description: Number of guest to host interrupts caused by the inbound ring
buffer transitioning from full to not full while a packet is
waiting for buffer space to become available
Users: Debugging tools
What: /sys/bus/vmbus/devices/<UUID>/channels/<N>/intr_out_empty
Date: February 2019
KernelVersion: 5.0
Contact: Michael Kelley <mikelley@microsoft.com>
Description: Number of guest to host interrupts caused by the outbound ring
buffer transitioning from empty to not empty
Users: Debugging tools
What: /sys/bus/vmbus/devices/<UUID>/channels/<N>/out_full_first
Date: February 2019
KernelVersion: 5.0
Contact: Michael Kelley <mikelley@microsoft.com>
Description: Number of write operations that were the first to encounter an
outbound ring buffer full condition
Users: Debugging tools
What: /sys/bus/vmbus/devices/<UUID>/channels/<N>/out_full_total
Date: February 2019
KernelVersion: 5.0
Contact: Michael Kelley <mikelley@microsoft.com>
Description: Total number of write operations that encountered an outbound
ring buffer full condition
Users: Debugging tools

View File

@@ -12,7 +12,6 @@ Description: This file shows ASIC health status. The possible values are:
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/
cpld1_version
cpld2_version
Date: June 2018
KernelVersion: 4.19
Contact: Vadim Pasternak <vadimpmellanox.com>
@@ -21,6 +20,40 @@ Description: These files show with which CPLD versions have been burned
The files are read only.
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/
fan_dir
Date: December 2018
KernelVersion: 5.0
Contact: Vadim Pasternak <vadimpmellanox.com>
Description: This file shows the system fans direction:
forward direction - relevant bit is set 0;
reversed direction - relevant bit is set 1.
The files are read only.
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/
jtag_enable
Date: November 2018
KernelVersion: 5.0
Contact: Vadim Pasternak <vadimpmellanox.com>
Description: These files show with which CPLD versions have been burned
on LED board.
The files are read only.
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/
jtag_enable
Date: November 2018
KernelVersion: 5.0
Contact: Vadim Pasternak <vadimpmellanox.com>
Description: These files enable and disable the access to the JTAG domain.
By default access to the JTAG domain is disabled.
The file is read/write.
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/select_iio
Date: June 2018
KernelVersion: 4.19
@@ -76,3 +109,21 @@ Description: These files show the system reset cause, as following: power
reset cause.
The files are read only.
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/
reset_comex_pwr_fail
reset_from_comex
reset_system
reset_voltmon_upgrade_fail
Date: November 2018
KernelVersion: 5.0
Contact: Vadim Pasternak <vadimpmellanox.com>
Description: These files show the system reset cause, as following: ComEx
power fail, reset from ComEx, system platform reset, reset
due to voltage monitor devices upgrade failure,
Value 1 in file means this is reset cause, 0 - otherwise.
Only one bit could be 1 at the same time, representing only
the last reset cause.
The files are read only.

View File

@@ -0,0 +1,126 @@
What: /sys/kernel/debug/habanalabs/hl<n>/addr
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
Description: Sets the device address to be used for read or write through
PCI bar. The acceptable value is a string that starts with "0x"
What: /sys/kernel/debug/habanalabs/hl<n>/command_buffers
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
Description: Displays a list with information about the currently allocated
command buffers
What: /sys/kernel/debug/habanalabs/hl<n>/command_submission
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
Description: Displays a list with information about the currently active
command submissions
What: /sys/kernel/debug/habanalabs/hl<n>/command_submission_jobs
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
Description: Displays a list with detailed information about each JOB (CB) of
each active command submission
What: /sys/kernel/debug/habanalabs/hl<n>/data32
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
Description: Allows the root user to read or write directly through the
device's PCI bar. Writing to this file generates a write
transaction while reading from the file generates a read
transcation. This custom interface is needed (instead of using
the generic Linux user-space PCI mapping) because the DDR bar
is very small compared to the DDR memory and only the driver can
move the bar before and after the transaction
What: /sys/kernel/debug/habanalabs/hl<n>/device
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
Description: Enables the root user to set the device to specific state.
Valid values are "disable", "enable", "suspend", "resume".
User can read this property to see the valid values
What: /sys/kernel/debug/habanalabs/hl<n>/i2c_addr
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
Description: Sets I2C device address for I2C transaction that is generated
by the device's CPU
What: /sys/kernel/debug/habanalabs/hl<n>/i2c_bus
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
Description: Sets I2C bus address for I2C transaction that is generated by
the device's CPU
What: /sys/kernel/debug/habanalabs/hl<n>/i2c_data
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
Description: Triggers an I2C transaction that is generated by the device's
CPU. Writing to this file generates a write transaction while
reading from the file generates a read transcation
What: /sys/kernel/debug/habanalabs/hl<n>/i2c_reg
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
Description: Sets I2C register id for I2C transaction that is generated by
the device's CPU
What: /sys/kernel/debug/habanalabs/hl<n>/led0
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
Description: Sets the state of the first S/W led on the device
What: /sys/kernel/debug/habanalabs/hl<n>/led1
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
Description: Sets the state of the second S/W led on the device
What: /sys/kernel/debug/habanalabs/hl<n>/led2
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
Description: Sets the state of the third S/W led on the device
What: /sys/kernel/debug/habanalabs/hl<n>/mmu
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
Description: Displays the hop values and physical address for a given ASID
and virtual address. The user should write the ASID and VA into
the file and then read the file to get the result.
e.g. to display info about VA 0x1000 for ASID 1 you need to do:
echo "1 0x1000" > /sys/kernel/debug/habanalabs/hl0/mmu
What: /sys/kernel/debug/habanalabs/hl<n>/set_power_state
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
Description: Sets the PCI power state. Valid values are "1" for D0 and "2"
for D3Hot
What: /sys/kernel/debug/habanalabs/hl<n>/userptr
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
Description: Displays a list with information about the currently user
pointers (user virtual addresses) that are pinned and mapped
to DMA addresses
What: /sys/kernel/debug/habanalabs/hl<n>/vm
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
Description: Displays a list with information about all the active virtual
address mappings per ASID

View File

@@ -0,0 +1,23 @@
What: /sys/kernel/debug/wilco_ec/raw
Date: January 2019
KernelVersion: 5.1
Description:
Write and read raw mailbox commands to the EC.
For writing:
Bytes 0-1 indicate the message type:
00 F0 = Execute Legacy Command
00 F2 = Read/Write NVRAM Property
Byte 2 provides the command code
Bytes 3+ consist of the data passed in the request
At least three bytes are required, for the msg type and command,
with additional bytes optional for additional data.
Example:
// Request EC info type 3 (EC firmware build date)
$ echo 00 f0 38 00 03 00 > raw
// View the result. The decoded ASCII result "12/21/18" is
// included after the raw hex.
$ cat raw
00 31 32 2f 32 31 2f 31 38 00 38 00 01 00 2f 00 .12/21/18.8...

View File

@@ -244,7 +244,7 @@ Description:
What: /sys/block/<disk>/queue/zoned
Date: September 2016
Contact: Damien Le Moal <damien.lemoal@hgst.com>
Contact: Damien Le Moal <damien.lemoal@wdc.com>
Description:
zoned indicates if the device is a zoned block device
and the zone model of the device if it is indeed zoned.
@@ -259,6 +259,14 @@ Description:
zone commands, they will be treated as regular block
devices and zoned will report "none".
What: /sys/block/<disk>/queue/nr_zones
Date: November 2018
Contact: Damien Le Moal <damien.lemoal@wdc.com>
Description:
nr_zones indicates the total number of zones of a zoned block
device ("host-aware" or "host-managed" zone model). For regular
block devices, the value is always 0.
What: /sys/block/<disk>/queue/chunk_sectors
Date: September 2016
Contact: Hannes Reinecke <hare@suse.com>
@@ -268,6 +276,15 @@ Description:
indicates the size in 512B sectors of the RAID volume
stripe segment. For a zoned block device, either
host-aware or host-managed, chunk_sectors indicates the
size of 512B sectors of the zones of the device, with
size in 512B sectors of the zones of the device, with
the eventual exception of the last zone of the device
which may be smaller.
What: /sys/block/<disk>/queue/io_timeout
Date: November 2018
Contact: Weiping Zhang <zhangweiping@didiglobal.com>
Description:
io_timeout is the request timeout in milliseconds. If a request
does not complete in this time then the block driver timeout
handler is invoked. That timeout handler can decide to retry
the request, to fail it or to start a device recovery strategy.

View File

@@ -98,3 +98,42 @@ Description:
The backing_dev file is read-write and set up backing
device for zram to write incompressible pages.
For using, user should enable CONFIG_ZRAM_WRITEBACK.
What: /sys/block/zram<id>/idle
Date: November 2018
Contact: Minchan Kim <minchan@kernel.org>
Description:
idle file is write-only and mark zram slot as idle.
If system has mounted debugfs, user can see which slots
are idle via /sys/kernel/debug/zram/zram<id>/block_state
What: /sys/block/zram<id>/writeback
Date: November 2018
Contact: Minchan Kim <minchan@kernel.org>
Description:
The writeback file is write-only and trigger idle and/or
huge page writeback to backing device.
What: /sys/block/zram<id>/bd_stat
Date: November 2018
Contact: Minchan Kim <minchan@kernel.org>
Description:
The bd_stat file is read-only and represents backing device's
statistics (bd_count, bd_reads, bd_writes) in a format
similar to block layer statistics file format.
What: /sys/block/zram<id>/writeback_limit_enable
Date: November 2018
Contact: Minchan Kim <minchan@kernel.org>
Description:
The writeback_limit_enable file is read-write and specifies
eanbe of writeback_limit feature. "1" means eable the feature.
No limit "0" is the initial state.
What: /sys/block/zram<id>/writeback_limit
Date: November 2018
Contact: Minchan Kim <minchan@kernel.org>
Description:
The writeback_limit file is read-write and specifies the maximum
amount of writeback ZRAM can do. The limit could be changed
in run time.

View File

@@ -0,0 +1,146 @@
What: /sys/bus/i3c/devices/i3c-<bus-id>
KernelVersion: 5.0
Contact: linux-i3c@vger.kernel.org
Description:
An I3C bus. This directory will contain one sub-directory per
I3C device present on the bus.
What: /sys/bus/i3c/devices/i3c-<bus-id>/current_master
KernelVersion: 5.0
Contact: linux-i3c@vger.kernel.org
Description:
Expose the master that owns the bus (<bus-id>-<master-pid>) at
the time this file is read. Note that bus ownership can change
overtime, so there's no guarantee that when the read() call
returns, the value returned is still valid.
What: /sys/bus/i3c/devices/i3c-<bus-id>/mode
KernelVersion: 5.0
Contact: linux-i3c@vger.kernel.org
Description:
I3C bus mode. Can be "pure", "mixed-fast" or "mixed-slow". See
the I3C specification for a detailed description of what each
of these modes implies.
What: /sys/bus/i3c/devices/i3c-<bus-id>/i3c_scl_frequency
KernelVersion: 5.0
Contact: linux-i3c@vger.kernel.org
Description:
The frequency (expressed in Hz) of the SCL signal when
operating in I3C SDR mode.
What: /sys/bus/i3c/devices/i3c-<bus-id>/i2c_scl_frequency
KernelVersion: 5.0
Contact: linux-i3c@vger.kernel.org
Description:
The frequency (expressed in Hz) of the SCL signal when
operating in I2C mode.
What: /sys/bus/i3c/devices/i3c-<bus-id>/dynamic_address
KernelVersion: 5.0
Contact: linux-i3c@vger.kernel.org
Description:
Dynamic address assigned to the master controller. This
address may change if the bus is re-initialized.
What: /sys/bus/i3c/devices/i3c-<bus-id>/bcr
KernelVersion: 5.0
Contact: linux-i3c@vger.kernel.org
Description:
BCR stands for Bus Characteristics Register and express the
device capabilities in term of speed, maximum read/write
length, etc. See the I3C specification for more details.
This entry describes the BCR of the master controller driving
the bus.
What: /sys/bus/i3c/devices/i3c-<bus-id>/dcr
KernelVersion: 5.0
Contact: linux-i3c@vger.kernel.org
Description:
DCR stands for Device Characteristics Register and express the
device capabilities in term of exposed features. See the I3C
specification for more details.
This entry describes the DCR of the master controller driving
the bus.
What: /sys/bus/i3c/devices/i3c-<bus-id>/pid
KernelVersion: 5.0
Contact: linux-i3c@vger.kernel.org
Description:
PID stands for Provisional ID and is used to uniquely identify
a device on a bus. This PID contains information about the
vendor, the part and an instance ID so that several devices of
the same type can be connected on the same bus.
See the I3C specification for more details.
This entry describes the PID of the master controller driving
the bus.
What: /sys/bus/i3c/devices/i3c-<bus-id>/hdrcap
KernelVersion: 5.0
Contact: linux-i3c@vger.kernel.org
Description:
Expose the HDR (High Data Rate) capabilities of a device.
Returns a list of supported HDR mode, each element is separated
by space. Modes can be "hdr-ddr", "hdr-tsp" and "hdr-tsl".
See the I3C specification for more details about these HDR
modes.
This entry describes the HDRCAP of the master controller
driving the bus.
What: /sys/bus/i3c/devices/i3c-<bus-id>/<bus-id>-<device-pid>
KernelVersion: 5.0
Contact: linux-i3c@vger.kernel.org
Description:
An I3C device present on I3C bus identified by <bus-id>. Note
that all devices are represented including the master driving
the bus.
What: /sys/bus/i3c/devices/i3c-<bus-id>/<bus-id>-<device-pid>/dynamic_address
KernelVersion: 5.0
Contact: linux-i3c@vger.kernel.org
Description:
Dynamic address assigned to device <bus-id>-<device-pid>. This
address may change if the bus is re-initialized.
What: /sys/bus/i3c/devices/i3c-<bus-id>/<bus-id>-<device-pid>/bcr
KernelVersion: 5.0
Contact: linux-i3c@vger.kernel.org
Description:
BCR stands for Bus Characteristics Register and express the
device capabilities in term of speed, maximum read/write
length, etc. See the I3C specification for more details.
What: /sys/bus/i3c/devices/i3c-<bus-id>/<bus-id>-<device-pid>/dcr
KernelVersion: 5.0
Contact: linux-i3c@vger.kernel.org
Description:
DCR stands for Device Characteristics Register and express the
device capabilities in term of exposed features. See the I3C
specification for more details.
What: /sys/bus/i3c/devices/i3c-<bus-id>/<bus-id>-<device-pid>/pid
KernelVersion: 5.0
Contact: linux-i3c@vger.kernel.org
Description:
PID stands for Provisional ID and is used to uniquely identify
a device on a bus. This PID contains information about the
vendor, the part and an instance ID so that several devices of
the same type can be connected on the same bus.
See the I3C specification for more details.
What: /sys/bus/i3c/devices/i3c-<bus-id>/<bus-id>-<device-pid>/hdrcap
KernelVersion: 5.0
Contact: linux-i3c@vger.kernel.org
Description:
Expose the HDR (High Data Rate) capabilities of a device.
Returns a list of supported HDR mode, each element is separated
by space. Modes can be "hdr-ddr", "hdr-tsp" and "hdr-tsl".
See the I3C specification for more details about these HDR
modes.
What: /sys/bus/i3c/devices/<bus-id>-<device-pid>
KernelVersion: 5.0
Contact: linux-i3c@vger.kernel.org
Description:
These directories are just symbolic links to
/sys/bus/i3c/devices/i3c-<bus-id>/<bus-id>-<device-pid>.

View File

@@ -1554,6 +1554,10 @@ What: /sys/bus/iio/devices/iio:deviceX/in_concentration_raw
What: /sys/bus/iio/devices/iio:deviceX/in_concentrationX_raw
What: /sys/bus/iio/devices/iio:deviceX/in_concentration_co2_raw
What: /sys/bus/iio/devices/iio:deviceX/in_concentrationX_co2_raw
What: /sys/bus/iio/devices/iio:deviceX/in_concentration_ethanol_raw
What: /sys/bus/iio/devices/iio:deviceX/in_concentrationX_ethanol_raw
What: /sys/bus/iio/devices/iio:deviceX/in_concentration_h2_raw
What: /sys/bus/iio/devices/iio:deviceX/in_concentrationX_h2_raw
What: /sys/bus/iio/devices/iio:deviceX/in_concentration_voc_raw
What: /sys/bus/iio/devices/iio:deviceX/in_concentrationX_voc_raw
KernelVersion: 4.3
@@ -1685,3 +1689,18 @@ Contact: linux-iio@vger.kernel.org
Description:
Raw (unscaled) phase difference reading from channel Y
that can be processed to radians.
What: /sys/bus/iio/devices/iio:deviceX/in_massconcentration_pm1_input
What: /sys/bus/iio/devices/iio:deviceX/in_massconcentrationY_pm1_input
What: /sys/bus/iio/devices/iio:deviceX/in_massconcentration_pm2p5_input
What: /sys/bus/iio/devices/iio:deviceX/in_massconcentrationY_pm2p5_input
What: /sys/bus/iio/devices/iio:deviceX/in_massconcentration_pm4_input
What: /sys/bus/iio/devices/iio:deviceX/in_massconcentrationY_pm4_input
What: /sys/bus/iio/devices/iio:deviceX/in_massconcentration_pm10_input
What: /sys/bus/iio/devices/iio:deviceX/in_massconcentrationY_pm10_input
KernelVersion: 4.22
Contact: linux-iio@vger.kernel.org
Description:
Mass concentration reading of particulate matter in ug / m3.
pmX consists of particles with aerodynamic diameter less or
equal to X micrometers.

View File

@@ -0,0 +1,28 @@
What: /sys/bus/iio/devices/iio:deviceX/start_cleaning
Date: December 2018
KernelVersion: 4.22
Contact: linux-iio@vger.kernel.org
Description:
Writing 1 starts sensor self cleaning. Internal fan accelerates
to its maximum speed and keeps spinning for about 10 seconds in
order to blow out accumulated dust.
What: /sys/bus/iio/devices/iio:deviceX/cleaning_period
Date: January 2019
KernelVersion: 5.1
Contact: linux-iio@vger.kernel.org
Description:
Sensor is capable of triggering self cleaning periodically.
Period can be changed by writing a new value here. Upon reading
the current one is returned. Units are seconds.
Writing 0 disables periodical self cleaning entirely.
What: /sys/bus/iio/devices/iio:deviceX/cleaning_period_available
Date: January 2019
KernelVersion: 5.1
Contact: linux-iio@vger.kernel.org
Description:
The range of available values in seconds represented as the
minimum value, the step and the maximum value, all enclosed in
square brackets.

View File

@@ -3,11 +3,13 @@ Date: June 2015
KernelVersion: 4.3
Contact: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Description: (RW) Writes of 1 or 0 enable or disable trace output to this
output device. Reads return current status.
output device. Reads return current status. Requires that the
correstponding output port driver be loaded.
What: /sys/bus/intel_th/devices/<intel_th_id>-msc<msc-id>/port
Date: June 2015
KernelVersion: 4.3
Contact: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Description: (RO) Port number, corresponding to this output device on the
switch (GTH).
switch (GTH) or "unassigned" if the corresponding output
port driver is not loaded.

View File

@@ -21,6 +21,15 @@ Description: Holds a comma separated list of device unique_ids that
If a device is authorized automatically during boot its
boot attribute is set to 1.
What: /sys/bus/thunderbolt/devices/.../domainX/iommu_dma_protection
Date: Mar 2019
KernelVersion: 4.21
Contact: thunderbolt-software@lists.01.org
Description: This attribute tells whether the system uses IOMMU
for DMA protection. Value of 1 means IOMMU is used 0 means
it is not (DMA protection is solely based on Thunderbolt
security levels).
What: /sys/bus/thunderbolt/devices/.../domainX/security
Date: Sep 2017
KernelVersion: 4.13

View File

@@ -186,7 +186,7 @@ Contact: Lan Tianyu <tianyu.lan@intel.com>
Description:
Some platforms provide usb port connect types through ACPI.
This attribute is to expose these information to user space.
The file will read "hotplug", "wired" and "not used" if the
The file will read "hotplug", "hardwired" and "not used" if the
information is available, and "unknown" otherwise.
What: /sys/bus/usb/devices/.../(hub interface)/portX/location

View File

@@ -0,0 +1,32 @@
What: /sys/class/chromeos/<ec-device-name>/flashinfo
Date: August 2015
KernelVersion: 4.2
Description:
Show the EC flash information.
What: /sys/class/chromeos/<ec-device-name>/kb_wake_angle
Date: March 2018
KernelVersion: 4.17
Description:
Control the keyboard wake lid angle. Values are between
0 and 360. This file will also show the keyboard wake lid
angle by querying the hardware.
What: /sys/class/chromeos/<ec-device-name>/reboot
Date: August 2015
KernelVersion: 4.2
Description:
Tell the EC to reboot in various ways. Options are:
"cancel": Cancel a pending reboot.
"ro": Jump to RO without rebooting.
"rw": Jump to RW without rebooting.
"cold": Cold reboot.
"disable-jump": Disable jump until next reboot.
"hibernate": Hibernate the EC.
"at-shutdown": Reboot after an AP shutdown.
What: /sys/class/chromeos/<ec-device-name>/version
Date: August 2015
KernelVersion: 4.2
Description:
Show the information about the EC software and hardware.

View File

@@ -0,0 +1,74 @@
What: /sys/class/chromeos/<ec-device-name>/lightbar/brightness
Date: August 2015
KernelVersion: 4.2
Description:
Writing to this file adjusts the overall brightness of
the lightbar, separate from any color intensity. The
valid range is 0 (off) to 255 (maximum brightness).
What: /sys/class/chromeos/<ec-device-name>/lightbar/interval_msec
Date: August 2015
KernelVersion: 4.2
Description:
The lightbar is controlled by an embedded controller (EC),
which also manages the keyboard, battery charging, fans,
and other system hardware. To prevent unprivileged users
from interfering with the other EC functions, the rate at
which the lightbar control files can be read or written is
limited.
Reading this file will return the number of milliseconds
that must elapse between accessing any of the lightbar
functions through this interface. Going faster will simply
block until the necessary interval has lapsed. The interval
applies uniformly to all accesses of any kind by any user.
What: /sys/class/chromeos/<ec-device-name>/lightbar/led_rgb
Date: August 2015
KernelVersion: 4.2
Description:
This allows you to control each LED segment. If the
lightbar is already running one of the automatic
sequences, you probably wont see anything change because
your color setting will be almost immediately replaced.
To get useful results, you should stop the lightbar
sequence first.
The values written to this file are sets of four integers,
indicating LED, RED, GREEN, BLUE. The LED number is 0 to 3
to select a single segment, or 4 to set all four segments
to the same value at once. The RED, GREEN, and BLUE
numbers should be in the range 0 (off) to 255 (maximum).
You can update more than one segment at a time by writing
more than one set of four integers.
What: /sys/class/chromeos/<ec-device-name>/lightbar/program
Date: August 2015
KernelVersion: 4.2
Description:
This allows you to upload and run custom lightbar sequences.
What: /sys/class/chromeos/<ec-device-name>/lightbar/sequence
Date: August 2015
KernelVersion: 4.2
Description:
The Pixel lightbar has a number of built-in sequences
that it displays under various conditions, such as at
power on, shut down, or while running. Reading from this
file displays the current sequence that the lightbar is
displaying. Writing to this file allows you to change the
sequence.
What: /sys/class/chromeos/<ec-device-name>/lightbar/userspace_control
Date: August 2015
KernelVersion: 4.2
Description:
This allows you to take the control of the lightbar. This
prevents the kernel from going through its normal
sequences.
What: /sys/class/chromeos/<ec-device-name>/lightbar/version
Date: August 2015
KernelVersion: 4.2
Description:
Show the information about the lightbar version.

View File

@@ -0,0 +1,6 @@
What: /sys/class/chromeos/<ec-device-name>/vbc/vboot_context
Date: October 2015
KernelVersion: 4.4
Description:
Read/write the verified boot context data included on a
small nvram space on some EC implementations.

View File

@@ -7,55 +7,10 @@ Description:
timer. It can do gradual dimming and step change of brightness.
The pattern is given by a series of tuples, of brightness and
duration (ms). The LED is expected to traverse the series and
each brightness value for the specified duration. Duration of
0 means brightness should immediately change to new value, and
writing malformed pattern deactivates any active one.
duration (ms).
1. For gradual dimming, the dimming interval now is set as 50
milliseconds. So the tuple with duration less than dimming
interval (50ms) is treated as a step change of brightness,
i.e. the subsequent brightness will be applied without adding
intervening dimming intervals.
The gradual dimming format of the software pattern values should be:
"brightness_1 duration_1 brightness_2 duration_2 brightness_3
duration_3 ...". For example:
echo 0 1000 255 2000 > pattern
It will make the LED go gradually from zero-intensity to max (255)
intensity in 1000 milliseconds, then back to zero intensity in 2000
milliseconds:
LED brightness
^
255-| / \ / \ /
| / \ / \ /
| / \ / \ /
| / \ / \ /
0-| / \/ \/
+---0----1----2----3----4----5----6------------> time (s)
2. To make the LED go instantly from one brightness value to another,
we should use zero-time lengths (the brightness must be same as
the previous tuple's). So the format should be:
"brightness_1 duration_1 brightness_1 0 brightness_2 duration_2
brightness_2 0 ...". For example:
echo 0 1000 0 0 255 2000 255 0 > pattern
It will make the LED stay off for one second, then stay at max brightness
for two seconds:
LED brightness
^
255-| +---------+ +---------+
| | | | |
| | | | |
| | | | |
0-| -----+ +----+ +----
+---0----1----2----3----4----5----6------------> time (s)
The exact format is described in:
Documentation/devicetree/bindings/leds/leds-trigger-pattern.txt
What: /sys/class/leds/<led>/hw_pattern
Date: September 2018

View File

@@ -49,3 +49,26 @@ Contact: Wim Van Sebroeck <wim@iguana.be>
Description:
It is a read only file. It is read to know about current
value of timeout programmed.
What: /sys/class/watchdog/watchdogn/pretimeout
Date: December 2016
Contact: Wim Van Sebroeck <wim@iguana.be>
Description:
It is a read only file. It specifies the time in seconds before
timeout when the pretimeout interrupt is delivered. Pretimeout
is an optional feature.
What: /sys/class/watchdog/watchdogn/pretimeout_avaialable_governors
Date: February 2017
Contact: Wim Van Sebroeck <wim@iguana.be>
Description:
It is a read only file. It shows the pretimeout governors
available for this watchdog.
What: /sys/class/watchdog/watchdogn/pretimeout_governor
Date: February 2017
Contact: Wim Van Sebroeck <wim@iguana.be>
Description:
It is a read/write file. When read, the currently assigned
pretimeout governor is returned. When written, it sets
the pretimeout governor.

View File

@@ -0,0 +1,10 @@
What: /sys/devices/.../software_node/
Date: January 2019
Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Description:
This directory contains the details about the device that are
assigned in kernel (i.e. software), as opposed to the
firmware_node directory which contains the details that are
assigned for the device in firmware. The main attributes in the
directory will show the properties the device has, and the
relationship it has to some of the other devices.

View File

@@ -145,6 +145,8 @@ What: /sys/devices/system/cpu/cpuX/cpuidle/stateN/name
/sys/devices/system/cpu/cpuX/cpuidle/stateN/power
/sys/devices/system/cpu/cpuX/cpuidle/stateN/time
/sys/devices/system/cpu/cpuX/cpuidle/stateN/usage
/sys/devices/system/cpu/cpuX/cpuidle/stateN/above
/sys/devices/system/cpu/cpuX/cpuidle/stateN/below
Date: September 2007
KernelVersion: v2.6.24
Contact: Linux power management list <linux-pm@vger.kernel.org>
@@ -166,6 +168,11 @@ Description:
usage: (RO) Number of times this state was entered (a count).
above: (RO) Number of times this state was entered, but the
observed CPU idle duration was too short for it (a count).
below: (RO) Number of times this state was entered, but the
observed CPU idle duration was too long for it (a count).
What: /sys/devices/system/cpu/cpuX/cpuidle/stateN/desc
Date: February 2008

View File

@@ -0,0 +1,190 @@
What: /sys/class/habanalabs/hl<n>/armcp_kernel_ver
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
Description: Version of the Linux kernel running on the device's CPU
What: /sys/class/habanalabs/hl<n>/armcp_ver
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
Description: Version of the application running on the device's CPU
What: /sys/class/habanalabs/hl<n>/cpld_ver
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
Description: Version of the Device's CPLD F/W
What: /sys/class/habanalabs/hl<n>/device_type
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
Description: Displays the code name of the device according to its type.
The supported values are: "GOYA"
What: /sys/class/habanalabs/hl<n>/eeprom
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
Description: A binary file attribute that contains the contents of the
on-board EEPROM
What: /sys/class/habanalabs/hl<n>/fuse_ver
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
Description: Displays the device's version from the eFuse
What: /sys/class/habanalabs/hl<n>/hard_reset
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
Description: Interface to trigger a hard-reset operation for the device.
Hard-reset will reset ALL internal components of the device
except for the PCI interface and the internal PLLs
What: /sys/class/habanalabs/hl<n>/hard_reset_cnt
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
Description: Displays how many times the device have undergone a hard-reset
operation since the driver was loaded
What: /sys/class/habanalabs/hl<n>/high_pll
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
Description: Allows the user to set the maximum clock frequency for MME, TPC
and IC when the power management profile is set to "automatic".
What: /sys/class/habanalabs/hl<n>/ic_clk
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
Description: Allows the user to set the maximum clock frequency of the
Interconnect fabric. Writes to this parameter affect the device
only when the power management profile is set to "manual" mode.
The device IC clock might be set to lower value then the
maximum. The user should read the ic_clk_curr to see the actual
frequency value of the IC
What: /sys/class/habanalabs/hl<n>/ic_clk_curr
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
Description: Displays the current clock frequency of the Interconnect fabric
What: /sys/class/habanalabs/hl<n>/infineon_ver
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
Description: Version of the Device's power supply F/W code
What: /sys/class/habanalabs/hl<n>/max_power
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
Description: Allows the user to set the maximum power consumption of the
device in milliwatts.
What: /sys/class/habanalabs/hl<n>/mme_clk
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
Description: Allows the user to set the maximum clock frequency of the
MME compute engine. Writes to this parameter affect the device
only when the power management profile is set to "manual" mode.
The device MME clock might be set to lower value then the
maximum. The user should read the mme_clk_curr to see the actual
frequency value of the MME
What: /sys/class/habanalabs/hl<n>/mme_clk_curr
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
Description: Displays the current clock frequency of the MME compute engine
What: /sys/class/habanalabs/hl<n>/pci_addr
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
Description: Displays the PCI address of the device. This is needed so the
user would be able to open a device based on its PCI address
What: /sys/class/habanalabs/hl<n>/pm_mng_profile
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
Description: Power management profile. Values are "auto", "manual". In "auto"
mode, the driver will set the maximum clock frequency to a high
value when a user-space process opens the device's file (unless
it was already opened by another process). The driver will set
the max clock frequency to a low value when there are no user
processes that are opened on the device's file. In "manual"
mode, the user sets the maximum clock frequency by writing to
ic_clk, mme_clk and tpc_clk
What: /sys/class/habanalabs/hl<n>/preboot_btl_ver
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
Description: Version of the device's preboot F/W code
What: /sys/class/habanalabs/hl<n>/soft_reset
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
Description: Interface to trigger a soft-reset operation for the device.
Soft-reset will reset only the compute and DMA engines of the
device
What: /sys/class/habanalabs/hl<n>/soft_reset_cnt
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
Description: Displays how many times the device have undergone a soft-reset
operation since the driver was loaded
What: /sys/class/habanalabs/hl<n>/status
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
Description: Status of the card: "Operational", "Malfunction", "In reset".
What: /sys/class/habanalabs/hl<n>/thermal_ver
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
Description: Version of the Device's thermal daemon
What: /sys/class/habanalabs/hl<n>/tpc_clk
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
Description: Allows the user to set the maximum clock frequency of the
TPC compute engines. Writes to this parameter affect the device
only when the power management profile is set to "manual" mode.
The device TPC clock might be set to lower value then the
maximum. The user should read the tpc_clk_curr to see the actual
frequency value of the TPC
What: /sys/class/habanalabs/hl<n>/tpc_clk_curr
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
Description: Displays the current clock frequency of the TPC compute engines
What: /sys/class/habanalabs/hl<n>/uboot_ver
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
Description: Version of the u-boot running on the device's CPU
What: /sys/class/habanalabs/hl<n>/write_open_cnt
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
Description: Displays the total number of user processes that are currently
opened on the device's file

View File

@@ -109,3 +109,10 @@ Description:
write operation (since a 4k random write might turn
into a much larger write due to the zeroout
operation).
What: /sys/fs/ext4/<disk>/journal_task
Date: February 2019
Contact: "Theodore Ts'o" <tytso@mit.edu>
Description:
This file is read-only and shows the pid of journal thread in
current pid-namespace or 0 if task is unreachable.

View File

@@ -86,12 +86,28 @@ Description:
The unit size is one block, now only support configuring in range
of [1, 512].
What: /sys/fs/f2fs/<disk>/umount_discard_timeout
Date: January 2019
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
Description:
Set timeout to issue discard commands during umount.
Default: 5 secs
What: /sys/fs/f2fs/<disk>/max_victim_search
Date: January 2014
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
Description:
Controls the number of trials to find a victim segment.
What: /sys/fs/f2fs/<disk>/migration_granularity
Date: October 2018
Contact: "Chao Yu" <yuchao0@huawei.com>
Description:
Controls migration granularity of garbage collection on large
section, it can let GC move partial segment{s} of one section
in one GC cycle, so that dispersing heavy overhead GC to
multiple lightweight one.
What: /sys/fs/f2fs/<disk>/dir_level
Date: March 2014
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>

View File

@@ -33,18 +33,6 @@ Description:
An attribute which indicates whether the patch is currently in
transition.
What: /sys/kernel/livepatch/<patch>/signal
Date: Nov 2017
KernelVersion: 4.15.0
Contact: live-patching@vger.kernel.org
Description:
A writable attribute that allows administrator to affect the
course of an existing transition. Writing 1 sends a fake
signal to all remaining blocking tasks. The fake signal
means that no proper signal is delivered (there is no data in
signal pending structures). Tasks are interrupted or woken up,
and forced to change their patched state.
What: /sys/kernel/livepatch/<patch>/force
Date: Nov 2017
KernelVersion: 4.15.0

View File

@@ -146,114 +146,75 @@ What about block I/O and networking buffers? The block I/O and
networking subsystems make sure that the buffers they use are valid
for you to DMA from/to.
DMA addressing limitations
DMA addressing capabilities
==========================
Does your device have any DMA addressing limitations? For example, is
your device only capable of driving the low order 24-bits of address?
If so, you need to inform the kernel of this fact.
By default, the kernel assumes that your device can address 32-bits of DMA
addressing. For a 64-bit capable device, this needs to be increased, and for
a device with limitations, it needs to be decreased.
By default, the kernel assumes that your device can address the full
32-bits. For a 64-bit capable device, this needs to be increased.
And for a device with limitations, as discussed in the previous
paragraph, it needs to be decreased.
Special note about PCI: PCI-X specification requires PCI-X devices to support
64-bit addressing (DAC) for all transactions. And at least one platform (SGI
SN2) requires 64-bit consistent allocations to operate correctly when the IO
bus is in PCI-X mode.
Special note about PCI: PCI-X specification requires PCI-X devices to
support 64-bit addressing (DAC) for all transactions. And at least
one platform (SGI SN2) requires 64-bit consistent allocations to
operate correctly when the IO bus is in PCI-X mode.
For correct operation, you must set the DMA mask to inform the kernel about
your devices DMA addressing capabilities.
For correct operation, you must interrogate the kernel in your device
probe routine to see if the DMA controller on the machine can properly
support the DMA addressing limitation your device has. It is good
style to do this even if your device holds the default setting,
because this shows that you did think about these issues wrt. your
device.
The query is performed via a call to dma_set_mask_and_coherent()::
This is performed via a call to dma_set_mask_and_coherent()::
int dma_set_mask_and_coherent(struct device *dev, u64 mask);
which will query the mask for both streaming and coherent APIs together.
If you have some special requirements, then the following two separate
queries can be used instead:
which will set the mask for both streaming and coherent APIs together. If you
have some special requirements, then the following two separate calls can be
used instead:
The query for streaming mappings is performed via a call to
The setup for streaming mappings is performed via a call to
dma_set_mask()::
int dma_set_mask(struct device *dev, u64 mask);
The query for consistent allocations is performed via a call
The setup for consistent allocations is performed via a call
to dma_set_coherent_mask()::
int dma_set_coherent_mask(struct device *dev, u64 mask);
Here, dev is a pointer to the device struct of your device, and mask
is a bit mask describing which bits of an address your device
supports. It returns zero if your card can perform DMA properly on
the machine given the address mask you provided. In general, the
device struct of your device is embedded in the bus-specific device
struct of your device. For example, &pdev->dev is a pointer to the
device struct of a PCI device (pdev is a pointer to the PCI device
struct of your device).
Here, dev is a pointer to the device struct of your device, and mask is a bit
mask describing which bits of an address your device supports. Often the
device struct of your device is embedded in the bus-specific device struct of
your device. For example, &pdev->dev is a pointer to the device struct of a
PCI device (pdev is a pointer to the PCI device struct of your device).
If it returns non-zero, your device cannot perform DMA properly on
this platform, and attempting to do so will result in undefined
behavior. You must either use a different mask, or not use DMA.
These calls usually return zero to indicated your device can perform DMA
properly on the machine given the address mask you provided, but they might
return an error if the mask is too small to be supportable on the given
system. If it returns non-zero, your device cannot perform DMA properly on
this platform, and attempting to do so will result in undefined behavior.
You must not use DMA on this device unless the dma_set_mask family of
functions has returned success.
This means that in the failure case, you have three options:
This means that in the failure case, you have two options:
1) Use another DMA mask, if possible (see below).
2) Use some non-DMA mode for data transfer, if possible.
3) Ignore this device and do not initialize it.
1) Use some non-DMA mode for data transfer, if possible.
2) Ignore this device and do not initialize it.
It is recommended that your driver print a kernel KERN_WARNING message
when you end up performing either #2 or #3. In this manner, if a user
of your driver reports that performance is bad or that the device is not
even detected, you can ask them for the kernel messages to find out
exactly why.
It is recommended that your driver print a kernel KERN_WARNING message when
setting the DMA mask fails. In this manner, if a user of your driver reports
that performance is bad or that the device is not even detected, you can ask
them for the kernel messages to find out exactly why.
The standard 32-bit addressing device would do something like this::
The standard 64-bit addressing device would do something like this::
if (dma_set_mask_and_coherent(dev, DMA_BIT_MASK(32))) {
if (dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64))) {
dev_warn(dev, "mydev: No suitable DMA available\n");
goto ignore_this_device;
}
Another common scenario is a 64-bit capable device. The approach here
is to try for 64-bit addressing, but back down to a 32-bit mask that
should not fail. The kernel may fail the 64-bit mask not because the
platform is not capable of 64-bit addressing. Rather, it may fail in
this case simply because 32-bit addressing is done more efficiently
than 64-bit addressing. For example, Sparc64 PCI SAC addressing is
more efficient than DAC addressing.
If the device only supports 32-bit addressing for descriptors in the
coherent allocations, but supports full 64-bits for streaming mappings
it would look like this:
Here is how you would handle a 64-bit capable device which can drive
all 64-bits when accessing streaming DMA::
int using_dac;
if (!dma_set_mask(dev, DMA_BIT_MASK(64))) {
using_dac = 1;
} else if (!dma_set_mask(dev, DMA_BIT_MASK(32))) {
using_dac = 0;
} else {
dev_warn(dev, "mydev: No suitable DMA available\n");
goto ignore_this_device;
}
If a card is capable of using 64-bit consistent allocations as well,
the case would look like this::
int using_dac, consistent_using_dac;
if (!dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64))) {
using_dac = 1;
consistent_using_dac = 1;
} else if (!dma_set_mask_and_coherent(dev, DMA_BIT_MASK(32))) {
using_dac = 0;
consistent_using_dac = 0;
} else {
if (dma_set_mask(dev, DMA_BIT_MASK(64))) {
dev_warn(dev, "mydev: No suitable DMA available\n");
goto ignore_this_device;
}

View File

@@ -58,15 +58,6 @@ specify the ``GFP_`` flags (see kmalloc()) for the allocation (the
implementation may choose to ignore flags that affect the location of
the returned memory, like GFP_DMA).
::
void *
dma_zalloc_coherent(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t flag)
Wraps dma_alloc_coherent() and also zeroes the returned memory if the
allocation attempt succeeded.
::
void
@@ -204,6 +195,14 @@ Requesting the required mask does not alter the current mask. If you
wish to take advantage of it, you should issue a dma_set_mask()
call to set the mask to the value returned.
::
size_t
dma_direct_max_mapping_size(struct device *dev);
Returns the maximum size of a mapping for the device. The size parameter
of the mapping functions like dma_map_single(), dma_map_page() and
others should not be larger than the returned value.
Part Id - Streaming DMA mappings
--------------------------------
@@ -539,8 +538,8 @@ that simply cannot make consistent memory.
dma_free_attrs(struct device *dev, size_t size, void *cpu_addr,
dma_addr_t dma_handle, unsigned long attrs)
Free memory allocated by the dma_alloc_attrs(). All parameters common
parameters must identical to those otherwise passed to dma_fre_coherent,
Free memory allocated by the dma_alloc_attrs(). All common
parameters must be identical to those otherwise passed to dma_free_coherent,
and the attrs argument must be identical to the attrs passed to
dma_alloc_attrs().
@@ -575,8 +574,7 @@ boundaries when doing this.
int
dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
dma_addr_t device_addr, size_t size, int
flags)
dma_addr_t device_addr, size_t size);
Declare region of memory to be handed out by dma_alloc_coherent() when
it's asked for coherent memory for this device.
@@ -590,12 +588,6 @@ dma_addr_t in dma_alloc_coherent()).
size is the size of the area (must be multiples of PAGE_SIZE).
flags can be ORed together and are:
- DMA_MEMORY_EXCLUSIVE - only allocate memory from the declared regions.
Do not allow dma_alloc_coherent() to fall back to system memory when
it's out of memory in the declared region.
As a simplification for the platforms, only *one* such region of
memory may be declared per device.
@@ -614,23 +606,6 @@ unconditionally having removed all the required structures. It is the
driver's job to ensure that no parts of this memory region are
currently in use.
::
void *
dma_mark_declared_memory_occupied(struct device *dev,
dma_addr_t device_addr, size_t size)
This is used to occupy specific regions of the declared space
(dma_alloc_coherent() will hand out the first free region it finds).
device_addr is the *device* address of the region requested.
size is the size (and should be a page-sized multiple).
The return value will be either a pointer to the processor virtual
address of the memory, or an error (via PTR_ERR()) if any part of the
region is occupied.
Part III - Debug drivers use of the DMA-API
-------------------------------------------
@@ -705,6 +680,9 @@ dma-api/disabled This read-only file contains the character 'Y'
happen when it runs out of memory or if it was
disabled at boot time
dma-api/dump This read-only file contains current DMA
mappings.
dma-api/error_count This file is read-only and shows the total
numbers of errors found.
@@ -717,13 +695,16 @@ dma-api/num_errors The number in this file shows how many
dma-api/min_free_entries This read-only file can be read to get the
minimum number of free dma_debug_entries the
allocator has ever seen. If this value goes
down to zero the code will disable itself
because it is not longer reliable.
down to zero the code will attempt to increase
nr_total_entries to compensate.
dma-api/num_free_entries The current number of free dma_debug_entries
in the allocator.
dma-api/driver-filter You can write a name of a driver into this file
dma-api/nr_total_entries The total number of dma_debug_entries in the
allocator, both free and used.
dma-api/driver_filter You can write a name of a driver into this file
to limit the debug output to requests from that
particular driver. Write an empty string to
that file to disable the filter and see
@@ -742,10 +723,15 @@ driver filter at boot time. The debug code will only print errors for that
driver afterwards. This filter can be disabled or changed later using debugfs.
When the code disables itself at runtime this is most likely because it ran
out of dma_debug_entries. These entries are preallocated at boot. The number
of preallocated entries is defined per architecture. If it is too low for you
boot with 'dma_debug_entries=<your_desired_number>' to overwrite the
architectural default.
out of dma_debug_entries and was unable to allocate more on-demand. 65536
entries are preallocated at boot - if this is too low for you boot with
'dma_debug_entries=<your_desired_number>' to overwrite the default. Note
that the code allocates entries in batches, so the exact number of
preallocated entries may be greater than the actual number requested. The
code will print to the kernel log each time it has dynamically allocated
as many entries as were initially preallocated. This is to indicate that a
larger preallocation size may be appropriate, or if it happens continually
that a driver may be leaking mappings.
::

View File

@@ -52,8 +52,8 @@ Address translation
-------------------
To translate the virtual address to a bus address, use the normal DMA
API. Do _not_ use isa_virt_to_phys() even though it does the same
thing. The reason for this is that the function isa_virt_to_phys()
API. Do _not_ use isa_virt_to_bus() even though it does the same
thing. The reason for this is that the function isa_virt_to_bus()
will require a Kconfig dependency to ISA, not just ISA_DMA_API which
is really all you need. Remember that even though the DMA controller
has its origins in ISA it is used elsewhere.

View File

@@ -31,14 +31,13 @@
#define YBLANK 38
#define XOFFSET 8
#define XPULSE 144
#define YOFFSET (63+3)
#define YPULSE (63+6)
#define YOFFSET 3
#define YPULSE 6
#define DPI 72
#define VFREQ 60 /* Hz */
#define TIMING_NAME "Linux XGA"
#define ESTABLISHED_TIMING2_BITS 0x08 /* Bit 3 -> 1024x768 @60 Hz */
#define HSYNC_POL 0
#define VSYNC_POL 0
#define CRC 0x55
#include "edid.S"

View File

@@ -31,14 +31,13 @@
#define YBLANK 42
#define XOFFSET 48
#define XPULSE 112
#define YOFFSET (63+1)
#define YPULSE (63+3)
#define YOFFSET 1
#define YPULSE 3
#define DPI 72
#define VFREQ 60 /* Hz */
#define TIMING_NAME "Linux SXGA"
/* No ESTABLISHED_TIMINGx_BITS */
#define HSYNC_POL 1
#define VSYNC_POL 1
#define CRC 0xa0
#include "edid.S"

View File

@@ -31,14 +31,13 @@
#define YBLANK 50
#define XOFFSET 64
#define XPULSE 192
#define YOFFSET (63+1)
#define YPULSE (63+3)
#define YOFFSET 1
#define YPULSE 3
#define DPI 72
#define VFREQ 60 /* Hz */
#define TIMING_NAME "Linux UXGA"
/* No ESTABLISHED_TIMINGx_BITS */
#define HSYNC_POL 1
#define VSYNC_POL 1
#define CRC 0x9d
#include "edid.S"

View File

@@ -31,14 +31,13 @@
#define YBLANK 39
#define XOFFSET 104
#define XPULSE 176
#define YOFFSET (63+3)
#define YPULSE (63+6)
#define YOFFSET 3
#define YPULSE 6
#define DPI 96
#define VFREQ 60 /* Hz */
#define TIMING_NAME "Linux WSXGA"
/* No ESTABLISHED_TIMINGx_BITS */
#define HSYNC_POL 1
#define VSYNC_POL 1
#define CRC 0x26
#include "edid.S"

View File

@@ -31,14 +31,13 @@
#define YBLANK 45
#define XOFFSET 88
#define XPULSE 44
#define YOFFSET (63+4)
#define YPULSE (63+5)
#define YOFFSET 4
#define YPULSE 5
#define DPI 96
#define VFREQ 60 /* Hz */
#define TIMING_NAME "Linux FHD"
/* No ESTABLISHED_TIMINGx_BITS */
#define HSYNC_POL 1
#define VSYNC_POL 1
#define CRC 0x05
#include "edid.S"

View File

@@ -28,14 +28,13 @@
#define YBLANK 28
#define XOFFSET 40
#define XPULSE 128
#define YOFFSET (63+1)
#define YPULSE (63+4)
#define YOFFSET 1
#define YPULSE 4
#define DPI 72
#define VFREQ 60 /* Hz */
#define TIMING_NAME "Linux SVGA"
#define ESTABLISHED_TIMING1_BITS 0x01 /* Bit 0: 800x600 @ 60Hz */
#define HSYNC_POL 1
#define VSYNC_POL 1
#define CRC 0xc2
#include "edid.S"

View File

@@ -45,14 +45,5 @@ EDID:
#define YPIX vdisp
#define YBLANK vtotal-vdisp
#define YOFFSET (63+(vsyncstart-vdisp))
#define YPULSE (63+(vsyncend-vsyncstart))
The CRC value in the last line
#define CRC 0x55
also is a bit tricky. After a first version of the binary data set is
created, it must be checked with the "edid-decode" utility which will
most probably complain about a wrong CRC. Fortunately, the utility also
displays the correct CRC which must then be inserted into the source
file. After the make procedure is repeated, the EDID data set is ready
to be used.
#define YOFFSET vsyncstart-vdisp
#define YPULSE vsyncend-vsyncstart

View File

@@ -15,10 +15,21 @@ clean:
%.o: %.S
@cc -c $^
%.bin: %.o
%.bin.nocrc: %.o
@objcopy -Obinary $^ $@
%.bin.ihex: %.o
%.crc: %.bin.nocrc
@list=$$(for i in `seq 1 127`; do head -c$$i $^ | tail -c1 \
| hexdump -v -e '/1 "%02X+"'; done); \
echo "ibase=16;100-($${list%?})%100" | bc >$@
%.p: %.crc %.S
@cc -c -DCRC="$$(cat $*.crc)" -o $@ $*.S
%.bin: %.p
@objcopy -Obinary $^ $@
%.bin.ihex: %.p
@objcopy -Oihex $^ $@
@dos2unix $@ 2>/dev/null

View File

@@ -47,9 +47,11 @@
#define mfgname2id(v1,v2,v3) \
((((v1-'@')&0x1f)<<10)+(((v2-'@')&0x1f)<<5)+((v3-'@')&0x1f))
#define swap16(v1) ((v1>>8)+((v1&0xff)<<8))
#define lsbs2(v1,v2) (((v1&0x0f)<<4)+(v2&0x0f))
#define msbs2(v1,v2) ((((v1>>8)&0x0f)<<4)+((v2>>8)&0x0f))
#define msbs4(v1,v2,v3,v4) \
(((v1&0x03)>>2)+((v2&0x03)>>4)+((v3&0x03)>>6)+((v4&0x03)>>8))
((((v1>>8)&0x03)<<6)+(((v2>>8)&0x03)<<4)+\
(((v3>>4)&0x03)<<2)+((v4>>4)&0x03))
#define pixdpi2mm(pix,dpi) ((pix*25)/dpi)
#define xsize pixdpi2mm(XPIX,DPI)
#define ysize pixdpi2mm(YPIX,DPI)
@@ -200,9 +202,9 @@ y_msbs: .byte msbs2(YPIX,YBLANK)
x_snc_off_lsb: .byte XOFFSET&0xff
/* Horizontal sync pulse width pixels 8 lsbits (0-1023) */
x_snc_pls_lsb: .byte XPULSE&0xff
/* Bits 7-4 Vertical sync offset lines 4 lsbits -63)
Bits 3-0 Vertical sync pulse width lines 4 lsbits -63) */
y_snc_lsb: .byte ((YOFFSET-63)<<4)+(YPULSE-63)
/* Bits 7-4 Vertical sync offset lines 4 lsbits (0-63)
Bits 3-0 Vertical sync pulse width lines 4 lsbits (0-63) */
y_snc_lsb: .byte lsbs2(YOFFSET, YPULSE)
/* Bits 7-6 Horizontal sync offset pixels 2 msbits
Bits 5-4 Horizontal sync pulse width pixels 2 msbits
Bits 3-2 Vertical sync offset lines 2 msbits

View File

@@ -2,7 +2,7 @@
# Makefile for Sphinx documentation
#
subdir-y :=
subdir-y := devicetree/bindings/
# You can set these variables from the command line.
SPHINXBUILD = sphinx-build

View File

@@ -1,499 +0,0 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!-- Creator: fig2dev Version 3.2 Patchlevel 5e -->
<!-- CreationDate: Wed Dec 9 17:26:09 2015 -->
<!-- Magnification: 2.000 -->
<svg
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:cc="http://creativecommons.org/ns#"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:svg="http://www.w3.org/2000/svg"
xmlns="http://www.w3.org/2000/svg"
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
width="5.7in"
height="6.6in"
viewBox="-44 -44 6838 7888"
id="svg2"
version="1.1"
inkscape:version="0.48.4 r9939"
sodipodi:docname="BigTreeClassicRCUBH.fig">
<metadata
id="metadata110">
<rdf:RDF>
<cc:Work
rdf:about="">
<dc:format>image/svg+xml</dc:format>
<dc:type
rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
<dc:title></dc:title>
</cc:Work>
</rdf:RDF>
</metadata>
<defs
id="defs108">
<marker
inkscape:stockid="Arrow1Mend"
orient="auto"
refY="0.0"
refX="0.0"
id="Arrow1Mend"
style="overflow:visible;">
<path
id="path3868"
d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;"
transform="scale(0.4) rotate(180) translate(10,0)" />
</marker>
<marker
inkscape:stockid="Arrow2Mend"
orient="auto"
refY="0.0"
refX="0.0"
id="Arrow2Mend"
style="overflow:visible;">
<path
id="path3886"
style="fill-rule:evenodd;stroke-width:0.62500000;stroke-linejoin:round;"
d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z "
transform="scale(0.6) rotate(180) translate(0,0)" />
</marker>
</defs>
<sodipodi:namedview
pagecolor="#ffffff"
bordercolor="#666666"
borderopacity="1"
objecttolerance="10"
gridtolerance="10"
guidetolerance="10"
inkscape:pageopacity="0"
inkscape:pageshadow="2"
inkscape:window-width="878"
inkscape:window-height="1148"
id="namedview106"
showgrid="false"
inkscape:zoom="1.3547758"
inkscape:cx="256.5"
inkscape:cy="297"
inkscape:window-x="45"
inkscape:window-y="24"
inkscape:window-maximized="0"
inkscape:current-layer="g4" />
<g
style="stroke-width:.025in; fill:none"
id="g4">
<!-- Line: box -->
<rect
x="450"
y="0"
width="6300"
height="7350"
rx="0"
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
id="rect6" />
<!-- Line: box -->
<rect
x="4950"
y="4950"
width="1500"
height="900"
rx="0"
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
id="rect8" />
<!-- Line: box -->
<rect
x="750"
y="600"
width="5700"
height="3750"
rx="0"
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
id="rect10" />
<!-- Line: box -->
<rect
x="0"
y="450"
width="6300"
height="7350"
rx="0"
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
id="rect12" />
<!-- Line: box -->
<rect
x="300"
y="1050"
width="5700"
height="3750"
rx="0"
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
id="rect14" />
<!-- Circle -->
<circle
cx="2850"
cy="3900"
r="76"
style="fill:#000000;stroke:#000000;stroke-width:14;"
id="circle16" />
<!-- Circle -->
<circle
cx="3150"
cy="3900"
r="76"
style="fill:#000000;stroke:#000000;stroke-width:14;"
id="circle18" />
<!-- Circle -->
<circle
cx="3450"
cy="3900"
r="76"
style="fill:#000000;stroke:#000000;stroke-width:14;"
id="circle20" />
<!-- Circle -->
<circle
cx="1350"
cy="5100"
r="76"
style="fill:#000000;stroke:#000000;stroke-width:14;"
id="circle22" />
<!-- Circle -->
<circle
cx="1650"
cy="5100"
r="76"
style="fill:#000000;stroke:#000000;stroke-width:14;"
id="circle24" />
<!-- Circle -->
<circle
cx="1950"
cy="5100"
r="76"
style="fill:#000000;stroke:#000000;stroke-width:14;"
id="circle26" />
<!-- Circle -->
<circle
cx="4350"
cy="5100"
r="76"
style="fill:#000000;stroke:#000000;stroke-width:14;"
id="circle28" />
<!-- Circle -->
<circle
cx="4650"
cy="5100"
r="76"
style="fill:#000000;stroke:#000000;stroke-width:14;"
id="circle30" />
<!-- Circle -->
<circle
cx="4950"
cy="5100"
r="76"
style="fill:#000000;stroke:#000000;stroke-width:14;"
id="circle32" />
<!-- Line -->
<polyline
points="1350,3450 2350,2590 "
style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
id="polyline34" />
<!-- Arrowhead on XXXpoint 1350 3450 - 2444 2510-->
<!-- Line -->
<polyline
points="4950,3450 3948,2590 "
style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
id="polyline38" />
<!-- Arrowhead on XXXpoint 4950 3450 - 3854 2510-->
<!-- Line: box -->
<rect
x="750"
y="3450"
width="1800"
height="900"
rx="0"
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
id="rect42" />
<!-- Line -->
<polyline
points="2250,5400 2250,4414 "
style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
id="polyline44" />
<!-- Arrowhead on XXXpoint 2250 5400 - 2250 4290-->
<!-- Line: box -->
<rect
x="1500"
y="5400"
width="1500"
height="900"
rx="0"
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
id="rect48" />
<!-- Line: box -->
<rect
x="300"
y="6600"
width="1500"
height="900"
rx="0"
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
id="rect50" />
<!-- Line: box -->
<rect
x="3750"
y="3450"
width="1800"
height="900"
rx="0"
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
id="rect52" />
<!-- Line: box -->
<rect
x="4500"
y="5400"
width="1500"
height="900"
rx="0"
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
id="rect54" />
<!-- Line: box -->
<rect
x="3300"
y="6600"
width="1500"
height="900"
rx="0"
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
id="rect56" />
<!-- Line: box -->
<rect
x="2250"
y="1650"
width="1800"
height="900"
rx="0"
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
id="rect58" />
<!-- Text -->
<text
xml:space="preserve"
x="6450"
y="300"
fill="#000000"
font-family="Helvetica"
font-style="normal"
font-weight="normal"
font-size="192"
text-anchor="end"
id="text60">rcu_bh</text>
<!-- Text -->
<text
xml:space="preserve"
x="3150"
y="1950"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text62">struct</text>
<!-- Text -->
<text
xml:space="preserve"
x="3150"
y="2250"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text64">rcu_node</text>
<!-- Text -->
<text
xml:space="preserve"
x="1650"
y="3750"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text66">struct</text>
<!-- Text -->
<text
xml:space="preserve"
x="1650"
y="4050"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text68">rcu_node</text>
<!-- Text -->
<text
xml:space="preserve"
x="4650"
y="4050"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text70">rcu_node</text>
<!-- Text -->
<text
xml:space="preserve"
x="4650"
y="3750"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text72">struct</text>
<!-- Text -->
<text
xml:space="preserve"
x="2250"
y="5700"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text74">struct</text>
<!-- Text -->
<text
xml:space="preserve"
x="2250"
y="6000"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text76">rcu_data</text>
<!-- Text -->
<text
xml:space="preserve"
x="1050"
y="6900"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text78">struct</text>
<!-- Text -->
<text
xml:space="preserve"
x="1050"
y="7200"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text80">rcu_data</text>
<!-- Text -->
<text
xml:space="preserve"
x="5250"
y="5700"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text82">struct</text>
<!-- Text -->
<text
xml:space="preserve"
x="5250"
y="6000"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text84">rcu_data</text>
<!-- Text -->
<text
xml:space="preserve"
x="4050"
y="6900"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text86">struct</text>
<!-- Text -->
<text
xml:space="preserve"
x="4050"
y="7200"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text88">rcu_data</text>
<!-- Text -->
<text
xml:space="preserve"
x="450"
y="1350"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="start"
id="text90">struct rcu_state</text>
<!-- Text -->
<text
xml:space="preserve"
x="6000"
y="750"
fill="#000000"
font-family="Helvetica"
font-style="normal"
font-weight="normal"
font-size="192"
text-anchor="end"
id="text92">rcu_sched</text>
<!-- Line -->
<polyline
points="5250,5400 5250,4414 "
style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
id="polyline94" />
<!-- Arrowhead on XXXpoint 5250 5400 - 5250 4290-->
<!-- Line -->
<polyline
points="4050,6600 4050,4414 "
style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
id="polyline98" />
<!-- Arrowhead on XXXpoint 4050 6600 - 4050 4290-->
<!-- Line -->
<polyline
points="1050,6600 1050,4414 "
style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
id="polyline102" />
<!-- Arrowhead on XXXpoint 1050 6600 - 1050 4290-->
</g>
</svg>

Before

Width:  |  Height:  |  Size: 13 KiB

View File

@@ -1,695 +0,0 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!-- Creator: fig2dev Version 3.2 Patchlevel 5e -->
<!-- CreationDate: Wed Dec 9 17:20:02 2015 -->
<!-- Magnification: 2.000 -->
<svg
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:cc="http://creativecommons.org/ns#"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:svg="http://www.w3.org/2000/svg"
xmlns="http://www.w3.org/2000/svg"
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
width="5.7in"
height="8.6in"
viewBox="-44 -44 6838 10288"
id="svg2"
version="1.1"
inkscape:version="0.48.4 r9939"
sodipodi:docname="BigTreeClassicRCUBHdyntick.fig">
<metadata
id="metadata166">
<rdf:RDF>
<cc:Work
rdf:about="">
<dc:format>image/svg+xml</dc:format>
<dc:type
rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
<dc:title></dc:title>
</cc:Work>
</rdf:RDF>
</metadata>
<defs
id="defs164">
<marker
inkscape:stockid="Arrow1Mend"
orient="auto"
refY="0.0"
refX="0.0"
id="Arrow1Mend"
style="overflow:visible;">
<path
id="path3924"
d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;"
transform="scale(0.4) rotate(180) translate(10,0)" />
</marker>
<marker
inkscape:stockid="Arrow2Lend"
orient="auto"
refY="0.0"
refX="0.0"
id="Arrow2Lend"
style="overflow:visible;">
<path
id="path3936"
style="fill-rule:evenodd;stroke-width:0.62500000;stroke-linejoin:round;"
d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z "
transform="scale(1.1) rotate(180) translate(1,0)" />
</marker>
</defs>
<sodipodi:namedview
pagecolor="#ffffff"
bordercolor="#666666"
borderopacity="1"
objecttolerance="10"
gridtolerance="10"
guidetolerance="10"
inkscape:pageopacity="0"
inkscape:pageshadow="2"
inkscape:window-width="845"
inkscape:window-height="988"
id="namedview162"
showgrid="false"
inkscape:zoom="1.0452196"
inkscape:cx="256.5"
inkscape:cy="387.00003"
inkscape:window-x="356"
inkscape:window-y="61"
inkscape:window-maximized="0"
inkscape:current-layer="g4" />
<g
style="stroke-width:.025in; fill:none"
id="g4">
<!-- Line: box -->
<rect
x="450"
y="0"
width="6300"
height="7350"
rx="0"
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
id="rect6" />
<!-- Line: box -->
<rect
x="4950"
y="4950"
width="1500"
height="900"
rx="0"
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
id="rect8" />
<!-- Line: box -->
<rect
x="750"
y="600"
width="5700"
height="3750"
rx="0"
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
id="rect10" />
<!-- Line -->
<polyline
points="5250,8100 5688,5912 "
style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
id="polyline12" />
<!-- Arrowhead on XXXpoint 5250 8100 - 5710 5790-->
<polyline
points="5714 6068 5704 5822 5598 6044 "
style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
id="polyline14" />
<!-- Line -->
<polyline
points="4050,9300 4486,7262 "
style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
id="polyline16" />
<!-- Arrowhead on XXXpoint 4050 9300 - 4512 7140-->
<polyline
points="4514 7418 4506 7172 4396 7394 "
style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
id="polyline18" />
<!-- Line -->
<polyline
points="1040,9300 1476,7262 "
style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
id="polyline20" />
<!-- Arrowhead on XXXpoint 1040 9300 - 1502 7140-->
<polyline
points="1504 7418 1496 7172 1386 7394 "
style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
id="polyline22" />
<!-- Line -->
<polyline
points="2240,8100 2676,6062 "
style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
id="polyline24" />
<!-- Arrowhead on XXXpoint 2240 8100 - 2702 5940-->
<polyline
points="2704 6218 2696 5972 2586 6194 "
style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
id="polyline26" />
<!-- Line: box -->
<rect
x="0"
y="450"
width="6300"
height="7350"
rx="0"
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
id="rect28" />
<!-- Line: box -->
<rect
x="300"
y="1050"
width="5700"
height="3750"
rx="0"
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
id="rect30" />
<!-- Line -->
<polyline
points="1350,3450 2350,2590 "
style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
id="polyline32" />
<!-- Arrowhead on XXXpoint 1350 3450 - 2444 2510-->
<!-- Line -->
<polyline
points="4950,3450 3948,2590 "
style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
id="polyline36" />
<!-- Arrowhead on XXXpoint 4950 3450 - 3854 2510-->
<!-- Line -->
<polyline
points="4050,6600 4050,4414 "
style="stroke:#00d1d1;stroke-width:30.00455750000000066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
id="polyline40" />
<!-- Arrowhead on XXXpoint 4050 6600 - 4050 4290-->
<!-- Line -->
<polyline
points="1050,6600 1050,4414 "
style="stroke:#00d1d1;stroke-width:30.00455750000000066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
id="polyline44" />
<!-- Arrowhead on XXXpoint 1050 6600 - 1050 4290-->
<!-- Line -->
<polyline
points="2250,5400 2250,4414 "
style="stroke:#00d1d1;stroke-width:30.00455750000000066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
id="polyline48" />
<!-- Arrowhead on XXXpoint 2250 5400 - 2250 4290-->
<!-- Line -->
<polyline
points="2250,8100 2250,6364 "
style="stroke:#00ff00;stroke-width:30;stroke-linejoin:miter;stroke-linecap:butt;marker-end:url(#Arrow1Mend)"
id="polyline52" />
<!-- Arrowhead on XXXpoint 2250 8100 - 2250 6240-->
<!-- Line -->
<polyline
points="1050,9300 1050,7564 "
style="stroke:#00ff00;stroke-width:30;stroke-linejoin:miter;stroke-linecap:butt;marker-end:url(#Arrow1Mend)"
id="polyline56" />
<!-- Arrowhead on XXXpoint 1050 9300 - 1050 7440-->
<!-- Line -->
<polyline
points="4050,9300 4050,7564 "
style="stroke:#00ff00;stroke-width:30;stroke-linejoin:miter;stroke-linecap:butt;marker-end:url(#Arrow1Mend)"
id="polyline60" />
<!-- Arrowhead on XXXpoint 4050 9300 - 4050 7440-->
<!-- Line -->
<polyline
points="5250,8100 5250,6364 "
style="stroke:#00ff00;stroke-width:30;stroke-linejoin:miter;stroke-linecap:butt;marker-end:url(#Arrow1Mend)"
id="polyline64" />
<!-- Arrowhead on XXXpoint 5250 8100 - 5250 6240-->
<!-- Circle -->
<circle
cx="2850"
cy="3900"
r="76"
style="fill:#000000;stroke:#000000;stroke-width:14;"
id="circle68" />
<!-- Circle -->
<circle
cx="3150"
cy="3900"
r="76"
style="fill:#000000;stroke:#000000;stroke-width:14;"
id="circle70" />
<!-- Circle -->
<circle
cx="3450"
cy="3900"
r="76"
style="fill:#000000;stroke:#000000;stroke-width:14;"
id="circle72" />
<!-- Circle -->
<circle
cx="1350"
cy="5100"
r="76"
style="fill:#000000;stroke:#000000;stroke-width:14;"
id="circle74" />
<!-- Circle -->
<circle
cx="1650"
cy="5100"
r="76"
style="fill:#000000;stroke:#000000;stroke-width:14;"
id="circle76" />
<!-- Circle -->
<circle
cx="1950"
cy="5100"
r="76"
style="fill:#000000;stroke:#000000;stroke-width:14;"
id="circle78" />
<!-- Circle -->
<circle
cx="4350"
cy="5100"
r="76"
style="fill:#000000;stroke:#000000;stroke-width:14;"
id="circle80" />
<!-- Circle -->
<circle
cx="4650"
cy="5100"
r="76"
style="fill:#000000;stroke:#000000;stroke-width:14;"
id="circle82" />
<!-- Circle -->
<circle
cx="4950"
cy="5100"
r="76"
style="fill:#000000;stroke:#000000;stroke-width:14;"
id="circle84" />
<!-- Line: box -->
<rect
x="750"
y="3450"
width="1800"
height="900"
rx="0"
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
id="rect86" />
<!-- Line: box -->
<rect
x="300"
y="6600"
width="1500"
height="900"
rx="0"
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
id="rect88" />
<!-- Line: box -->
<rect
x="3750"
y="3450"
width="1800"
height="900"
rx="0"
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
id="rect90" />
<!-- Line: box -->
<rect
x="4500"
y="5400"
width="1500"
height="900"
rx="0"
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
id="rect92" />
<!-- Line: box -->
<rect
x="3300"
y="6600"
width="1500"
height="900"
rx="0"
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
id="rect94" />
<!-- Line: box -->
<rect
x="2250"
y="1650"
width="1800"
height="900"
rx="0"
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
id="rect96" />
<!-- Line: box -->
<rect
x="0"
y="9300"
width="2100"
height="900"
rx="0"
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
id="rect98" />
<!-- Line: box -->
<rect
x="1350"
y="8100"
width="2100"
height="900"
rx="0"
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
id="rect100" />
<!-- Line: box -->
<rect
x="3000"
y="9300"
width="2100"
height="900"
rx="0"
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
id="rect102" />
<!-- Line: box -->
<rect
x="4350"
y="8100"
width="2100"
height="900"
rx="0"
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
id="rect104" />
<!-- Line: box -->
<rect
x="1500"
y="5400"
width="1500"
height="900"
rx="0"
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
id="rect106" />
<!-- Text -->
<text
xml:space="preserve"
x="6450"
y="300"
fill="#000000"
font-family="Helvetica"
font-style="normal"
font-weight="normal"
font-size="192"
text-anchor="end"
id="text108">rcu_bh</text>
<!-- Text -->
<text
xml:space="preserve"
x="3150"
y="1950"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text110">struct</text>
<!-- Text -->
<text
xml:space="preserve"
x="3150"
y="2250"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text112">rcu_node</text>
<!-- Text -->
<text
xml:space="preserve"
x="1650"
y="3750"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text114">struct</text>
<!-- Text -->
<text
xml:space="preserve"
x="1650"
y="4050"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text116">rcu_node</text>
<!-- Text -->
<text
xml:space="preserve"
x="4650"
y="4050"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text118">rcu_node</text>
<!-- Text -->
<text
xml:space="preserve"
x="4650"
y="3750"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text120">struct</text>
<!-- Text -->
<text
xml:space="preserve"
x="2250"
y="5700"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text122">struct</text>
<!-- Text -->
<text
xml:space="preserve"
x="2250"
y="6000"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text124">rcu_data</text>
<!-- Text -->
<text
xml:space="preserve"
x="1050"
y="6900"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text126">struct</text>
<!-- Text -->
<text
xml:space="preserve"
x="1050"
y="7200"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text128">rcu_data</text>
<!-- Text -->
<text
xml:space="preserve"
x="5250"
y="5700"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text130">struct</text>
<!-- Text -->
<text
xml:space="preserve"
x="5250"
y="6000"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text132">rcu_data</text>
<!-- Text -->
<text
xml:space="preserve"
x="4050"
y="6900"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text134">struct</text>
<!-- Text -->
<text
xml:space="preserve"
x="4050"
y="7200"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text136">rcu_data</text>
<!-- Text -->
<text
xml:space="preserve"
x="450"
y="1350"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="start"
id="text138">struct rcu_state</text>
<!-- Text -->
<text
xml:space="preserve"
x="1050"
y="9600"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text140">struct</text>
<!-- Text -->
<text
xml:space="preserve"
x="1050"
y="9900"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text142">rcu_dynticks</text>
<!-- Text -->
<text
xml:space="preserve"
x="4050"
y="9600"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text144">struct</text>
<!-- Text -->
<text
xml:space="preserve"
x="4050"
y="9900"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text146">rcu_dynticks</text>
<!-- Text -->
<text
xml:space="preserve"
x="2400"
y="8400"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text148">struct</text>
<!-- Text -->
<text
xml:space="preserve"
x="2400"
y="8700"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text150">rcu_dynticks</text>
<!-- Text -->
<text
xml:space="preserve"
x="5400"
y="8400"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text152">struct</text>
<!-- Text -->
<text
xml:space="preserve"
x="5400"
y="8700"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text154">rcu_dynticks</text>
<!-- Text -->
<text
xml:space="preserve"
x="6000"
y="750"
fill="#000000"
font-family="Helvetica"
font-style="normal"
font-weight="normal"
font-size="192"
text-anchor="end"
id="text156">rcu_sched</text>
<!-- Line -->
<polyline
points="5250,5400 5250,4414 "
style="stroke:#00d1d1;stroke-width:30.00455750000000066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
id="polyline158" />
<!-- Arrowhead on XXXpoint 5250 5400 - 5250 4290-->
</g>
</svg>

Before

Width:  |  Height:  |  Size: 19 KiB

View File

@@ -1,741 +0,0 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!-- Creator: fig2dev Version 3.2 Patchlevel 5e -->
<!-- CreationDate: Wed Dec 9 17:32:59 2015 -->
<!-- Magnification: 2.000 -->
<svg
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:cc="http://creativecommons.org/ns#"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:svg="http://www.w3.org/2000/svg"
xmlns="http://www.w3.org/2000/svg"
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
width="6.1in"
height="8.9in"
viewBox="-44 -44 7288 10738"
id="svg2"
version="1.1"
inkscape:version="0.48.4 r9939"
sodipodi:docname="BigTreePreemptRCUBHdyntick.fig">
<metadata
id="metadata182">
<rdf:RDF>
<cc:Work
rdf:about="">
<dc:format>image/svg+xml</dc:format>
<dc:type
rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
<dc:title></dc:title>
</cc:Work>
</rdf:RDF>
</metadata>
<defs
id="defs180">
<marker
inkscape:stockid="Arrow1Mend"
orient="auto"
refY="0.0"
refX="0.0"
id="Arrow1Mend"
style="overflow:visible;">
<path
id="path3940"
d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;"
transform="scale(0.4) rotate(180) translate(10,0)" />
</marker>
</defs>
<sodipodi:namedview
pagecolor="#ffffff"
bordercolor="#666666"
borderopacity="1"
objecttolerance="10"
gridtolerance="10"
guidetolerance="10"
inkscape:pageopacity="0"
inkscape:pageshadow="2"
inkscape:window-width="874"
inkscape:window-height="1148"
id="namedview178"
showgrid="false"
inkscape:zoom="1.2097379"
inkscape:cx="274.5"
inkscape:cy="400.49997"
inkscape:window-x="946"
inkscape:window-y="24"
inkscape:window-maximized="0"
inkscape:current-layer="g4" />
<g
style="stroke-width:.025in; fill:none"
id="g4">
<!-- Line: box -->
<rect
x="900"
y="0"
width="6300"
height="7350"
rx="0"
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
id="rect6" />
<!-- Line: box -->
<rect
x="1200"
y="600"
width="5700"
height="3750"
rx="0"
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
id="rect8" />
<!-- Line: box -->
<rect
x="5400"
y="4950"
width="1500"
height="900"
rx="0"
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
id="rect10" />
<!-- Line: box -->
<rect
x="450"
y="450"
width="6300"
height="7350"
rx="0"
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
id="rect12" />
<!-- Line: box -->
<rect
x="750"
y="1050"
width="5700"
height="3750"
rx="0"
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
id="rect14" />
<!-- Line: box -->
<rect
x="4950"
y="5400"
width="1500"
height="900"
rx="0"
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
id="rect16" />
<!-- Line -->
<polyline
points="5250,8550 5688,6362 "
style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
id="polyline18" />
<!-- Arrowhead on XXXpoint 5250 8550 - 5710 6240-->
<polyline
points="5714 6518 5704 6272 5598 6494 "
style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
id="polyline20" />
<!-- Line -->
<polyline
points="4050,9750 4486,7712 "
style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
id="polyline22" />
<!-- Arrowhead on XXXpoint 4050 9750 - 4512 7590-->
<polyline
points="4514 7868 4506 7622 4396 7844 "
style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
id="polyline24" />
<!-- Line -->
<polyline
points="1040,9750 1476,7712 "
style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
id="polyline26" />
<!-- Arrowhead on XXXpoint 1040 9750 - 1502 7590-->
<polyline
points="1504 7868 1496 7622 1386 7844 "
style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
id="polyline28" />
<!-- Line -->
<polyline
points="2240,8550 2676,6512 "
style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
id="polyline30" />
<!-- Arrowhead on XXXpoint 2240 8550 - 2702 6390-->
<polyline
points="2704 6668 2696 6422 2586 6644 "
style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
id="polyline32" />
<!-- Line -->
<polyline
points="4050,9750 5682,6360 "
style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
id="polyline34" />
<!-- Arrowhead on XXXpoint 4050 9750 - 5736 6246-->
<polyline
points="5672 6518 5722 6276 5562 6466 "
style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
id="polyline36" />
<!-- Line -->
<polyline
points="1010,9750 2642,6360 "
style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
id="polyline38" />
<!-- Arrowhead on XXXpoint 1010 9750 - 2696 6246-->
<polyline
points="2632 6518 2682 6276 2522 6466 "
style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
id="polyline40" />
<!-- Line: box -->
<rect
x="0"
y="900"
width="6300"
height="7350"
rx="0"
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
id="rect42" />
<!-- Line: box -->
<rect
x="300"
y="1500"
width="5700"
height="3750"
rx="0"
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
id="rect44" />
<!-- Line -->
<polyline
points="1350,3900 2350,3040 "
style="stroke:#00d1d1;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
id="polyline46" />
<!-- Arrowhead on XXXpoint 1350 3900 - 2444 2960-->
<!-- Line -->
<polyline
points="4950,3900 3948,3040 "
style="stroke:#00d1d1;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
id="polyline50" />
<!-- Arrowhead on XXXpoint 4950 3900 - 3854 2960-->
<!-- Line -->
<polyline
points="4050,7050 4050,4864 "
style="stroke:#00d1d1;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
id="polyline54" />
<!-- Arrowhead on XXXpoint 4050 7050 - 4050 4740-->
<!-- Line -->
<polyline
points="1050,7050 1050,4864 "
style="stroke:#00d1d1;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
id="polyline58" />
<!-- Arrowhead on XXXpoint 1050 7050 - 1050 4740-->
<!-- Line -->
<polyline
points="2250,5850 2250,4864 "
style="stroke:#00d1d1;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
id="polyline62" />
<!-- Arrowhead on XXXpoint 2250 5850 - 2250 4740-->
<!-- Line -->
<polyline
points="2250,8550 2250,6814 "
style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
id="polyline66" />
<!-- Arrowhead on XXXpoint 2250 8550 - 2250 6690-->
<!-- Line -->
<polyline
points="1050,9750 1050,8014 "
style="stroke:#00ff00;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
id="polyline70" />
<!-- Arrowhead on XXXpoint 1050 9750 - 1050 7890-->
<!-- Line -->
<polyline
points="4050,9750 4050,8014 "
style="stroke:#00ff00;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
id="polyline74" />
<!-- Arrowhead on XXXpoint 4050 9750 - 4050 7890-->
<!-- Line -->
<polyline
points="5250,8550 5250,6814 "
style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
id="polyline78" />
<!-- Arrowhead on XXXpoint 5250 8550 - 5250 6690-->
<!-- Circle -->
<circle
cx="2850"
cy="4350"
r="76"
style="fill:#000000;stroke:#000000;stroke-width:14;"
id="circle82" />
<!-- Circle -->
<circle
cx="3150"
cy="4350"
r="76"
style="fill:#000000;stroke:#000000;stroke-width:14;"
id="circle84" />
<!-- Circle -->
<circle
cx="3450"
cy="4350"
r="76"
style="fill:#000000;stroke:#000000;stroke-width:14;"
id="circle86" />
<!-- Circle -->
<circle
cx="1350"
cy="5550"
r="76"
style="fill:#000000;stroke:#000000;stroke-width:14;"
id="circle88" />
<!-- Circle -->
<circle
cx="1650"
cy="5550"
r="76"
style="fill:#000000;stroke:#000000;stroke-width:14;"
id="circle90" />
<!-- Circle -->
<circle
cx="1950"
cy="5550"
r="76"
style="fill:#000000;stroke:#000000;stroke-width:14;"
id="circle92" />
<!-- Circle -->
<circle
cx="4350"
cy="5550"
r="76"
style="fill:#000000;stroke:#000000;stroke-width:14;"
id="circle94" />
<!-- Circle -->
<circle
cx="4650"
cy="5550"
r="76"
style="fill:#000000;stroke:#000000;stroke-width:14;"
id="circle96" />
<!-- Circle -->
<circle
cx="4950"
cy="5550"
r="76"
style="fill:#000000;stroke:#000000;stroke-width:14;"
id="circle98" />
<!-- Line: box -->
<rect
x="750"
y="3900"
width="1800"
height="900"
rx="0"
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
id="rect100" />
<!-- Line: box -->
<rect
x="300"
y="7050"
width="1500"
height="900"
rx="0"
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
id="rect102" />
<!-- Line: box -->
<rect
x="3750"
y="3900"
width="1800"
height="900"
rx="0"
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
id="rect104" />
<!-- Line: box -->
<rect
x="4500"
y="5850"
width="1500"
height="900"
rx="0"
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
id="rect106" />
<!-- Line: box -->
<rect
x="3300"
y="7050"
width="1500"
height="900"
rx="0"
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
id="rect108" />
<!-- Line: box -->
<rect
x="2250"
y="2100"
width="1800"
height="900"
rx="0"
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
id="rect110" />
<!-- Line: box -->
<rect
x="0"
y="9750"
width="2100"
height="900"
rx="0"
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
id="rect112" />
<!-- Line: box -->
<rect
x="1350"
y="8550"
width="2100"
height="900"
rx="0"
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
id="rect114" />
<!-- Line: box -->
<rect
x="3000"
y="9750"
width="2100"
height="900"
rx="0"
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
id="rect116" />
<!-- Line: box -->
<rect
x="4350"
y="8550"
width="2100"
height="900"
rx="0"
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
id="rect118" />
<!-- Line: box -->
<rect
x="1500"
y="5850"
width="1500"
height="900"
rx="0"
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
id="rect120" />
<!-- Text -->
<text
xml:space="preserve"
x="6450"
y="750"
fill="#000000"
font-family="Helvetica"
font-style="normal"
font-weight="normal"
font-size="192"
text-anchor="end"
id="text122">rcu_bh</text>
<!-- Text -->
<text
xml:space="preserve"
x="3150"
y="2400"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text124">struct</text>
<!-- Text -->
<text
xml:space="preserve"
x="3150"
y="2700"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text126">rcu_node</text>
<!-- Text -->
<text
xml:space="preserve"
x="1650"
y="4200"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text128">struct</text>
<!-- Text -->
<text
xml:space="preserve"
x="1650"
y="4500"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text130">rcu_node</text>
<!-- Text -->
<text
xml:space="preserve"
x="4650"
y="4500"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text132">rcu_node</text>
<!-- Text -->
<text
xml:space="preserve"
x="4650"
y="4200"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text134">struct</text>
<!-- Text -->
<text
xml:space="preserve"
x="2250"
y="6150"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text136">struct</text>
<!-- Text -->
<text
xml:space="preserve"
x="2250"
y="6450"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text138">rcu_data</text>
<!-- Text -->
<text
xml:space="preserve"
x="1050"
y="7350"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text140">struct</text>
<!-- Text -->
<text
xml:space="preserve"
x="1050"
y="7650"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text142">rcu_data</text>
<!-- Text -->
<text
xml:space="preserve"
x="5250"
y="6150"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text144">struct</text>
<!-- Text -->
<text
xml:space="preserve"
x="5250"
y="6450"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text146">rcu_data</text>
<!-- Text -->
<text
xml:space="preserve"
x="4050"
y="7350"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text148">struct</text>
<!-- Text -->
<text
xml:space="preserve"
x="4050"
y="7650"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text150">rcu_data</text>
<!-- Text -->
<text
xml:space="preserve"
x="450"
y="1800"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="start"
id="text152">struct rcu_state</text>
<!-- Text -->
<text
xml:space="preserve"
x="1050"
y="10050"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text154">struct</text>
<!-- Text -->
<text
xml:space="preserve"
x="1050"
y="10350"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text156">rcu_dynticks</text>
<!-- Text -->
<text
xml:space="preserve"
x="4050"
y="10050"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text158">struct</text>
<!-- Text -->
<text
xml:space="preserve"
x="4050"
y="10350"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text160">rcu_dynticks</text>
<!-- Text -->
<text
xml:space="preserve"
x="2400"
y="8850"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text162">struct</text>
<!-- Text -->
<text
xml:space="preserve"
x="2400"
y="9150"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text164">rcu_dynticks</text>
<!-- Text -->
<text
xml:space="preserve"
x="5400"
y="8850"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text166">struct</text>
<!-- Text -->
<text
xml:space="preserve"
x="5400"
y="9150"
fill="#000000"
font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="192"
text-anchor="middle"
id="text168">rcu_dynticks</text>
<!-- Text -->
<text
xml:space="preserve"
x="6900"
y="300"
fill="#000000"
font-family="Helvetica"
font-style="normal"
font-weight="normal"
font-size="192"
text-anchor="end"
id="text170">rcu_preempt</text>
<!-- Text -->
<text
xml:space="preserve"
x="6000"
y="1200"
fill="#000000"
font-family="Helvetica"
font-style="normal"
font-weight="normal"
font-size="192"
text-anchor="end"
id="text172">rcu_sched</text>
<!-- Line -->
<polyline
points="5250,5850 5250,4864 "
style="stroke:#00d1d1;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
id="polyline174" />
<!-- Arrowhead on XXXpoint 5250 5850 - 5250 4740-->
</g>
</svg>

Before

Width:  |  Height:  |  Size: 20 KiB

File diff suppressed because it is too large Load Diff

Before

Width:  |  Height:  |  Size: 24 KiB

After

Width:  |  Height:  |  Size: 22 KiB

View File

@@ -23,8 +23,6 @@ to each other.
The <tt>rcu_segcblist</tt> Structure</a>
<li> <a href="#The rcu_data Structure">
The <tt>rcu_data</tt> Structure</a>
<li> <a href="#The rcu_dynticks Structure">
The <tt>rcu_dynticks</tt> Structure</a>
<li> <a href="#The rcu_head Structure">
The <tt>rcu_head</tt> Structure</a>
<li> <a href="#RCU-Specific Fields in the task_struct Structure">
@@ -127,9 +125,11 @@ CPUs, RCU would configure the <tt>rcu_node</tt> tree as follows:
</p><p>RCU currently permits up to a four-level tree, which on a 64-bit system
accommodates up to 4,194,304 CPUs, though only a mere 524,288 CPUs for
32-bit systems.
On the other hand, you can set <tt>CONFIG_RCU_FANOUT</tt> to be
as small as 2 if you wish, which would permit only 16 CPUs, which
is useful for testing.
On the other hand, you can set both <tt>CONFIG_RCU_FANOUT</tt> and
<tt>CONFIG_RCU_FANOUT_LEAF</tt> to be as small as 2, which would result
in a 16-CPU test using a 4-level tree.
This can be useful for testing large-system capabilities on small test
machines.
</p><p>This multi-level combining tree allows us to get most of the
performance and scalability
@@ -154,44 +154,9 @@ on that root <tt>rcu_node</tt> structure remains acceptably low.
keeping lock contention under control at all tree levels regardless
of the level of loading on the system.
</p><p>The Linux kernel actually supports multiple flavors of RCU
running concurrently, so RCU builds separate data structures for each
flavor.
For example, for <tt>CONFIG_TREE_RCU=y</tt> kernels, RCU provides
rcu_sched and rcu_bh, as shown below:
</p><p><img src="BigTreeClassicRCUBH.svg" alt="BigTreeClassicRCUBH.svg" width="33%">
</p><p>Energy efficiency is increasingly important, and for that
reason the Linux kernel provides <tt>CONFIG_NO_HZ_IDLE</tt>, which
turns off the scheduling-clock interrupts on idle CPUs, which in
turn allows those CPUs to attain deeper sleep states and to consume
less energy.
CPUs whose scheduling-clock interrupts have been turned off are
said to be in <i>dyntick-idle mode</i>.
RCU must handle dyntick-idle CPUs specially
because RCU would otherwise wake up each CPU on every grace period,
which would defeat the whole purpose of <tt>CONFIG_NO_HZ_IDLE</tt>.
RCU uses the <tt>rcu_dynticks</tt> structure to track
which CPUs are in dyntick idle mode, as shown below:
</p><p><img src="BigTreeClassicRCUBHdyntick.svg" alt="BigTreeClassicRCUBHdyntick.svg" width="33%">
</p><p>However, if a CPU is in dyntick-idle mode, it is in that mode
for all flavors of RCU.
Therefore, a single <tt>rcu_dynticks</tt> structure is allocated per
CPU, and all of a given CPU's <tt>rcu_data</tt> structures share
that <tt>rcu_dynticks</tt>, as shown in the figure.
</p><p>Kernels built with <tt>CONFIG_PREEMPT_RCU</tt> support
rcu_preempt in addition to rcu_sched and rcu_bh, as shown below:
</p><p><img src="BigTreePreemptRCUBHdyntick.svg" alt="BigTreePreemptRCUBHdyntick.svg" width="35%">
</p><p>RCU updaters wait for normal grace periods by registering
RCU callbacks, either directly via <tt>call_rcu()</tt> and
friends (namely <tt>call_rcu_bh()</tt> and <tt>call_rcu_sched()</tt>),
there being a separate interface per flavor of RCU)
or indirectly via <tt>synchronize_rcu()</tt> and friends.
RCU callbacks are represented by <tt>rcu_head</tt> structures,
which are queued on <tt>rcu_data</tt> structures while they are
@@ -214,9 +179,6 @@ its own synchronization:
<li> Each <tt>rcu_node</tt> structure has a spinlock.
<li> The fields in <tt>rcu_data</tt> are private to the corresponding
CPU, although a few can be read and written by other CPUs.
<li> Similarly, the fields in <tt>rcu_dynticks</tt> are private
to the corresponding CPU, although a few can be read by
other CPUs.
</ol>
<p>It is important to note that different data structures can have
@@ -272,11 +234,6 @@ follows:
access to this information from the corresponding CPU.
Finally, this structure records past dyntick-idle state
for the corresponding CPU and also tracks statistics.
<li> <tt>rcu_dynticks</tt>:
This per-CPU structure tracks the current dyntick-idle
state for the corresponding CPU.
Unlike the other three structures, the <tt>rcu_dynticks</tt>
structure is not replicated per RCU flavor.
<li> <tt>rcu_head</tt>:
This structure represents RCU callbacks, and is the
only structure allocated and managed by RCU users.
@@ -287,14 +244,14 @@ follows:
<p>If all you wanted from this article was a general notion of how
RCU's data structures are related, you are done.
Otherwise, each of the following sections give more details on
the <tt>rcu_state</tt>, <tt>rcu_node</tt>, <tt>rcu_data</tt>,
and <tt>rcu_dynticks</tt> data structures.
the <tt>rcu_state</tt>, <tt>rcu_node</tt> and <tt>rcu_data</tt> data
structures.
<h3><a name="The rcu_state Structure">
The <tt>rcu_state</tt> Structure</a></h3>
<p>The <tt>rcu_state</tt> structure is the base structure that
represents a flavor of RCU.
represents the state of RCU in the system.
This structure forms the interconnection between the
<tt>rcu_node</tt> and <tt>rcu_data</tt> structures,
tracks grace periods, contains the lock used to
@@ -389,7 +346,7 @@ sequence number.
The bottom two bits are the state of the current grace period,
which can be zero for not yet started or one for in progress.
In other words, if the bottom two bits of <tt>-&gt;gp_seq</tt> are
zero, the corresponding flavor of RCU is idle.
zero, then RCU is idle.
Any other value in the bottom two bits indicates that something is broken.
This field is protected by the root <tt>rcu_node</tt> structure's
<tt>-&gt;lock</tt> field.
@@ -419,10 +376,10 @@ as follows:
grace period in jiffies.
It is protected by the root <tt>rcu_node</tt>'s <tt>-&gt;lock</tt>.
<p>The <tt>-&gt;name</tt> field points to the name of the RCU flavor
(for example, &ldquo;rcu_sched&rdquo;), and is constant.
The <tt>-&gt;abbr</tt> field contains a one-character abbreviation,
for example, &ldquo;s&rdquo; for RCU-sched.
<p>The <tt>-&gt;name</tt> and <tt>-&gt;abbr</tt> fields distinguish
between preemptible RCU (&ldquo;rcu_preempt&rdquo; and &ldquo;p&rdquo;)
and non-preemptible RCU (&ldquo;rcu_sched&rdquo; and &ldquo;s&rdquo;).
These fields are used for diagnostic and tracing purposes.
<h3><a name="The rcu_node Structure">
The <tt>rcu_node</tt> Structure</a></h3>
@@ -971,25 +928,31 @@ this <tt>rcu_segcblist</tt> structure, <i>not</i> the <tt>-&gt;head</tt>
pointer.
The reason for this is that all the ready-to-invoke callbacks
(that is, those in the <tt>RCU_DONE_TAIL</tt> segment) are extracted
all at once at callback-invocation time.
all at once at callback-invocation time (<tt>rcu_do_batch</tt>), due
to which <tt>-&gt;head</tt> may be set to NULL if there are no not-done
callbacks remaining in the <tt>rcu_segcblist</tt>.
If callback invocation must be postponed, for example, because a
high-priority process just woke up on this CPU, then the remaining
callbacks are placed back on the <tt>RCU_DONE_TAIL</tt> segment.
Either way, the <tt>-&gt;len</tt> and <tt>-&gt;len_lazy</tt> counts
are adjusted after the corresponding callbacks have been invoked, and so
again it is the <tt>-&gt;len</tt> count that accurately reflects whether
or not there are callbacks associated with this <tt>rcu_segcblist</tt>
structure.
callbacks are placed back on the <tt>RCU_DONE_TAIL</tt> segment and
<tt>-&gt;head</tt> once again points to the start of the segment.
In short, the head field can briefly be <tt>NULL</tt> even though the
CPU has callbacks present the entire time.
Therefore, it is not appropriate to test the <tt>-&gt;head</tt> pointer
for <tt>NULL</tt>.
<p>In contrast, the <tt>-&gt;len</tt> and <tt>-&gt;len_lazy</tt> counts
are adjusted only after the corresponding callbacks have been invoked.
This means that the <tt>-&gt;len</tt> count is zero only if
the <tt>rcu_segcblist</tt> structure really is devoid of callbacks.
Of course, off-CPU sampling of the <tt>-&gt;len</tt> count requires
the use of appropriate synchronization, for example, memory barriers.
careful use of appropriate synchronization, for example, memory barriers.
This synchronization can be a bit subtle, particularly in the case
of <tt>rcu_barrier()</tt>.
<h3><a name="The rcu_data Structure">
The <tt>rcu_data</tt> Structure</a></h3>
<p>The <tt>rcu_data</tt> maintains the per-CPU state for the
corresponding flavor of RCU.
<p>The <tt>rcu_data</tt> maintains the per-CPU state for the RCU subsystem.
The fields in this structure may be accessed only from the corresponding
CPU (and from tracing) unless otherwise stated.
This structure is the
@@ -1015,30 +978,19 @@ as follows:
<pre>
1 int cpu;
2 struct rcu_state *rsp;
3 struct rcu_node *mynode;
4 struct rcu_dynticks *dynticks;
5 unsigned long grpmask;
6 bool beenonline;
2 struct rcu_node *mynode;
3 unsigned long grpmask;
4 bool beenonline;
</pre>
<p>The <tt>-&gt;cpu</tt> field contains the number of the
corresponding CPU, the <tt>-&gt;rsp</tt> pointer references
the corresponding <tt>rcu_state</tt> structure (and is most frequently
used to locate the name of the corresponding flavor of RCU for tracing),
and the <tt>-&gt;mynode</tt> field references the corresponding
<tt>rcu_node</tt> structure.
corresponding CPU and the <tt>-&gt;mynode</tt> field references the
corresponding <tt>rcu_node</tt> structure.
The <tt>-&gt;mynode</tt> is used to propagate quiescent states
up the combining tree.
<p>The <tt>-&gt;dynticks</tt> pointer references the
<tt>rcu_dynticks</tt> structure corresponding to this
CPU.
Recall that a single per-CPU instance of the <tt>rcu_dynticks</tt>
structure is shared among all flavors of RCU.
These first four fields are constant and therefore require not
synchronization.
These two fields are constant and therefore do not require synchronization.
</p><p>The <tt>-&gt;grpmask</tt> field indicates the bit in
<p>The <tt>-&gt;grpmask</tt> field indicates the bit in
the <tt>-&gt;mynode-&gt;qsmask</tt> corresponding to this
<tt>rcu_data</tt> structure, and is also used when propagating
quiescent states.
@@ -1057,12 +1009,12 @@ as follows:
3 bool cpu_no_qs;
4 bool core_needs_qs;
5 bool gpwrap;
6 unsigned long rcu_qs_ctr_snap;
</pre>
<p>The <tt>-&gt;gp_seq</tt> and <tt>-&gt;gp_seq_needed</tt>
fields are the counterparts of the fields of the same name
in the <tt>rcu_state</tt> and <tt>rcu_node</tt> structures.
<p>The <tt>-&gt;gp_seq</tt> field is the counterpart of the field of the same
name in the <tt>rcu_state</tt> and <tt>rcu_node</tt> structures. The
<tt>-&gt;gp_seq_needed</tt> field is the counterpart of the field of the same
name in the rcu_node</tt> structure.
They may each lag up to one behind their <tt>rcu_node</tt>
counterparts, but in <tt>CONFIG_NO_HZ_IDLE</tt> and
<tt>CONFIG_NO_HZ_FULL</tt> kernels can lag
@@ -1103,10 +1055,6 @@ CPU has remained idle for so long that the
<tt>gp_seq</tt> counter is in danger of overflow, which
will cause the CPU to disregard the values of its counters on
its next exit from idle.
Finally, the <tt>rcu_qs_ctr_snap</tt> field is used to detect
cases where a given operation has resulted in a quiescent state
for all flavors of RCU, for example, <tt>cond_resched()</tt>
when RCU has indicated a need for quiescent states.
<h5>RCU Callback Handling</h5>
@@ -1179,26 +1127,22 @@ Finally, the <tt>-&gt;dynticks_fqs</tt> field is used to
count the number of times this CPU is determined to be in
dyntick-idle state, and is used for tracing and debugging purposes.
<h3><a name="The rcu_dynticks Structure">
The <tt>rcu_dynticks</tt> Structure</a></h3>
<p>The <tt>rcu_dynticks</tt> maintains the per-CPU dyntick-idle state
for the corresponding CPU.
Unlike the other structures, <tt>rcu_dynticks</tt> is not
replicated over the different flavors of RCU.
The fields in this structure may be accessed only from the corresponding
CPU (and from tracing) unless otherwise stated.
Its fields are as follows:
<p>
This portion of the rcu_data structure is declared as follows:
<pre>
1 long dynticks_nesting;
2 long dynticks_nmi_nesting;
3 atomic_t dynticks;
4 bool rcu_need_heavy_qs;
5 unsigned long rcu_qs_ctr;
6 bool rcu_urgent_qs;
5 bool rcu_urgent_qs;
</pre>
<p>These fields in the rcu_data structure maintain the per-CPU dyntick-idle
state for the corresponding CPU.
The fields may be accessed only from the corresponding CPU (and from tracing)
unless otherwise stated.
<p>The <tt>-&gt;dynticks_nesting</tt> field counts the
nesting depth of process execution, so that in normal circumstances
this counter has value zero or one.
@@ -1240,19 +1184,12 @@ it is willing to call for heavy-weight dyntick-counter operations.
This flag is checked by RCU's context-switch and <tt>cond_resched()</tt>
code, which provide a momentary idle sojourn in response.
</p><p>The <tt>-&gt;rcu_qs_ctr</tt> field is used to record
quiescent states from <tt>cond_resched()</tt>.
Because <tt>cond_resched()</tt> can execute quite frequently, this
must be quite lightweight, as in a non-atomic increment of this
per-CPU field.
</p><p>Finally, the <tt>-&gt;rcu_urgent_qs</tt> field is used to record
the fact that the RCU core code would really like to see a quiescent
state from the corresponding CPU, with the various other fields indicating
just how badly RCU wants this quiescent state.
This flag is checked by RCU's context-switch and <tt>cond_resched()</tt>
code, which, if nothing else, non-atomically increment <tt>-&gt;rcu_qs_ctr</tt>
in response.
the fact that the RCU core code would really like to see a quiescent state from
the corresponding CPU, with the various other fields indicating just how badly
RCU wants this quiescent state.
This flag is checked by RCU's context-switch path
(<tt>rcu_note_context_switch</tt>) and the cond_resched code.
<table>
<tr><th>&nbsp;</th></tr>
@@ -1425,11 +1362,11 @@ the last part of the array, thus traversing only the leaf
<h3><a name="Summary">
Summary</a></h3>
So each flavor of RCU is represented by an <tt>rcu_state</tt> structure,
So the state of RCU is represented by an <tt>rcu_state</tt> structure,
which contains a combining tree of <tt>rcu_node</tt> and
<tt>rcu_data</tt> structures.
Finally, in <tt>CONFIG_NO_HZ_IDLE</tt> kernels, each CPU's dyntick-idle
state is tracked by an <tt>rcu_dynticks</tt> structure.
state is tracked by dynticks-related fields in the <tt>rcu_data</tt> structure.
If you made it this far, you are well prepared to read the code
walkthroughs in the other articles in this series.

File diff suppressed because it is too large Load Diff

Before

Width:  |  Height:  |  Size: 24 KiB

After

Width:  |  Height:  |  Size: 20 KiB

View File

@@ -328,13 +328,13 @@
inkscape:window-height="1148"
id="namedview90"
showgrid="true"
inkscape:zoom="0.80021373"
inkscape:cx="462.49289"
inkscape:cy="473.6718"
inkscape:zoom="0.69092787"
inkscape:cx="476.34085"
inkscape:cy="712.80957"
inkscape:window-x="770"
inkscape:window-y="24"
inkscape:window-maximized="0"
inkscape:current-layer="g4114-9-3-9"
inkscape:current-layer="g4"
inkscape:snap-grids="false"
fit-margin-top="5"
fit-margin-right="5"
@@ -813,14 +813,18 @@
<text
sodipodi:linespacing="125%"
id="text4110-5-7-6-2-4-0"
y="841.88086"
y="670.74316"
x="1460.1007"
style="font-size:267.24359131px;font-style:normal;font-weight:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
xml:space="preserve"><tspan
y="841.88086"
y="670.74316"
x="1460.1007"
sodipodi:role="line"
id="tspan4925-1-2-4-5">reched_cpu()</tspan></text>
id="tspan4925-1-2-4-5">Request</tspan><tspan
y="1004.7976"
x="1460.1007"
sodipodi:role="line"
id="tspan3100">context switch</tspan></text>
</g>
</g>
</svg>

Before

Width:  |  Height:  |  Size: 32 KiB

After

Width:  |  Height:  |  Size: 32 KiB

View File

@@ -72,10 +72,10 @@ will ignore it because idle and offline CPUs are already residing
in quiescent states.
Otherwise, the expedited grace period will use
<tt>smp_call_function_single()</tt> to send the CPU an IPI, which
is handled by <tt>sync_rcu_exp_handler()</tt>.
is handled by <tt>rcu_exp_handler()</tt>.
<p>
However, because this is preemptible RCU, <tt>sync_rcu_exp_handler()</tt>
However, because this is preemptible RCU, <tt>rcu_exp_handler()</tt>
can check to see if the CPU is currently running in an RCU read-side
critical section.
If not, the handler can immediately report a quiescent state.
@@ -145,24 +145,23 @@ expedited grace period is shown in the following diagram:
<p><img src="ExpSchedFlow.svg" alt="ExpSchedFlow.svg" width="55%">
<p>
As with RCU-preempt's <tt>synchronize_rcu_expedited()</tt>,
As with RCU-preempt, RCU-sched's
<tt>synchronize_sched_expedited()</tt> ignores offline and
idle CPUs, again because they are in remotely detectable
quiescent states.
However, the <tt>synchronize_rcu_expedited()</tt> handler
is <tt>sync_sched_exp_handler()</tt>, and because the
However, because the
<tt>rcu_read_lock_sched()</tt> and <tt>rcu_read_unlock_sched()</tt>
leave no trace of their invocation, in general it is not possible to tell
whether or not the current CPU is in an RCU read-side critical section.
The best that <tt>sync_sched_exp_handler()</tt> can do is to check
The best that RCU-sched's <tt>rcu_exp_handler()</tt> can do is to check
for idle, on the off-chance that the CPU went idle while the IPI
was in flight.
If the CPU is idle, then <tt>sync_sched_exp_handler()</tt> reports
If the CPU is idle, then <tt>rcu_exp_handler()</tt> reports
the quiescent state.
<p>
Otherwise, the handler invokes <tt>resched_cpu()</tt>, which forces
a future context switch.
<p> Otherwise, the handler forces a future context switch by setting the
NEED_RESCHED flag of the current task's thread flag and the CPU preempt
counter.
At the time of the context switch, the CPU reports the quiescent state.
Should the CPU go offline first, it will report the quiescent state
at that time.
@@ -298,19 +297,18 @@ Instead, the task pushing the grace period forward will include the
idle CPUs in the mask passed to <tt>rcu_report_exp_cpu_mult()</tt>.
<p>
For RCU-sched, there is an additional check for idle in the IPI
handler, <tt>sync_sched_exp_handler()</tt>.
For RCU-sched, there is an additional check:
If the IPI has interrupted the idle loop, then
<tt>sync_sched_exp_handler()</tt> invokes <tt>rcu_report_exp_rdp()</tt>
<tt>rcu_exp_handler()</tt> invokes <tt>rcu_report_exp_rdp()</tt>
to report the corresponding quiescent state.
<p>
For RCU-preempt, there is no specific check for idle in the
IPI handler (<tt>sync_rcu_exp_handler()</tt>), but because
IPI handler (<tt>rcu_exp_handler()</tt>), but because
RCU read-side critical sections are not permitted within the
idle loop, if <tt>sync_rcu_exp_handler()</tt> sees that the CPU is within
idle loop, if <tt>rcu_exp_handler()</tt> sees that the CPU is within
RCU read-side critical section, the CPU cannot possibly be idle.
Otherwise, <tt>sync_rcu_exp_handler()</tt> invokes
Otherwise, <tt>rcu_exp_handler()</tt> invokes
<tt>rcu_report_exp_rdp()</tt> to report the corresponding quiescent
state, regardless of whether or not that quiescent state was due to
the CPU being idle.
@@ -625,6 +623,8 @@ checks, but only during the mid-boot dead zone.
<p>
With this refinement, synchronous grace periods can now be used from
task context pretty much any time during the life of the kernel.
That is, aside from some points in the suspend, hibernate, or shutdown
code path.
<h3><a name="Summary">
Summary</a></h3>

View File

@@ -77,7 +77,7 @@ The key point is that the lock-acquisition functions, including
<tt>smp_mb__after_unlock_lock()</tt> immediately after successful
acquisition of the lock.
<p>Therefore, for any given <tt>rcu_node</tt> struction, any access
<p>Therefore, for any given <tt>rcu_node</tt> structure, any access
happening before one of the above lock-release functions will be seen
by all CPUs as happening before any access happening after a later
one of the above lock-acquisition functions.
@@ -485,13 +485,13 @@ section that the grace period must wait on.
noted by <tt>rcu_node_context_switch()</tt> on the left.
On the other hand, if the CPU takes a scheduler-clock interrupt
while executing in usermode, a quiescent state will be noted by
<tt>rcu_check_callbacks()</tt> on the right.
<tt>rcu_sched_clock_irq()</tt> on the right.
Either way, the passage through a quiescent state will be noted
in a per-CPU variable.
<p>The next time an <tt>RCU_SOFTIRQ</tt> handler executes on
this CPU (for example, after the next scheduler-clock
interrupt), <tt>__rcu_process_callbacks()</tt> will invoke
interrupt), <tt>rcu_core()</tt> will invoke
<tt>rcu_check_quiescent_state()</tt>, which will notice the
recorded quiescent state, and invoke
<tt>rcu_report_qs_rdp()</tt>.
@@ -651,7 +651,7 @@ to end.
These callbacks are identified by <tt>rcu_advance_cbs()</tt>,
which is usually invoked by <tt>__note_gp_changes()</tt>.
As shown in the diagram below, this invocation can be triggered by
the scheduling-clock interrupt (<tt>rcu_check_callbacks()</tt> on
the scheduling-clock interrupt (<tt>rcu_sched_clock_irq()</tt> on
the left) or by idle entry (<tt>rcu_cleanup_after_idle()</tt> on
the right, but only for kernels build with
<tt>CONFIG_RCU_FAST_NO_HZ=y</tt>).

View File

@@ -349,7 +349,7 @@
font-weight="bold"
font-size="192"
id="text202-7-5"
style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcu_check_callbacks()</text>
style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcu_sched_clock_irq()</text>
<rect
x="7069.6187"
y="5087.4678"

Before

Width:  |  Height:  |  Size: 16 KiB

After

Width:  |  Height:  |  Size: 16 KiB

View File

@@ -3902,7 +3902,7 @@
font-style="normal"
y="-4418.6582"
x="3745.7725"
xml:space="preserve">rcu_check_callbacks()</text>
xml:space="preserve">rcu_sched_clock_irq()</text>
</g>
<g
transform="translate(-850.30204,55463.106)"
@@ -3924,7 +3924,7 @@
font-style="normal"
y="-4418.6582"
x="3745.7725"
xml:space="preserve">rcu_process_callbacks()</text>
xml:space="preserve">rcu_core()</text>
<text
style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier"
id="text202-7-5-3-27-0"
@@ -3933,7 +3933,7 @@
font-style="normal"
y="-4165.7954"
x="3745.7725"
xml:space="preserve">rcu_check_quiescent_state())</text>
xml:space="preserve">rcu_check_quiescent_state()</text>
<text
style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier"
id="text202-7-5-3-27-0-9"
@@ -4968,7 +4968,7 @@
font-weight="bold"
font-size="192"
id="text202-7-5-19"
style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcu_check_callbacks()</text>
style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcu_sched_clock_irq()</text>
<rect
x="5314.2671"
y="82817.688"

Before

Width:  |  Height:  |  Size: 209 KiB

After

Width:  |  Height:  |  Size: 209 KiB

View File

@@ -775,7 +775,7 @@
font-style="normal"
y="-4418.6582"
x="3745.7725"
xml:space="preserve">rcu_check_callbacks()</text>
xml:space="preserve">rcu_sched_clock_irq()</text>
</g>
<g
transform="translate(399.7744,828.86448)"
@@ -797,7 +797,7 @@
font-style="normal"
y="-4418.6582"
x="3745.7725"
xml:space="preserve">rcu_process_callbacks()</text>
xml:space="preserve">rcu_core()</text>
<text
style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier"
id="text202-7-5-3-27-0"
@@ -806,7 +806,7 @@
font-style="normal"
y="-4165.7954"
x="3745.7725"
xml:space="preserve">rcu_check_quiescent_state())</text>
xml:space="preserve">rcu_check_quiescent_state()</text>
<text
style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier"
id="text202-7-5-3-27-0-9"

Before

Width:  |  Height:  |  Size: 43 KiB

After

Width:  |  Height:  |  Size: 43 KiB

View File

@@ -900,8 +900,6 @@ Except where otherwise noted, these non-guarantees were premeditated.
Grace Periods Don't Partition Read-Side Critical Sections</a>
<li> <a href="#Read-Side Critical Sections Don't Partition Grace Periods">
Read-Side Critical Sections Don't Partition Grace Periods</a>
<li> <a href="#Disabling Preemption Does Not Block Grace Periods">
Disabling Preemption Does Not Block Grace Periods</a>
</ol>
<h3><a name="Readers Impose Minimal Ordering">Readers Impose Minimal Ordering</a></h3>
@@ -1259,54 +1257,6 @@ of RCU grace periods.
<tr><td>&nbsp;</td></tr>
</table>
<h3><a name="Disabling Preemption Does Not Block Grace Periods">
Disabling Preemption Does Not Block Grace Periods</a></h3>
<p>
There was a time when disabling preemption on any given CPU would block
subsequent grace periods.
However, this was an accident of implementation and is not a requirement.
And in the current Linux-kernel implementation, disabling preemption
on a given CPU in fact does not block grace periods, as Oleg Nesterov
<a href="https://lkml.kernel.org/g/20150614193825.GA19582@redhat.com">demonstrated</a>.
<p>
If you need a preempt-disable region to block grace periods, you need to add
<tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>, for example
as follows:
<blockquote>
<pre>
1 preempt_disable();
2 rcu_read_lock();
3 do_something();
4 rcu_read_unlock();
5 preempt_enable();
6
7 /* Spinlocks implicitly disable preemption. */
8 spin_lock(&amp;mylock);
9 rcu_read_lock();
10 do_something();
11 rcu_read_unlock();
12 spin_unlock(&amp;mylock);
</pre>
</blockquote>
<p>
In theory, you could enter the RCU read-side critical section first,
but it is more efficient to keep the entire RCU read-side critical
section contained in the preempt-disable region as shown above.
Of course, RCU read-side critical sections that extend outside of
preempt-disable regions will work correctly, but such critical sections
can be preempted, which forces <tt>rcu_read_unlock()</tt> to do
more work.
And no, this is <i>not</i> an invitation to enclose all of your RCU
read-side critical sections within preempt-disable regions, because
doing so would degrade real-time response.
<p>
This non-requirement appeared with preemptible RCU.
<h2><a name="Parallelism Facts of Life">Parallelism Facts of Life</a></h2>
<p>
@@ -1381,6 +1331,7 @@ Classes of quality-of-implementation requirements are as follows:
<ol>
<li> <a href="#Specialization">Specialization</a>
<li> <a href="#Performance and Scalability">Performance and Scalability</a>
<li> <a href="#Forward Progress">Forward Progress</a>
<li> <a href="#Composability">Composability</a>
<li> <a href="#Corner Cases">Corner Cases</a>
</ol>
@@ -1645,7 +1596,7 @@ used in place of <tt>synchronize_rcu()</tt> as follows:
16 struct foo *p;
17
18 spin_lock(&amp;gp_lock);
19 p = rcu_dereference(gp);
19 p = rcu_access_pointer(gp);
20 if (!p) {
21 spin_unlock(&amp;gp_lock);
22 return false;
@@ -1822,6 +1773,106 @@ so it is too early to tell whether they will stand the test of time.
RCU thus provides a range of tools to allow updaters to strike the
required tradeoff between latency, flexibility and CPU overhead.
<h3><a name="Forward Progress">Forward Progress</a></h3>
<p>
In theory, delaying grace-period completion and callback invocation
is harmless.
In practice, not only are memory sizes finite but also callbacks sometimes
do wakeups, and sufficiently deferred wakeups can be difficult
to distinguish from system hangs.
Therefore, RCU must provide a number of mechanisms to promote forward
progress.
<p>
These mechanisms are not foolproof, nor can they be.
For one simple example, an infinite loop in an RCU read-side critical
section must by definition prevent later grace periods from ever completing.
For a more involved example, consider a 64-CPU system built with
<tt>CONFIG_RCU_NOCB_CPU=y</tt> and booted with <tt>rcu_nocbs=1-63</tt>,
where CPUs&nbsp;1 through&nbsp;63 spin in tight loops that invoke
<tt>call_rcu()</tt>.
Even if these tight loops also contain calls to <tt>cond_resched()</tt>
(thus allowing grace periods to complete), CPU&nbsp;0 simply will
not be able to invoke callbacks as fast as the other 63 CPUs can
register them, at least not until the system runs out of memory.
In both of these examples, the Spiderman principle applies: With great
power comes great responsibility.
However, short of this level of abuse, RCU is required to
ensure timely completion of grace periods and timely invocation of
callbacks.
<p>
RCU takes the following steps to encourage timely completion of
grace periods:
<ol>
<li> If a grace period fails to complete within 100&nbsp;milliseconds,
RCU causes future invocations of <tt>cond_resched()</tt> on
the holdout CPUs to provide an RCU quiescent state.
RCU also causes those CPUs' <tt>need_resched()</tt> invocations
to return <tt>true</tt>, but only after the corresponding CPU's
next scheduling-clock.
<li> CPUs mentioned in the <tt>nohz_full</tt> kernel boot parameter
can run indefinitely in the kernel without scheduling-clock
interrupts, which defeats the above <tt>need_resched()</tt>
strategem.
RCU will therefore invoke <tt>resched_cpu()</tt> on any
<tt>nohz_full</tt> CPUs still holding out after
109&nbsp;milliseconds.
<li> In kernels built with <tt>CONFIG_RCU_BOOST=y</tt>, if a given
task that has been preempted within an RCU read-side critical
section is holding out for more than 500&nbsp;milliseconds,
RCU will resort to priority boosting.
<li> If a CPU is still holding out 10&nbsp;seconds into the grace
period, RCU will invoke <tt>resched_cpu()</tt> on it regardless
of its <tt>nohz_full</tt> state.
</ol>
<p>
The above values are defaults for systems running with <tt>HZ=1000</tt>.
They will vary as the value of <tt>HZ</tt> varies, and can also be
changed using the relevant Kconfig options and kernel boot parameters.
RCU currently does not do much sanity checking of these
parameters, so please use caution when changing them.
Note that these forward-progress measures are provided only for RCU,
not for
<a href="#Sleepable RCU">SRCU</a> or
<a href="#Tasks RCU">Tasks RCU</a>.
<p>
RCU takes the following steps in <tt>call_rcu()</tt> to encourage timely
invocation of callbacks when any given non-<tt>rcu_nocbs</tt> CPU has
10,000 callbacks, or has 10,000 more callbacks than it had the last time
encouragement was provided:
<ol>
<li> Starts a grace period, if one is not already in progress.
<li> Forces immediate checking for quiescent states, rather than
waiting for three milliseconds to have elapsed since the
beginning of the grace period.
<li> Immediately tags the CPU's callbacks with their grace period
completion numbers, rather than waiting for the <tt>RCU_SOFTIRQ</tt>
handler to get around to it.
<li> Lifts callback-execution batch limits, which speeds up callback
invocation at the expense of degrading realtime response.
</ol>
<p>
Again, these are default values when running at <tt>HZ=1000</tt>,
and can be overridden.
Again, these forward-progress measures are provided only for RCU,
not for
<a href="#Sleepable RCU">SRCU</a> or
<a href="#Tasks RCU">Tasks RCU</a>.
Even for RCU, callback-invocation forward progress for <tt>rcu_nocbs</tt>
CPUs is much less well-developed, in part because workloads benefiting
from <tt>rcu_nocbs</tt> CPUs tend to invoke <tt>call_rcu()</tt>
relatively infrequently.
If workloads emerge that need both <tt>rcu_nocbs</tt> CPUs and high
<tt>call_rcu()</tt> invocation rates, then additional forward-progress
work will be required.
<h3><a name="Composability">Composability</a></h3>
<p>
@@ -2272,7 +2323,7 @@ that meets this requirement.
Furthermore, NMI handlers can be interrupted by what appear to RCU
to be normal interrupts.
One way that this can happen is for code that directly invokes
<tt>rcu_irq_enter()</tt> and </tt>rcu_irq_exit()</tt> to be called
<tt>rcu_irq_enter()</tt> and <tt>rcu_irq_exit()</tt> to be called
from an NMI handler.
This astonishing fact of life prompted the current code structure,
which has <tt>rcu_irq_enter()</tt> invoking <tt>rcu_nmi_enter()</tt>
@@ -2294,7 +2345,7 @@ via <tt>del_timer_sync()</tt> or similar.
<p>
Unfortunately, there is no way to cancel an RCU callback;
once you invoke <tt>call_rcu()</tt>, the callback function is
going to eventually be invoked, unless the system goes down first.
eventually going to be invoked, unless the system goes down first.
Because it is normally considered socially irresponsible to crash the system
in response to a module unload request, we need some other way
to deal with in-flight RCU callbacks.
@@ -2424,23 +2475,37 @@ for context-switch-heavy <tt>CONFIG_NO_HZ_FULL=y</tt> workloads,
but there is room for further improvement.
<p>
In the past, it was forbidden to disable interrupts across an
<tt>rcu_read_unlock()</tt> unless that interrupt-disabled region
of code also included the matching <tt>rcu_read_lock()</tt>.
Violating this restriction could result in deadlocks involving the
scheduler's runqueue and priority-inheritance spinlocks.
This restriction was lifted when interrupt-disabled calls to
<tt>rcu_read_unlock()</tt> started deferring the reporting of
the resulting RCU-preempt quiescent state until the end of that
It is forbidden to hold any of scheduler's runqueue or priority-inheritance
spinlocks across an <tt>rcu_read_unlock()</tt> unless interrupts have been
disabled across the entire RCU read-side critical section, that is,
up to and including the matching <tt>rcu_read_lock()</tt>.
Violating this restriction can result in deadlocks involving these
scheduler spinlocks.
There was hope that this restriction might be lifted when interrupt-disabled
calls to <tt>rcu_read_unlock()</tt> started deferring the reporting of
the resulting RCU-preempt quiescent state until the end of the corresponding
interrupts-disabled region.
This deferred reporting means that the scheduler's runqueue and
priority-inheritance locks cannot be held while reporting an RCU-preempt
quiescent state, which lifts the earlier restriction, at least from
a deadlock perspective.
Unfortunately, real-time systems using RCU priority boosting may
Unfortunately, timely reporting of the corresponding quiescent state
to expedited grace periods requires a call to <tt>raise_softirq()</tt>,
which can acquire these scheduler spinlocks.
In addition, real-time systems using RCU priority boosting
need this restriction to remain in effect because deferred
quiescent-state reporting also defers deboosting, which in turn
degrades real-time latencies.
quiescent-state reporting would also defer deboosting, which in turn
would degrade real-time latencies.
<p>
In theory, if a given RCU read-side critical section could be
guaranteed to be less than one second in duration, holding a scheduler
spinlock across that critical section's <tt>rcu_read_unlock()</tt>
would require only that preemption be disabled across the entire
RCU read-side critical section, not interrupts.
Unfortunately, given the possibility of vCPU preemption, long-running
interrupts, and so on, it is not possible in practice to guarantee
that a given RCU read-side critical section will complete in less than
one second.
Therefore, as noted above, if scheduler spinlocks are held across
a given call to <tt>rcu_read_unlock()</tt>, interrupts must be
disabled across the entire RCU read-side critical section.
<h3><a name="Tracing and RCU">Tracing and RCU</a></h3>
@@ -3034,7 +3099,7 @@ If you block forever in one of a given domain's SRCU read-side critical
sections, then that domain's grace periods will also be blocked forever.
Of course, one good way to block forever is to deadlock, which can
happen if any operation in a given domain's SRCU read-side critical
section can block waiting, either directly or indirectly, for that domain's
section can wait, either directly or indirectly, for that domain's
grace period to elapse.
For example, this results in a self-deadlock:
@@ -3074,12 +3139,18 @@ API, which, in combination with <tt>srcu_read_unlock()</tt>,
guarantees a full memory barrier.
<p>
Also unlike other RCU flavors, SRCU's callbacks-wait function
<tt>srcu_barrier()</tt> may be invoked from CPU-hotplug notifiers,
though this is not necessarily a good idea.
The reason that this is possible is that SRCU is insensitive
to whether or not a CPU is online, which means that <tt>srcu_barrier()</tt>
need not exclude CPU-hotplug operations.
Also unlike other RCU flavors, <tt>synchronize_srcu()</tt> may <b>not</b>
be invoked from CPU-hotplug notifiers, due to the fact that SRCU grace
periods make use of timers and the possibility of timers being temporarily
&ldquo;stranded&rdquo; on the outgoing CPU.
This stranding of timers means that timers posted to the outgoing CPU
will not fire until late in the CPU-hotplug process.
The problem is that if a notifier is waiting on an SRCU grace period,
that grace period is waiting on a timer, and that timer is stranded on the
outgoing CPU, then the notifier will never be awakened, in other words,
deadlock has occurred.
This same situation of course also prohibits <tt>srcu_barrier()</tt>
from being invoked from CPU-hotplug notifiers.
<p>
SRCU also differs from other RCU flavors in that SRCU's expedited and
@@ -3233,6 +3304,11 @@ For example, RCU callback overhead might be charged back to the
originating <tt>call_rcu()</tt> instance, though probably not
in production kernels.
<p>
Additional work may be required to provide reasonable forward-progress
guarantees under heavy load for grace periods and for callback
invocation.
<h2><a name="Summary">Summary</a></h2>
<p>

View File

@@ -63,7 +63,7 @@ over a rather long period of time, but improvements are always welcome!
pointer must be covered by rcu_read_lock(), rcu_read_lock_bh(),
rcu_read_lock_sched(), or by the appropriate update-side lock.
Disabling of preemption can serve as rcu_read_lock_sched(), but
is less readable.
is less readable and prevents lockdep from detecting locking issues.
Letting RCU-protected pointers "leak" out of an RCU read-side
critical section is every bid as bad as letting them leak out
@@ -285,11 +285,7 @@ over a rather long period of time, but improvements are always welcome!
here is that superuser already has lots of ways to crash
the machine.
d. Use call_rcu_bh() rather than call_rcu(), in order to take
advantage of call_rcu_bh()'s faster grace periods. (This
is only a partial solution, though.)
e. Periodically invoke synchronize_rcu(), permitting a limited
d. Periodically invoke synchronize_rcu(), permitting a limited
number of updates per grace period.
The same cautions apply to call_rcu_bh(), call_rcu_sched(),
@@ -324,37 +320,14 @@ over a rather long period of time, but improvements are always welcome!
will break Alpha, cause aggressive compilers to generate bad code,
and confuse people trying to read your code.
11. Note that synchronize_rcu() -only- guarantees to wait until
all currently executing rcu_read_lock()-protected RCU read-side
critical sections complete. It does -not- necessarily guarantee
that all currently running interrupts, NMIs, preempt_disable()
code, or idle loops will complete. Therefore, if your
read-side critical sections are protected by something other
than rcu_read_lock(), do -not- use synchronize_rcu().
Similarly, disabling preemption is not an acceptable substitute
for rcu_read_lock(). Code that attempts to use preemption
disabling where it should be using rcu_read_lock() will break
in CONFIG_PREEMPT=y kernel builds.
If you want to wait for interrupt handlers, NMI handlers, and
code under the influence of preempt_disable(), you instead
need to use synchronize_irq() or synchronize_sched().
This same limitation also applies to synchronize_rcu_bh()
and synchronize_srcu(), as well as to the asynchronous and
expedited forms of the three primitives, namely call_rcu(),
call_rcu_bh(), call_srcu(), synchronize_rcu_expedited(),
synchronize_rcu_bh_expedited(), and synchronize_srcu_expedited().
12. Any lock acquired by an RCU callback must be acquired elsewhere
11. Any lock acquired by an RCU callback must be acquired elsewhere
with softirq disabled, e.g., via spin_lock_irqsave(),
spin_lock_bh(), etc. Failing to disable irq on a given
acquisition of that lock will result in deadlock as soon as
the RCU softirq handler happens to run your RCU callback while
interrupting that acquisition's critical section.
13. RCU callbacks can be and are executed in parallel. In many cases,
12. RCU callbacks can be and are executed in parallel. In many cases,
the callback code simply wrappers around kfree(), so that this
is not an issue (or, more accurately, to the extent that it is
an issue, the memory-allocator locking handles it). However,
@@ -370,7 +343,7 @@ over a rather long period of time, but improvements are always welcome!
not the case, a self-spawning RCU callback would prevent the
victim CPU from ever going offline.)
14. Unlike other forms of RCU, it -is- permissible to block in an
13. Unlike other forms of RCU, it -is- permissible to block in an
SRCU read-side critical section (demarked by srcu_read_lock()
and srcu_read_unlock()), hence the "SRCU": "sleepable RCU".
Please note that if you don't need to sleep in read-side critical
@@ -414,7 +387,7 @@ over a rather long period of time, but improvements are always welcome!
Note that rcu_dereference() and rcu_assign_pointer() relate to
SRCU just as they do to other forms of RCU.
15. The whole point of call_rcu(), synchronize_rcu(), and friends
14. The whole point of call_rcu(), synchronize_rcu(), and friends
is to wait until all pre-existing readers have finished before
carrying out some otherwise-destructive operation. It is
therefore critically important to -first- remove any path
@@ -426,13 +399,13 @@ over a rather long period of time, but improvements are always welcome!
is the caller's responsibility to guarantee that any subsequent
readers will execute safely.
16. The various RCU read-side primitives do -not- necessarily contain
15. The various RCU read-side primitives do -not- necessarily contain
memory barriers. You should therefore plan for the CPU
and the compiler to freely reorder code into and out of RCU
read-side critical sections. It is the responsibility of the
RCU update-side primitives to deal with this.
17. Use CONFIG_PROVE_LOCKING, CONFIG_DEBUG_OBJECTS_RCU_HEAD, and the
16. Use CONFIG_PROVE_LOCKING, CONFIG_DEBUG_OBJECTS_RCU_HEAD, and the
__rcu sparse checks to validate your RCU code. These can help
find problems as follows:
@@ -455,7 +428,7 @@ over a rather long period of time, but improvements are always welcome!
These debugging aids can help you find problems that are
otherwise extremely difficult to spot.
18. If you register a callback using call_rcu(), call_rcu_bh(),
17. If you register a callback using call_rcu(), call_rcu_bh(),
call_rcu_sched(), or call_srcu(), and pass in a function defined
within a loadable module, then it in necessary to wait for
all pending callbacks to be invoked after the last invocation
@@ -469,8 +442,8 @@ over a rather long period of time, but improvements are always welcome!
You instead need to use one of the barrier functions:
o call_rcu() -> rcu_barrier()
o call_rcu_bh() -> rcu_barrier_bh()
o call_rcu_sched() -> rcu_barrier_sched()
o call_rcu_bh() -> rcu_barrier()
o call_rcu_sched() -> rcu_barrier()
o call_srcu() -> srcu_barrier()
However, these barrier functions are absolutely -not- guaranteed

View File

@@ -14,9 +14,9 @@ being the real world and all that.
So let's look at an example RCU lockdep splat from 3.0-rc5, one that
has long since been fixed:
===============================
[ INFO: suspicious RCU usage. ]
-------------------------------
=============================
WARNING: suspicious RCU usage
-----------------------------
block/cfq-iosched.c:2776 suspicious rcu_dereference_protected() usage!
other info that might help us debug this:
@@ -24,11 +24,11 @@ other info that might help us debug this:
rcu_scheduler_active = 1, debug_locks = 0
3 locks held by scsi_scan_6/1552:
#0: (&shost->scan_mutex){+.+.+.}, at: [<ffffffff8145efca>]
#0: (&shost->scan_mutex){+.+.}, at: [<ffffffff8145efca>]
scsi_scan_host_selected+0x5a/0x150
#1: (&eq->sysfs_lock){+.+...}, at: [<ffffffff812a5032>]
#1: (&eq->sysfs_lock){+.+.}, at: [<ffffffff812a5032>]
elevator_exit+0x22/0x60
#2: (&(&q->__queue_lock)->rlock){-.-...}, at: [<ffffffff812b6233>]
#2: (&(&q->__queue_lock)->rlock){-.-.}, at: [<ffffffff812b6233>]
cfq_exit_queue+0x43/0x190
stack backtrace:

View File

@@ -176,9 +176,8 @@ causing stalls, and that the stall was affecting RCU-sched. This message
will normally be followed by stack dumps for each CPU. Please note that
PREEMPT_RCU builds can be stalled by tasks as well as by CPUs, and that
the tasks will be indicated by PID, for example, "P3421". It is even
possible for a rcu_preempt_state stall to be caused by both CPUs -and-
tasks, in which case the offending CPUs and tasks will all be called
out in the list.
possible for an rcu_state stall to be caused by both CPUs -and- tasks,
in which case the offending CPUs and tasks will all be called out in the list.
CPU 2's "(3 GPs behind)" indicates that this CPU has not interacted with
the RCU core for the past three grace periods. In contrast, CPU 16's "(0
@@ -206,7 +205,7 @@ handlers are no longer able to execute on this CPU. This can happen if
the stalled CPU is spinning with interrupts are disabled, or, in -rt
kernels, if a high-priority process is starving RCU's softirq handler.
The "fps=" shows the number of force-quiescent-state idle/offline
The "fqs=" shows the number of force-quiescent-state idle/offline
detection passes that the grace-period kthread has made across this
CPU since the last time that this CPU noted the beginning of a grace
period.
@@ -220,17 +219,18 @@ an estimate of the total number of RCU callbacks queued across all CPUs
In kernels with CONFIG_RCU_FAST_NO_HZ, more information is printed
for each CPU:
0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 nonlazy_posted: 25 .D
0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 Nonlazy posted: ..D
The "last_accelerate:" prints the low-order 16 bits (in hex) of the
jiffies counter when this CPU last invoked rcu_try_advance_all_cbs()
from rcu_needs_cpu() or last invoked rcu_accelerate_cbs() from
rcu_prepare_for_idle(). The "nonlazy_posted:" prints the number
of non-lazy callbacks posted since the last call to rcu_needs_cpu().
Finally, an "L" indicates that there are currently no non-lazy callbacks
("." is printed otherwise, as shown above) and "D" indicates that
dyntick-idle processing is enabled ("." is printed otherwise, for example,
if disabled via the "nohz=" kernel boot parameter).
rcu_prepare_for_idle(). The "Nonlazy posted:" indicates lazy-callback
status, so that an "l" indicates that all callbacks were lazy at the start
of the last idle period and an "L" indicates that there are currently
no non-lazy callbacks (in both cases, "." is printed otherwise, as
shown above) and "D" indicates that dyntick-idle processing is enabled
("." is printed otherwise, for example, if disabled via the "nohz="
kernel boot parameter).
If the grace period ends just as the stall warning starts printing,
there will be a spurious stall-warning message, which will include

View File

@@ -10,173 +10,8 @@ status messages via printk(), which can be examined via the dmesg
command (perhaps grepping for "torture"). The test is started
when the module is loaded, and stops when the module is unloaded.
MODULE PARAMETERS
This module has the following parameters:
fqs_duration Duration (in microseconds) of artificially induced bursts
of force_quiescent_state() invocations. In RCU
implementations having force_quiescent_state(), these
bursts help force races between forcing a given grace
period and that grace period ending on its own.
fqs_holdoff Holdoff time (in microseconds) between consecutive calls
to force_quiescent_state() within a burst.
fqs_stutter Wait time (in seconds) between consecutive bursts
of calls to force_quiescent_state().
gp_normal Make the fake writers use normal synchronous grace-period
primitives.
gp_exp Make the fake writers use expedited synchronous grace-period
primitives. If both gp_normal and gp_exp are set, or
if neither gp_normal nor gp_exp are set, then randomly
choose the primitive so that about 50% are normal and
50% expedited. By default, neither are set, which
gives best overall test coverage.
irqreader Says to invoke RCU readers from irq level. This is currently
done via timers. Defaults to "1" for variants of RCU that
permit this. (Or, more accurately, variants of RCU that do
-not- permit this know to ignore this variable.)
n_barrier_cbs If this is nonzero, RCU barrier testing will be conducted,
in which case n_barrier_cbs specifies the number of
RCU callbacks (and corresponding kthreads) to use for
this testing. The value cannot be negative. If you
specify this to be non-zero when torture_type indicates a
synchronous RCU implementation (one for which a member of
the synchronize_rcu() rather than the call_rcu() family is
used -- see the documentation for torture_type below), an
error will be reported and no testing will be carried out.
nfakewriters This is the number of RCU fake writer threads to run. Fake
writer threads repeatedly use the synchronous "wait for
current readers" function of the interface selected by
torture_type, with a delay between calls to allow for various
different numbers of writers running in parallel.
nfakewriters defaults to 4, which provides enough parallelism
to trigger special cases caused by multiple writers, such as
the synchronize_srcu() early return optimization.
nreaders This is the number of RCU reading threads supported.
The default is twice the number of CPUs. Why twice?
To properly exercise RCU implementations with preemptible
read-side critical sections.
onoff_interval
The number of seconds between each attempt to execute a
randomly selected CPU-hotplug operation. Defaults to
zero, which disables CPU hotplugging. In HOTPLUG_CPU=n
kernels, rcutorture will silently refuse to do any
CPU-hotplug operations regardless of what value is
specified for onoff_interval.
onoff_holdoff The number of seconds to wait until starting CPU-hotplug
operations. This would normally only be used when
rcutorture was built into the kernel and started
automatically at boot time, in which case it is useful
in order to avoid confusing boot-time code with CPUs
coming and going.
shuffle_interval
The number of seconds to keep the test threads affinitied
to a particular subset of the CPUs, defaults to 3 seconds.
Used in conjunction with test_no_idle_hz.
shutdown_secs The number of seconds to run the test before terminating
the test and powering off the system. The default is
zero, which disables test termination and system shutdown.
This capability is useful for automated testing.
stall_cpu The number of seconds that a CPU should be stalled while
within both an rcu_read_lock() and a preempt_disable().
This stall happens only once per rcutorture run.
If you need multiple stalls, use modprobe and rmmod to
repeatedly run rcutorture. The default for stall_cpu
is zero, which prevents rcutorture from stalling a CPU.
Note that attempts to rmmod rcutorture while the stall
is ongoing will hang, so be careful what value you
choose for this module parameter! In addition, too-large
values for stall_cpu might well induce failures and
warnings in other parts of the kernel. You have been
warned!
stall_cpu_holdoff
The number of seconds to wait after rcutorture starts
before stalling a CPU. Defaults to 10 seconds.
stat_interval The number of seconds between output of torture
statistics (via printk()). Regardless of the interval,
statistics are printed when the module is unloaded.
Setting the interval to zero causes the statistics to
be printed -only- when the module is unloaded, and this
is the default.
stutter The length of time to run the test before pausing for this
same period of time. Defaults to "stutter=5", so as
to run and pause for (roughly) five-second intervals.
Specifying "stutter=0" causes the test to run continuously
without pausing, which is the old default behavior.
test_boost Whether or not to test the ability of RCU to do priority
boosting. Defaults to "test_boost=1", which performs
RCU priority-inversion testing only if the selected
RCU implementation supports priority boosting. Specifying
"test_boost=0" never performs RCU priority-inversion
testing. Specifying "test_boost=2" performs RCU
priority-inversion testing even if the selected RCU
implementation does not support RCU priority boosting,
which can be used to test rcutorture's ability to
carry out RCU priority-inversion testing.
test_boost_interval
The number of seconds in an RCU priority-inversion test
cycle. Defaults to "test_boost_interval=7". It is
usually wise for this value to be relatively prime to
the value selected for "stutter".
test_boost_duration
The number of seconds to do RCU priority-inversion testing
within any given "test_boost_interval". Defaults to
"test_boost_duration=4".
test_no_idle_hz Whether or not to test the ability of RCU to operate in
a kernel that disables the scheduling-clock interrupt to
idle CPUs. Boolean parameter, "1" to test, "0" otherwise.
Defaults to omitting this test.
torture_type The type of RCU to test, with string values as follows:
"rcu": rcu_read_lock(), rcu_read_unlock() and call_rcu(),
along with expedited, synchronous, and polling
variants.
"rcu_bh": rcu_read_lock_bh(), rcu_read_unlock_bh(), and
call_rcu_bh(), along with expedited and synchronous
variants.
"rcu_busted": This tests an intentionally incorrect version
of RCU in order to help test rcutorture itself.
"srcu": srcu_read_lock(), srcu_read_unlock() and
call_srcu(), along with expedited and
synchronous variants.
"sched": preempt_disable(), preempt_enable(), and
call_rcu_sched(), along with expedited,
synchronous, and polling variants.
"tasks": voluntary context switch and call_rcu_tasks(),
along with expedited and synchronous variants.
Defaults to "rcu".
verbose Enable debug printk()s. Default is disabled.
Module parameters are prefixed by "rcutorture." in
Documentation/admin-guide/kernel-parameters.txt.
OUTPUT

View File

@@ -266,7 +266,7 @@ rcu_dereference()
unnecessary overhead on Alpha CPUs.
Note that the value returned by rcu_dereference() is valid
only within the enclosing RCU read-side critical section.
only within the enclosing RCU read-side critical section [1].
For example, the following is -not- legal:
rcu_read_lock();
@@ -292,6 +292,19 @@ rcu_dereference()
typically used indirectly, via the _rcu list-manipulation
primitives, such as list_for_each_entry_rcu().
[1] The variant rcu_dereference_protected() can be used outside
of an RCU read-side critical section as long as the usage is
protected by locks acquired by the update-side code. This variant
avoids the lockdep warning that would happen when using (for
example) rcu_dereference() without rcu_read_lock() protection.
Using rcu_dereference_protected() also has the advantage
of permitting compiler optimizations that rcu_dereference()
must prohibit. The rcu_dereference_protected() variant takes
a lockdep expression to indicate which locks must be acquired
by the caller. If the indicated protection is not provided,
a lockdep splat is emitted. See RCU/Design/Requirements/Requirements.html
and the API's code comments for more details and example usage.
The following diagram shows how each API communicates among the
reader, updater, and reclaimer.
@@ -322,28 +335,27 @@ to their callers and (2) call_rcu() callbacks may be invoked. Efficient
implementations of the RCU infrastructure make heavy use of batching in
order to amortize their overhead over many uses of the corresponding APIs.
There are no fewer than three RCU mechanisms in the Linux kernel; the
diagram above shows the first one, which is by far the most commonly used.
The rcu_dereference() and rcu_assign_pointer() primitives are used for
all three mechanisms, but different defer and protect primitives are
used as follows:
There are at least three flavors of RCU usage in the Linux kernel. The diagram
above shows the most common one. On the updater side, the rcu_assign_pointer(),
sychronize_rcu() and call_rcu() primitives used are the same for all three
flavors. However for protection (on the reader side), the primitives used vary
depending on the flavor:
Defer Protect
a. rcu_read_lock() / rcu_read_unlock()
rcu_dereference()
a. synchronize_rcu() rcu_read_lock() / rcu_read_unlock()
call_rcu() rcu_dereference()
b. rcu_read_lock_bh() / rcu_read_unlock_bh()
local_bh_disable() / local_bh_enable()
rcu_dereference_bh()
b. synchronize_rcu_bh() rcu_read_lock_bh() / rcu_read_unlock_bh()
call_rcu_bh() rcu_dereference_bh()
c. synchronize_sched() rcu_read_lock_sched() / rcu_read_unlock_sched()
call_rcu_sched() preempt_disable() / preempt_enable()
c. rcu_read_lock_sched() / rcu_read_unlock_sched()
preempt_disable() / preempt_enable()
local_irq_save() / local_irq_restore()
hardirq enter / hardirq exit
NMI enter / NMI exit
rcu_dereference_sched()
These three mechanisms are used as follows:
These three flavors are used as follows:
a. RCU applied to normal data structures.
@@ -548,7 +560,7 @@ presents two such "toy" implementations of RCU, one that is implemented
in terms of familiar locking primitives, and another that more closely
resembles "classic" RCU. Both are way too simple for real-world use,
lacking both functionality and performance. However, they are useful
in getting a feel for how RCU works. See kernel/rcupdate.c for a
in getting a feel for how RCU works. See kernel/rcu/update.c for a
production-quality implementation, and see:
http://www.rdrop.com/users/paulmck/RCU
@@ -867,18 +879,20 @@ RCU: Critical sections Grace period Barrier
bh: Critical sections Grace period Barrier
rcu_read_lock_bh call_rcu_bh rcu_barrier_bh
rcu_read_unlock_bh synchronize_rcu_bh
rcu_dereference_bh synchronize_rcu_bh_expedited
rcu_read_lock_bh call_rcu rcu_barrier
rcu_read_unlock_bh synchronize_rcu
[local_bh_disable] synchronize_rcu_expedited
[and friends]
rcu_dereference_bh
rcu_dereference_bh_check
rcu_dereference_bh_protected
rcu_read_lock_bh_held
sched: Critical sections Grace period Barrier
rcu_read_lock_sched synchronize_sched rcu_barrier_sched
rcu_read_unlock_sched call_rcu_sched
[preempt_disable] synchronize_sched_expedited
rcu_read_lock_sched call_rcu rcu_barrier
rcu_read_unlock_sched synchronize_rcu
[preempt_disable] synchronize_rcu_expedited
[and friends]
rcu_read_lock_sched_notrace
rcu_read_unlock_sched_notrace
@@ -890,8 +904,8 @@ sched: Critical sections Grace period Barrier
SRCU: Critical sections Grace period Barrier
srcu_read_lock synchronize_srcu srcu_barrier
srcu_read_unlock call_srcu
srcu_read_lock call_srcu srcu_barrier
srcu_read_unlock synchronize_srcu
srcu_dereference synchronize_srcu_expedited
srcu_dereference_check
srcu_read_lock_held
@@ -1034,7 +1048,7 @@ Answer: Just as PREEMPT_RT permits preemption of spinlock
spinlocks blocking while in RCU read-side critical
sections.
Why the apparent inconsistency? Because it is it
Why the apparent inconsistency? Because it is
possible to use priority boosting to keep the RCU
grace periods short if need be (for example, if running
short of memory). In contrast, if blocking waiting

View File

@@ -56,12 +56,12 @@ situation from a state where some tasks are stalled but the CPU is
still doing productive work. As such, time spent in this subset of the
stall state is tracked separately and exported in the "full" averages.
The ratios are tracked as recent trends over ten, sixty, and three
hundred second windows, which gives insight into short term events as
well as medium and long term trends. The total absolute stall time is
tracked and exported as well, to allow detection of latency spikes
which wouldn't necessarily make a dent in the time averages, or to
average trends over custom time frames.
The ratios (in %) are tracked as recent trends over ten, sixty, and
three hundred second windows, which gives insight into short term events
as well as medium and long term trends. The total absolute stall time
(in us) is tracked and exported as well, to allow detection of latency
spikes which wouldn't necessarily make a dent in the time averages,
or to average trends over custom time frames.
Cgroup2 interface
=================

View File

@@ -23,7 +23,7 @@ kernel.
The resultant userspace tool binary is then located at:
tools/acpi/power/acpi/acpidbg/acpidbg
tools/power/acpi/acpidbg
It can be installed to system directories by running "make install" (as a
sufficiently privileged user).
@@ -35,7 +35,7 @@ kernel.
# mount -t debugfs none /sys/kernel/debug
# modprobe acpi_dbg
# tools/acpi/power/acpi/acpidbg/acpidbg
# tools/power/acpi/acpidbg
That spawns the interactive AML debugger environment where you can execute
debugger commands.

View File

@@ -14,6 +14,10 @@ upgrade the ACPI execution environment that is defined by the ACPI tables
via upgrading the ACPI tables provided by the BIOS with an instrumented,
modified, more recent version one, or installing brand new ACPI tables.
When building initrd with kernel in a single image, option
ACPI_TABLE_OVERRIDE_VIA_BUILTIN_INITRD should also be true for this
feature to work.
For a full list of ACPI tables that can be upgraded/installed, take a look
at the char *table_sigs[MAX_ACPI_SIGNATURE]; definition in
drivers/acpi/tables.c.

View File

@@ -6,7 +6,7 @@ If you want to use SELinux, chances are you will want
to use the distro-provided policies, or install the
latest reference policy release from
http://oss.tresys.com/projects/refpolicy
https://github.com/SELinuxProject/refpolicy
However, if you want to install a dummy policy for
testing, you can do using ``mdp`` provided under

View File

@@ -0,0 +1,107 @@
=========
SafeSetID
=========
SafeSetID is an LSM module that gates the setid family of syscalls to restrict
UID/GID transitions from a given UID/GID to only those approved by a
system-wide whitelist. These restrictions also prohibit the given UIDs/GIDs
from obtaining auxiliary privileges associated with CAP_SET{U/G}ID, such as
allowing a user to set up user namespace UID mappings.
Background
==========
In absence of file capabilities, processes spawned on a Linux system that need
to switch to a different user must be spawned with CAP_SETUID privileges.
CAP_SETUID is granted to programs running as root or those running as a non-root
user that have been explicitly given the CAP_SETUID runtime capability. It is
often preferable to use Linux runtime capabilities rather than file
capabilities, since using file capabilities to run a program with elevated
privileges opens up possible security holes since any user with access to the
file can exec() that program to gain the elevated privileges.
While it is possible to implement a tree of processes by giving full
CAP_SET{U/G}ID capabilities, this is often at odds with the goals of running a
tree of processes under non-root user(s) in the first place. Specifically,
since CAP_SETUID allows changing to any user on the system, including the root
user, it is an overpowered capability for what is needed in this scenario,
especially since programs often only call setuid() to drop privileges to a
lesser-privileged user -- not elevate privileges. Unfortunately, there is no
generally feasible way in Linux to restrict the potential UIDs that a user can
switch to through setuid() beyond allowing a switch to any user on the system.
This SafeSetID LSM seeks to provide a solution for restricting setid
capabilities in such a way.
The main use case for this LSM is to allow a non-root program to transition to
other untrusted uids without full blown CAP_SETUID capabilities. The non-root
program would still need CAP_SETUID to do any kind of transition, but the
additional restrictions imposed by this LSM would mean it is a "safer" version
of CAP_SETUID since the non-root program cannot take advantage of CAP_SETUID to
do any unapproved actions (e.g. setuid to uid 0 or create/enter new user
namespace). The higher level goal is to allow for uid-based sandboxing of system
services without having to give out CAP_SETUID all over the place just so that
non-root programs can drop to even-lesser-privileged uids. This is especially
relevant when one non-root daemon on the system should be allowed to spawn other
processes as different uids, but its undesirable to give the daemon a
basically-root-equivalent CAP_SETUID.
Other Approaches Considered
===========================
Solve this problem in userspace
-------------------------------
For candidate applications that would like to have restricted setid capabilities
as implemented in this LSM, an alternative option would be to simply take away
setid capabilities from the application completely and refactor the process
spawning semantics in the application (e.g. by using a privileged helper program
to do process spawning and UID/GID transitions). Unfortunately, there are a
number of semantics around process spawning that would be affected by this, such
as fork() calls where the program doesn???t immediately call exec() after the
fork(), parent processes specifying custom environment variables or command line
args for spawned child processes, or inheritance of file handles across a
fork()/exec(). Because of this, as solution that uses a privileged helper in
userspace would likely be less appealing to incorporate into existing projects
that rely on certain process-spawning semantics in Linux.
Use user namespaces
-------------------
Another possible approach would be to run a given process tree in its own user
namespace and give programs in the tree setid capabilities. In this way,
programs in the tree could change to any desired UID/GID in the context of their
own user namespace, and only approved UIDs/GIDs could be mapped back to the
initial system user namespace, affectively preventing privilege escalation.
Unfortunately, it is not generally feasible to use user namespaces in isolation,
without pairing them with other namespace types, which is not always an option.
Linux checks for capabilities based off of the user namespace that ???owns??? some
entity. For example, Linux has the notion that network namespaces are owned by
the user namespace in which they were created. A consequence of this is that
capability checks for access to a given network namespace are done by checking
whether a task has the given capability in the context of the user namespace
that owns the network namespace -- not necessarily the user namespace under
which the given task runs. Therefore spawning a process in a new user namespace
effectively prevents it from accessing the network namespace owned by the
initial namespace. This is a deal-breaker for any application that expects to
retain the CAP_NET_ADMIN capability for the purpose of adjusting network
configurations. Using user namespaces in isolation causes problems regarding
other system interactions, including use of pid namespaces and device creation.
Use an existing LSM
-------------------
None of the other in-tree LSMs have the capability to gate setid transitions, or
even employ the security_task_fix_setuid hook at all. SELinux says of that hook:
"Since setuid only affects the current process, and since the SELinux controls
are not based on the Linux identity attributes, SELinux does not need to control
this operation."
Directions for use
==================
This LSM hooks the setid syscalls to make sure transitions are allowed if an
applicable restriction policy is in place. Policies are configured through
securityfs by writing to the safesetid/add_whitelist_policy and
safesetid/flush_whitelist_policies files at the location where securityfs is
mounted. The format for adding a policy is '<UID>:<UID>', using literal
numbers, such as '123:456'. To flush the policies, any write to the file is
sufficient. Again, configuring a policy for a UID will prevent that UID from
obtaining auxiliary setid privileges, such as allowing a user to set up user
namespace UID mappings.

View File

@@ -818,6 +818,10 @@ Smack supports some mount options:
specifies a label to which all labels set on the
filesystem must have read access. Not yet enforced.
smackfstransmute=label:
behaves exactly like smackfsroot except that it also
sets the transmute flag on the root of the mount
These mount options apply to all file system types.
Smack auditing

View File

@@ -17,9 +17,8 @@ MAC extensions, other extensions can be built using the LSM to provide
specific changes to system operation when these tweaks are not available
in the core functionality of Linux itself.
Without a specific LSM built into the kernel, the default LSM will be the
Linux capabilities system. Most LSMs choose to extend the capabilities
system, building their checks on top of the defined capability hooks.
The Linux capabilities modules will always be included. This may be
followed by any number of "minor" modules and at most one "major" module.
For more details on capabilities, see ``capabilities(7)`` in the Linux
man-pages project.
@@ -30,6 +29,14 @@ order in which checks are made. The capability module will always
be first, followed by any "minor" modules (e.g. Yama) and then
the one "major" module (e.g. SELinux) if there is one configured.
Process attributes associated with "major" security modules should
be accessed and maintained using the special files in ``/proc/.../attr``.
A security module may maintain a module specific subdirectory there,
named after the module. ``/proc/.../attr/smack`` is provided by the Smack
security module and contains all its special files. The files directly
in ``/proc/.../attr`` remain as legacy interfaces for modules that provide
subdirectories.
.. toctree::
:maxdepth: 1
@@ -39,3 +46,4 @@ the one "major" module (e.g. SELinux) if there is one configured.
Smack
tomoyo
Yama
SafeSetID

View File

@@ -1,9 +1,9 @@
.. _readme:
Linux kernel release 4.x <http://kernel.org/>
Linux kernel release 5.x <http://kernel.org/>
=============================================
These are the release notes for Linux version 4. Read them carefully,
These are the release notes for Linux version 5. Read them carefully,
as they tell you what this is all about, explain how to install the
kernel, and what to do if something goes wrong.
@@ -63,7 +63,7 @@ Installing the kernel source
directory where you have permissions (e.g. your home directory) and
unpack it::
xz -cd linux-4.X.tar.xz | tar xvf -
xz -cd linux-5.x.tar.xz | tar xvf -
Replace "X" with the version number of the latest kernel.
@@ -72,26 +72,26 @@ Installing the kernel source
files. They should match the library, and not get messed up by
whatever the kernel-du-jour happens to be.
- You can also upgrade between 4.x releases by patching. Patches are
- You can also upgrade between 5.x releases by patching. Patches are
distributed in the xz format. To install by patching, get all the
newer patch files, enter the top level directory of the kernel source
(linux-4.X) and execute::
(linux-5.x) and execute::
xz -cd ../patch-4.x.xz | patch -p1
xz -cd ../patch-5.x.xz | patch -p1
Replace "x" for all versions bigger than the version "X" of your current
Replace "x" for all versions bigger than the version "x" of your current
source tree, **in_order**, and you should be ok. You may want to remove
the backup files (some-file-name~ or some-file-name.orig), and make sure
that there are no failed patches (some-file-name# or some-file-name.rej).
If there are, either you or I have made a mistake.
Unlike patches for the 4.x kernels, patches for the 4.x.y kernels
Unlike patches for the 5.x kernels, patches for the 5.x.y kernels
(also known as the -stable kernels) are not incremental but instead apply
directly to the base 4.x kernel. For example, if your base kernel is 4.0
and you want to apply the 4.0.3 patch, you must not first apply the 4.0.1
and 4.0.2 patches. Similarly, if you are running kernel version 4.0.2 and
want to jump to 4.0.3, you must first reverse the 4.0.2 patch (that is,
patch -R) **before** applying the 4.0.3 patch. You can read more on this in
directly to the base 5.x kernel. For example, if your base kernel is 5.0
and you want to apply the 5.0.3 patch, you must not first apply the 5.0.1
and 5.0.2 patches. Similarly, if you are running kernel version 5.0.2 and
want to jump to 5.0.3, you must first reverse the 5.0.2 patch (that is,
patch -R) **before** applying the 5.0.3 patch. You can read more on this in
:ref:`Documentation/process/applying-patches.rst <applying_patches>`.
Alternatively, the script patch-kernel can be used to automate this
@@ -114,7 +114,7 @@ Installing the kernel source
Software requirements
---------------------
Compiling and running the 4.x kernels requires up-to-date
Compiling and running the 5.x kernels requires up-to-date
versions of various software packages. Consult
:ref:`Documentation/process/changes.rst <changes>` for the minimum version numbers
required and how to get updates for these packages. Beware that using
@@ -132,12 +132,12 @@ Build directory for the kernel
place for the output files (including .config).
Example::
kernel source code: /usr/src/linux-4.X
kernel source code: /usr/src/linux-5.x
build directory: /home/name/build/kernel
To configure and build the kernel, use::
cd /usr/src/linux-4.X
cd /usr/src/linux-5.x
make O=/home/name/build/kernel menuconfig
make O=/home/name/build/kernel
sudo make O=/home/name/build/kernel modules_install install
@@ -251,7 +251,7 @@ Configuring the kernel
Compiling the kernel
--------------------
- Make sure you have at least gcc 3.2 available.
- Make sure you have at least gcc 4.6 available.
For more information, refer to :ref:`Documentation/process/changes.rst <changes>`.
Please note that you can still run a.out user programs with this kernel.

View File

@@ -56,11 +56,13 @@ v1 is available under Documentation/cgroup-v1/.
5-3-3-2. IO Latency Interface Files
5-4. PID
5-4-1. PID Interface Files
5-5. Device
5-6. RDMA
5-6-1. RDMA Interface Files
5-7. Misc
5-7-1. perf_event
5-5. Cpuset
5.5-1. Cpuset Interface Files
5-6. Device
5-7. RDMA
5-7-1. RDMA Interface Files
5-8. Misc
5-8-1. perf_event
5-N. Non-normative information
5-N-1. CPU controller root cgroup process behaviour
5-N-2. IO controller root cgroup process behaviour
@@ -1187,6 +1189,10 @@ PAGE_SIZE multiple when read back.
Amount of cached filesystem data that was modified and
is currently being written back to disk
anon_thp
Amount of memory used in anonymous mappings backed by
transparent hugepages
inactive_anon, active_anon, inactive_file, active_file, unevictable
Amount of memory, swap-backed and filesystem-backed,
on the internal memory management lists used by the
@@ -1246,6 +1252,18 @@ PAGE_SIZE multiple when read back.
Amount of reclaimed lazyfree pages
thp_fault_alloc
Number of transparent hugepages which were allocated to satisfy
a page fault, including COW faults. This counter is not present
when CONFIG_TRANSPARENT_HUGEPAGE is not set.
thp_collapse_alloc
Number of transparent hugepages which were allocated to allow
collapsing an existing range of pages. This counter is not
present when CONFIG_TRANSPARENT_HUGEPAGE is not set.
memory.swap.current
A read-only single value file which exists on non-root
cgroups.
@@ -1501,7 +1519,7 @@ protected workload.
The limits are only applied at the peer level in the hierarchy. This means that
in the diagram below, only groups A, B, and C will influence each other, and
groups D and F will influence each other. Group G will influence nobody.
groups D and F will influence each other. Group G will influence nobody::
[root]
/ | \
@@ -1610,6 +1628,176 @@ through fork() or clone(). These will return -EAGAIN if the creation
of a new process would cause a cgroup policy to be violated.
Cpuset
------
The "cpuset" controller provides a mechanism for constraining
the CPU and memory node placement of tasks to only the resources
specified in the cpuset interface files in a task's current cgroup.
This is especially valuable on large NUMA systems where placing jobs
on properly sized subsets of the systems with careful processor and
memory placement to reduce cross-node memory access and contention
can improve overall system performance.
The "cpuset" controller is hierarchical. That means the controller
cannot use CPUs or memory nodes not allowed in its parent.
Cpuset Interface Files
~~~~~~~~~~~~~~~~~~~~~~
cpuset.cpus
A read-write multiple values file which exists on non-root
cpuset-enabled cgroups.
It lists the requested CPUs to be used by tasks within this
cgroup. The actual list of CPUs to be granted, however, is
subjected to constraints imposed by its parent and can differ
from the requested CPUs.
The CPU numbers are comma-separated numbers or ranges.
For example:
# cat cpuset.cpus
0-4,6,8-10
An empty value indicates that the cgroup is using the same
setting as the nearest cgroup ancestor with a non-empty
"cpuset.cpus" or all the available CPUs if none is found.
The value of "cpuset.cpus" stays constant until the next update
and won't be affected by any CPU hotplug events.
cpuset.cpus.effective
A read-only multiple values file which exists on all
cpuset-enabled cgroups.
It lists the onlined CPUs that are actually granted to this
cgroup by its parent. These CPUs are allowed to be used by
tasks within the current cgroup.
If "cpuset.cpus" is empty, the "cpuset.cpus.effective" file shows
all the CPUs from the parent cgroup that can be available to
be used by this cgroup. Otherwise, it should be a subset of
"cpuset.cpus" unless none of the CPUs listed in "cpuset.cpus"
can be granted. In this case, it will be treated just like an
empty "cpuset.cpus".
Its value will be affected by CPU hotplug events.
cpuset.mems
A read-write multiple values file which exists on non-root
cpuset-enabled cgroups.
It lists the requested memory nodes to be used by tasks within
this cgroup. The actual list of memory nodes granted, however,
is subjected to constraints imposed by its parent and can differ
from the requested memory nodes.
The memory node numbers are comma-separated numbers or ranges.
For example:
# cat cpuset.mems
0-1,3
An empty value indicates that the cgroup is using the same
setting as the nearest cgroup ancestor with a non-empty
"cpuset.mems" or all the available memory nodes if none
is found.
The value of "cpuset.mems" stays constant until the next update
and won't be affected by any memory nodes hotplug events.
cpuset.mems.effective
A read-only multiple values file which exists on all
cpuset-enabled cgroups.
It lists the onlined memory nodes that are actually granted to
this cgroup by its parent. These memory nodes are allowed to
be used by tasks within the current cgroup.
If "cpuset.mems" is empty, it shows all the memory nodes from the
parent cgroup that will be available to be used by this cgroup.
Otherwise, it should be a subset of "cpuset.mems" unless none of
the memory nodes listed in "cpuset.mems" can be granted. In this
case, it will be treated just like an empty "cpuset.mems".
Its value will be affected by memory nodes hotplug events.
cpuset.cpus.partition
A read-write single value file which exists on non-root
cpuset-enabled cgroups. This flag is owned by the parent cgroup
and is not delegatable.
It accepts only the following input values when written to.
"root" - a paritition root
"member" - a non-root member of a partition
When set to be a partition root, the current cgroup is the
root of a new partition or scheduling domain that comprises
itself and all its descendants except those that are separate
partition roots themselves and their descendants. The root
cgroup is always a partition root.
There are constraints on where a partition root can be set.
It can only be set in a cgroup if all the following conditions
are true.
1) The "cpuset.cpus" is not empty and the list of CPUs are
exclusive, i.e. they are not shared by any of its siblings.
2) The parent cgroup is a partition root.
3) The "cpuset.cpus" is also a proper subset of the parent's
"cpuset.cpus.effective".
4) There is no child cgroups with cpuset enabled. This is for
eliminating corner cases that have to be handled if such a
condition is allowed.
Setting it to partition root will take the CPUs away from the
effective CPUs of the parent cgroup. Once it is set, this
file cannot be reverted back to "member" if there are any child
cgroups with cpuset enabled.
A parent partition cannot distribute all its CPUs to its
child partitions. There must be at least one cpu left in the
parent partition.
Once becoming a partition root, changes to "cpuset.cpus" is
generally allowed as long as the first condition above is true,
the change will not take away all the CPUs from the parent
partition and the new "cpuset.cpus" value is a superset of its
children's "cpuset.cpus" values.
Sometimes, external factors like changes to ancestors'
"cpuset.cpus" or cpu hotplug can cause the state of the partition
root to change. On read, the "cpuset.sched.partition" file
can show the following values.
"member" Non-root member of a partition
"root" Partition root
"root invalid" Invalid partition root
It is a partition root if the first 2 partition root conditions
above are true and at least one CPU from "cpuset.cpus" is
granted by the parent cgroup.
A partition root can become invalid if none of CPUs requested
in "cpuset.cpus" can be granted by the parent cgroup or the
parent cgroup is no longer a partition root itself. In this
case, it is not a real partition even though the restriction
of the first partition root condition above will still apply.
The cpu affinity of all the tasks in the cgroup will then be
associated with CPUs in the nearest ancestor partition.
An invalid partition root can be transitioned back to a
real partition root if at least one of the requested CPUs
can now be granted by its parent. In this case, the cpu
affinity of all the tasks in the formerly invalid partition
will be associated to the CPUs of the newly formed partition.
Changing the partition state of an invalid partition root to
"member" is always allowed even if child cpusets are present.
Device controller
-----------------
@@ -1879,8 +2067,10 @@ following two functions.
wbc_init_bio(@wbc, @bio)
Should be called for each bio carrying writeback data and
associates the bio with the inode's owner cgroup. Can be
called anytime between bio allocation and submission.
associates the bio with the inode's owner cgroup and the
corresponding request queue. This must be called after
a queue (device) has been associated with the bio and
before submission.
wbc_account_io(@wbc, @page, @bytes)
Should be called for each data segment being written out.
@@ -1899,7 +2089,7 @@ the configuration, the bio may be executed at a lower priority and if
the writeback session is holding shared resources, e.g. a journal
entry, may lead to priority inversion. There is no one easy solution
for the problem. Filesystems can try to work around specific problem
cases by skipping wbc_init_bio() or using bio_associate_blkcg()
cases by skipping wbc_init_bio() and using bio_associate_blkg()
directly.

View File

@@ -1,3 +1,4 @@
.. _admin_devices:
Linux allocated devices (4.x+ version)
======================================

View File

@@ -110,8 +110,8 @@ If your query set is big, you can batch them too::
~# cat query-batch-file > <debugfs>/dynamic_debug/control
A another way is to use wildcard. The match rule support ``*`` (matches
zero or more characters) and ``?`` (matches exactly one character).For
Another way is to use wildcards. The match rule supports ``*`` (matches
zero or more characters) and ``?`` (matches exactly one character). For
example, you can match all usb drivers::
~# echo "file drivers/usb/* +p" > <debugfs>/dynamic_debug/control
@@ -258,7 +258,7 @@ this boot parameter for debugging purposes.
If ``foo`` module is not built-in, ``foo.dyndbg`` will still be processed at
boot time, without effect, but will be reprocessed when module is
loaded later. ``dyndbg_query=`` and bare ``dyndbg=`` are only processed at
loaded later. ``ddebug_query=`` and bare ``dyndbg=`` are only processed at
boot.
@@ -301,7 +301,7 @@ The ``dyndbg`` option is a "fake" module parameter, which means:
For ``CONFIG_DYNAMIC_DEBUG`` kernels, any settings given at boot-time (or
enabled by ``-DDEBUG`` flag during compilation) can be disabled later via
the sysfs interface if the debug messages are no longer needed::
the debugfs interface if the debug messages are no longer needed::
echo "module module_name -p" > <debugfs>/dynamic_debug/control

View File

@@ -76,6 +76,7 @@ configure specific aspects of kernel behavior to your liking.
thunderbolt
LSM/index
mm/index
perf-security
.. only:: subproject and html

View File

@@ -331,7 +331,7 @@
APC and your system crashes randomly.
apic= [APIC,X86] Advanced Programmable Interrupt Controller
Change the output verbosity whilst booting
Change the output verbosity while booting
Format: { quiet (default) | verbose | debug }
Change the amount of debugging information output
when initialising the APIC and IO-APIC components.
@@ -461,6 +461,11 @@
possible to determine what the correct size should be.
This option provides an override for these situations.
carrier_timeout=
[NET] Specifies amount of time (in seconds) that
the kernel should wait for a network carrier. By default
it waits 120 seconds.
ca_keys= [KEYS] This parameter identifies a specific key(s) on
the system trusted keyring to be used for certificate
trust validation.
@@ -486,10 +491,14 @@
cut the overhead, others just disable the usage. So
only cgroup_disable=memory is actually worthy}
cgroup_no_v1= [KNL] Disable one, multiple, all cgroup controllers in v1
Format: { controller[,controller...] | "all" }
cgroup_no_v1= [KNL] Disable cgroup controllers and named hierarchies in v1
Format: { { controller | "all" | "named" }
[,{ controller | "all" | "named" }...] }
Like cgroup_disable, but only applies to cgroup v1;
the blacklisted controllers remain available in cgroup2.
"all" blacklists all controllers and "named" disables
named mounts. Specifying both "all" and "named" disables
all v1 hierarchies.
cgroup.memory= [KNL] Pass options to the cgroup memory controller.
Format: <string>
@@ -674,6 +683,9 @@
cpuidle.off=1 [CPU_IDLE]
disable the cpuidle sub-system
cpuidle.governor=
[CPU_IDLE] Name of the cpuidle governor to use.
cpufreq.off=1 [CPU_FREQ]
disable the cpufreq sub-system
@@ -903,6 +915,10 @@
The filter can be disabled or changed to another
driver later using sysfs.
driver_async_probe= [KNL]
List of driver names to be probed asynchronously.
Format: <driver_name1>,<driver_name2>...
drm.edid_firmware=[<connector>:]<file>[,[<connector>:]<file>]
Broken monitors, graphic adapters, KVMs and EDIDless
panels may send no or incorrect EDID data sets.
@@ -1021,6 +1037,12 @@
specified address. The serial port must already be
setup and configured. Options are not yet supported.
rda,<addr>
Start an early, polled-mode console on a serial port
of an RDA Micro SoC, such as RDA8810PL, at the
specified address. The serial port must already be
setup and configured. Options are not yet supported.
smh Use ARM semihosting calls for early console.
s3c2410,<addr>
@@ -1060,9 +1082,15 @@
specified address. The serial port must already be
setup and configured. Options are not yet supported.
efifb,[options]
Start an early, unaccelerated console on the EFI
memory mapped framebuffer (if available). On cache
coherent non-x86 systems that use system memory for
the framebuffer, pass the 'ram' option so that it is
mapped with the correct attributes.
earlyprintk= [X86,SH,ARM,M68k,S390]
earlyprintk=vga
earlyprintk=efi
earlyprintk=sclp
earlyprintk=xen
earlyprintk=serial[,ttySn[,baudrate]]
@@ -1169,9 +1197,10 @@
arch/x86/kernel/cpu/cpufreq/elanfreq.c.
elevator= [IOSCHED]
Format: {"cfq" | "deadline" | "noop"}
See Documentation/block/cfq-iosched.txt and
Documentation/block/deadline-iosched.txt for details.
Format: { "mq-deadline" | "kyber" | "bfq" }
See Documentation/block/deadline-iosched.txt,
Documentation/block/kyber-iosched.txt and
Documentation/block/bfq-iosched.txt for details.
elfcorehdr=[size[KMG]@]offset[KMG] [IA64,PPC,SH,X86,S390]
Specifies physical address of start of kernel core
@@ -1683,12 +1712,11 @@
By default, super page will be supported if Intel IOMMU
has the capability. With this option, super page will
not be supported.
ecs_off [Default Off]
By default, extended context tables will be supported if
the hardware advertises that it has support both for the
extended tables themselves, and also PASID support. With
this option set, extended tables will not be used even
on hardware which claims to support them.
sm_on [Default Off]
By default, scalable mode will be disabled even if the
hardware advertises that it has support for the scalable
mode translation. With this option set, scalable mode
will be used on hardware which claims to support it.
tboot_noforce [Default Off]
Do not force the Intel IOMMU enabled under tboot.
By default, tboot will force Intel IOMMU on, which
@@ -1818,6 +1846,11 @@
to let secondary kernels in charge of setting up
LPIs.
irqchip.gicv3_pseudo_nmi= [ARM64]
Enables support for pseudo-NMIs in the kernel. This
requires the kernel to be built with
CONFIG_ARM64_PSEUDO_NMI.
irqfixup [HW]
When an interrupt is not handled search all handlers
for it. Intended to get systems with badly broken
@@ -1969,6 +2002,12 @@
Built with CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF=y,
the default is off.
kpti= [ARM64] Control page table isolation of user
and kernel address spaces.
Default: enabled on cores which need mitigation.
0: force disabled
1: force enabled
kvm.ignore_msrs=[KVM] Ignore guest accesses to unhandled MSRs.
Default is 0 (don't ignore, but inject #GP)
@@ -2096,6 +2135,9 @@
off
Disables hypervisor mitigations and doesn't
emit any warnings.
It also drops the swap size and available
RAM limit restriction on both hypervisor and
bare metal.
Default is 'flush'.
@@ -2303,6 +2345,10 @@
lsm.debug [SECURITY] Enable LSM initialization debugging output.
lsm=lsm1,...,lsmN
[SECURITY] Choose order of LSM initialization. This
overrides CONFIG_LSM, and the "security=" parameter.
machvec= [IA-64] Force the use of a particular machine-vector
(machvec) in a generic kernel.
Example: machvec=hpzx1_swiotlb
@@ -2827,7 +2873,7 @@
check bypass). With this option data leaks are possible
in the system.
nospectre_v2 [X86] Disable all mitigations for the Spectre variant 2
nospectre_v2 [X86,PPC_FSL_BOOK3E] Disable all mitigations for the Spectre variant 2
(indirect branch prediction) vulnerability. System may
allow data leaks with this option, which is equivalent
to spectre_v2=off.
@@ -3082,6 +3128,14 @@
timeout < 0: reboot immediately
Format: <timeout>
panic_print= Bitmask for printing system info when panic happens.
User can chose combination of the following bits:
bit 0: print all tasks info
bit 1: print system memory info
bit 2: print timer info
bit 3: print locks info if CONFIG_LOCKDEP is on
bit 4: print ftrace buffer
panic_on_warn panic() instead of WARN(). Useful to cause kdump
on a WARN().
@@ -3630,19 +3684,6 @@
latencies, which will choose a value aligned
with the appropriate hardware boundaries.
rcutree.jiffies_till_sched_qs= [KNL]
Set required age in jiffies for a
given grace period before RCU starts
soliciting quiescent-state help from
rcu_note_context_switch(). If not specified, the
kernel will calculate a value based on the most
recent settings of rcutree.jiffies_till_first_fqs
and rcutree.jiffies_till_next_fqs.
This calculated value may be viewed in
rcutree.jiffies_to_sched_qs. Any attempt to
set rcutree.jiffies_to_sched_qs will be
cheerfully overwritten.
rcutree.jiffies_till_first_fqs= [KNL]
Set delay from grace-period initialization to
first attempt to force quiescent states.
@@ -3654,6 +3695,20 @@
quiescent states. Units are jiffies, minimum
value is one, and maximum value is HZ.
rcutree.jiffies_till_sched_qs= [KNL]
Set required age in jiffies for a
given grace period before RCU starts
soliciting quiescent-state help from
rcu_note_context_switch() and cond_resched().
If not specified, the kernel will calculate
a value based on the most recent settings
of rcutree.jiffies_till_first_fqs
and rcutree.jiffies_till_next_fqs.
This calculated value may be viewed in
rcutree.jiffies_to_sched_qs. Any attempt to set
rcutree.jiffies_to_sched_qs will be cheerfully
overwritten.
rcutree.kthread_prio= [KNL,BOOT]
Set the SCHED_FIFO priority of the RCU per-CPU
kthreads (rcuc/N). This value is also used for
@@ -3697,6 +3752,11 @@
This wake_up() will be accompanied by a
WARN_ONCE() splat and an ftrace_dump().
rcutree.sysrq_rcu= [KNL]
Commandeer a sysrq key to dump out Tree RCU's
rcu_node tree with an eye towards determining
why a new grace period has not yet started.
rcuperf.gp_async= [KNL]
Measure performance of asynchronous
grace-period primitives such as call_rcu().
@@ -3748,24 +3808,6 @@
in microseconds. The default of zero says
no holdoff.
rcutorture.cbflood_inter_holdoff= [KNL]
Set holdoff time (jiffies) between successive
callback-flood tests.
rcutorture.cbflood_intra_holdoff= [KNL]
Set holdoff time (jiffies) between successive
bursts of callbacks within a given callback-flood
test.
rcutorture.cbflood_n_burst= [KNL]
Set the number of bursts making up a given
callback-flood test. Set this to zero to
disable callback-flood testing.
rcutorture.cbflood_n_per_burst= [KNL]
Set the number of callbacks to be registered
in a given burst of a callback-flood test.
rcutorture.fqs_duration= [KNL]
Set duration of force_quiescent_state bursts
in microseconds.
@@ -3778,6 +3820,23 @@
Set wait time between force_quiescent_state bursts
in seconds.
rcutorture.fwd_progress= [KNL]
Enable RCU grace-period forward-progress testing
for the types of RCU supporting this notion.
rcutorture.fwd_progress_div= [KNL]
Specify the fraction of a CPU-stall-warning
period to do tight-loop forward-progress testing.
rcutorture.fwd_progress_holdoff= [KNL]
Number of seconds to wait between successive
forward-progress tests.
rcutorture.fwd_progress_need_resched= [KNL]
Enclose cond_resched() calls within checks for
need_resched() during tight-loop forward-progress
testing.
rcutorture.gp_cond= [KNL]
Use conditional/asynchronous update-side
primitives, if available.
@@ -4067,11 +4126,9 @@
Note: increases power consumption, thus should only be
enabled if running jitter sensitive (HPC/RT) workloads.
security= [SECURITY] Choose a security module to enable at boot.
If this boot parameter is not specified, only the first
security module asking for security registration will be
loaded. An invalid security module name will be treated
as if no module has been chosen.
security= [SECURITY] Choose a legacy "major" security module to
enable at boot. This has been deprecated by the
"lsm=" parameter.
selinux= [SELINUX] Disable or enable SELinux at boot time.
Format: { "0" | "1" }
@@ -4675,7 +4732,8 @@
usbcore.authorized_default=
[USB] Default USB device authorization:
(default -1 = authorized except for wireless USB,
0 = not authorized, 1 = authorized)
0 = not authorized, 1 = authorized, 2 = authorized
if device connected to internal port)
usbcore.autosuspend=
[USB] The autosuspend time delay (in seconds) used
@@ -5020,6 +5078,14 @@
or other driver-specific files in the
Documentation/watchdog/ directory.
watchdog_thresh=
[KNL]
Set the hard lockup detector stall duration
threshold in seconds. The soft lockup detector
threshold is set to twice the value. A value of 0
disables both lockup detectors. Default is 10
seconds.
workqueue.watchdog_thresh=
If CONFIG_WQ_WATCHDOG is configured, workqueue can
warn stall conditions and dump internal state to

View File

@@ -405,6 +405,9 @@ time with the option "l1tf=". The valid arguments for this option are:
off Disables hypervisor mitigations and doesn't emit any
warnings.
It also drops the swap size and available RAM limit restrictions
on both hypervisor and bare metal.
============ =============================================================
The default is 'flush'. For details about L1D flushing see :ref:`l1d_flush`.
@@ -576,7 +579,8 @@ Default mitigations
The kernel default mitigations for vulnerable processors are:
- PTE inversion to protect against malicious user space. This is done
unconditionally and cannot be controlled.
unconditionally and cannot be controlled. The swap storage is limited
to ~16TB.
- L1D conditional flushing on VMENTER when EPT is enabled for
a guest.

View File

@@ -756,3 +756,6 @@ These currently include:
The cache mode for raid5. raid5 could include an extra disk for
caching. The mode can be "write-throuth" and "write-back". The
default is "write-through".
ppl_write_hint
NVMe stream ID to be set for each PPL write request.

View File

@@ -4,13 +4,13 @@
Concepts overview
=================
The memory management in Linux is complex system that evolved over the
years and included more and more functionality to support variety of
The memory management in Linux is a complex system that evolved over the
years and included more and more functionality to support a variety of
systems from MMU-less microcontrollers to supercomputers. The memory
management for systems without MMU is called ``nommu`` and it
management for systems without an MMU is called ``nommu`` and it
definitely deserves a dedicated document, which hopefully will be
eventually written. Yet, although some of the concepts are the same,
here we assume that MMU is available and CPU can translate a virtual
here we assume that an MMU is available and a CPU can translate a virtual
address to a physical address.
.. contents:: :local:
@@ -21,10 +21,10 @@ Virtual Memory Primer
The physical memory in a computer system is a limited resource and
even for systems that support memory hotplug there is a hard limit on
the amount of memory that can be installed. The physical memory is not
necessary contiguous, it might be accessible as a set of distinct
necessarily contiguous; it might be accessible as a set of distinct
address ranges. Besides, different CPU architectures, and even
different implementations of the same architecture have different view
how these address ranges defined.
different implementations of the same architecture have different views
of how these address ranges are defined.
All this makes dealing directly with physical memory quite complex and
to avoid this complexity a concept of virtual memory was developed.
@@ -48,8 +48,8 @@ appropriate kernel configuration option.
Each physical memory page can be mapped as one or more virtual
pages. These mappings are described by page tables that allow
translation from virtual address used by programs to real address in
the physical memory. The page tables organized hierarchically.
translation from a virtual address used by programs to the physical
memory address. The page tables are organized hierarchically.
The tables at the lowest level of the hierarchy contain physical
addresses of actual pages used by the software. The tables at higher
@@ -121,8 +121,8 @@ Nodes
Many multi-processor machines are NUMA - Non-Uniform Memory Access -
systems. In such systems the memory is arranged into banks that have
different access latency depending on the "distance" from the
processor. Each bank is referred as `node` and for each node Linux
constructs an independent memory management subsystem. A node has it's
processor. Each bank is referred to as a `node` and for each node Linux
constructs an independent memory management subsystem. A node has its
own set of zones, lists of free and used pages and various statistics
counters. You can find more details about NUMA in
:ref:`Documentation/vm/numa.rst <numa>` and in
@@ -149,9 +149,9 @@ for program's stack and heap or by explicit calls to mmap(2) system
call. Usually, the anonymous mappings only define virtual memory areas
that the program is allowed to access. The read accesses will result
in creation of a page table entry that references a special physical
page filled with zeroes. When the program performs a write, regular
page filled with zeroes. When the program performs a write, a regular
physical page will be allocated to hold the written data. The page
will be marked dirty and if the kernel will decide to repurpose it,
will be marked dirty and if the kernel decides to repurpose it,
the dirty page will be swapped out.
Reclaim
@@ -181,8 +181,8 @@ pressure.
The process of freeing the reclaimable physical memory pages and
repurposing them is called (surprise!) `reclaim`. Linux can reclaim
pages either asynchronously or synchronously, depending on the state
of the system. When system is not loaded, most of the memory is free
and allocation request will be satisfied immediately from the free
of the system. When the system is not loaded, most of the memory is free
and allocation requests will be satisfied immediately from the free
pages supply. As the load increases, the amount of the free pages goes
down and when it reaches a certain threshold (high watermark), an
allocation request will awaken the ``kswapd`` daemon. It will
@@ -190,7 +190,7 @@ asynchronously scan memory pages and either just free them if the data
they contain is available elsewhere, or evict to the backing storage
device (remember those dirty pages?). As memory usage increases even
more and reaches another threshold - min watermark - an allocation
will trigger the `direct reclaim`. In this case allocation is stalled
will trigger `direct reclaim`. In this case allocation is stalled
until enough memory pages are reclaimed to satisfy the request.
Compaction
@@ -200,7 +200,7 @@ As the system runs, tasks allocate and free the memory and it becomes
fragmented. Although with virtual memory it is possible to present
scattered physical pages as virtually contiguous range, sometimes it is
necessary to allocate large physically contiguous memory areas. Such
need may arise, for instance, when a device driver requires large
need may arise, for instance, when a device driver requires a large
buffer for DMA, or when THP allocates a huge page. Memory `compaction`
addresses the fragmentation issue. This mechanism moves occupied pages
from the lower part of a memory zone to free pages in the upper part
@@ -208,15 +208,16 @@ of the zone. When a compaction scan is finished free pages are grouped
together at the beginning of the zone and allocations of large
physically contiguous areas become possible.
Like reclaim, the compaction may happen asynchronously in ``kcompactd``
daemon or synchronously as a result of memory allocation request.
Like reclaim, the compaction may happen asynchronously in the ``kcompactd``
daemon or synchronously as a result of a memory allocation request.
OOM killer
==========
It may happen, that on a loaded machine memory will be exhausted. When
the kernel detects that the system runs out of memory (OOM) it invokes
`OOM killer`. Its mission is simple: all it has to do is to select a
task to sacrifice for the sake of the overall system health. The
selected task is killed in a hope that after it exits enough memory
will be freed to continue normal operation.
It is possible that on a loaded machine memory will be exhausted and the
kernel will be unable to reclaim enough memory to continue to operate. In
order to save the rest of the system, it invokes the `OOM killer`.
The `OOM killer` selects a task to sacrifice for the sake of the overall
system health. The selected task is killed in a hope that after it exits
enough memory will be freed to continue normal operation.

View File

@@ -75,9 +75,10 @@ number of times a page is mapped.
20. NOPAGE
21. KSM
22. THP
23. BALLOON
23. OFFLINE
24. ZERO_PAGE
25. IDLE
26. PGTABLE
* ``/proc/kpagecgroup``. This file contains a 64-bit inode number of the
memory cgroup each page is charged to, indexed by PFN. Only available when
@@ -118,8 +119,8 @@ Short descriptions to the page flags
identical memory pages dynamically shared between one or more processes
22 - THP
contiguous pages which construct transparent hugepages
23 - BALLOON
balloon compaction page
23 - OFFLINE
page is logically offline
24 - ZERO_PAGE
zero page for pfn_zero or huge_zero page
25 - IDLE
@@ -128,6 +129,8 @@ Short descriptions to the page flags
Note that this flag may be stale in case the page was accessed via
a PTE. To make sure the flag is up-to-date one has to read
``/sys/kernel/mm/page_idle/bitmap`` first.
26 - PGTABLE
page is in use as a page table
IO related page flags
---------------------

View File

@@ -0,0 +1,230 @@
.. _perf_security:
Perf Events and tool security
=============================
Overview
--------
Usage of Performance Counters for Linux (perf_events) [1]_ , [2]_ , [3]_
can impose a considerable risk of leaking sensitive data accessed by
monitored processes. The data leakage is possible both in scenarios of
direct usage of perf_events system call API [2]_ and over data files
generated by Perf tool user mode utility (Perf) [3]_ , [4]_ . The risk
depends on the nature of data that perf_events performance monitoring
units (PMU) [2]_ and Perf collect and expose for performance analysis.
Collected system and performance data may be split into several
categories:
1. System hardware and software configuration data, for example: a CPU
model and its cache configuration, an amount of available memory and
its topology, used kernel and Perf versions, performance monitoring
setup including experiment time, events configuration, Perf command
line parameters, etc.
2. User and kernel module paths and their load addresses with sizes,
process and thread names with their PIDs and TIDs, timestamps for
captured hardware and software events.
3. Content of kernel software counters (e.g., for context switches, page
faults, CPU migrations), architectural hardware performance counters
(PMC) [8]_ and machine specific registers (MSR) [9]_ that provide
execution metrics for various monitored parts of the system (e.g.,
memory controller (IMC), interconnect (QPI/UPI) or peripheral (PCIe)
uncore counters) without direct attribution to any execution context
state.
4. Content of architectural execution context registers (e.g., RIP, RSP,
RBP on x86_64), process user and kernel space memory addresses and
data, content of various architectural MSRs that capture data from
this category.
Data that belong to the fourth category can potentially contain
sensitive process data. If PMUs in some monitoring modes capture values
of execution context registers or data from process memory then access
to such monitoring capabilities requires to be ordered and secured
properly. So, perf_events/Perf performance monitoring is the subject for
security access control management [5]_ .
perf_events/Perf access control
-------------------------------
To perform security checks, the Linux implementation splits processes
into two categories [6]_ : a) privileged processes (whose effective user
ID is 0, referred to as superuser or root), and b) unprivileged
processes (whose effective UID is nonzero). Privileged processes bypass
all kernel security permission checks so perf_events performance
monitoring is fully available to privileged processes without access,
scope and resource restrictions.
Unprivileged processes are subject to a full security permission check
based on the process's credentials [5]_ (usually: effective UID,
effective GID, and supplementary group list).
Linux divides the privileges traditionally associated with superuser
into distinct units, known as capabilities [6]_ , which can be
independently enabled and disabled on per-thread basis for processes and
files of unprivileged users.
Unprivileged processes with enabled CAP_SYS_ADMIN capability are treated
as privileged processes with respect to perf_events performance
monitoring and bypass *scope* permissions checks in the kernel.
Unprivileged processes using perf_events system call API is also subject
for PTRACE_MODE_READ_REALCREDS ptrace access mode check [7]_ , whose
outcome determines whether monitoring is permitted. So unprivileged
processes provided with CAP_SYS_PTRACE capability are effectively
permitted to pass the check.
Other capabilities being granted to unprivileged processes can
effectively enable capturing of additional data required for later
performance analysis of monitored processes or a system. For example,
CAP_SYSLOG capability permits reading kernel space memory addresses from
/proc/kallsyms file.
perf_events/Perf privileged users
---------------------------------
Mechanisms of capabilities, privileged capability-dumb files [6]_ and
file system ACLs [10]_ can be used to create a dedicated group of
perf_events/Perf privileged users who are permitted to execute
performance monitoring without scope limits. The following steps can be
taken to create such a group of privileged Perf users.
1. Create perf_users group of privileged Perf users, assign perf_users
group to Perf tool executable and limit access to the executable for
other users in the system who are not in the perf_users group:
::
# groupadd perf_users
# ls -alhF
-rwxr-xr-x 2 root root 11M Oct 19 15:12 perf
# chgrp perf_users perf
# ls -alhF
-rwxr-xr-x 2 root perf_users 11M Oct 19 15:12 perf
# chmod o-rwx perf
# ls -alhF
-rwxr-x--- 2 root perf_users 11M Oct 19 15:12 perf
2. Assign the required capabilities to the Perf tool executable file and
enable members of perf_users group with performance monitoring
privileges [6]_ :
::
# setcap "cap_sys_admin,cap_sys_ptrace,cap_syslog=ep" perf
# setcap -v "cap_sys_admin,cap_sys_ptrace,cap_syslog=ep" perf
perf: OK
# getcap perf
perf = cap_sys_ptrace,cap_sys_admin,cap_syslog+ep
As a result, members of perf_users group are capable of conducting
performance monitoring by using functionality of the configured Perf
tool executable that, when executes, passes perf_events subsystem scope
checks.
This specific access control management is only available to superuser
or root running processes with CAP_SETPCAP, CAP_SETFCAP [6]_
capabilities.
perf_events/Perf unprivileged users
-----------------------------------
perf_events/Perf *scope* and *access* control for unprivileged processes
is governed by perf_event_paranoid [2]_ setting:
-1:
Impose no *scope* and *access* restrictions on using perf_events
performance monitoring. Per-user per-cpu perf_event_mlock_kb [2]_
locking limit is ignored when allocating memory buffers for storing
performance data. This is the least secure mode since allowed
monitored *scope* is maximized and no perf_events specific limits
are imposed on *resources* allocated for performance monitoring.
>=0:
*scope* includes per-process and system wide performance monitoring
but excludes raw tracepoints and ftrace function tracepoints
monitoring. CPU and system events happened when executing either in
user or in kernel space can be monitored and captured for later
analysis. Per-user per-cpu perf_event_mlock_kb locking limit is
imposed but ignored for unprivileged processes with CAP_IPC_LOCK
[6]_ capability.
>=1:
*scope* includes per-process performance monitoring only and
excludes system wide performance monitoring. CPU and system events
happened when executing either in user or in kernel space can be
monitored and captured for later analysis. Per-user per-cpu
perf_event_mlock_kb locking limit is imposed but ignored for
unprivileged processes with CAP_IPC_LOCK capability.
>=2:
*scope* includes per-process performance monitoring only. CPU and
system events happened when executing in user space only can be
monitored and captured for later analysis. Per-user per-cpu
perf_event_mlock_kb locking limit is imposed but ignored for
unprivileged processes with CAP_IPC_LOCK capability.
perf_events/Perf resource control
---------------------------------
Open file descriptors
+++++++++++++++++++++
The perf_events system call API [2]_ allocates file descriptors for
every configured PMU event. Open file descriptors are a per-process
accountable resource governed by the RLIMIT_NOFILE [11]_ limit
(ulimit -n), which is usually derived from the login shell process. When
configuring Perf collection for a long list of events on a large server
system, this limit can be easily hit preventing required monitoring
configuration. RLIMIT_NOFILE limit can be increased on per-user basis
modifying content of the limits.conf file [12]_ . Ordinarily, a Perf
sampling session (perf record) requires an amount of open perf_event
file descriptors that is not less than the number of monitored events
multiplied by the number of monitored CPUs.
Memory allocation
+++++++++++++++++
The amount of memory available to user processes for capturing
performance monitoring data is governed by the perf_event_mlock_kb [2]_
setting. This perf_event specific resource setting defines overall
per-cpu limits of memory allowed for mapping by the user processes to
execute performance monitoring. The setting essentially extends the
RLIMIT_MEMLOCK [11]_ limit, but only for memory regions mapped
specifically for capturing monitored performance events and related data.
For example, if a machine has eight cores and perf_event_mlock_kb limit
is set to 516 KiB, then a user process is provided with 516 KiB * 8 =
4128 KiB of memory above the RLIMIT_MEMLOCK limit (ulimit -l) for
perf_event mmap buffers. In particular, this means that, if the user
wants to start two or more performance monitoring processes, the user is
required to manually distribute the available 4128 KiB between the
monitoring processes, for example, using the --mmap-pages Perf record
mode option. Otherwise, the first started performance monitoring process
allocates all available 4128 KiB and the other processes will fail to
proceed due to the lack of memory.
RLIMIT_MEMLOCK and perf_event_mlock_kb resource constraints are ignored
for processes with the CAP_IPC_LOCK capability. Thus, perf_events/Perf
privileged users can be provided with memory above the constraints for
perf_events/Perf performance monitoring purpose by providing the Perf
executable with CAP_IPC_LOCK capability.
Bibliography
------------
.. [1] `<https://lwn.net/Articles/337493/>`_
.. [2] `<http://man7.org/linux/man-pages/man2/perf_event_open.2.html>`_
.. [3] `<http://web.eece.maine.edu/~vweaver/projects/perf_events/>`_
.. [4] `<https://perf.wiki.kernel.org/index.php/Main_Page>`_
.. [5] `<https://www.kernel.org/doc/html/latest/security/credentials.html>`_
.. [6] `<http://man7.org/linux/man-pages/man7/capabilities.7.html>`_
.. [7] `<http://man7.org/linux/man-pages/man2/ptrace.2.html>`_
.. [8] `<https://en.wikipedia.org/wiki/Hardware_performance_counter>`_
.. [9] `<https://en.wikipedia.org/wiki/Model-specific_register>`_
.. [10] `<http://man7.org/linux/man-pages/man5/acl.5.html>`_
.. [11] `<http://man7.org/linux/man-pages/man2/getrlimit.2.html>`_
.. [12] `<http://man7.org/linux/man-pages/man5/limits.conf.5.html>`_

View File

@@ -0,0 +1,719 @@
.. |struct cpuidle_state| replace:: :c:type:`struct cpuidle_state <cpuidle_state>`
.. |cpufreq| replace:: :doc:`CPU Performance Scaling <cpufreq>`
========================
CPU Idle Time Management
========================
::
Copyright (c) 2018 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Concepts
========
Modern processors are generally able to enter states in which the execution of
a program is suspended and instructions belonging to it are not fetched from
memory or executed. Those states are the *idle* states of the processor.
Since part of the processor hardware is not used in idle states, entering them
generally allows power drawn by the processor to be reduced and, in consequence,
it is an opportunity to save energy.
CPU idle time management is an energy-efficiency feature concerned about using
the idle states of processors for this purpose.
Logical CPUs
------------
CPU idle time management operates on CPUs as seen by the *CPU scheduler* (that
is the part of the kernel responsible for the distribution of computational
work in the system). In its view, CPUs are *logical* units. That is, they need
not be separate physical entities and may just be interfaces appearing to
software as individual single-core processors. In other words, a CPU is an
entity which appears to be fetching instructions that belong to one sequence
(program) from memory and executing them, but it need not work this way
physically. Generally, three different cases can be consider here.
First, if the whole processor can only follow one sequence of instructions (one
program) at a time, it is a CPU. In that case, if the hardware is asked to
enter an idle state, that applies to the processor as a whole.
Second, if the processor is multi-core, each core in it is able to follow at
least one program at a time. The cores need not be entirely independent of each
other (for example, they may share caches), but still most of the time they
work physically in parallel with each other, so if each of them executes only
one program, those programs run mostly independently of each other at the same
time. The entire cores are CPUs in that case and if the hardware is asked to
enter an idle state, that applies to the core that asked for it in the first
place, but it also may apply to a larger unit (say a "package" or a "cluster")
that the core belongs to (in fact, it may apply to an entire hierarchy of larger
units containing the core). Namely, if all of the cores in the larger unit
except for one have been put into idle states at the "core level" and the
remaining core asks the processor to enter an idle state, that may trigger it
to put the whole larger unit into an idle state which also will affect the
other cores in that unit.
Finally, each core in a multi-core processor may be able to follow more than one
program in the same time frame (that is, each core may be able to fetch
instructions from multiple locations in memory and execute them in the same time
frame, but not necessarily entirely in parallel with each other). In that case
the cores present themselves to software as "bundles" each consisting of
multiple individual single-core "processors", referred to as *hardware threads*
(or hyper-threads specifically on Intel hardware), that each can follow one
sequence of instructions. Then, the hardware threads are CPUs from the CPU idle
time management perspective and if the processor is asked to enter an idle state
by one of them, the hardware thread (or CPU) that asked for it is stopped, but
nothing more happens, unless all of the other hardware threads within the same
core also have asked the processor to enter an idle state. In that situation,
the core may be put into an idle state individually or a larger unit containing
it may be put into an idle state as a whole (if the other cores within the
larger unit are in idle states already).
Idle CPUs
---------
Logical CPUs, simply referred to as "CPUs" in what follows, are regarded as
*idle* by the Linux kernel when there are no tasks to run on them except for the
special "idle" task.
Tasks are the CPU scheduler's representation of work. Each task consists of a
sequence of instructions to execute, or code, data to be manipulated while
running that code, and some context information that needs to be loaded into the
processor every time the task's code is run by a CPU. The CPU scheduler
distributes work by assigning tasks to run to the CPUs present in the system.
Tasks can be in various states. In particular, they are *runnable* if there are
no specific conditions preventing their code from being run by a CPU as long as
there is a CPU available for that (for example, they are not waiting for any
events to occur or similar). When a task becomes runnable, the CPU scheduler
assigns it to one of the available CPUs to run and if there are no more runnable
tasks assigned to it, the CPU will load the given task's context and run its
code (from the instruction following the last one executed so far, possibly by
another CPU). [If there are multiple runnable tasks assigned to one CPU
simultaneously, they will be subject to prioritization and time sharing in order
to allow them to make some progress over time.]
The special "idle" task becomes runnable if there are no other runnable tasks
assigned to the given CPU and the CPU is then regarded as idle. In other words,
in Linux idle CPUs run the code of the "idle" task called *the idle loop*. That
code may cause the processor to be put into one of its idle states, if they are
supported, in order to save energy, but if the processor does not support any
idle states, or there is not enough time to spend in an idle state before the
next wakeup event, or there are strict latency constraints preventing any of the
available idle states from being used, the CPU will simply execute more or less
useless instructions in a loop until it is assigned a new task to run.
.. _idle-loop:
The Idle Loop
=============
The idle loop code takes two major steps in every iteration of it. First, it
calls into a code module referred to as the *governor* that belongs to the CPU
idle time management subsystem called ``CPUIdle`` to select an idle state for
the CPU to ask the hardware to enter. Second, it invokes another code module
from the ``CPUIdle`` subsystem, called the *driver*, to actually ask the
processor hardware to enter the idle state selected by the governor.
The role of the governor is to find an idle state most suitable for the
conditions at hand. For this purpose, idle states that the hardware can be
asked to enter by logical CPUs are represented in an abstract way independent of
the platform or the processor architecture and organized in a one-dimensional
(linear) array. That array has to be prepared and supplied by the ``CPUIdle``
driver matching the platform the kernel is running on at the initialization
time. This allows ``CPUIdle`` governors to be independent of the underlying
hardware and to work with any platforms that the Linux kernel can run on.
Each idle state present in that array is characterized by two parameters to be
taken into account by the governor, the *target residency* and the (worst-case)
*exit latency*. The target residency is the minimum time the hardware must
spend in the given state, including the time needed to enter it (which may be
substantial), in order to save more energy than it would save by entering one of
the shallower idle states instead. [The "depth" of an idle state roughly
corresponds to the power drawn by the processor in that state.] The exit
latency, in turn, is the maximum time it will take a CPU asking the processor
hardware to enter an idle state to start executing the first instruction after a
wakeup from that state. Note that in general the exit latency also must cover
the time needed to enter the given state in case the wakeup occurs when the
hardware is entering it and it must be entered completely to be exited in an
ordered manner.
There are two types of information that can influence the governor's decisions.
First of all, the governor knows the time until the closest timer event. That
time is known exactly, because the kernel programs timers and it knows exactly
when they will trigger, and it is the maximum time the hardware that the given
CPU depends on can spend in an idle state, including the time necessary to enter
and exit it. However, the CPU may be woken up by a non-timer event at any time
(in particular, before the closest timer triggers) and it generally is not known
when that may happen. The governor can only see how much time the CPU actually
was idle after it has been woken up (that time will be referred to as the *idle
duration* from now on) and it can use that information somehow along with the
time until the closest timer to estimate the idle duration in future. How the
governor uses that information depends on what algorithm is implemented by it
and that is the primary reason for having more than one governor in the
``CPUIdle`` subsystem.
There are three ``CPUIdle`` governors available, ``menu``, `TEO <teo-gov_>`_
and ``ladder``. Which of them is used by default depends on the configuration
of the kernel and in particular on whether or not the scheduler tick can be
`stopped by the idle loop <idle-cpus-and-tick_>`_. It is possible to change the
governor at run time if the ``cpuidle_sysfs_switch`` command line parameter has
been passed to the kernel, but that is not safe in general, so it should not be
done on production systems (that may change in the future, though). The name of
the ``CPUIdle`` governor currently used by the kernel can be read from the
:file:`current_governor_ro` (or :file:`current_governor` if
``cpuidle_sysfs_switch`` is present in the kernel command line) file under
:file:`/sys/devices/system/cpu/cpuidle/` in ``sysfs``.
Which ``CPUIdle`` driver is used, on the other hand, usually depends on the
platform the kernel is running on, but there are platforms with more than one
matching driver. For example, there are two drivers that can work with the
majority of Intel platforms, ``intel_idle`` and ``acpi_idle``, one with
hardcoded idle states information and the other able to read that information
from the system's ACPI tables, respectively. Still, even in those cases, the
driver chosen at the system initialization time cannot be replaced later, so the
decision on which one of them to use has to be made early (on Intel platforms
the ``acpi_idle`` driver will be used if ``intel_idle`` is disabled for some
reason or if it does not recognize the processor). The name of the ``CPUIdle``
driver currently used by the kernel can be read from the :file:`current_driver`
file under :file:`/sys/devices/system/cpu/cpuidle/` in ``sysfs``.
.. _idle-cpus-and-tick:
Idle CPUs and The Scheduler Tick
================================
The scheduler tick is a timer that triggers periodically in order to implement
the time sharing strategy of the CPU scheduler. Of course, if there are
multiple runnable tasks assigned to one CPU at the same time, the only way to
allow them to make reasonable progress in a given time frame is to make them
share the available CPU time. Namely, in rough approximation, each task is
given a slice of the CPU time to run its code, subject to the scheduling class,
prioritization and so on and when that time slice is used up, the CPU should be
switched over to running (the code of) another task. The currently running task
may not want to give the CPU away voluntarily, however, and the scheduler tick
is there to make the switch happen regardless. That is not the only role of the
tick, but it is the primary reason for using it.
The scheduler tick is problematic from the CPU idle time management perspective,
because it triggers periodically and relatively often (depending on the kernel
configuration, the length of the tick period is between 1 ms and 10 ms).
Thus, if the tick is allowed to trigger on idle CPUs, it will not make sense
for them to ask the hardware to enter idle states with target residencies above
the tick period length. Moreover, in that case the idle duration of any CPU
will never exceed the tick period length and the energy used for entering and
exiting idle states due to the tick wakeups on idle CPUs will be wasted.
Fortunately, it is not really necessary to allow the tick to trigger on idle
CPUs, because (by definition) they have no tasks to run except for the special
"idle" one. In other words, from the CPU scheduler perspective, the only user
of the CPU time on them is the idle loop. Since the time of an idle CPU need
not be shared between multiple runnable tasks, the primary reason for using the
tick goes away if the given CPU is idle. Consequently, it is possible to stop
the scheduler tick entirely on idle CPUs in principle, even though that may not
always be worth the effort.
Whether or not it makes sense to stop the scheduler tick in the idle loop
depends on what is expected by the governor. First, if there is another
(non-tick) timer due to trigger within the tick range, stopping the tick clearly
would be a waste of time, even though the timer hardware may not need to be
reprogrammed in that case. Second, if the governor is expecting a non-timer
wakeup within the tick range, stopping the tick is not necessary and it may even
be harmful. Namely, in that case the governor will select an idle state with
the target residency within the time until the expected wakeup, so that state is
going to be relatively shallow. The governor really cannot select a deep idle
state then, as that would contradict its own expectation of a wakeup in short
order. Now, if the wakeup really occurs shortly, stopping the tick would be a
waste of time and in this case the timer hardware would need to be reprogrammed,
which is expensive. On the other hand, if the tick is stopped and the wakeup
does not occur any time soon, the hardware may spend indefinite amount of time
in the shallow idle state selected by the governor, which will be a waste of
energy. Hence, if the governor is expecting a wakeup of any kind within the
tick range, it is better to allow the tick trigger. Otherwise, however, the
governor will select a relatively deep idle state, so the tick should be stopped
so that it does not wake up the CPU too early.
In any case, the governor knows what it is expecting and the decision on whether
or not to stop the scheduler tick belongs to it. Still, if the tick has been
stopped already (in one of the previous iterations of the loop), it is better
to leave it as is and the governor needs to take that into account.
The kernel can be configured to disable stopping the scheduler tick in the idle
loop altogether. That can be done through the build-time configuration of it
(by unsetting the ``CONFIG_NO_HZ_IDLE`` configuration option) or by passing
``nohz=off`` to it in the command line. In both cases, as the stopping of the
scheduler tick is disabled, the governor's decisions regarding it are simply
ignored by the idle loop code and the tick is never stopped.
The systems that run kernels configured to allow the scheduler tick to be
stopped on idle CPUs are referred to as *tickless* systems and they are
generally regarded as more energy-efficient than the systems running kernels in
which the tick cannot be stopped. If the given system is tickless, it will use
the ``menu`` governor by default and if it is not tickless, the default
``CPUIdle`` governor on it will be ``ladder``.
.. _menu-gov:
The ``menu`` Governor
=====================
The ``menu`` governor is the default ``CPUIdle`` governor for tickless systems.
It is quite complex, but the basic principle of its design is straightforward.
Namely, when invoked to select an idle state for a CPU (i.e. an idle state that
the CPU will ask the processor hardware to enter), it attempts to predict the
idle duration and uses the predicted value for idle state selection.
It first obtains the time until the closest timer event with the assumption
that the scheduler tick will be stopped. That time, referred to as the *sleep
length* in what follows, is the upper bound on the time before the next CPU
wakeup. It is used to determine the sleep length range, which in turn is needed
to get the sleep length correction factor.
The ``menu`` governor maintains two arrays of sleep length correction factors.
One of them is used when tasks previously running on the given CPU are waiting
for some I/O operations to complete and the other one is used when that is not
the case. Each array contains several correction factor values that correspond
to different sleep length ranges organized so that each range represented in the
array is approximately 10 times wider than the previous one.
The correction factor for the given sleep length range (determined before
selecting the idle state for the CPU) is updated after the CPU has been woken
up and the closer the sleep length is to the observed idle duration, the closer
to 1 the correction factor becomes (it must fall between 0 and 1 inclusive).
The sleep length is multiplied by the correction factor for the range that it
falls into to obtain the first approximation of the predicted idle duration.
Next, the governor uses a simple pattern recognition algorithm to refine its
idle duration prediction. Namely, it saves the last 8 observed idle duration
values and, when predicting the idle duration next time, it computes the average
and variance of them. If the variance is small (smaller than 400 square
milliseconds) or it is small relative to the average (the average is greater
that 6 times the standard deviation), the average is regarded as the "typical
interval" value. Otherwise, the longest of the saved observed idle duration
values is discarded and the computation is repeated for the remaining ones.
Again, if the variance of them is small (in the above sense), the average is
taken as the "typical interval" value and so on, until either the "typical
interval" is determined or too many data points are disregarded, in which case
the "typical interval" is assumed to equal "infinity" (the maximum unsigned
integer value). The "typical interval" computed this way is compared with the
sleep length multiplied by the correction factor and the minimum of the two is
taken as the predicted idle duration.
Then, the governor computes an extra latency limit to help "interactive"
workloads. It uses the observation that if the exit latency of the selected
idle state is comparable with the predicted idle duration, the total time spent
in that state probably will be very short and the amount of energy to save by
entering it will be relatively small, so likely it is better to avoid the
overhead related to entering that state and exiting it. Thus selecting a
shallower state is likely to be a better option then. The first approximation
of the extra latency limit is the predicted idle duration itself which
additionally is divided by a value depending on the number of tasks that
previously ran on the given CPU and now they are waiting for I/O operations to
complete. The result of that division is compared with the latency limit coming
from the power management quality of service, or `PM QoS <cpu-pm-qos_>`_,
framework and the minimum of the two is taken as the limit for the idle states'
exit latency.
Now, the governor is ready to walk the list of idle states and choose one of
them. For this purpose, it compares the target residency of each state with
the predicted idle duration and the exit latency of it with the computed latency
limit. It selects the state with the target residency closest to the predicted
idle duration, but still below it, and exit latency that does not exceed the
limit.
In the final step the governor may still need to refine the idle state selection
if it has not decided to `stop the scheduler tick <idle-cpus-and-tick_>`_. That
happens if the idle duration predicted by it is less than the tick period and
the tick has not been stopped already (in a previous iteration of the idle
loop). Then, the sleep length used in the previous computations may not reflect
the real time until the closest timer event and if it really is greater than
that time, the governor may need to select a shallower state with a suitable
target residency.
.. _teo-gov:
The Timer Events Oriented (TEO) Governor
========================================
The timer events oriented (TEO) governor is an alternative ``CPUIdle`` governor
for tickless systems. It follows the same basic strategy as the ``menu`` `one
<menu-gov_>`_: it always tries to find the deepest idle state suitable for the
given conditions. However, it applies a different approach to that problem.
First, it does not use sleep length correction factors, but instead it attempts
to correlate the observed idle duration values with the available idle states
and use that information to pick up the idle state that is most likely to
"match" the upcoming CPU idle interval. Second, it does not take the tasks
that were running on the given CPU in the past and are waiting on some I/O
operations to complete now at all (there is no guarantee that they will run on
the same CPU when they become runnable again) and the pattern detection code in
it avoids taking timer wakeups into account. It also only uses idle duration
values less than the current time till the closest timer (with the scheduler
tick excluded) for that purpose.
Like in the ``menu`` governor `case <menu-gov_>`_, the first step is to obtain
the *sleep length*, which is the time until the closest timer event with the
assumption that the scheduler tick will be stopped (that also is the upper bound
on the time until the next CPU wakeup). That value is then used to preselect an
idle state on the basis of three metrics maintained for each idle state provided
by the ``CPUIdle`` driver: ``hits``, ``misses`` and ``early_hits``.
The ``hits`` and ``misses`` metrics measure the likelihood that a given idle
state will "match" the observed (post-wakeup) idle duration if it "matches" the
sleep length. They both are subject to decay (after a CPU wakeup) every time
the target residency of the idle state corresponding to them is less than or
equal to the sleep length and the target residency of the next idle state is
greater than the sleep length (that is, when the idle state corresponding to
them "matches" the sleep length). The ``hits`` metric is increased if the
former condition is satisfied and the target residency of the given idle state
is less than or equal to the observed idle duration and the target residency of
the next idle state is greater than the observed idle duration at the same time
(that is, it is increased when the given idle state "matches" both the sleep
length and the observed idle duration). In turn, the ``misses`` metric is
increased when the given idle state "matches" the sleep length only and the
observed idle duration is too short for its target residency.
The ``early_hits`` metric measures the likelihood that a given idle state will
"match" the observed (post-wakeup) idle duration if it does not "match" the
sleep length. It is subject to decay on every CPU wakeup and it is increased
when the idle state corresponding to it "matches" the observed (post-wakeup)
idle duration and the target residency of the next idle state is less than or
equal to the sleep length (i.e. the idle state "matching" the sleep length is
deeper than the given one).
The governor walks the list of idle states provided by the ``CPUIdle`` driver
and finds the last (deepest) one with the target residency less than or equal
to the sleep length. Then, the ``hits`` and ``misses`` metrics of that idle
state are compared with each other and it is preselected if the ``hits`` one is
greater (which means that that idle state is likely to "match" the observed idle
duration after CPU wakeup). If the ``misses`` one is greater, the governor
preselects the shallower idle state with the maximum ``early_hits`` metric
(or if there are multiple shallower idle states with equal ``early_hits``
metric which also is the maximum, the shallowest of them will be preselected).
[If there is a wakeup latency constraint coming from the `PM QoS framework
<cpu-pm-qos_>`_ which is hit before reaching the deepest idle state with the
target residency within the sleep length, the deepest idle state with the exit
latency within the constraint is preselected without consulting the ``hits``,
``misses`` and ``early_hits`` metrics.]
Next, the governor takes several idle duration values observed most recently
into consideration and if at least a half of them are greater than or equal to
the target residency of the preselected idle state, that idle state becomes the
final candidate to ask for. Otherwise, the average of the most recent idle
duration values below the target residency of the preselected idle state is
computed and the governor walks the idle states shallower than the preselected
one and finds the deepest of them with the target residency within that average.
That idle state is then taken as the final candidate to ask for.
Still, at this point the governor may need to refine the idle state selection if
it has not decided to `stop the scheduler tick <idle-cpus-and-tick_>`_. That
generally happens if the target residency of the idle state selected so far is
less than the tick period and the tick has not been stopped already (in a
previous iteration of the idle loop). Then, like in the ``menu`` governor
`case <menu-gov_>`_, the sleep length used in the previous computations may not
reflect the real time until the closest timer event and if it really is greater
than that time, a shallower state with a suitable target residency may need to
be selected.
.. _idle-states-representation:
Representation of Idle States
=============================
For the CPU idle time management purposes all of the physical idle states
supported by the processor have to be represented as a one-dimensional array of
|struct cpuidle_state| objects each allowing an individual (logical) CPU to ask
the processor hardware to enter an idle state of certain properties. If there
is a hierarchy of units in the processor, one |struct cpuidle_state| object can
cover a combination of idle states supported by the units at different levels of
the hierarchy. In that case, the `target residency and exit latency parameters
of it <idle-loop_>`_, must reflect the properties of the idle state at the
deepest level (i.e. the idle state of the unit containing all of the other
units).
For example, take a processor with two cores in a larger unit referred to as
a "module" and suppose that asking the hardware to enter a specific idle state
(say "X") at the "core" level by one core will trigger the module to try to
enter a specific idle state of its own (say "MX") if the other core is in idle
state "X" already. In other words, asking for idle state "X" at the "core"
level gives the hardware a license to go as deep as to idle state "MX" at the
"module" level, but there is no guarantee that this is going to happen (the core
asking for idle state "X" may just end up in that state by itself instead).
Then, the target residency of the |struct cpuidle_state| object representing
idle state "X" must reflect the minimum time to spend in idle state "MX" of
the module (including the time needed to enter it), because that is the minimum
time the CPU needs to be idle to save any energy in case the hardware enters
that state. Analogously, the exit latency parameter of that object must cover
the exit time of idle state "MX" of the module (and usually its entry time too),
because that is the maximum delay between a wakeup signal and the time the CPU
will start to execute the first new instruction (assuming that both cores in the
module will always be ready to execute instructions as soon as the module
becomes operational as a whole).
There are processors without direct coordination between different levels of the
hierarchy of units inside them, however. In those cases asking for an idle
state at the "core" level does not automatically affect the "module" level, for
example, in any way and the ``CPUIdle`` driver is responsible for the entire
handling of the hierarchy. Then, the definition of the idle state objects is
entirely up to the driver, but still the physical properties of the idle state
that the processor hardware finally goes into must always follow the parameters
used by the governor for idle state selection (for instance, the actual exit
latency of that idle state must not exceed the exit latency parameter of the
idle state object selected by the governor).
In addition to the target residency and exit latency idle state parameters
discussed above, the objects representing idle states each contain a few other
parameters describing the idle state and a pointer to the function to run in
order to ask the hardware to enter that state. Also, for each
|struct cpuidle_state| object, there is a corresponding
:c:type:`struct cpuidle_state_usage <cpuidle_state_usage>` one containing usage
statistics of the given idle state. That information is exposed by the kernel
via ``sysfs``.
For each CPU in the system, there is a :file:`/sys/devices/system/cpu<N>/cpuidle/`
directory in ``sysfs``, where the number ``<N>`` is assigned to the given
CPU at the initialization time. That directory contains a set of subdirectories
called :file:`state0`, :file:`state1` and so on, up to the number of idle state
objects defined for the given CPU minus one. Each of these directories
corresponds to one idle state object and the larger the number in its name, the
deeper the (effective) idle state represented by it. Each of them contains
a number of files (attributes) representing the properties of the idle state
object corresponding to it, as follows:
``above``
Total number of times this idle state had been asked for, but the
observed idle duration was certainly too short to match its target
residency.
``below``
Total number of times this idle state had been asked for, but cerainly
a deeper idle state would have been a better match for the observed idle
duration.
``desc``
Description of the idle state.
``disable``
Whether or not this idle state is disabled.
``latency``
Exit latency of the idle state in microseconds.
``name``
Name of the idle state.
``power``
Power drawn by hardware in this idle state in milliwatts (if specified,
0 otherwise).
``residency``
Target residency of the idle state in microseconds.
``time``
Total time spent in this idle state by the given CPU (as measured by the
kernel) in microseconds.
``usage``
Total number of times the hardware has been asked by the given CPU to
enter this idle state.
The :file:`desc` and :file:`name` files both contain strings. The difference
between them is that the name is expected to be more concise, while the
description may be longer and it may contain white space or special characters.
The other files listed above contain integer numbers.
The :file:`disable` attribute is the only writeable one. If it contains 1, the
given idle state is disabled for this particular CPU, which means that the
governor will never select it for this particular CPU and the ``CPUIdle``
driver will never ask the hardware to enter it for that CPU as a result.
However, disabling an idle state for one CPU does not prevent it from being
asked for by the other CPUs, so it must be disabled for all of them in order to
never be asked for by any of them. [Note that, due to the way the ``ladder``
governor is implemented, disabling an idle state prevents that governor from
selecting any idle states deeper than the disabled one too.]
If the :file:`disable` attribute contains 0, the given idle state is enabled for
this particular CPU, but it still may be disabled for some or all of the other
CPUs in the system at the same time. Writing 1 to it causes the idle state to
be disabled for this particular CPU and writing 0 to it allows the governor to
take it into consideration for the given CPU and the driver to ask for it,
unless that state was disabled globally in the driver (in which case it cannot
be used at all).
The :file:`power` attribute is not defined very well, especially for idle state
objects representing combinations of idle states at different levels of the
hierarchy of units in the processor, and it generally is hard to obtain idle
state power numbers for complex hardware, so :file:`power` often contains 0 (not
available) and if it contains a nonzero number, that number may not be very
accurate and it should not be relied on for anything meaningful.
The number in the :file:`time` file generally may be greater than the total time
really spent by the given CPU in the given idle state, because it is measured by
the kernel and it may not cover the cases in which the hardware refused to enter
this idle state and entered a shallower one instead of it (or even it did not
enter any idle state at all). The kernel can only measure the time span between
asking the hardware to enter an idle state and the subsequent wakeup of the CPU
and it cannot say what really happened in the meantime at the hardware level.
Moreover, if the idle state object in question represents a combination of idle
states at different levels of the hierarchy of units in the processor,
the kernel can never say how deep the hardware went down the hierarchy in any
particular case. For these reasons, the only reliable way to find out how
much time has been spent by the hardware in different idle states supported by
it is to use idle state residency counters in the hardware, if available.
.. _cpu-pm-qos:
Power Management Quality of Service for CPUs
============================================
The power management quality of service (PM QoS) framework in the Linux kernel
allows kernel code and user space processes to set constraints on various
energy-efficiency features of the kernel to prevent performance from dropping
below a required level. The PM QoS constraints can be set globally, in
predefined categories referred to as PM QoS classes, or against individual
devices.
CPU idle time management can be affected by PM QoS in two ways, through the
global constraint in the ``PM_QOS_CPU_DMA_LATENCY`` class and through the
resume latency constraints for individual CPUs. Kernel code (e.g. device
drivers) can set both of them with the help of special internal interfaces
provided by the PM QoS framework. User space can modify the former by opening
the :file:`cpu_dma_latency` special device file under :file:`/dev/` and writing
a binary value (interpreted as a signed 32-bit integer) to it. In turn, the
resume latency constraint for a CPU can be modified by user space by writing a
string (representing a signed 32-bit integer) to the
:file:`power/pm_qos_resume_latency_us` file under
:file:`/sys/devices/system/cpu/cpu<N>/` in ``sysfs``, where the CPU number
``<N>`` is allocated at the system initialization time. Negative values
will be rejected in both cases and, also in both cases, the written integer
number will be interpreted as a requested PM QoS constraint in microseconds.
The requested value is not automatically applied as a new constraint, however,
as it may be less restrictive (greater in this particular case) than another
constraint previously requested by someone else. For this reason, the PM QoS
framework maintains a list of requests that have been made so far in each
global class and for each device, aggregates them and applies the effective
(minimum in this particular case) value as the new constraint.
In fact, opening the :file:`cpu_dma_latency` special device file causes a new
PM QoS request to be created and added to the priority list of requests in the
``PM_QOS_CPU_DMA_LATENCY`` class and the file descriptor coming from the
"open" operation represents that request. If that file descriptor is then
used for writing, the number written to it will be associated with the PM QoS
request represented by it as a new requested constraint value. Next, the
priority list mechanism will be used to determine the new effective value of
the entire list of requests and that effective value will be set as a new
constraint. Thus setting a new requested constraint value will only change the
real constraint if the effective "list" value is affected by it. In particular,
for the ``PM_QOS_CPU_DMA_LATENCY`` class it only affects the real constraint if
it is the minimum of the requested constraints in the list. The process holding
a file descriptor obtained by opening the :file:`cpu_dma_latency` special device
file controls the PM QoS request associated with that file descriptor, but it
controls this particular PM QoS request only.
Closing the :file:`cpu_dma_latency` special device file or, more precisely, the
file descriptor obtained while opening it, causes the PM QoS request associated
with that file descriptor to be removed from the ``PM_QOS_CPU_DMA_LATENCY``
class priority list and destroyed. If that happens, the priority list mechanism
will be used, again, to determine the new effective value for the whole list
and that value will become the new real constraint.
In turn, for each CPU there is only one resume latency PM QoS request
associated with the :file:`power/pm_qos_resume_latency_us` file under
:file:`/sys/devices/system/cpu/cpu<N>/` in ``sysfs`` and writing to it causes
this single PM QoS request to be updated regardless of which user space
process does that. In other words, this PM QoS request is shared by the entire
user space, so access to the file associated with it needs to be arbitrated
to avoid confusion. [Arguably, the only legitimate use of this mechanism in
practice is to pin a process to the CPU in question and let it use the
``sysfs`` interface to control the resume latency constraint for it.] It
still only is a request, however. It is a member of a priority list used to
determine the effective value to be set as the resume latency constraint for the
CPU in question every time the list of requests is updated this way or another
(there may be other requests coming from kernel code in that list).
CPU idle time governors are expected to regard the minimum of the global
effective ``PM_QOS_CPU_DMA_LATENCY`` class constraint and the effective
resume latency constraint for the given CPU as the upper limit for the exit
latency of the idle states they can select for that CPU. They should never
select any idle states with exit latency beyond that limit.
Idle States Control Via Kernel Command Line
===========================================
In addition to the ``sysfs`` interface allowing individual idle states to be
`disabled for individual CPUs <idle-states-representation_>`_, there are kernel
command line parameters affecting CPU idle time management.
The ``cpuidle.off=1`` kernel command line option can be used to disable the
CPU idle time management entirely. It does not prevent the idle loop from
running on idle CPUs, but it prevents the CPU idle time governors and drivers
from being invoked. If it is added to the kernel command line, the idle loop
will ask the hardware to enter idle states on idle CPUs via the CPU architecture
support code that is expected to provide a default mechanism for this purpose.
That default mechanism usually is the least common denominator for all of the
processors implementing the architecture (i.e. CPU instruction set) in question,
however, so it is rather crude and not very energy-efficient. For this reason,
it is not recommended for production use.
The ``cpuidle.governor=`` kernel command line switch allows the ``CPUIdle``
governor to use to be specified. It has to be appended with a string matching
the name of an available governor (e.g. ``cpuidle.governor=menu``) and that
governor will be used instead of the default one. It is possible to force
the ``menu`` governor to be used on the systems that use the ``ladder`` governor
by default this way, for example.
The other kernel command line parameters controlling CPU idle time management
described below are only relevant for the *x86* architecture and some of
them affect Intel processors only.
The *x86* architecture support code recognizes three kernel command line
options related to CPU idle time management: ``idle=poll``, ``idle=halt``,
and ``idle=nomwait``. The first two of them disable the ``acpi_idle`` and
``intel_idle`` drivers altogether, which effectively causes the entire
``CPUIdle`` subsystem to be disabled and makes the idle loop invoke the
architecture support code to deal with idle CPUs. How it does that depends on
which of the two parameters is added to the kernel command line. In the
``idle=halt`` case, the architecture support code will use the ``HLT``
instruction of the CPUs (which, as a rule, suspends the execution of the program
and causes the hardware to attempt to enter the shallowest available idle state)
for this purpose, and if ``idle=poll`` is used, idle CPUs will execute a
more or less ``lightweight'' sequence of instructions in a tight loop. [Note
that using ``idle=poll`` is somewhat drastic in many cases, as preventing idle
CPUs from saving almost any energy at all may not be the only effect of it.
For example, on Intel hardware it effectively prevents CPUs from using
P-states (see |cpufreq|) that require any number of CPUs in a package to be
idle, so it very well may hurt single-thread computations performance as well as
energy-efficiency. Thus using it for performance reasons may not be a good idea
at all.]
The ``idle=nomwait`` option disables the ``intel_idle`` driver and causes
``acpi_idle`` to be used (as long as all of the information needed by it is
there in the system's ACPI tables), but it is not allowed to use the
``MWAIT`` instruction of the CPUs to ask the hardware to enter idle states.
In addition to the architecture-level kernel command line options affecting CPU
idle time management, there are parameters affecting individual ``CPUIdle``
drivers that can be passed to them via the kernel command line. Specifically,
the ``intel_idle.max_cstate=<n>`` and ``processor.max_cstate=<n>`` parameters,
where ``<n>`` is an idle state index also used in the name of the given
state's directory in ``sysfs`` (see
`Representation of Idle States <idle-states-representation_>`_), causes the
``intel_idle`` and ``acpi_idle`` drivers, respectively, to discard all of the
idle states deeper than idle state ``<n>``. In that case, they will never ask
for any of those idle states or expose them to the governor. [The behavior of
the two drivers is different for ``<n>`` equal to ``0``. Adding
``intel_idle.max_cstate=0`` to the kernel command line disables the
``intel_idle`` driver and allows ``acpi_idle`` to be used, whereas
``processor.max_cstate=0`` is equivalent to ``processor.max_cstate=1``.
Also, the ``acpi_idle`` driver is part of the ``processor`` kernel module that
can be loaded separately and ``max_cstate=<n>`` can be passed to it as a module
parameter when it is loaded.]

View File

@@ -495,7 +495,15 @@ on the following rules, regardless of the current operation mode of the driver:
2. Each individual CPU is affected by its own per-policy limits (that is, it
cannot be requested to run faster than its own per-policy maximum and it
cannot be requested to run slower than its own per-policy minimum).
cannot be requested to run slower than its own per-policy minimum). The
effective performance depends on whether the platform supports per core
P-states, hyper-threading is enabled and on current performance requests
from other CPUs. When platform doesn't support per core P-states, the
effective performance can be more than the policy limits set on a CPU, if
other CPUs are requesting higher performance at that moment. Even with per
core P-states support, when hyper-threading is enabled, if the sibling CPU
is requesting higher performance, the other siblings will get higher
performance than their policy limits.
3. The global and per-policy limits can be set independently.

View File

@@ -5,5 +5,6 @@ Working-State Power Management
.. toctree::
:maxdepth: 2
cpuidle
cpufreq
intel_pstate

View File

@@ -54,7 +54,7 @@ those errors are correctable.
Types of errors
---------------
Most mechanisms used on modern systems use use technologies like Hamming
Most mechanisms used on modern systems use technologies like Hamming
Codes that allow error correction when the number of errors on a bit packet
is below a threshold. If the number of errors is above, those mechanisms
can indicate with a high degree of confidence that an error happened, but

View File

@@ -67,7 +67,7 @@ If you can't figure out which subsystem caused the issue, you should file
a bug in kernel.org bugzilla and send email to
linux-kernel@vger.kernel.org, referencing the bugzilla URL. (For more
information on the linux-kernel mailing list see
http://www.tux.org/lkml/).
http://vger.kernel.org/lkml/).
Tips for reporting bugs

View File

@@ -44,7 +44,7 @@ only valid reason for deferring the publication of a fix is to accommodate
the logistics of QA and large scale rollouts which require release
coordination.
Whilst embargoed information may be shared with trusted individuals in
While embargoed information may be shared with trusted individuals in
order to develop a fix, such information will not be published alongside
the fix or on any other disclosure channel without the permission of the
reporter. This includes but is not limited to the original bug report

View File

@@ -1,59 +1,164 @@
Tainted kernels
---------------
Some oops reports contain the string **'Tainted: '** after the program
counter. This indicates that the kernel has been tainted by some
mechanism. The string is followed by a series of position-sensitive
characters, each representing a particular tainted value.
The kernel will mark itself as 'tainted' when something occurs that might be
relevant later when investigating problems. Don't worry too much about this,
most of the time it's not a problem to run a tainted kernel; the information is
mainly of interest once someone wants to investigate some problem, as its real
cause might be the event that got the kernel tainted. That's why bug reports
from tainted kernels will often be ignored by developers, hence try to reproduce
problems with an untainted kernel.
1) ``G`` if all modules loaded have a GPL or compatible license, ``P`` if
Note the kernel will remain tainted even after you undo what caused the taint
(i.e. unload a proprietary kernel module), to indicate the kernel remains not
trustworthy. That's also why the kernel will print the tainted state when it
notices an internal problem (a 'kernel bug'), a recoverable error
('kernel oops') or a non-recoverable error ('kernel panic') and writes debug
information about this to the logs ``dmesg`` outputs. It's also possible to
check the tainted state at runtime through a file in ``/proc/``.
Tainted flag in bugs, oops or panics messages
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
You find the tainted state near the top in a line starting with 'CPU:'; if or
why the kernel was tainted is shown after the Process ID ('PID:') and a shortened
name of the command ('Comm:') that triggered the event::
BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
Oops: 0002 [#1] SMP PTI
CPU: 0 PID: 4424 Comm: insmod Tainted: P W O 4.20.0-0.rc6.fc30 #1
Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
RIP: 0010:my_oops_init+0x13/0x1000 [kpanic]
[...]
You'll find a 'Not tainted: ' there if the kernel was not tainted at the
time of the event; if it was, then it will print 'Tainted: ' and characters
either letters or blanks. In above example it looks like this::
Tainted: P W O
The meaning of those characters is explained in the table below. In tis case
the kernel got tainted earlier because a proprietary Module (``P``) was loaded,
a warning occurred (``W``), and an externally-built module was loaded (``O``).
To decode other letters use the table below.
Decoding tainted state at runtime
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
At runtime, you can query the tainted state by reading
``cat /proc/sys/kernel/tainted``. If that returns ``0``, the kernel is not
tainted; any other number indicates the reasons why it is. The easiest way to
decode that number is the script ``tools/debugging/kernel-chktaint``, which your
distribution might ship as part of a package called ``linux-tools`` or
``kernel-tools``; if it doesn't you can download the script from
`git.kernel.org <https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/plain/tools/debugging/kernel-chktaint>`_
and execute it with ``sh kernel-chktaint``, which would print something like
this on the machine that had the statements in the logs that were quoted earlier::
Kernel is Tainted for following reasons:
* Proprietary module was loaded (#0)
* Kernel issued warning (#9)
* Externally-built ('out-of-tree') module was loaded (#12)
See Documentation/admin-guide/tainted-kernels.rst in the the Linux kernel or
https://www.kernel.org/doc/html/latest/admin-guide/tainted-kernels.html for
a more details explanation of the various taint flags.
Raw taint value as int/string: 4609/'P W O '
You can try to decode the number yourself. That's easy if there was only one
reason that got your kernel tainted, as in this case you can find the number
with the table below. If there were multiple reasons you need to decode the
number, as it is a bitfield, where each bit indicates the absence or presence of
a particular type of taint. It's best to leave that to the aforementioned
script, but if you need something quick you can use this shell command to check
which bits are set::
$ for i in $(seq 18); do echo $(($i-1)) $(($(cat /proc/sys/kernel/tainted)>>($i-1)&1));done
Table for decoding tainted state
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
=== === ====== ========================================================
Bit Log Number Reason that got the kernel tainted
=== === ====== ========================================================
0 G/P 1 proprietary module was loaded
1 _/F 2 module was force loaded
2 _/S 4 SMP kernel oops on an officially SMP incapable processor
3 _/R 8 module was force unloaded
4 _/M 16 processor reported a Machine Check Exception (MCE)
5 _/B 32 bad page referenced or some unexpected page flags
6 _/U 64 taint requested by userspace application
7 _/D 128 kernel died recently, i.e. there was an OOPS or BUG
8 _/A 256 ACPI table overridden by user
9 _/W 512 kernel issued warning
10 _/C 1024 staging driver was loaded
11 _/I 2048 workaround for bug in platform firmware applied
12 _/O 4096 externally-built ("out-of-tree") module was loaded
13 _/E 8192 unsigned module was loaded
14 _/L 16384 soft lockup occurred
15 _/K 32768 kernel has been live patched
16 _/X 65536 auxiliary taint, defined for and used by distros
17 _/T 131072 kernel was built with the struct randomization plugin
=== === ====== ========================================================
Note: The character ``_`` is representing a blank in this table to make reading
easier.
More detailed explanation for tainting
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
0) ``G`` if all modules loaded have a GPL or compatible license, ``P`` if
any proprietary module has been loaded. Modules without a
MODULE_LICENSE or with a MODULE_LICENSE that is not recognised by
insmod as GPL compatible are assumed to be proprietary.
2) ``F`` if any module was force loaded by ``insmod -f``, ``' '`` if all
1) ``F`` if any module was force loaded by ``insmod -f``, ``' '`` if all
modules were loaded normally.
3) ``S`` if the oops occurred on an SMP kernel running on hardware that
2) ``S`` if the oops occurred on an SMP kernel running on hardware that
hasn't been certified as safe to run multiprocessor.
Currently this occurs only on various Athlons that are not
SMP capable.
4) ``R`` if a module was force unloaded by ``rmmod -f``, ``' '`` if all
3) ``R`` if a module was force unloaded by ``rmmod -f``, ``' '`` if all
modules were unloaded normally.
5) ``M`` if any processor has reported a Machine Check Exception,
4) ``M`` if any processor has reported a Machine Check Exception,
``' '`` if no Machine Check Exceptions have occurred.
6) ``B`` if a page-release function has found a bad page reference or
some unexpected page flags.
5) ``B`` If a page-release function has found a bad page reference or some
unexpected page flags. This indicates a hardware problem or a kernel bug;
there should be other information in the log indicating why this tainting
occured.
7) ``U`` if a user or user application specifically requested that the
6) ``U`` if a user or user application specifically requested that the
Tainted flag be set, ``' '`` otherwise.
8) ``D`` if the kernel has died recently, i.e. there was an OOPS or BUG.
7) ``D`` if the kernel has died recently, i.e. there was an OOPS or BUG.
9) ``A`` if the ACPI table has been overridden.
8) ``A`` if an ACPI table has been overridden.
10) ``W`` if a warning has previously been issued by the kernel.
9) ``W`` if a warning has previously been issued by the kernel.
(Though some warnings may set more specific taint flags.)
11) ``C`` if a staging driver has been loaded.
10) ``C`` if a staging driver has been loaded.
12) ``I`` if the kernel is working around a severe bug in the platform
11) ``I`` if the kernel is working around a severe bug in the platform
firmware (BIOS or similar).
13) ``O`` if an externally-built ("out-of-tree") module has been loaded.
12) ``O`` if an externally-built ("out-of-tree") module has been loaded.
14) ``E`` if an unsigned module has been loaded in a kernel supporting
13) ``E`` if an unsigned module has been loaded in a kernel supporting
module signature.
15) ``L`` if a soft lockup has previously occurred on the system.
14) ``L`` if a soft lockup has previously occurred on the system.
16) ``K`` if the kernel has been live patched.
15) ``K`` if the kernel has been live patched.
The primary reason for the **'Tainted: '** string is to tell kernel
debuggers if this is a clean kernel or if anything unusual has
occurred. Tainting is permanent: even if an offending module is
unloaded, the tainted value remains to indicate that the kernel is not
trustworthy.
16) ``X`` Auxiliary taint, defined for and used by Linux distributors.
17) ``T`` Kernel was build with the randstruct plugin, which can intentionally
produce extremely unusual kernel structure layouts (even performance
pathological ones), which is important to know when debugging. Set at
build time.

View File

@@ -133,6 +133,26 @@ If the user still wants to connect the device they can either approve
the device without a key or write a new key and write 1 to the
``authorized`` file to get the new key stored on the device NVM.
DMA protection utilizing IOMMU
------------------------------
Recent systems from 2018 and forward with Thunderbolt ports may natively
support IOMMU. This means that Thunderbolt security is handled by an IOMMU
so connected devices cannot access memory regions outside of what is
allocated for them by drivers. When Linux is running on such system it
automatically enables IOMMU if not enabled by the user already. These
systems can be identified by reading ``1`` from
``/sys/bus/thunderbolt/devices/domainX/iommu_dma_protection`` attribute.
The driver does not do anything special in this case but because DMA
protection is handled by the IOMMU, security levels (if set) are
redundant. For this reason some systems ship with security level set to
``none``. Other systems have security level set to ``user`` in order to
support downgrade to older OS, so users who want to automatically
authorize devices when IOMMU DMA protection is enabled can use the
following ``udev`` rule::
ACTION=="add", SUBSYSTEM=="thunderbolt", ATTRS{iommu_dma_protection}=="1", ATTR{authorized}=="0", ATTR{authorized}="1"
Upgrading NVM on Thunderbolt device or host
-------------------------------------------
Since most of the functionality is handled in firmware running on a

View File

@@ -126,7 +126,7 @@ tagged list.
The boot loader must pass at a minimum the size and location of the
system memory, and the root filesystem location. The dtb must be
placed in a region of memory where the kernel decompressor will not
overwrite it, whilst remaining within the region which will be covered
overwrite it, while remaining within the region which will be covered
by the kernel's low-memory mapping.
A safe location is just above the 128MiB boundary from start of RAM.

View File

@@ -55,7 +55,7 @@ out s3c2410 API, then here are some notes on the process.
as they have the same arguments, and can either take the pin specific
values, or the more generic special-function-number arguments.
3) s3c2410_gpio_pullup() changes have the problem that whilst the
3) s3c2410_gpio_pullup() changes have the problem that while the
s3c2410_gpio_pullup(x, 1) can be easily translated to the
s3c_gpio_setpull(x, S3C_GPIO_PULL_NONE), the s3c2410_gpio_pullup(x, 0)
are not so easy.

View File

@@ -17,7 +17,7 @@ Introduction
versions.
The S3C2416 and S3C2450 devices are very similar and S3C2450 support is
included under the arch/arm/mach-s3c2416 directory. Note, whilst core
included under the arch/arm/mach-s3c2416 directory. Note, while core
support for these SoCs is in, work on some of the extra peripherals
and extra interrupts is still ongoing.

View File

@@ -87,7 +87,7 @@ Debugging
suspending, which means that use of printascii() or similar direct
access to the UARTs will cause the debug to stop.
2) Whilst the pm code itself will attempt to re-enable the UART clocks,
2) While the pm code itself will attempt to re-enable the UART clocks,
care should be taken that any external clock sources that the UARTs
rely on are still enabled at that point.

View File

@@ -6,7 +6,7 @@ TL;DR summary
* Use only NEON instructions, or VFP instructions that don't rely on support
code
* Isolate your NEON code in a separate compilation unit, and compile it with
'-mfpu=neon -mfloat-abi=softfp'
'-march=armv7-a -mfpu=neon -mfloat-abi=softfp'
* Put kernel_neon_begin() and kernel_neon_end() calls around the calls into your
NEON code
* Don't sleep in your NEON code, and be aware that it will be executed with
@@ -87,7 +87,7 @@ instructions appearing in unexpected places if no special care is taken.
Therefore, the recommended and only supported way of using NEON/VFP in the
kernel is by adhering to the following rules:
* isolate the NEON code in a separate compilation unit and compile it with
'-mfpu=neon -mfloat-abi=softfp';
'-march=armv7-a -mfpu=neon -mfloat-abi=softfp';
* issue the calls to kernel_neon_begin(), kernel_neon_end() as well as the calls
into the unit containing the NEON code from a compilation unit which is *not*
built with the GCC flag '-mfpu=neon' set.

View File

@@ -188,6 +188,11 @@ Before jumping into the kernel, the following conditions must be met:
the kernel image will be entered must be initialised by software at a
higher exception level to prevent execution in an UNKNOWN state.
- SCR_EL3.FIQ must have the same value across all CPUs the kernel is
executing on.
- The value of SCR_EL3.FIQ must be the same as the one present at boot
time whenever the kernel is executing.
For systems with a GICv3 interrupt controller to be used in v3 mode:
- If EL3 is present:
ICC_SRE_EL3.Enable (bit 3) must be initialiased to 0b1.
@@ -205,6 +210,14 @@ Before jumping into the kernel, the following conditions must be met:
ICC_SRE_EL2.SRE (bit 0) must be initialised to 0b0.
- The DT or ACPI tables must describe a GICv2 interrupt controller.
For CPUs with pointer authentication functionality:
- If EL3 is present:
SCR_EL3.APK (bit 16) must be initialised to 0b1
SCR_EL3.API (bit 17) must be initialised to 0b1
- If the kernel is entered at EL1:
HCR_EL2.APK (bit 40) must be initialised to 0b1
HCR_EL2.API (bit 41) must be initialised to 0b1
The requirements described above for CPU mode, caches, MMUs, architected
timers, coherency and system registers apply to all CPUs. All CPUs must
enter the kernel in the same exception level.

View File

@@ -184,12 +184,20 @@ infrastructure:
x--------------------------------------------------x
| Name | bits | visible |
|--------------------------------------------------|
| GPI | [31-28] | y |
|--------------------------------------------------|
| GPA | [27-24] | y |
|--------------------------------------------------|
| LRCPC | [23-20] | y |
|--------------------------------------------------|
| FCMA | [19-16] | y |
|--------------------------------------------------|
| JSCVT | [15-12] | y |
|--------------------------------------------------|
| API | [11-8] | y |
|--------------------------------------------------|
| APA | [7-4] | y |
|--------------------------------------------------|
| DPB | [3-0] | y |
x--------------------------------------------------x

View File

@@ -182,3 +182,15 @@ HWCAP_FLAGM
HWCAP_SSBS
Functionality implied by ID_AA64PFR1_EL1.SSBS == 0b0010.
HWCAP_PACA
Functionality implied by ID_AA64ISAR1_EL1.APA == 0b0001 or
ID_AA64ISAR1_EL1.API == 0b0001, as described by
Documentation/arm64/pointer-authentication.txt.
HWCAP_PACG
Functionality implied by ID_AA64ISAR1_EL1.GPA == 0b0001 or
ID_AA64ISAR1_EL1.GPI == 0b0001, as described by
Documentation/arm64/pointer-authentication.txt.

View File

@@ -0,0 +1,93 @@
Pointer authentication in AArch64 Linux
=======================================
Author: Mark Rutland <mark.rutland@arm.com>
Date: 2017-07-19
This document briefly describes the provision of pointer authentication
functionality in AArch64 Linux.
Architecture overview
---------------------
The ARMv8.3 Pointer Authentication extension adds primitives that can be
used to mitigate certain classes of attack where an attacker can corrupt
the contents of some memory (e.g. the stack).
The extension uses a Pointer Authentication Code (PAC) to determine
whether pointers have been modified unexpectedly. A PAC is derived from
a pointer, another value (such as the stack pointer), and a secret key
held in system registers.
The extension adds instructions to insert a valid PAC into a pointer,
and to verify/remove the PAC from a pointer. The PAC occupies a number
of high-order bits of the pointer, which varies dependent on the
configured virtual address size and whether pointer tagging is in use.
A subset of these instructions have been allocated from the HINT
encoding space. In the absence of the extension (or when disabled),
these instructions behave as NOPs. Applications and libraries using
these instructions operate correctly regardless of the presence of the
extension.
The extension provides five separate keys to generate PACs - two for
instruction addresses (APIAKey, APIBKey), two for data addresses
(APDAKey, APDBKey), and one for generic authentication (APGAKey).
Basic support
-------------
When CONFIG_ARM64_PTR_AUTH is selected, and relevant HW support is
present, the kernel will assign random key values to each process at
exec*() time. The keys are shared by all threads within the process, and
are preserved across fork().
Presence of address authentication functionality is advertised via
HWCAP_PACA, and generic authentication functionality via HWCAP_PACG.
The number of bits that the PAC occupies in a pointer is 55 minus the
virtual address size configured by the kernel. For example, with a
virtual address size of 48, the PAC is 7 bits wide.
Recent versions of GCC can compile code with APIAKey-based return
address protection when passed the -msign-return-address option. This
uses instructions in the HINT space (unless -march=armv8.3-a or higher
is also passed), and such code can run on systems without the pointer
authentication extension.
In addition to exec(), keys can also be reinitialized to random values
using the PR_PAC_RESET_KEYS prctl. A bitmask of PR_PAC_APIAKEY,
PR_PAC_APIBKEY, PR_PAC_APDAKEY, PR_PAC_APDBKEY and PR_PAC_APGAKEY
specifies which keys are to be reinitialized; specifying 0 means "all
keys".
Debugging
---------
When CONFIG_ARM64_PTR_AUTH is selected, and HW support for address
authentication is present, the kernel will expose the position of TTBR0
PAC bits in the NT_ARM_PAC_MASK regset (struct user_pac_mask), which
userspace can acquire via PTRACE_GETREGSET.
The regset is exposed only when HWCAP_PACA is set. Separate masks are
exposed for data pointers and instruction pointers, as the set of PAC
bits can vary between the two. Note that the masks apply to TTBR0
addresses, and are not valid to apply to TTBR1 addresses (e.g. kernel
pointers).
Additionally, when CONFIG_CHECKPOINT_RESTORE is also set, the kernel
will expose the NT_ARM_PACA_KEYS and NT_ARM_PACG_KEYS regsets (struct
user_pac_address_keys and struct user_pac_generic_keys). These can be
used to get and set the keys for a thread.
Virtualization
--------------
Pointer authentication is not currently supported in KVM guests. KVM
will mask the feature bits from ID_AA64ISAR1_EL1, and attempted use of
the feature will result in an UNDEFINED exception being injected into
the guest.

View File

@@ -44,6 +44,8 @@ stable kernels.
| Implementor | Component | Erratum ID | Kconfig |
+----------------+-----------------+-----------------+-----------------------------+
| Allwinner | A64/R18 | UNKNOWN1 | SUN50I_ERRATUM_UNKNOWN1 |
| | | | |
| ARM | Cortex-A53 | #826319 | ARM64_ERRATUM_826319 |
| ARM | Cortex-A53 | #827319 | ARM64_ERRATUM_827319 |
| ARM | Cortex-A53 | #824069 | ARM64_ERRATUM_824069 |
@@ -57,6 +59,7 @@ stable kernels.
| ARM | Cortex-A73 | #858921 | ARM64_ERRATUM_858921 |
| ARM | Cortex-A55 | #1024718 | ARM64_ERRATUM_1024718 |
| ARM | Cortex-A76 | #1188873 | ARM64_ERRATUM_1188873 |
| ARM | Cortex-A76 | #1165522 | ARM64_ERRATUM_1165522 |
| ARM | Cortex-A76 | #1286807 | ARM64_ERRATUM_1286807 |
| ARM | MMU-500 | #841119,#826419 | N/A |
| | | | |
@@ -79,3 +82,4 @@ stable kernels.
| Qualcomm Tech. | Falkor v1 | E1009 | QCOM_FALKOR_ERRATUM_1009 |
| Qualcomm Tech. | QDF2400 ITS | E0065 | QCOM_QDF2400_ERRATUM_0065 |
| Qualcomm Tech. | Falkor v{1,2} | E1041 | QCOM_FALKOR_ERRATUM_1041 |
| Fujitsu | A64FX | E#010001 | FUJITSU_ERRATUM_010001 |

View File

@@ -357,6 +357,13 @@ video playing/streaming, a very low drop rate may be more important
than maximum throughput. In these cases, consider setting the
strict_guarantees parameter.
slice_idle_us
-------------
Controls the same tuning parameter as slice_idle, but in microseconds.
Either tunable can be used to set idling behavior. Afterwards, the
other tunable will reflect the newly set value in sysfs.
strict_guarantees
-----------------

View File

@@ -65,7 +65,6 @@ Description of Contents:
3.2.3 I/O completion
3.2.4 Implications for drivers that do not interpret bios (don't handle
multiple segments)
3.2.5 Request command tagging
3.3 I/O submission
4. The I/O scheduler
5. Scalability related changes
@@ -708,93 +707,6 @@ is crossed on completion of a transfer. (The end*request* functions should
be used if only if the request has come down from block/bio path, not for
direct access requests which only specify rq->buffer without a valid rq->bio)
3.2.5 Generic request command tagging
3.2.5.1 Tag helpers
Block now offers some simple generic functionality to help support command
queueing (typically known as tagged command queueing), ie manage more than
one outstanding command on a queue at any given time.
blk_queue_init_tags(struct request_queue *q, int depth)
Initialize internal command tagging structures for a maximum
depth of 'depth'.
blk_queue_free_tags((struct request_queue *q)
Teardown tag info associated with the queue. This will be done
automatically by block if blk_queue_cleanup() is called on a queue
that is using tagging.
The above are initialization and exit management, the main helpers during
normal operations are:
blk_queue_start_tag(struct request_queue *q, struct request *rq)
Start tagged operation for this request. A free tag number between
0 and 'depth' is assigned to the request (rq->tag holds this number),
and 'rq' is added to the internal tag management. If the maximum depth
for this queue is already achieved (or if the tag wasn't started for
some other reason), 1 is returned. Otherwise 0 is returned.
blk_queue_end_tag(struct request_queue *q, struct request *rq)
End tagged operation on this request. 'rq' is removed from the internal
book keeping structures.
To minimize struct request and queue overhead, the tag helpers utilize some
of the same request members that are used for normal request queue management.
This means that a request cannot both be an active tag and be on the queue
list at the same time. blk_queue_start_tag() will remove the request, but
the driver must remember to call blk_queue_end_tag() before signalling
completion of the request to the block layer. This means ending tag
operations before calling end_that_request_last()! For an example of a user
of these helpers, see the IDE tagged command queueing support.
3.2.5.2 Tag info
Some block functions exist to query current tag status or to go from a
tag number to the associated request. These are, in no particular order:
blk_queue_tagged(q)
Returns 1 if the queue 'q' is using tagging, 0 if not.
blk_queue_tag_request(q, tag)
Returns a pointer to the request associated with tag 'tag'.
blk_queue_tag_depth(q)
Return current queue depth.
blk_queue_tag_queue(q)
Returns 1 if the queue can accept a new queued command, 0 if we are
at the maximum depth already.
blk_queue_rq_tagged(rq)
Returns 1 if the request 'rq' is tagged.
3.2.5.2 Internal structure
Internally, block manages tags in the blk_queue_tag structure:
struct blk_queue_tag {
struct request **tag_index; /* array or pointers to rq */
unsigned long *tag_map; /* bitmap of free tags */
struct list_head busy_list; /* fifo list of busy tags */
int busy; /* queue depth */
int max_depth; /* max queue depth */
};
Most of the above is simple and straight forward, however busy_list may need
a bit of explaining. Normally we don't care too much about request ordering,
but in the event of any barrier requests in the tag queue we need to ensure
that requests are restarted in the order they were queue.
3.3 I/O Submission
The routine submit_bio() is used to submit a single io. Higher level i/o

View File

@@ -117,3 +117,28 @@ Other implications:
size limitations and the limitations of the underlying devices. Thus
there's no need to define ->merge_bvec_fn() callbacks for individual block
drivers.
Usage of helpers:
=================
* The following helpers whose names have the suffix of "_all" can only be used
on non-BIO_CLONED bio. They are usually used by filesystem code. Drivers
shouldn't use them because the bio may have been split before it reached the
driver.
bio_for_each_segment_all()
bio_first_bvec_all()
bio_first_page_all()
bio_last_bvec_all()
* The following helpers iterate over single-page segment. The passed 'struct
bio_vec' will contain a single-page IO vector during the iteration
bio_for_each_segment()
bio_for_each_segment_all()
* The following helpers iterate over multi-page bvec. The passed 'struct
bio_vec' will contain a multi-page IO vector during the iteration
bio_for_each_bvec()
rq_for_each_bvec()

View File

@@ -1,291 +0,0 @@
CFQ (Complete Fairness Queueing)
===============================
The main aim of CFQ scheduler is to provide a fair allocation of the disk
I/O bandwidth for all the processes which requests an I/O operation.
CFQ maintains the per process queue for the processes which request I/O
operation(synchronous requests). In case of asynchronous requests, all the
requests from all the processes are batched together according to their
process's I/O priority.
CFQ ioscheduler tunables
========================
slice_idle
----------
This specifies how long CFQ should idle for next request on certain cfq queues
(for sequential workloads) and service trees (for random workloads) before
queue is expired and CFQ selects next queue to dispatch from.
By default slice_idle is a non-zero value. That means by default we idle on
queues/service trees. This can be very helpful on highly seeky media like
single spindle SATA/SAS disks where we can cut down on overall number of
seeks and see improved throughput.
Setting slice_idle to 0 will remove all the idling on queues/service tree
level and one should see an overall improved throughput on faster storage
devices like multiple SATA/SAS disks in hardware RAID configuration. The down
side is that isolation provided from WRITES also goes down and notion of
IO priority becomes weaker.
So depending on storage and workload, it might be useful to set slice_idle=0.
In general I think for SATA/SAS disks and software RAID of SATA/SAS disks
keeping slice_idle enabled should be useful. For any configurations where
there are multiple spindles behind single LUN (Host based hardware RAID
controller or for storage arrays), setting slice_idle=0 might end up in better
throughput and acceptable latencies.
back_seek_max
-------------
This specifies, given in Kbytes, the maximum "distance" for backward seeking.
The distance is the amount of space from the current head location to the
sectors that are backward in terms of distance.
This parameter allows the scheduler to anticipate requests in the "backward"
direction and consider them as being the "next" if they are within this
distance from the current head location.
back_seek_penalty
-----------------
This parameter is used to compute the cost of backward seeking. If the
backward distance of request is just 1/back_seek_penalty from a "front"
request, then the seeking cost of two requests is considered equivalent.
So scheduler will not bias toward one or the other request (otherwise scheduler
will bias toward front request). Default value of back_seek_penalty is 2.
fifo_expire_async
-----------------
This parameter is used to set the timeout of asynchronous requests. Default
value of this is 248ms.
fifo_expire_sync
----------------
This parameter is used to set the timeout of synchronous requests. Default
value of this is 124ms. In case to favor synchronous requests over asynchronous
one, this value should be decreased relative to fifo_expire_async.
group_idle
-----------
This parameter forces idling at the CFQ group level instead of CFQ
queue level. This was introduced after a bottleneck was observed
in higher end storage due to idle on sequential queue and allow dispatch
from a single queue. The idea with this parameter is that it can be run with
slice_idle=0 and group_idle=8, so that idling does not happen on individual
queues in the group but happens overall on the group and thus still keeps the
IO controller working.
Not idling on individual queues in the group will dispatch requests from
multiple queues in the group at the same time and achieve higher throughput
on higher end storage.
Default value for this parameter is 8ms.
low_latency
-----------
This parameter is used to enable/disable the low latency mode of the CFQ
scheduler. If enabled, CFQ tries to recompute the slice time for each process
based on the target_latency set for the system. This favors fairness over
throughput. Disabling low latency (setting it to 0) ignores target latency,
allowing each process in the system to get a full time slice.
By default low latency mode is enabled.
target_latency
--------------
This parameter is used to calculate the time slice for a process if cfq's
latency mode is enabled. It will ensure that sync requests have an estimated
latency. But if sequential workload is higher(e.g. sequential read),
then to meet the latency constraints, throughput may decrease because of less
time for each process to issue I/O request before the cfq queue is switched.
Though this can be overcome by disabling the latency_mode, it may increase
the read latency for some applications. This parameter allows for changing
target_latency through the sysfs interface which can provide the balanced
throughput and read latency.
Default value for target_latency is 300ms.
slice_async
-----------
This parameter is same as of slice_sync but for asynchronous queue. The
default value is 40ms.
slice_async_rq
--------------
This parameter is used to limit the dispatching of asynchronous request to
device request queue in queue's slice time. The maximum number of request that
are allowed to be dispatched also depends upon the io priority. Default value
for this is 2.
slice_sync
----------
When a queue is selected for execution, the queues IO requests are only
executed for a certain amount of time(time_slice) before switching to another
queue. This parameter is used to calculate the time slice of synchronous
queue.
time_slice is computed using the below equation:-
time_slice = slice_sync + (slice_sync/5 * (4 - prio)). To increase the
time_slice of synchronous queue, increase the value of slice_sync. Default
value is 100ms.
quantum
-------
This specifies the number of request dispatched to the device queue. In a
queue's time slice, a request will not be dispatched if the number of request
in the device exceeds this parameter. This parameter is used for synchronous
request.
In case of storage with several disk, this setting can limit the parallel
processing of request. Therefore, increasing the value can improve the
performance although this can cause the latency of some I/O to increase due
to more number of requests.
CFQ Group scheduling
====================
CFQ supports blkio cgroup and has "blkio." prefixed files in each
blkio cgroup directory. It is weight-based and there are four knobs
for configuration - weight[_device] and leaf_weight[_device].
Internal cgroup nodes (the ones with children) can also have tasks in
them, so the former two configure how much proportion the cgroup as a
whole is entitled to at its parent's level while the latter two
configure how much proportion the tasks in the cgroup have compared to
its direct children.
Another way to think about it is assuming that each internal node has
an implicit leaf child node which hosts all the tasks whose weight is
configured by leaf_weight[_device]. Let's assume a blkio hierarchy
composed of five cgroups - root, A, B, AA and AB - with the following
weights where the names represent the hierarchy.
weight leaf_weight
root : 125 125
A : 500 750
B : 250 500
AA : 500 500
AB : 1000 500
root never has a parent making its weight is meaningless. For backward
compatibility, weight is always kept in sync with leaf_weight. B, AA
and AB have no child and thus its tasks have no children cgroup to
compete with. They always get 100% of what the cgroup won at the
parent level. Considering only the weights which matter, the hierarchy
looks like the following.
root
/ | \
A B leaf
500 250 125
/ | \
AA AB leaf
500 1000 750
If all cgroups have active IOs and competing with each other, disk
time will be distributed like the following.
Distribution below root. The total active weight at this level is
A:500 + B:250 + C:125 = 875.
root-leaf : 125 / 875 =~ 14%
A : 500 / 875 =~ 57%
B(-leaf) : 250 / 875 =~ 28%
A has children and further distributes its 57% among the children and
the implicit leaf node. The total active weight at this level is
AA:500 + AB:1000 + A-leaf:750 = 2250.
A-leaf : ( 750 / 2250) * A =~ 19%
AA(-leaf) : ( 500 / 2250) * A =~ 12%
AB(-leaf) : (1000 / 2250) * A =~ 25%
CFQ IOPS Mode for group scheduling
===================================
Basic CFQ design is to provide priority based time slices. Higher priority
process gets bigger time slice and lower priority process gets smaller time
slice. Measuring time becomes harder if storage is fast and supports NCQ and
it would be better to dispatch multiple requests from multiple cfq queues in
request queue at a time. In such scenario, it is not possible to measure time
consumed by single queue accurately.
What is possible though is to measure number of requests dispatched from a
single queue and also allow dispatch from multiple cfq queue at the same time.
This effectively becomes the fairness in terms of IOPS (IO operations per
second).
If one sets slice_idle=0 and if storage supports NCQ, CFQ internally switches
to IOPS mode and starts providing fairness in terms of number of requests
dispatched. Note that this mode switching takes effect only for group
scheduling. For non-cgroup users nothing should change.
CFQ IO scheduler Idling Theory
===============================
Idling on a queue is primarily about waiting for the next request to come
on same queue after completion of a request. In this process CFQ will not
dispatch requests from other cfq queues even if requests are pending there.
The rationale behind idling is that it can cut down on number of seeks
on rotational media. For example, if a process is doing dependent
sequential reads (next read will come on only after completion of previous
one), then not dispatching request from other queue should help as we
did not move the disk head and kept on dispatching sequential IO from
one queue.
CFQ has following service trees and various queues are put on these trees.
sync-idle sync-noidle async
All cfq queues doing synchronous sequential IO go on to sync-idle tree.
On this tree we idle on each queue individually.
All synchronous non-sequential queues go on sync-noidle tree. Also any
synchronous write request which is not marked with REQ_IDLE goes on this
service tree. On this tree we do not idle on individual queues instead idle
on the whole group of queues or the tree. So if there are 4 queues waiting
for IO to dispatch we will idle only once last queue has dispatched the IO
and there is no more IO on this service tree.
All async writes go on async service tree. There is no idling on async
queues.
CFQ has some optimizations for SSDs and if it detects a non-rotational
media which can support higher queue depth (multiple requests at in
flight at a time), then it cuts down on idling of individual queues and
all the queues move to sync-noidle tree and only tree idle remains. This
tree idling provides isolation with buffered write queues on async tree.
FAQ
===
Q1. Why to idle at all on queues not marked with REQ_IDLE.
A1. We only do tree idle (all queues on sync-noidle tree) on queues not marked
with REQ_IDLE. This helps in providing isolation with all the sync-idle
queues. Otherwise in presence of many sequential readers, other
synchronous IO might not get fair share of disk.
For example, if there are 10 sequential readers doing IO and they get
100ms each. If a !REQ_IDLE request comes in, it will be scheduled
roughly after 1 second. If after completion of !REQ_IDLE request we
do not idle, and after a couple of milli seconds a another !REQ_IDLE
request comes in, again it will be scheduled after 1second. Repeat it
and notice how a workload can lose its disk share and suffer due to
multiple sequential readers.
fsync can generate dependent IO where bunch of data is written in the
context of fsync, and later some journaling data is written. Journaling
data comes in only after fsync has finished its IO (atleast for ext4
that seemed to be the case). Now if one decides not to idle on fsync
thread due to !REQ_IDLE, then next journaling write will not get
scheduled for another second. A process doing small fsync, will suffer
badly in presence of multiple sequential readers.
Hence doing tree idling on threads using !REQ_IDLE flag on requests
provides isolation from multiple sequential readers and at the same
time we do not idle on individual threads.
Q2. When to specify REQ_IDLE
A2. I would think whenever one is doing synchronous write and expecting
more writes to be dispatched from same context soon, should be able
to specify REQ_IDLE on writes and that probably should work well for
most of the cases.

View File

@@ -88,7 +88,8 @@ shared_tags=[0/1]: Default: 0
zoned=[0/1]: Default: 0
0: Block device is exposed as a random-access block device.
1: Block device is exposed as a host-managed zoned block device.
1: Block device is exposed as a host-managed zoned block device. Requires
CONFIG_BLK_DEV_ZONED.
zone_size=[MB]: Default: 256
Per zone size when exposed as a zoned block device. Must be a power of two.

Some files were not shown because too many files have changed in this diff Show More