Merge tag 'v5.1' into next
Sync up with mainline to bring in the latest APIs.
@@ -72,8 +72,14 @@ ForEachMacros:
|
||||
- 'apei_estatus_for_each_section'
|
||||
- 'ata_for_each_dev'
|
||||
- 'ata_for_each_link'
|
||||
- '__ata_qc_for_each'
|
||||
- 'ata_qc_for_each'
|
||||
- 'ata_qc_for_each_raw'
|
||||
- 'ata_qc_for_each_with_internal'
|
||||
- 'ax25_for_each'
|
||||
- 'ax25_uid_for_each'
|
||||
- '__bio_for_each_bvec'
|
||||
- 'bio_for_each_bvec'
|
||||
- 'bio_for_each_integrity_vec'
|
||||
- '__bio_for_each_segment'
|
||||
- 'bio_for_each_segment'
|
||||
@@ -85,6 +91,7 @@ ForEachMacros:
|
||||
- 'blk_queue_for_each_rl'
|
||||
- 'bond_for_each_slave'
|
||||
- 'bond_for_each_slave_rcu'
|
||||
- 'bpf_for_each_spilled_reg'
|
||||
- 'btree_for_each_safe128'
|
||||
- 'btree_for_each_safe32'
|
||||
- 'btree_for_each_safe64'
|
||||
@@ -103,6 +110,8 @@ ForEachMacros:
|
||||
- 'drm_atomic_crtc_for_each_plane'
|
||||
- 'drm_atomic_crtc_state_for_each_plane'
|
||||
- 'drm_atomic_crtc_state_for_each_plane_state'
|
||||
- 'drm_atomic_for_each_plane_damage'
|
||||
- 'drm_connector_for_each_possible_encoder'
|
||||
- 'drm_for_each_connector_iter'
|
||||
- 'drm_for_each_crtc'
|
||||
- 'drm_for_each_encoder'
|
||||
@@ -111,21 +120,33 @@ ForEachMacros:
|
||||
- 'drm_for_each_legacy_plane'
|
||||
- 'drm_for_each_plane'
|
||||
- 'drm_for_each_plane_mask'
|
||||
- 'drm_for_each_privobj'
|
||||
- 'drm_mm_for_each_hole'
|
||||
- 'drm_mm_for_each_node'
|
||||
- 'drm_mm_for_each_node_in_range'
|
||||
- 'drm_mm_for_each_node_safe'
|
||||
- 'flow_action_for_each'
|
||||
- 'for_each_active_drhd_unit'
|
||||
- 'for_each_active_iommu'
|
||||
- 'for_each_available_child_of_node'
|
||||
- 'for_each_bio'
|
||||
- 'for_each_board_func_rsrc'
|
||||
- 'for_each_bvec'
|
||||
- 'for_each_card_components'
|
||||
- 'for_each_card_links'
|
||||
- 'for_each_card_links_safe'
|
||||
- 'for_each_card_prelinks'
|
||||
- 'for_each_card_rtds'
|
||||
- 'for_each_card_rtds_safe'
|
||||
- 'for_each_cgroup_storage_type'
|
||||
- 'for_each_child_of_node'
|
||||
- 'for_each_clear_bit'
|
||||
- 'for_each_clear_bit_from'
|
||||
- 'for_each_cmsghdr'
|
||||
- 'for_each_compatible_node'
|
||||
- 'for_each_component_dais'
|
||||
- 'for_each_component_dais_safe'
|
||||
- 'for_each_comp_order'
|
||||
- 'for_each_console'
|
||||
- 'for_each_cpu'
|
||||
- 'for_each_cpu_and'
|
||||
@@ -133,10 +154,17 @@ ForEachMacros:
|
||||
- 'for_each_cpu_wrap'
|
||||
- 'for_each_dev_addr'
|
||||
- 'for_each_dma_cap_mask'
|
||||
- 'for_each_dpcm_be'
|
||||
- 'for_each_dpcm_be_rollback'
|
||||
- 'for_each_dpcm_be_safe'
|
||||
- 'for_each_dpcm_fe'
|
||||
- 'for_each_drhd_unit'
|
||||
- 'for_each_dss_dev'
|
||||
- 'for_each_efi_memory_desc'
|
||||
- 'for_each_efi_memory_desc_in_map'
|
||||
- 'for_each_element'
|
||||
- 'for_each_element_extid'
|
||||
- 'for_each_element_id'
|
||||
- 'for_each_endpoint_of_node'
|
||||
- 'for_each_evictable_lru'
|
||||
- 'for_each_fib6_node_rt_rcu'
|
||||
@@ -149,6 +177,7 @@ ForEachMacros:
|
||||
- 'for_each_iommu'
|
||||
- 'for_each_ip_tunnel_rcu'
|
||||
- 'for_each_irq_nr'
|
||||
- 'for_each_link_codecs'
|
||||
- 'for_each_lru'
|
||||
- 'for_each_matching_node'
|
||||
- 'for_each_matching_node_and_match'
|
||||
@@ -160,6 +189,7 @@ ForEachMacros:
|
||||
- 'for_each_mem_range_rev'
|
||||
- 'for_each_migratetype_order'
|
||||
- 'for_each_msi_entry'
|
||||
- 'for_each_msi_entry_safe'
|
||||
- 'for_each_net'
|
||||
- 'for_each_netdev'
|
||||
- 'for_each_netdev_continue'
|
||||
@@ -172,6 +202,7 @@ ForEachMacros:
|
||||
- 'for_each_net_rcu'
|
||||
- 'for_each_new_connector_in_state'
|
||||
- 'for_each_new_crtc_in_state'
|
||||
- 'for_each_new_mst_mgr_in_state'
|
||||
- 'for_each_new_plane_in_state'
|
||||
- 'for_each_new_private_obj_in_state'
|
||||
- 'for_each_node'
|
||||
@@ -183,12 +214,16 @@ ForEachMacros:
|
||||
- 'for_each_node_with_property'
|
||||
- 'for_each_of_allnodes'
|
||||
- 'for_each_of_allnodes_from'
|
||||
- 'for_each_of_cpu_node'
|
||||
- 'for_each_of_pci_range'
|
||||
- 'for_each_old_connector_in_state'
|
||||
- 'for_each_old_crtc_in_state'
|
||||
- 'for_each_old_mst_mgr_in_state'
|
||||
- 'for_each_oldnew_connector_in_state'
|
||||
- 'for_each_oldnew_crtc_in_state'
|
||||
- 'for_each_oldnew_mst_mgr_in_state'
|
||||
- 'for_each_oldnew_plane_in_state'
|
||||
- 'for_each_oldnew_plane_in_state_reverse'
|
||||
- 'for_each_oldnew_private_obj_in_state'
|
||||
- 'for_each_old_plane_in_state'
|
||||
- 'for_each_old_private_obj_in_state'
|
||||
@@ -206,14 +241,21 @@ ForEachMacros:
|
||||
- 'for_each_process'
|
||||
- 'for_each_process_thread'
|
||||
- 'for_each_property_of_node'
|
||||
- 'for_each_registered_fb'
|
||||
- 'for_each_reserved_mem_region'
|
||||
- 'for_each_resv_unavail_range'
|
||||
- 'for_each_rtd_codec_dai'
|
||||
- 'for_each_rtd_codec_dai_rollback'
|
||||
- 'for_each_rtdcom'
|
||||
- 'for_each_rtdcom_safe'
|
||||
- 'for_each_set_bit'
|
||||
- 'for_each_set_bit_from'
|
||||
- 'for_each_sg'
|
||||
- 'for_each_sg_dma_page'
|
||||
- 'for_each_sg_page'
|
||||
- 'for_each_sibling_event'
|
||||
- 'for_each_subelement'
|
||||
- 'for_each_subelement_extid'
|
||||
- 'for_each_subelement_id'
|
||||
- '__for_each_thread'
|
||||
- 'for_each_thread'
|
||||
- 'for_each_zone'
|
||||
@@ -223,6 +265,8 @@ ForEachMacros:
|
||||
- 'fwnode_for_each_child_node'
|
||||
- 'fwnode_graph_for_each_endpoint'
|
||||
- 'gadget_for_each_ep'
|
||||
- 'genradix_for_each'
|
||||
- 'genradix_for_each_from'
|
||||
- 'hash_for_each'
|
||||
- 'hash_for_each_possible'
|
||||
- 'hash_for_each_possible_rcu'
|
||||
@@ -251,6 +295,8 @@ ForEachMacros:
|
||||
- 'hlist_nulls_for_each_entry_from'
|
||||
- 'hlist_nulls_for_each_entry_rcu'
|
||||
- 'hlist_nulls_for_each_entry_safe'
|
||||
- 'i3c_bus_for_each_i2cdev'
|
||||
- 'i3c_bus_for_each_i3cdev'
|
||||
- 'ide_host_for_each_port'
|
||||
- 'ide_port_for_each_dev'
|
||||
- 'ide_port_for_each_present_dev'
|
||||
@@ -259,19 +305,25 @@ ForEachMacros:
|
||||
- 'idr_for_each_entry_ul'
|
||||
- 'inet_bind_bucket_for_each'
|
||||
- 'inet_lhash2_for_each_icsk_rcu'
|
||||
- 'iov_for_each'
|
||||
- 'key_for_each'
|
||||
- 'key_for_each_safe'
|
||||
- 'klp_for_each_func'
|
||||
- 'klp_for_each_func_safe'
|
||||
- 'klp_for_each_func_static'
|
||||
- 'klp_for_each_object'
|
||||
- 'klp_for_each_object_safe'
|
||||
- 'klp_for_each_object_static'
|
||||
- 'kvm_for_each_memslot'
|
||||
- 'kvm_for_each_vcpu'
|
||||
- 'list_for_each'
|
||||
- 'list_for_each_codec'
|
||||
- 'list_for_each_codec_safe'
|
||||
- 'list_for_each_entry'
|
||||
- 'list_for_each_entry_continue'
|
||||
- 'list_for_each_entry_continue_rcu'
|
||||
- 'list_for_each_entry_continue_reverse'
|
||||
- 'list_for_each_entry_from'
|
||||
- 'list_for_each_entry_from_rcu'
|
||||
- 'list_for_each_entry_from_reverse'
|
||||
- 'list_for_each_entry_lockless'
|
||||
- 'list_for_each_entry_rcu'
|
||||
@@ -291,6 +343,9 @@ ForEachMacros:
|
||||
- 'media_device_for_each_intf'
|
||||
- 'media_device_for_each_link'
|
||||
- 'media_device_for_each_pad'
|
||||
- 'mp_bvec_for_each_page'
|
||||
- 'mp_bvec_for_each_segment'
|
||||
- 'nanddev_io_for_each_page'
|
||||
- 'netdev_for_each_lower_dev'
|
||||
- 'netdev_for_each_lower_private'
|
||||
- 'netdev_for_each_lower_private_rcu'
|
||||
@@ -326,6 +381,7 @@ ForEachMacros:
|
||||
- 'radix_tree_for_each_slot'
|
||||
- 'radix_tree_for_each_tagged'
|
||||
- 'rbtree_postorder_for_each_entry_safe'
|
||||
- 'rdma_for_each_port'
|
||||
- 'resource_list_for_each_entry'
|
||||
- 'resource_list_for_each_entry_safe'
|
||||
- 'rhl_for_each_entry_rcu'
|
||||
@@ -340,6 +396,7 @@ ForEachMacros:
|
||||
- 'rht_for_each_rcu'
|
||||
- 'rht_for_each_rcu_continue'
|
||||
- '__rq_for_each_bio'
|
||||
- 'rq_for_each_bvec'
|
||||
- 'rq_for_each_segment'
|
||||
- 'scsi_for_each_prot_sg'
|
||||
- 'scsi_for_each_sg'
|
||||
@@ -357,12 +414,14 @@ ForEachMacros:
|
||||
- 'sk_nulls_for_each'
|
||||
- 'sk_nulls_for_each_from'
|
||||
- 'sk_nulls_for_each_rcu'
|
||||
- 'snd_array_for_each'
|
||||
- 'snd_pcm_group_for_each_entry'
|
||||
- 'snd_soc_dapm_widget_for_each_path'
|
||||
- 'snd_soc_dapm_widget_for_each_path_safe'
|
||||
- 'snd_soc_dapm_widget_for_each_sink_path'
|
||||
- 'snd_soc_dapm_widget_for_each_source_path'
|
||||
- 'tb_property_for_each'
|
||||
- 'tcf_exts_for_each_action'
|
||||
- 'udp_portaddr_for_each_entry'
|
||||
- 'udp_portaddr_for_each_entry_rcu'
|
||||
- 'usb_hub_for_each_child'
|
||||
@@ -371,6 +430,13 @@ ForEachMacros:
|
||||
- 'v4l2_m2m_for_each_dst_buf_safe'
|
||||
- 'v4l2_m2m_for_each_src_buf'
|
||||
- 'v4l2_m2m_for_each_src_buf_safe'
|
||||
- 'virtio_device_for_each_vq'
|
||||
- 'xa_for_each'
|
||||
- 'xa_for_each_marked'
|
||||
- 'xa_for_each_start'
|
||||
- 'xas_for_each'
|
||||
- 'xas_for_each_conflict'
|
||||
- 'xas_for_each_marked'
|
||||
- 'zorro_for_each_dev'
|
||||
|
||||
#IncludeBlocks: Preserve # Unknown to clang-format-5.0
|
||||
|
1
.gitignore
vendored
@@ -15,6 +15,7 @@
|
||||
*.bin
|
||||
*.bz2
|
||||
*.c.[012]*.*
|
||||
*.dt.yaml
|
||||
*.dtb
|
||||
*.dtb.S
|
||||
*.dwo
|
||||
|
17
.mailmap
@@ -36,9 +36,10 @@ Bart Van Assche <bvanassche@acm.org> <bart.vanassche@sandisk.com>
|
||||
Ben Gardner <bgardner@wabtec.com>
|
||||
Ben M Cahill <ben.m.cahill@intel.com>
|
||||
Björn Steinbrink <B.Steinbrink@gmx.de>
|
||||
Boris Brezillon <boris.brezillon@bootlin.com> <boris.brezillon@free-electrons.com>
|
||||
Boris Brezillon <boris.brezillon@bootlin.com> <b.brezillon.dev@gmail.com>
|
||||
Boris Brezillon <boris.brezillon@bootlin.com> <b.brezillon@overkiz.com>
|
||||
Boris Brezillon <bbrezillon@kernel.org> <boris.brezillon@bootlin.com>
|
||||
Boris Brezillon <bbrezillon@kernel.org> <boris.brezillon@free-electrons.com>
|
||||
Boris Brezillon <bbrezillon@kernel.org> <b.brezillon.dev@gmail.com>
|
||||
Boris Brezillon <bbrezillon@kernel.org> <b.brezillon@overkiz.com>
|
||||
Brian Avery <b.avery@hp.com>
|
||||
Brian King <brking@us.ibm.com>
|
||||
Christoph Hellwig <hch@lst.de>
|
||||
@@ -47,7 +48,10 @@ Corey Minyard <minyard@acm.org>
|
||||
Damian Hobson-Garcia <dhobsong@igel.co.jp>
|
||||
David Brownell <david-b@pacbell.net>
|
||||
David Woodhouse <dwmw2@shinybook.infradead.org>
|
||||
Deng-Cheng Zhu <dengcheng.zhu@mips.com> <dengcheng.zhu@imgtec.com>
|
||||
Dengcheng Zhu <dzhu@wavecomp.com> <dengcheng.zhu@mips.com>
|
||||
Dengcheng Zhu <dzhu@wavecomp.com> <dengcheng.zhu@imgtec.com>
|
||||
Dengcheng Zhu <dzhu@wavecomp.com> <dczhu@mips.com>
|
||||
Dengcheng Zhu <dzhu@wavecomp.com> <dengcheng.zhu@gmail.com>
|
||||
Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
|
||||
Domen Puncer <domen@coderock.org>
|
||||
Douglas Gilbert <dougg@torque.net>
|
||||
@@ -119,6 +123,7 @@ Mark Brown <broonie@sirena.org.uk>
|
||||
Mark Yao <markyao0591@gmail.com> <mark.yao@rock-chips.com>
|
||||
Martin Kepplinger <martink@posteo.de> <martin.kepplinger@theobroma-systems.com>
|
||||
Martin Kepplinger <martink@posteo.de> <martin.kepplinger@ginzinger.com>
|
||||
Mathieu Othacehe <m.othacehe@gmail.com>
|
||||
Matthew Wilcox <willy@infradead.org> <matthew.r.wilcox@intel.com>
|
||||
Matthew Wilcox <willy@infradead.org> <matthew@wil.cx>
|
||||
Matthew Wilcox <willy@infradead.org> <mawilcox@linuxonhyperv.com>
|
||||
@@ -151,6 +156,8 @@ Morten Welinder <welinder@darter.rentec.com>
|
||||
Morten Welinder <welinder@troll.com>
|
||||
Mythri P K <mythripk@ti.com>
|
||||
Nguyen Anh Quynh <aquynh@gmail.com>
|
||||
Nicolas Pitre <nico@fluxnic.net> <nicolas.pitre@linaro.org>
|
||||
Nicolas Pitre <nico@fluxnic.net> <nico@linaro.org>
|
||||
Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
|
||||
Patrick Mochel <mochel@digitalimplant.org>
|
||||
Paul Burton <paul.burton@mips.com> <paul.burton@imgtec.com>
|
||||
@@ -219,3 +226,5 @@ Yakir Yang <kuankuan.y@gmail.com> <ykk@rock-chips.com>
|
||||
Yusuke Goda <goda.yusuke@renesas.com>
|
||||
Gustavo Padovan <gustavo@las.ic.unicamp.br>
|
||||
Gustavo Padovan <padovan@profusion.mobi>
|
||||
Changbin Du <changbin.du@intel.com> <changbin.du@intel.com>
|
||||
Changbin Du <changbin.du@intel.com> <changbin.du@gmail.com>
|
||||
|
28
CREDITS
@@ -842,10 +842,9 @@ D: ax25-utils maintainer.
|
||||
|
||||
N: Helge Deller
|
||||
E: deller@gmx.de
|
||||
E: hdeller@redhat.de
|
||||
D: PA-RISC Linux hacker, LASI-, ASP-, WAX-, LCD/LED-driver
|
||||
S: Schimmelsrain 1
|
||||
S: D-69231 Rauenberg
|
||||
W: http://www.parisc-linux.org/
|
||||
D: PA-RISC Linux architecture maintainer
|
||||
D: LASI-, ASP-, WAX-, LCD/LED-driver
|
||||
S: Germany
|
||||
|
||||
N: Jean Delvare
|
||||
@@ -1222,7 +1221,7 @@ S: Brazil
|
||||
|
||||
N: Oded Gabbay
|
||||
E: oded.gabbay@gmail.com
|
||||
D: AMD KFD maintainer
|
||||
D: HabanaLabs and AMD KFD maintainer
|
||||
S: 12 Shraga Raphaeli
|
||||
S: Petah-Tikva, 4906418
|
||||
S: Israel
|
||||
@@ -1361,7 +1360,7 @@ S: Stellenbosch, Western Cape
|
||||
S: South Africa
|
||||
|
||||
N: Grant Grundler
|
||||
E: grundler@parisc-linux.org
|
||||
E: grantgrundler@gmail.com
|
||||
W: http://obmouse.sourceforge.net/
|
||||
W: http://www.parisc-linux.org/
|
||||
D: obmouse - rewrote Olivier Florent's Omnibook 600 "pop-up" mouse driver
|
||||
@@ -2208,6 +2207,12 @@ N: Christopher Li
|
||||
E: sparse@chrisli.org
|
||||
D: Sparse maintainer 2009 - 2018
|
||||
|
||||
N: Shaohua Li
|
||||
D: Worked on many parts of the kernel, from core x86, ACPI, PCI, KVM, MM,
|
||||
D: and much more. He was the maintainer of MD from 2016 to 2018. Shaohua
|
||||
D: passed away late 2018, he will be greatly missed.
|
||||
W: https://www.spinics.net/lists/raid/msg61993.html
|
||||
|
||||
N: Stephan Linz
|
||||
E: linz@mazet.de
|
||||
E: Stephan.Linz@gmx.de
|
||||
@@ -2486,7 +2491,7 @@ S: Syracuse, New York 13206
|
||||
S: USA
|
||||
|
||||
N: Kyle McMartin
|
||||
E: kyle@parisc-linux.org
|
||||
E: kyle@mcmartin.ca
|
||||
D: Linux/PARISC hacker
|
||||
D: AD1889 sound driver
|
||||
S: Ottawa, Canada
|
||||
@@ -3774,14 +3779,13 @@ S: 21513 Conradia Ct
|
||||
S: Cupertino, CA 95014
|
||||
S: USA
|
||||
|
||||
N: Thibaut Varene
|
||||
E: T-Bone@parisc-linux.org
|
||||
W: http://www.parisc-linux.org/~varenet/
|
||||
P: 1024D/B7D2F063 E67C 0D43 A75E 12A5 BB1C FA2F 1E32 C3DA B7D2 F063
|
||||
N: Thibaut Varène
|
||||
E: hacks+kernel@slashdirt.org
|
||||
W: http://hacks.slashdirt.org/
|
||||
D: PA-RISC port minion, PDC and GSCPS2 drivers, debuglocks and other bits
|
||||
D: Some ARM at91rm9200 bits, S1D13XXX FB driver, random patches here and there
|
||||
D: AD1889 sound driver
|
||||
S: Paris, France
|
||||
S: France
|
||||
|
||||
N: Heikki Vatiainen
|
||||
E: hessu@cs.tut.fi
|
||||
|
22
Documentation/ABI/obsolete/sysfs-class-dax
Normal file
@@ -0,0 +1,22 @@
|
||||
What: /sys/class/dax/
|
||||
Date: May, 2016
|
||||
KernelVersion: v4.7
|
||||
Contact: linux-nvdimm@lists.01.org
|
||||
Description: Device DAX is the device-centric analogue of Filesystem
|
||||
DAX (CONFIG_FS_DAX). It allows memory ranges to be
|
||||
allocated and mapped without need of an intervening file
|
||||
system. Device DAX is strict, precise and predictable.
|
||||
Specifically this interface:
|
||||
|
||||
1/ Guarantees fault granularity with respect to a given
|
||||
page size (pte, pmd, or pud) set at configuration time.
|
||||
|
||||
2/ Enforces deterministic behavior by being strict about
|
||||
what fault scenarios are supported.
|
||||
|
||||
The /sys/class/dax/ interface enumerates all the
|
||||
device-dax instances in the system. The ABI is
|
||||
deprecated and will be removed after 2020. It is
|
||||
replaced with the DAX bus interface /sys/bus/dax/ where
|
||||
device-dax instances can be found under
|
||||
/sys/bus/dax/devices/
|
@@ -146,3 +146,36 @@ KernelVersion: 4.16
|
||||
Contact: Stephen Hemminger <sthemmin@microsoft.com>
|
||||
Description: Binary file created by uio_hv_generic for ring buffer
|
||||
Users: Userspace drivers
|
||||
|
||||
What: /sys/bus/vmbus/devices/<UUID>/channels/<N>/intr_in_full
|
||||
Date: February 2019
|
||||
KernelVersion: 5.0
|
||||
Contact: Michael Kelley <mikelley@microsoft.com>
|
||||
Description: Number of guest to host interrupts caused by the inbound ring
|
||||
buffer transitioning from full to not full while a packet is
|
||||
waiting for buffer space to become available
|
||||
Users: Debugging tools
|
||||
|
||||
What: /sys/bus/vmbus/devices/<UUID>/channels/<N>/intr_out_empty
|
||||
Date: February 2019
|
||||
KernelVersion: 5.0
|
||||
Contact: Michael Kelley <mikelley@microsoft.com>
|
||||
Description: Number of guest to host interrupts caused by the outbound ring
|
||||
buffer transitioning from empty to not empty
|
||||
Users: Debugging tools
|
||||
|
||||
What: /sys/bus/vmbus/devices/<UUID>/channels/<N>/out_full_first
|
||||
Date: February 2019
|
||||
KernelVersion: 5.0
|
||||
Contact: Michael Kelley <mikelley@microsoft.com>
|
||||
Description: Number of write operations that were the first to encounter an
|
||||
outbound ring buffer full condition
|
||||
Users: Debugging tools
|
||||
|
||||
What: /sys/bus/vmbus/devices/<UUID>/channels/<N>/out_full_total
|
||||
Date: February 2019
|
||||
KernelVersion: 5.0
|
||||
Contact: Michael Kelley <mikelley@microsoft.com>
|
||||
Description: Total number of write operations that encountered an outbound
|
||||
ring buffer full condition
|
||||
Users: Debugging tools
|
||||
|
@@ -12,7 +12,6 @@ Description: This file shows ASIC health status. The possible values are:
|
||||
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/
|
||||
cpld1_version
|
||||
cpld2_version
|
||||
|
||||
Date: June 2018
|
||||
KernelVersion: 4.19
|
||||
Contact: Vadim Pasternak <vadimpmellanox.com>
|
||||
@@ -21,6 +20,40 @@ Description: These files show with which CPLD versions have been burned
|
||||
|
||||
The files are read only.
|
||||
|
||||
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/
|
||||
fan_dir
|
||||
|
||||
Date: December 2018
|
||||
KernelVersion: 5.0
|
||||
Contact: Vadim Pasternak <vadimpmellanox.com>
|
||||
Description: This file shows the system fans direction:
|
||||
forward direction - relevant bit is set 0;
|
||||
reversed direction - relevant bit is set 1.
|
||||
|
||||
The files are read only.
|
||||
|
||||
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/
|
||||
jtag_enable
|
||||
|
||||
Date: November 2018
|
||||
KernelVersion: 5.0
|
||||
Contact: Vadim Pasternak <vadimpmellanox.com>
|
||||
Description: These files show with which CPLD versions have been burned
|
||||
on LED board.
|
||||
|
||||
The files are read only.
|
||||
|
||||
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/
|
||||
jtag_enable
|
||||
|
||||
Date: November 2018
|
||||
KernelVersion: 5.0
|
||||
Contact: Vadim Pasternak <vadimpmellanox.com>
|
||||
Description: These files enable and disable the access to the JTAG domain.
|
||||
By default access to the JTAG domain is disabled.
|
||||
|
||||
The file is read/write.
|
||||
|
||||
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/select_iio
|
||||
Date: June 2018
|
||||
KernelVersion: 4.19
|
||||
@@ -76,3 +109,21 @@ Description: These files show the system reset cause, as following: power
|
||||
reset cause.
|
||||
|
||||
The files are read only.
|
||||
|
||||
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/
|
||||
reset_comex_pwr_fail
|
||||
reset_from_comex
|
||||
reset_system
|
||||
reset_voltmon_upgrade_fail
|
||||
|
||||
Date: November 2018
|
||||
KernelVersion: 5.0
|
||||
Contact: Vadim Pasternak <vadimpmellanox.com>
|
||||
Description: These files show the system reset cause, as following: ComEx
|
||||
power fail, reset from ComEx, system platform reset, reset
|
||||
due to voltage monitor devices upgrade failure,
|
||||
Value 1 in file means this is reset cause, 0 - otherwise.
|
||||
Only one bit could be 1 at the same time, representing only
|
||||
the last reset cause.
|
||||
|
||||
The files are read only.
|
||||
|
126
Documentation/ABI/testing/debugfs-driver-habanalabs
Normal file
@@ -0,0 +1,126 @@
|
||||
What: /sys/kernel/debug/habanalabs/hl<n>/addr
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Sets the device address to be used for read or write through
|
||||
PCI bar. The acceptable value is a string that starts with "0x"
|
||||
|
||||
What: /sys/kernel/debug/habanalabs/hl<n>/command_buffers
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Displays a list with information about the currently allocated
|
||||
command buffers
|
||||
|
||||
What: /sys/kernel/debug/habanalabs/hl<n>/command_submission
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Displays a list with information about the currently active
|
||||
command submissions
|
||||
|
||||
What: /sys/kernel/debug/habanalabs/hl<n>/command_submission_jobs
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Displays a list with detailed information about each JOB (CB) of
|
||||
each active command submission
|
||||
|
||||
What: /sys/kernel/debug/habanalabs/hl<n>/data32
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Allows the root user to read or write directly through the
|
||||
device's PCI bar. Writing to this file generates a write
|
||||
transaction while reading from the file generates a read
|
||||
transcation. This custom interface is needed (instead of using
|
||||
the generic Linux user-space PCI mapping) because the DDR bar
|
||||
is very small compared to the DDR memory and only the driver can
|
||||
move the bar before and after the transaction
|
||||
|
||||
What: /sys/kernel/debug/habanalabs/hl<n>/device
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Enables the root user to set the device to specific state.
|
||||
Valid values are "disable", "enable", "suspend", "resume".
|
||||
User can read this property to see the valid values
|
||||
|
||||
What: /sys/kernel/debug/habanalabs/hl<n>/i2c_addr
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Sets I2C device address for I2C transaction that is generated
|
||||
by the device's CPU
|
||||
|
||||
What: /sys/kernel/debug/habanalabs/hl<n>/i2c_bus
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Sets I2C bus address for I2C transaction that is generated by
|
||||
the device's CPU
|
||||
|
||||
What: /sys/kernel/debug/habanalabs/hl<n>/i2c_data
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Triggers an I2C transaction that is generated by the device's
|
||||
CPU. Writing to this file generates a write transaction while
|
||||
reading from the file generates a read transcation
|
||||
|
||||
What: /sys/kernel/debug/habanalabs/hl<n>/i2c_reg
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Sets I2C register id for I2C transaction that is generated by
|
||||
the device's CPU
|
||||
|
||||
What: /sys/kernel/debug/habanalabs/hl<n>/led0
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Sets the state of the first S/W led on the device
|
||||
|
||||
What: /sys/kernel/debug/habanalabs/hl<n>/led1
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Sets the state of the second S/W led on the device
|
||||
|
||||
What: /sys/kernel/debug/habanalabs/hl<n>/led2
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Sets the state of the third S/W led on the device
|
||||
|
||||
What: /sys/kernel/debug/habanalabs/hl<n>/mmu
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Displays the hop values and physical address for a given ASID
|
||||
and virtual address. The user should write the ASID and VA into
|
||||
the file and then read the file to get the result.
|
||||
e.g. to display info about VA 0x1000 for ASID 1 you need to do:
|
||||
echo "1 0x1000" > /sys/kernel/debug/habanalabs/hl0/mmu
|
||||
|
||||
What: /sys/kernel/debug/habanalabs/hl<n>/set_power_state
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Sets the PCI power state. Valid values are "1" for D0 and "2"
|
||||
for D3Hot
|
||||
|
||||
What: /sys/kernel/debug/habanalabs/hl<n>/userptr
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Displays a list with information about the currently user
|
||||
pointers (user virtual addresses) that are pinned and mapped
|
||||
to DMA addresses
|
||||
|
||||
What: /sys/kernel/debug/habanalabs/hl<n>/vm
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Displays a list with information about all the active virtual
|
||||
address mappings per ASID
|
23
Documentation/ABI/testing/debugfs-wilco-ec
Normal file
@@ -0,0 +1,23 @@
|
||||
What: /sys/kernel/debug/wilco_ec/raw
|
||||
Date: January 2019
|
||||
KernelVersion: 5.1
|
||||
Description:
|
||||
Write and read raw mailbox commands to the EC.
|
||||
|
||||
For writing:
|
||||
Bytes 0-1 indicate the message type:
|
||||
00 F0 = Execute Legacy Command
|
||||
00 F2 = Read/Write NVRAM Property
|
||||
Byte 2 provides the command code
|
||||
Bytes 3+ consist of the data passed in the request
|
||||
|
||||
At least three bytes are required, for the msg type and command,
|
||||
with additional bytes optional for additional data.
|
||||
|
||||
Example:
|
||||
// Request EC info type 3 (EC firmware build date)
|
||||
$ echo 00 f0 38 00 03 00 > raw
|
||||
// View the result. The decoded ASCII result "12/21/18" is
|
||||
// included after the raw hex.
|
||||
$ cat raw
|
||||
00 31 32 2f 32 31 2f 31 38 00 38 00 01 00 2f 00 .12/21/18.8...
|
@@ -244,7 +244,7 @@ Description:
|
||||
|
||||
What: /sys/block/<disk>/queue/zoned
|
||||
Date: September 2016
|
||||
Contact: Damien Le Moal <damien.lemoal@hgst.com>
|
||||
Contact: Damien Le Moal <damien.lemoal@wdc.com>
|
||||
Description:
|
||||
zoned indicates if the device is a zoned block device
|
||||
and the zone model of the device if it is indeed zoned.
|
||||
@@ -259,6 +259,14 @@ Description:
|
||||
zone commands, they will be treated as regular block
|
||||
devices and zoned will report "none".
|
||||
|
||||
What: /sys/block/<disk>/queue/nr_zones
|
||||
Date: November 2018
|
||||
Contact: Damien Le Moal <damien.lemoal@wdc.com>
|
||||
Description:
|
||||
nr_zones indicates the total number of zones of a zoned block
|
||||
device ("host-aware" or "host-managed" zone model). For regular
|
||||
block devices, the value is always 0.
|
||||
|
||||
What: /sys/block/<disk>/queue/chunk_sectors
|
||||
Date: September 2016
|
||||
Contact: Hannes Reinecke <hare@suse.com>
|
||||
@@ -268,6 +276,15 @@ Description:
|
||||
indicates the size in 512B sectors of the RAID volume
|
||||
stripe segment. For a zoned block device, either
|
||||
host-aware or host-managed, chunk_sectors indicates the
|
||||
size of 512B sectors of the zones of the device, with
|
||||
size in 512B sectors of the zones of the device, with
|
||||
the eventual exception of the last zone of the device
|
||||
which may be smaller.
|
||||
|
||||
What: /sys/block/<disk>/queue/io_timeout
|
||||
Date: November 2018
|
||||
Contact: Weiping Zhang <zhangweiping@didiglobal.com>
|
||||
Description:
|
||||
io_timeout is the request timeout in milliseconds. If a request
|
||||
does not complete in this time then the block driver timeout
|
||||
handler is invoked. That timeout handler can decide to retry
|
||||
the request, to fail it or to start a device recovery strategy.
|
||||
|
@@ -98,3 +98,42 @@ Description:
|
||||
The backing_dev file is read-write and set up backing
|
||||
device for zram to write incompressible pages.
|
||||
For using, user should enable CONFIG_ZRAM_WRITEBACK.
|
||||
|
||||
What: /sys/block/zram<id>/idle
|
||||
Date: November 2018
|
||||
Contact: Minchan Kim <minchan@kernel.org>
|
||||
Description:
|
||||
idle file is write-only and mark zram slot as idle.
|
||||
If system has mounted debugfs, user can see which slots
|
||||
are idle via /sys/kernel/debug/zram/zram<id>/block_state
|
||||
|
||||
What: /sys/block/zram<id>/writeback
|
||||
Date: November 2018
|
||||
Contact: Minchan Kim <minchan@kernel.org>
|
||||
Description:
|
||||
The writeback file is write-only and trigger idle and/or
|
||||
huge page writeback to backing device.
|
||||
|
||||
What: /sys/block/zram<id>/bd_stat
|
||||
Date: November 2018
|
||||
Contact: Minchan Kim <minchan@kernel.org>
|
||||
Description:
|
||||
The bd_stat file is read-only and represents backing device's
|
||||
statistics (bd_count, bd_reads, bd_writes) in a format
|
||||
similar to block layer statistics file format.
|
||||
|
||||
What: /sys/block/zram<id>/writeback_limit_enable
|
||||
Date: November 2018
|
||||
Contact: Minchan Kim <minchan@kernel.org>
|
||||
Description:
|
||||
The writeback_limit_enable file is read-write and specifies
|
||||
eanbe of writeback_limit feature. "1" means eable the feature.
|
||||
No limit "0" is the initial state.
|
||||
|
||||
What: /sys/block/zram<id>/writeback_limit
|
||||
Date: November 2018
|
||||
Contact: Minchan Kim <minchan@kernel.org>
|
||||
Description:
|
||||
The writeback_limit file is read-write and specifies the maximum
|
||||
amount of writeback ZRAM can do. The limit could be changed
|
||||
in run time.
|
||||
|
146
Documentation/ABI/testing/sysfs-bus-i3c
Normal file
@@ -0,0 +1,146 @@
|
||||
What: /sys/bus/i3c/devices/i3c-<bus-id>
|
||||
KernelVersion: 5.0
|
||||
Contact: linux-i3c@vger.kernel.org
|
||||
Description:
|
||||
An I3C bus. This directory will contain one sub-directory per
|
||||
I3C device present on the bus.
|
||||
|
||||
What: /sys/bus/i3c/devices/i3c-<bus-id>/current_master
|
||||
KernelVersion: 5.0
|
||||
Contact: linux-i3c@vger.kernel.org
|
||||
Description:
|
||||
Expose the master that owns the bus (<bus-id>-<master-pid>) at
|
||||
the time this file is read. Note that bus ownership can change
|
||||
overtime, so there's no guarantee that when the read() call
|
||||
returns, the value returned is still valid.
|
||||
|
||||
What: /sys/bus/i3c/devices/i3c-<bus-id>/mode
|
||||
KernelVersion: 5.0
|
||||
Contact: linux-i3c@vger.kernel.org
|
||||
Description:
|
||||
I3C bus mode. Can be "pure", "mixed-fast" or "mixed-slow". See
|
||||
the I3C specification for a detailed description of what each
|
||||
of these modes implies.
|
||||
|
||||
What: /sys/bus/i3c/devices/i3c-<bus-id>/i3c_scl_frequency
|
||||
KernelVersion: 5.0
|
||||
Contact: linux-i3c@vger.kernel.org
|
||||
Description:
|
||||
The frequency (expressed in Hz) of the SCL signal when
|
||||
operating in I3C SDR mode.
|
||||
|
||||
What: /sys/bus/i3c/devices/i3c-<bus-id>/i2c_scl_frequency
|
||||
KernelVersion: 5.0
|
||||
Contact: linux-i3c@vger.kernel.org
|
||||
Description:
|
||||
The frequency (expressed in Hz) of the SCL signal when
|
||||
operating in I2C mode.
|
||||
|
||||
What: /sys/bus/i3c/devices/i3c-<bus-id>/dynamic_address
|
||||
KernelVersion: 5.0
|
||||
Contact: linux-i3c@vger.kernel.org
|
||||
Description:
|
||||
Dynamic address assigned to the master controller. This
|
||||
address may change if the bus is re-initialized.
|
||||
|
||||
What: /sys/bus/i3c/devices/i3c-<bus-id>/bcr
|
||||
KernelVersion: 5.0
|
||||
Contact: linux-i3c@vger.kernel.org
|
||||
Description:
|
||||
BCR stands for Bus Characteristics Register and express the
|
||||
device capabilities in term of speed, maximum read/write
|
||||
length, etc. See the I3C specification for more details.
|
||||
This entry describes the BCR of the master controller driving
|
||||
the bus.
|
||||
|
||||
What: /sys/bus/i3c/devices/i3c-<bus-id>/dcr
|
||||
KernelVersion: 5.0
|
||||
Contact: linux-i3c@vger.kernel.org
|
||||
Description:
|
||||
DCR stands for Device Characteristics Register and express the
|
||||
device capabilities in term of exposed features. See the I3C
|
||||
specification for more details.
|
||||
This entry describes the DCR of the master controller driving
|
||||
the bus.
|
||||
|
||||
What: /sys/bus/i3c/devices/i3c-<bus-id>/pid
|
||||
KernelVersion: 5.0
|
||||
Contact: linux-i3c@vger.kernel.org
|
||||
Description:
|
||||
PID stands for Provisional ID and is used to uniquely identify
|
||||
a device on a bus. This PID contains information about the
|
||||
vendor, the part and an instance ID so that several devices of
|
||||
the same type can be connected on the same bus.
|
||||
See the I3C specification for more details.
|
||||
This entry describes the PID of the master controller driving
|
||||
the bus.
|
||||
|
||||
What: /sys/bus/i3c/devices/i3c-<bus-id>/hdrcap
|
||||
KernelVersion: 5.0
|
||||
Contact: linux-i3c@vger.kernel.org
|
||||
Description:
|
||||
Expose the HDR (High Data Rate) capabilities of a device.
|
||||
Returns a list of supported HDR mode, each element is separated
|
||||
by space. Modes can be "hdr-ddr", "hdr-tsp" and "hdr-tsl".
|
||||
See the I3C specification for more details about these HDR
|
||||
modes.
|
||||
This entry describes the HDRCAP of the master controller
|
||||
driving the bus.
|
||||
|
||||
What: /sys/bus/i3c/devices/i3c-<bus-id>/<bus-id>-<device-pid>
|
||||
KernelVersion: 5.0
|
||||
Contact: linux-i3c@vger.kernel.org
|
||||
Description:
|
||||
An I3C device present on I3C bus identified by <bus-id>. Note
|
||||
that all devices are represented including the master driving
|
||||
the bus.
|
||||
|
||||
What: /sys/bus/i3c/devices/i3c-<bus-id>/<bus-id>-<device-pid>/dynamic_address
|
||||
KernelVersion: 5.0
|
||||
Contact: linux-i3c@vger.kernel.org
|
||||
Description:
|
||||
Dynamic address assigned to device <bus-id>-<device-pid>. This
|
||||
address may change if the bus is re-initialized.
|
||||
|
||||
What: /sys/bus/i3c/devices/i3c-<bus-id>/<bus-id>-<device-pid>/bcr
|
||||
KernelVersion: 5.0
|
||||
Contact: linux-i3c@vger.kernel.org
|
||||
Description:
|
||||
BCR stands for Bus Characteristics Register and express the
|
||||
device capabilities in term of speed, maximum read/write
|
||||
length, etc. See the I3C specification for more details.
|
||||
|
||||
What: /sys/bus/i3c/devices/i3c-<bus-id>/<bus-id>-<device-pid>/dcr
|
||||
KernelVersion: 5.0
|
||||
Contact: linux-i3c@vger.kernel.org
|
||||
Description:
|
||||
DCR stands for Device Characteristics Register and express the
|
||||
device capabilities in term of exposed features. See the I3C
|
||||
specification for more details.
|
||||
|
||||
What: /sys/bus/i3c/devices/i3c-<bus-id>/<bus-id>-<device-pid>/pid
|
||||
KernelVersion: 5.0
|
||||
Contact: linux-i3c@vger.kernel.org
|
||||
Description:
|
||||
PID stands for Provisional ID and is used to uniquely identify
|
||||
a device on a bus. This PID contains information about the
|
||||
vendor, the part and an instance ID so that several devices of
|
||||
the same type can be connected on the same bus.
|
||||
See the I3C specification for more details.
|
||||
|
||||
What: /sys/bus/i3c/devices/i3c-<bus-id>/<bus-id>-<device-pid>/hdrcap
|
||||
KernelVersion: 5.0
|
||||
Contact: linux-i3c@vger.kernel.org
|
||||
Description:
|
||||
Expose the HDR (High Data Rate) capabilities of a device.
|
||||
Returns a list of supported HDR mode, each element is separated
|
||||
by space. Modes can be "hdr-ddr", "hdr-tsp" and "hdr-tsl".
|
||||
See the I3C specification for more details about these HDR
|
||||
modes.
|
||||
|
||||
What: /sys/bus/i3c/devices/<bus-id>-<device-pid>
|
||||
KernelVersion: 5.0
|
||||
Contact: linux-i3c@vger.kernel.org
|
||||
Description:
|
||||
These directories are just symbolic links to
|
||||
/sys/bus/i3c/devices/i3c-<bus-id>/<bus-id>-<device-pid>.
|
@@ -1554,6 +1554,10 @@ What: /sys/bus/iio/devices/iio:deviceX/in_concentration_raw
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_concentrationX_raw
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_concentration_co2_raw
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_concentrationX_co2_raw
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_concentration_ethanol_raw
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_concentrationX_ethanol_raw
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_concentration_h2_raw
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_concentrationX_h2_raw
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_concentration_voc_raw
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_concentrationX_voc_raw
|
||||
KernelVersion: 4.3
|
||||
@@ -1685,3 +1689,18 @@ Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Raw (unscaled) phase difference reading from channel Y
|
||||
that can be processed to radians.
|
||||
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_massconcentration_pm1_input
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_massconcentrationY_pm1_input
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_massconcentration_pm2p5_input
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_massconcentrationY_pm2p5_input
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_massconcentration_pm4_input
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_massconcentrationY_pm4_input
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_massconcentration_pm10_input
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_massconcentrationY_pm10_input
|
||||
KernelVersion: 4.22
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Mass concentration reading of particulate matter in ug / m3.
|
||||
pmX consists of particles with aerodynamic diameter less or
|
||||
equal to X micrometers.
|
||||
|
28
Documentation/ABI/testing/sysfs-bus-iio-sps30
Normal file
@@ -0,0 +1,28 @@
|
||||
What: /sys/bus/iio/devices/iio:deviceX/start_cleaning
|
||||
Date: December 2018
|
||||
KernelVersion: 4.22
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Writing 1 starts sensor self cleaning. Internal fan accelerates
|
||||
to its maximum speed and keeps spinning for about 10 seconds in
|
||||
order to blow out accumulated dust.
|
||||
|
||||
What: /sys/bus/iio/devices/iio:deviceX/cleaning_period
|
||||
Date: January 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Sensor is capable of triggering self cleaning periodically.
|
||||
Period can be changed by writing a new value here. Upon reading
|
||||
the current one is returned. Units are seconds.
|
||||
|
||||
Writing 0 disables periodical self cleaning entirely.
|
||||
|
||||
What: /sys/bus/iio/devices/iio:deviceX/cleaning_period_available
|
||||
Date: January 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
The range of available values in seconds represented as the
|
||||
minimum value, the step and the maximum value, all enclosed in
|
||||
square brackets.
|
@@ -3,11 +3,13 @@ Date: June 2015
|
||||
KernelVersion: 4.3
|
||||
Contact: Alexander Shishkin <alexander.shishkin@linux.intel.com>
|
||||
Description: (RW) Writes of 1 or 0 enable or disable trace output to this
|
||||
output device. Reads return current status.
|
||||
output device. Reads return current status. Requires that the
|
||||
correstponding output port driver be loaded.
|
||||
|
||||
What: /sys/bus/intel_th/devices/<intel_th_id>-msc<msc-id>/port
|
||||
Date: June 2015
|
||||
KernelVersion: 4.3
|
||||
Contact: Alexander Shishkin <alexander.shishkin@linux.intel.com>
|
||||
Description: (RO) Port number, corresponding to this output device on the
|
||||
switch (GTH).
|
||||
switch (GTH) or "unassigned" if the corresponding output
|
||||
port driver is not loaded.
|
||||
|
@@ -21,6 +21,15 @@ Description: Holds a comma separated list of device unique_ids that
|
||||
If a device is authorized automatically during boot its
|
||||
boot attribute is set to 1.
|
||||
|
||||
What: /sys/bus/thunderbolt/devices/.../domainX/iommu_dma_protection
|
||||
Date: Mar 2019
|
||||
KernelVersion: 4.21
|
||||
Contact: thunderbolt-software@lists.01.org
|
||||
Description: This attribute tells whether the system uses IOMMU
|
||||
for DMA protection. Value of 1 means IOMMU is used 0 means
|
||||
it is not (DMA protection is solely based on Thunderbolt
|
||||
security levels).
|
||||
|
||||
What: /sys/bus/thunderbolt/devices/.../domainX/security
|
||||
Date: Sep 2017
|
||||
KernelVersion: 4.13
|
||||
|
@@ -186,7 +186,7 @@ Contact: Lan Tianyu <tianyu.lan@intel.com>
|
||||
Description:
|
||||
Some platforms provide usb port connect types through ACPI.
|
||||
This attribute is to expose these information to user space.
|
||||
The file will read "hotplug", "wired" and "not used" if the
|
||||
The file will read "hotplug", "hardwired" and "not used" if the
|
||||
information is available, and "unknown" otherwise.
|
||||
|
||||
What: /sys/bus/usb/devices/.../(hub interface)/portX/location
|
||||
|
32
Documentation/ABI/testing/sysfs-class-chromeos
Normal file
@@ -0,0 +1,32 @@
|
||||
What: /sys/class/chromeos/<ec-device-name>/flashinfo
|
||||
Date: August 2015
|
||||
KernelVersion: 4.2
|
||||
Description:
|
||||
Show the EC flash information.
|
||||
|
||||
What: /sys/class/chromeos/<ec-device-name>/kb_wake_angle
|
||||
Date: March 2018
|
||||
KernelVersion: 4.17
|
||||
Description:
|
||||
Control the keyboard wake lid angle. Values are between
|
||||
0 and 360. This file will also show the keyboard wake lid
|
||||
angle by querying the hardware.
|
||||
|
||||
What: /sys/class/chromeos/<ec-device-name>/reboot
|
||||
Date: August 2015
|
||||
KernelVersion: 4.2
|
||||
Description:
|
||||
Tell the EC to reboot in various ways. Options are:
|
||||
"cancel": Cancel a pending reboot.
|
||||
"ro": Jump to RO without rebooting.
|
||||
"rw": Jump to RW without rebooting.
|
||||
"cold": Cold reboot.
|
||||
"disable-jump": Disable jump until next reboot.
|
||||
"hibernate": Hibernate the EC.
|
||||
"at-shutdown": Reboot after an AP shutdown.
|
||||
|
||||
What: /sys/class/chromeos/<ec-device-name>/version
|
||||
Date: August 2015
|
||||
KernelVersion: 4.2
|
||||
Description:
|
||||
Show the information about the EC software and hardware.
|
@@ -0,0 +1,74 @@
|
||||
What: /sys/class/chromeos/<ec-device-name>/lightbar/brightness
|
||||
Date: August 2015
|
||||
KernelVersion: 4.2
|
||||
Description:
|
||||
Writing to this file adjusts the overall brightness of
|
||||
the lightbar, separate from any color intensity. The
|
||||
valid range is 0 (off) to 255 (maximum brightness).
|
||||
|
||||
What: /sys/class/chromeos/<ec-device-name>/lightbar/interval_msec
|
||||
Date: August 2015
|
||||
KernelVersion: 4.2
|
||||
Description:
|
||||
The lightbar is controlled by an embedded controller (EC),
|
||||
which also manages the keyboard, battery charging, fans,
|
||||
and other system hardware. To prevent unprivileged users
|
||||
from interfering with the other EC functions, the rate at
|
||||
which the lightbar control files can be read or written is
|
||||
limited.
|
||||
|
||||
Reading this file will return the number of milliseconds
|
||||
that must elapse between accessing any of the lightbar
|
||||
functions through this interface. Going faster will simply
|
||||
block until the necessary interval has lapsed. The interval
|
||||
applies uniformly to all accesses of any kind by any user.
|
||||
|
||||
What: /sys/class/chromeos/<ec-device-name>/lightbar/led_rgb
|
||||
Date: August 2015
|
||||
KernelVersion: 4.2
|
||||
Description:
|
||||
This allows you to control each LED segment. If the
|
||||
lightbar is already running one of the automatic
|
||||
sequences, you probably won’t see anything change because
|
||||
your color setting will be almost immediately replaced.
|
||||
To get useful results, you should stop the lightbar
|
||||
sequence first.
|
||||
|
||||
The values written to this file are sets of four integers,
|
||||
indicating LED, RED, GREEN, BLUE. The LED number is 0 to 3
|
||||
to select a single segment, or 4 to set all four segments
|
||||
to the same value at once. The RED, GREEN, and BLUE
|
||||
numbers should be in the range 0 (off) to 255 (maximum).
|
||||
You can update more than one segment at a time by writing
|
||||
more than one set of four integers.
|
||||
|
||||
What: /sys/class/chromeos/<ec-device-name>/lightbar/program
|
||||
Date: August 2015
|
||||
KernelVersion: 4.2
|
||||
Description:
|
||||
This allows you to upload and run custom lightbar sequences.
|
||||
|
||||
What: /sys/class/chromeos/<ec-device-name>/lightbar/sequence
|
||||
Date: August 2015
|
||||
KernelVersion: 4.2
|
||||
Description:
|
||||
The Pixel lightbar has a number of built-in sequences
|
||||
that it displays under various conditions, such as at
|
||||
power on, shut down, or while running. Reading from this
|
||||
file displays the current sequence that the lightbar is
|
||||
displaying. Writing to this file allows you to change the
|
||||
sequence.
|
||||
|
||||
What: /sys/class/chromeos/<ec-device-name>/lightbar/userspace_control
|
||||
Date: August 2015
|
||||
KernelVersion: 4.2
|
||||
Description:
|
||||
This allows you to take the control of the lightbar. This
|
||||
prevents the kernel from going through its normal
|
||||
sequences.
|
||||
|
||||
What: /sys/class/chromeos/<ec-device-name>/lightbar/version
|
||||
Date: August 2015
|
||||
KernelVersion: 4.2
|
||||
Description:
|
||||
Show the information about the lightbar version.
|
@@ -0,0 +1,6 @@
|
||||
What: /sys/class/chromeos/<ec-device-name>/vbc/vboot_context
|
||||
Date: October 2015
|
||||
KernelVersion: 4.4
|
||||
Description:
|
||||
Read/write the verified boot context data included on a
|
||||
small nvram space on some EC implementations.
|
@@ -7,55 +7,10 @@ Description:
|
||||
timer. It can do gradual dimming and step change of brightness.
|
||||
|
||||
The pattern is given by a series of tuples, of brightness and
|
||||
duration (ms). The LED is expected to traverse the series and
|
||||
each brightness value for the specified duration. Duration of
|
||||
0 means brightness should immediately change to new value, and
|
||||
writing malformed pattern deactivates any active one.
|
||||
duration (ms).
|
||||
|
||||
1. For gradual dimming, the dimming interval now is set as 50
|
||||
milliseconds. So the tuple with duration less than dimming
|
||||
interval (50ms) is treated as a step change of brightness,
|
||||
i.e. the subsequent brightness will be applied without adding
|
||||
intervening dimming intervals.
|
||||
|
||||
The gradual dimming format of the software pattern values should be:
|
||||
"brightness_1 duration_1 brightness_2 duration_2 brightness_3
|
||||
duration_3 ...". For example:
|
||||
|
||||
echo 0 1000 255 2000 > pattern
|
||||
|
||||
It will make the LED go gradually from zero-intensity to max (255)
|
||||
intensity in 1000 milliseconds, then back to zero intensity in 2000
|
||||
milliseconds:
|
||||
|
||||
LED brightness
|
||||
^
|
||||
255-| / \ / \ /
|
||||
| / \ / \ /
|
||||
| / \ / \ /
|
||||
| / \ / \ /
|
||||
0-| / \/ \/
|
||||
+---0----1----2----3----4----5----6------------> time (s)
|
||||
|
||||
2. To make the LED go instantly from one brightness value to another,
|
||||
we should use zero-time lengths (the brightness must be same as
|
||||
the previous tuple's). So the format should be:
|
||||
"brightness_1 duration_1 brightness_1 0 brightness_2 duration_2
|
||||
brightness_2 0 ...". For example:
|
||||
|
||||
echo 0 1000 0 0 255 2000 255 0 > pattern
|
||||
|
||||
It will make the LED stay off for one second, then stay at max brightness
|
||||
for two seconds:
|
||||
|
||||
LED brightness
|
||||
^
|
||||
255-| +---------+ +---------+
|
||||
| | | | |
|
||||
| | | | |
|
||||
| | | | |
|
||||
0-| -----+ +----+ +----
|
||||
+---0----1----2----3----4----5----6------------> time (s)
|
||||
The exact format is described in:
|
||||
Documentation/devicetree/bindings/leds/leds-trigger-pattern.txt
|
||||
|
||||
What: /sys/class/leds/<led>/hw_pattern
|
||||
Date: September 2018
|
||||
|
@@ -49,3 +49,26 @@ Contact: Wim Van Sebroeck <wim@iguana.be>
|
||||
Description:
|
||||
It is a read only file. It is read to know about current
|
||||
value of timeout programmed.
|
||||
|
||||
What: /sys/class/watchdog/watchdogn/pretimeout
|
||||
Date: December 2016
|
||||
Contact: Wim Van Sebroeck <wim@iguana.be>
|
||||
Description:
|
||||
It is a read only file. It specifies the time in seconds before
|
||||
timeout when the pretimeout interrupt is delivered. Pretimeout
|
||||
is an optional feature.
|
||||
|
||||
What: /sys/class/watchdog/watchdogn/pretimeout_avaialable_governors
|
||||
Date: February 2017
|
||||
Contact: Wim Van Sebroeck <wim@iguana.be>
|
||||
Description:
|
||||
It is a read only file. It shows the pretimeout governors
|
||||
available for this watchdog.
|
||||
|
||||
What: /sys/class/watchdog/watchdogn/pretimeout_governor
|
||||
Date: February 2017
|
||||
Contact: Wim Van Sebroeck <wim@iguana.be>
|
||||
Description:
|
||||
It is a read/write file. When read, the currently assigned
|
||||
pretimeout governor is returned. When written, it sets
|
||||
the pretimeout governor.
|
||||
|
10
Documentation/ABI/testing/sysfs-devices-software_node
Normal file
@@ -0,0 +1,10 @@
|
||||
What: /sys/devices/.../software_node/
|
||||
Date: January 2019
|
||||
Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
|
||||
Description:
|
||||
This directory contains the details about the device that are
|
||||
assigned in kernel (i.e. software), as opposed to the
|
||||
firmware_node directory which contains the details that are
|
||||
assigned for the device in firmware. The main attributes in the
|
||||
directory will show the properties the device has, and the
|
||||
relationship it has to some of the other devices.
|
@@ -145,6 +145,8 @@ What: /sys/devices/system/cpu/cpuX/cpuidle/stateN/name
|
||||
/sys/devices/system/cpu/cpuX/cpuidle/stateN/power
|
||||
/sys/devices/system/cpu/cpuX/cpuidle/stateN/time
|
||||
/sys/devices/system/cpu/cpuX/cpuidle/stateN/usage
|
||||
/sys/devices/system/cpu/cpuX/cpuidle/stateN/above
|
||||
/sys/devices/system/cpu/cpuX/cpuidle/stateN/below
|
||||
Date: September 2007
|
||||
KernelVersion: v2.6.24
|
||||
Contact: Linux power management list <linux-pm@vger.kernel.org>
|
||||
@@ -166,6 +168,11 @@ Description:
|
||||
|
||||
usage: (RO) Number of times this state was entered (a count).
|
||||
|
||||
above: (RO) Number of times this state was entered, but the
|
||||
observed CPU idle duration was too short for it (a count).
|
||||
|
||||
below: (RO) Number of times this state was entered, but the
|
||||
observed CPU idle duration was too long for it (a count).
|
||||
|
||||
What: /sys/devices/system/cpu/cpuX/cpuidle/stateN/desc
|
||||
Date: February 2008
|
||||
|
190
Documentation/ABI/testing/sysfs-driver-habanalabs
Normal file
@@ -0,0 +1,190 @@
|
||||
What: /sys/class/habanalabs/hl<n>/armcp_kernel_ver
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Version of the Linux kernel running on the device's CPU
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/armcp_ver
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Version of the application running on the device's CPU
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/cpld_ver
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Version of the Device's CPLD F/W
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/device_type
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Displays the code name of the device according to its type.
|
||||
The supported values are: "GOYA"
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/eeprom
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: A binary file attribute that contains the contents of the
|
||||
on-board EEPROM
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/fuse_ver
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Displays the device's version from the eFuse
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/hard_reset
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Interface to trigger a hard-reset operation for the device.
|
||||
Hard-reset will reset ALL internal components of the device
|
||||
except for the PCI interface and the internal PLLs
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/hard_reset_cnt
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Displays how many times the device have undergone a hard-reset
|
||||
operation since the driver was loaded
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/high_pll
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Allows the user to set the maximum clock frequency for MME, TPC
|
||||
and IC when the power management profile is set to "automatic".
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/ic_clk
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Allows the user to set the maximum clock frequency of the
|
||||
Interconnect fabric. Writes to this parameter affect the device
|
||||
only when the power management profile is set to "manual" mode.
|
||||
The device IC clock might be set to lower value then the
|
||||
maximum. The user should read the ic_clk_curr to see the actual
|
||||
frequency value of the IC
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/ic_clk_curr
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Displays the current clock frequency of the Interconnect fabric
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/infineon_ver
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Version of the Device's power supply F/W code
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/max_power
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Allows the user to set the maximum power consumption of the
|
||||
device in milliwatts.
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/mme_clk
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Allows the user to set the maximum clock frequency of the
|
||||
MME compute engine. Writes to this parameter affect the device
|
||||
only when the power management profile is set to "manual" mode.
|
||||
The device MME clock might be set to lower value then the
|
||||
maximum. The user should read the mme_clk_curr to see the actual
|
||||
frequency value of the MME
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/mme_clk_curr
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Displays the current clock frequency of the MME compute engine
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/pci_addr
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Displays the PCI address of the device. This is needed so the
|
||||
user would be able to open a device based on its PCI address
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/pm_mng_profile
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Power management profile. Values are "auto", "manual". In "auto"
|
||||
mode, the driver will set the maximum clock frequency to a high
|
||||
value when a user-space process opens the device's file (unless
|
||||
it was already opened by another process). The driver will set
|
||||
the max clock frequency to a low value when there are no user
|
||||
processes that are opened on the device's file. In "manual"
|
||||
mode, the user sets the maximum clock frequency by writing to
|
||||
ic_clk, mme_clk and tpc_clk
|
||||
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/preboot_btl_ver
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Version of the device's preboot F/W code
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/soft_reset
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Interface to trigger a soft-reset operation for the device.
|
||||
Soft-reset will reset only the compute and DMA engines of the
|
||||
device
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/soft_reset_cnt
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Displays how many times the device have undergone a soft-reset
|
||||
operation since the driver was loaded
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/status
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Status of the card: "Operational", "Malfunction", "In reset".
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/thermal_ver
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Version of the Device's thermal daemon
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/tpc_clk
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Allows the user to set the maximum clock frequency of the
|
||||
TPC compute engines. Writes to this parameter affect the device
|
||||
only when the power management profile is set to "manual" mode.
|
||||
The device TPC clock might be set to lower value then the
|
||||
maximum. The user should read the tpc_clk_curr to see the actual
|
||||
frequency value of the TPC
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/tpc_clk_curr
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Displays the current clock frequency of the TPC compute engines
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/uboot_ver
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Version of the u-boot running on the device's CPU
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/write_open_cnt
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Displays the total number of user processes that are currently
|
||||
opened on the device's file
|
@@ -109,3 +109,10 @@ Description:
|
||||
write operation (since a 4k random write might turn
|
||||
into a much larger write due to the zeroout
|
||||
operation).
|
||||
|
||||
What: /sys/fs/ext4/<disk>/journal_task
|
||||
Date: February 2019
|
||||
Contact: "Theodore Ts'o" <tytso@mit.edu>
|
||||
Description:
|
||||
This file is read-only and shows the pid of journal thread in
|
||||
current pid-namespace or 0 if task is unreachable.
|
||||
|
@@ -86,12 +86,28 @@ Description:
|
||||
The unit size is one block, now only support configuring in range
|
||||
of [1, 512].
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/umount_discard_timeout
|
||||
Date: January 2019
|
||||
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
||||
Description:
|
||||
Set timeout to issue discard commands during umount.
|
||||
Default: 5 secs
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/max_victim_search
|
||||
Date: January 2014
|
||||
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
|
||||
Description:
|
||||
Controls the number of trials to find a victim segment.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/migration_granularity
|
||||
Date: October 2018
|
||||
Contact: "Chao Yu" <yuchao0@huawei.com>
|
||||
Description:
|
||||
Controls migration granularity of garbage collection on large
|
||||
section, it can let GC move partial segment{s} of one section
|
||||
in one GC cycle, so that dispersing heavy overhead GC to
|
||||
multiple lightweight one.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/dir_level
|
||||
Date: March 2014
|
||||
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
|
||||
|
@@ -33,18 +33,6 @@ Description:
|
||||
An attribute which indicates whether the patch is currently in
|
||||
transition.
|
||||
|
||||
What: /sys/kernel/livepatch/<patch>/signal
|
||||
Date: Nov 2017
|
||||
KernelVersion: 4.15.0
|
||||
Contact: live-patching@vger.kernel.org
|
||||
Description:
|
||||
A writable attribute that allows administrator to affect the
|
||||
course of an existing transition. Writing 1 sends a fake
|
||||
signal to all remaining blocking tasks. The fake signal
|
||||
means that no proper signal is delivered (there is no data in
|
||||
signal pending structures). Tasks are interrupted or woken up,
|
||||
and forced to change their patched state.
|
||||
|
||||
What: /sys/kernel/livepatch/<patch>/force
|
||||
Date: Nov 2017
|
||||
KernelVersion: 4.15.0
|
||||
|
@@ -146,114 +146,75 @@ What about block I/O and networking buffers? The block I/O and
|
||||
networking subsystems make sure that the buffers they use are valid
|
||||
for you to DMA from/to.
|
||||
|
||||
DMA addressing limitations
|
||||
DMA addressing capabilities
|
||||
==========================
|
||||
|
||||
Does your device have any DMA addressing limitations? For example, is
|
||||
your device only capable of driving the low order 24-bits of address?
|
||||
If so, you need to inform the kernel of this fact.
|
||||
By default, the kernel assumes that your device can address 32-bits of DMA
|
||||
addressing. For a 64-bit capable device, this needs to be increased, and for
|
||||
a device with limitations, it needs to be decreased.
|
||||
|
||||
By default, the kernel assumes that your device can address the full
|
||||
32-bits. For a 64-bit capable device, this needs to be increased.
|
||||
And for a device with limitations, as discussed in the previous
|
||||
paragraph, it needs to be decreased.
|
||||
Special note about PCI: PCI-X specification requires PCI-X devices to support
|
||||
64-bit addressing (DAC) for all transactions. And at least one platform (SGI
|
||||
SN2) requires 64-bit consistent allocations to operate correctly when the IO
|
||||
bus is in PCI-X mode.
|
||||
|
||||
Special note about PCI: PCI-X specification requires PCI-X devices to
|
||||
support 64-bit addressing (DAC) for all transactions. And at least
|
||||
one platform (SGI SN2) requires 64-bit consistent allocations to
|
||||
operate correctly when the IO bus is in PCI-X mode.
|
||||
For correct operation, you must set the DMA mask to inform the kernel about
|
||||
your devices DMA addressing capabilities.
|
||||
|
||||
For correct operation, you must interrogate the kernel in your device
|
||||
probe routine to see if the DMA controller on the machine can properly
|
||||
support the DMA addressing limitation your device has. It is good
|
||||
style to do this even if your device holds the default setting,
|
||||
because this shows that you did think about these issues wrt. your
|
||||
device.
|
||||
|
||||
The query is performed via a call to dma_set_mask_and_coherent()::
|
||||
This is performed via a call to dma_set_mask_and_coherent()::
|
||||
|
||||
int dma_set_mask_and_coherent(struct device *dev, u64 mask);
|
||||
|
||||
which will query the mask for both streaming and coherent APIs together.
|
||||
If you have some special requirements, then the following two separate
|
||||
queries can be used instead:
|
||||
which will set the mask for both streaming and coherent APIs together. If you
|
||||
have some special requirements, then the following two separate calls can be
|
||||
used instead:
|
||||
|
||||
The query for streaming mappings is performed via a call to
|
||||
The setup for streaming mappings is performed via a call to
|
||||
dma_set_mask()::
|
||||
|
||||
int dma_set_mask(struct device *dev, u64 mask);
|
||||
|
||||
The query for consistent allocations is performed via a call
|
||||
The setup for consistent allocations is performed via a call
|
||||
to dma_set_coherent_mask()::
|
||||
|
||||
int dma_set_coherent_mask(struct device *dev, u64 mask);
|
||||
|
||||
Here, dev is a pointer to the device struct of your device, and mask
|
||||
is a bit mask describing which bits of an address your device
|
||||
supports. It returns zero if your card can perform DMA properly on
|
||||
the machine given the address mask you provided. In general, the
|
||||
device struct of your device is embedded in the bus-specific device
|
||||
struct of your device. For example, &pdev->dev is a pointer to the
|
||||
device struct of a PCI device (pdev is a pointer to the PCI device
|
||||
struct of your device).
|
||||
Here, dev is a pointer to the device struct of your device, and mask is a bit
|
||||
mask describing which bits of an address your device supports. Often the
|
||||
device struct of your device is embedded in the bus-specific device struct of
|
||||
your device. For example, &pdev->dev is a pointer to the device struct of a
|
||||
PCI device (pdev is a pointer to the PCI device struct of your device).
|
||||
|
||||
If it returns non-zero, your device cannot perform DMA properly on
|
||||
this platform, and attempting to do so will result in undefined
|
||||
behavior. You must either use a different mask, or not use DMA.
|
||||
These calls usually return zero to indicated your device can perform DMA
|
||||
properly on the machine given the address mask you provided, but they might
|
||||
return an error if the mask is too small to be supportable on the given
|
||||
system. If it returns non-zero, your device cannot perform DMA properly on
|
||||
this platform, and attempting to do so will result in undefined behavior.
|
||||
You must not use DMA on this device unless the dma_set_mask family of
|
||||
functions has returned success.
|
||||
|
||||
This means that in the failure case, you have three options:
|
||||
This means that in the failure case, you have two options:
|
||||
|
||||
1) Use another DMA mask, if possible (see below).
|
||||
2) Use some non-DMA mode for data transfer, if possible.
|
||||
3) Ignore this device and do not initialize it.
|
||||
1) Use some non-DMA mode for data transfer, if possible.
|
||||
2) Ignore this device and do not initialize it.
|
||||
|
||||
It is recommended that your driver print a kernel KERN_WARNING message
|
||||
when you end up performing either #2 or #3. In this manner, if a user
|
||||
of your driver reports that performance is bad or that the device is not
|
||||
even detected, you can ask them for the kernel messages to find out
|
||||
exactly why.
|
||||
It is recommended that your driver print a kernel KERN_WARNING message when
|
||||
setting the DMA mask fails. In this manner, if a user of your driver reports
|
||||
that performance is bad or that the device is not even detected, you can ask
|
||||
them for the kernel messages to find out exactly why.
|
||||
|
||||
The standard 32-bit addressing device would do something like this::
|
||||
The standard 64-bit addressing device would do something like this::
|
||||
|
||||
if (dma_set_mask_and_coherent(dev, DMA_BIT_MASK(32))) {
|
||||
if (dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64))) {
|
||||
dev_warn(dev, "mydev: No suitable DMA available\n");
|
||||
goto ignore_this_device;
|
||||
}
|
||||
|
||||
Another common scenario is a 64-bit capable device. The approach here
|
||||
is to try for 64-bit addressing, but back down to a 32-bit mask that
|
||||
should not fail. The kernel may fail the 64-bit mask not because the
|
||||
platform is not capable of 64-bit addressing. Rather, it may fail in
|
||||
this case simply because 32-bit addressing is done more efficiently
|
||||
than 64-bit addressing. For example, Sparc64 PCI SAC addressing is
|
||||
more efficient than DAC addressing.
|
||||
If the device only supports 32-bit addressing for descriptors in the
|
||||
coherent allocations, but supports full 64-bits for streaming mappings
|
||||
it would look like this:
|
||||
|
||||
Here is how you would handle a 64-bit capable device which can drive
|
||||
all 64-bits when accessing streaming DMA::
|
||||
|
||||
int using_dac;
|
||||
|
||||
if (!dma_set_mask(dev, DMA_BIT_MASK(64))) {
|
||||
using_dac = 1;
|
||||
} else if (!dma_set_mask(dev, DMA_BIT_MASK(32))) {
|
||||
using_dac = 0;
|
||||
} else {
|
||||
dev_warn(dev, "mydev: No suitable DMA available\n");
|
||||
goto ignore_this_device;
|
||||
}
|
||||
|
||||
If a card is capable of using 64-bit consistent allocations as well,
|
||||
the case would look like this::
|
||||
|
||||
int using_dac, consistent_using_dac;
|
||||
|
||||
if (!dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64))) {
|
||||
using_dac = 1;
|
||||
consistent_using_dac = 1;
|
||||
} else if (!dma_set_mask_and_coherent(dev, DMA_BIT_MASK(32))) {
|
||||
using_dac = 0;
|
||||
consistent_using_dac = 0;
|
||||
} else {
|
||||
if (dma_set_mask(dev, DMA_BIT_MASK(64))) {
|
||||
dev_warn(dev, "mydev: No suitable DMA available\n");
|
||||
goto ignore_this_device;
|
||||
}
|
||||
|
@@ -58,15 +58,6 @@ specify the ``GFP_`` flags (see kmalloc()) for the allocation (the
|
||||
implementation may choose to ignore flags that affect the location of
|
||||
the returned memory, like GFP_DMA).
|
||||
|
||||
::
|
||||
|
||||
void *
|
||||
dma_zalloc_coherent(struct device *dev, size_t size,
|
||||
dma_addr_t *dma_handle, gfp_t flag)
|
||||
|
||||
Wraps dma_alloc_coherent() and also zeroes the returned memory if the
|
||||
allocation attempt succeeded.
|
||||
|
||||
::
|
||||
|
||||
void
|
||||
@@ -204,6 +195,14 @@ Requesting the required mask does not alter the current mask. If you
|
||||
wish to take advantage of it, you should issue a dma_set_mask()
|
||||
call to set the mask to the value returned.
|
||||
|
||||
::
|
||||
|
||||
size_t
|
||||
dma_direct_max_mapping_size(struct device *dev);
|
||||
|
||||
Returns the maximum size of a mapping for the device. The size parameter
|
||||
of the mapping functions like dma_map_single(), dma_map_page() and
|
||||
others should not be larger than the returned value.
|
||||
|
||||
Part Id - Streaming DMA mappings
|
||||
--------------------------------
|
||||
@@ -539,8 +538,8 @@ that simply cannot make consistent memory.
|
||||
dma_free_attrs(struct device *dev, size_t size, void *cpu_addr,
|
||||
dma_addr_t dma_handle, unsigned long attrs)
|
||||
|
||||
Free memory allocated by the dma_alloc_attrs(). All parameters common
|
||||
parameters must identical to those otherwise passed to dma_fre_coherent,
|
||||
Free memory allocated by the dma_alloc_attrs(). All common
|
||||
parameters must be identical to those otherwise passed to dma_free_coherent,
|
||||
and the attrs argument must be identical to the attrs passed to
|
||||
dma_alloc_attrs().
|
||||
|
||||
@@ -575,8 +574,7 @@ boundaries when doing this.
|
||||
|
||||
int
|
||||
dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
|
||||
dma_addr_t device_addr, size_t size, int
|
||||
flags)
|
||||
dma_addr_t device_addr, size_t size);
|
||||
|
||||
Declare region of memory to be handed out by dma_alloc_coherent() when
|
||||
it's asked for coherent memory for this device.
|
||||
@@ -590,12 +588,6 @@ dma_addr_t in dma_alloc_coherent()).
|
||||
|
||||
size is the size of the area (must be multiples of PAGE_SIZE).
|
||||
|
||||
flags can be ORed together and are:
|
||||
|
||||
- DMA_MEMORY_EXCLUSIVE - only allocate memory from the declared regions.
|
||||
Do not allow dma_alloc_coherent() to fall back to system memory when
|
||||
it's out of memory in the declared region.
|
||||
|
||||
As a simplification for the platforms, only *one* such region of
|
||||
memory may be declared per device.
|
||||
|
||||
@@ -614,23 +606,6 @@ unconditionally having removed all the required structures. It is the
|
||||
driver's job to ensure that no parts of this memory region are
|
||||
currently in use.
|
||||
|
||||
::
|
||||
|
||||
void *
|
||||
dma_mark_declared_memory_occupied(struct device *dev,
|
||||
dma_addr_t device_addr, size_t size)
|
||||
|
||||
This is used to occupy specific regions of the declared space
|
||||
(dma_alloc_coherent() will hand out the first free region it finds).
|
||||
|
||||
device_addr is the *device* address of the region requested.
|
||||
|
||||
size is the size (and should be a page-sized multiple).
|
||||
|
||||
The return value will be either a pointer to the processor virtual
|
||||
address of the memory, or an error (via PTR_ERR()) if any part of the
|
||||
region is occupied.
|
||||
|
||||
Part III - Debug drivers use of the DMA-API
|
||||
-------------------------------------------
|
||||
|
||||
@@ -705,6 +680,9 @@ dma-api/disabled This read-only file contains the character 'Y'
|
||||
happen when it runs out of memory or if it was
|
||||
disabled at boot time
|
||||
|
||||
dma-api/dump This read-only file contains current DMA
|
||||
mappings.
|
||||
|
||||
dma-api/error_count This file is read-only and shows the total
|
||||
numbers of errors found.
|
||||
|
||||
@@ -717,13 +695,16 @@ dma-api/num_errors The number in this file shows how many
|
||||
dma-api/min_free_entries This read-only file can be read to get the
|
||||
minimum number of free dma_debug_entries the
|
||||
allocator has ever seen. If this value goes
|
||||
down to zero the code will disable itself
|
||||
because it is not longer reliable.
|
||||
down to zero the code will attempt to increase
|
||||
nr_total_entries to compensate.
|
||||
|
||||
dma-api/num_free_entries The current number of free dma_debug_entries
|
||||
in the allocator.
|
||||
|
||||
dma-api/driver-filter You can write a name of a driver into this file
|
||||
dma-api/nr_total_entries The total number of dma_debug_entries in the
|
||||
allocator, both free and used.
|
||||
|
||||
dma-api/driver_filter You can write a name of a driver into this file
|
||||
to limit the debug output to requests from that
|
||||
particular driver. Write an empty string to
|
||||
that file to disable the filter and see
|
||||
@@ -742,10 +723,15 @@ driver filter at boot time. The debug code will only print errors for that
|
||||
driver afterwards. This filter can be disabled or changed later using debugfs.
|
||||
|
||||
When the code disables itself at runtime this is most likely because it ran
|
||||
out of dma_debug_entries. These entries are preallocated at boot. The number
|
||||
of preallocated entries is defined per architecture. If it is too low for you
|
||||
boot with 'dma_debug_entries=<your_desired_number>' to overwrite the
|
||||
architectural default.
|
||||
out of dma_debug_entries and was unable to allocate more on-demand. 65536
|
||||
entries are preallocated at boot - if this is too low for you boot with
|
||||
'dma_debug_entries=<your_desired_number>' to overwrite the default. Note
|
||||
that the code allocates entries in batches, so the exact number of
|
||||
preallocated entries may be greater than the actual number requested. The
|
||||
code will print to the kernel log each time it has dynamically allocated
|
||||
as many entries as were initially preallocated. This is to indicate that a
|
||||
larger preallocation size may be appropriate, or if it happens continually
|
||||
that a driver may be leaking mappings.
|
||||
|
||||
::
|
||||
|
||||
|
@@ -52,8 +52,8 @@ Address translation
|
||||
-------------------
|
||||
|
||||
To translate the virtual address to a bus address, use the normal DMA
|
||||
API. Do _not_ use isa_virt_to_phys() even though it does the same
|
||||
thing. The reason for this is that the function isa_virt_to_phys()
|
||||
API. Do _not_ use isa_virt_to_bus() even though it does the same
|
||||
thing. The reason for this is that the function isa_virt_to_bus()
|
||||
will require a Kconfig dependency to ISA, not just ISA_DMA_API which
|
||||
is really all you need. Remember that even though the DMA controller
|
||||
has its origins in ISA it is used elsewhere.
|
||||
|
@@ -31,14 +31,13 @@
|
||||
#define YBLANK 38
|
||||
#define XOFFSET 8
|
||||
#define XPULSE 144
|
||||
#define YOFFSET (63+3)
|
||||
#define YPULSE (63+6)
|
||||
#define YOFFSET 3
|
||||
#define YPULSE 6
|
||||
#define DPI 72
|
||||
#define VFREQ 60 /* Hz */
|
||||
#define TIMING_NAME "Linux XGA"
|
||||
#define ESTABLISHED_TIMING2_BITS 0x08 /* Bit 3 -> 1024x768 @60 Hz */
|
||||
#define HSYNC_POL 0
|
||||
#define VSYNC_POL 0
|
||||
#define CRC 0x55
|
||||
|
||||
#include "edid.S"
|
||||
|
@@ -31,14 +31,13 @@
|
||||
#define YBLANK 42
|
||||
#define XOFFSET 48
|
||||
#define XPULSE 112
|
||||
#define YOFFSET (63+1)
|
||||
#define YPULSE (63+3)
|
||||
#define YOFFSET 1
|
||||
#define YPULSE 3
|
||||
#define DPI 72
|
||||
#define VFREQ 60 /* Hz */
|
||||
#define TIMING_NAME "Linux SXGA"
|
||||
/* No ESTABLISHED_TIMINGx_BITS */
|
||||
#define HSYNC_POL 1
|
||||
#define VSYNC_POL 1
|
||||
#define CRC 0xa0
|
||||
|
||||
#include "edid.S"
|
||||
|
@@ -31,14 +31,13 @@
|
||||
#define YBLANK 50
|
||||
#define XOFFSET 64
|
||||
#define XPULSE 192
|
||||
#define YOFFSET (63+1)
|
||||
#define YPULSE (63+3)
|
||||
#define YOFFSET 1
|
||||
#define YPULSE 3
|
||||
#define DPI 72
|
||||
#define VFREQ 60 /* Hz */
|
||||
#define TIMING_NAME "Linux UXGA"
|
||||
/* No ESTABLISHED_TIMINGx_BITS */
|
||||
#define HSYNC_POL 1
|
||||
#define VSYNC_POL 1
|
||||
#define CRC 0x9d
|
||||
|
||||
#include "edid.S"
|
||||
|
@@ -31,14 +31,13 @@
|
||||
#define YBLANK 39
|
||||
#define XOFFSET 104
|
||||
#define XPULSE 176
|
||||
#define YOFFSET (63+3)
|
||||
#define YPULSE (63+6)
|
||||
#define YOFFSET 3
|
||||
#define YPULSE 6
|
||||
#define DPI 96
|
||||
#define VFREQ 60 /* Hz */
|
||||
#define TIMING_NAME "Linux WSXGA"
|
||||
/* No ESTABLISHED_TIMINGx_BITS */
|
||||
#define HSYNC_POL 1
|
||||
#define VSYNC_POL 1
|
||||
#define CRC 0x26
|
||||
|
||||
#include "edid.S"
|
||||
|
@@ -31,14 +31,13 @@
|
||||
#define YBLANK 45
|
||||
#define XOFFSET 88
|
||||
#define XPULSE 44
|
||||
#define YOFFSET (63+4)
|
||||
#define YPULSE (63+5)
|
||||
#define YOFFSET 4
|
||||
#define YPULSE 5
|
||||
#define DPI 96
|
||||
#define VFREQ 60 /* Hz */
|
||||
#define TIMING_NAME "Linux FHD"
|
||||
/* No ESTABLISHED_TIMINGx_BITS */
|
||||
#define HSYNC_POL 1
|
||||
#define VSYNC_POL 1
|
||||
#define CRC 0x05
|
||||
|
||||
#include "edid.S"
|
||||
|
@@ -28,14 +28,13 @@
|
||||
#define YBLANK 28
|
||||
#define XOFFSET 40
|
||||
#define XPULSE 128
|
||||
#define YOFFSET (63+1)
|
||||
#define YPULSE (63+4)
|
||||
#define YOFFSET 1
|
||||
#define YPULSE 4
|
||||
#define DPI 72
|
||||
#define VFREQ 60 /* Hz */
|
||||
#define TIMING_NAME "Linux SVGA"
|
||||
#define ESTABLISHED_TIMING1_BITS 0x01 /* Bit 0: 800x600 @ 60Hz */
|
||||
#define HSYNC_POL 1
|
||||
#define VSYNC_POL 1
|
||||
#define CRC 0xc2
|
||||
|
||||
#include "edid.S"
|
||||
|
@@ -45,14 +45,5 @@ EDID:
|
||||
|
||||
#define YPIX vdisp
|
||||
#define YBLANK vtotal-vdisp
|
||||
#define YOFFSET (63+(vsyncstart-vdisp))
|
||||
#define YPULSE (63+(vsyncend-vsyncstart))
|
||||
|
||||
The CRC value in the last line
|
||||
#define CRC 0x55
|
||||
also is a bit tricky. After a first version of the binary data set is
|
||||
created, it must be checked with the "edid-decode" utility which will
|
||||
most probably complain about a wrong CRC. Fortunately, the utility also
|
||||
displays the correct CRC which must then be inserted into the source
|
||||
file. After the make procedure is repeated, the EDID data set is ready
|
||||
to be used.
|
||||
#define YOFFSET vsyncstart-vdisp
|
||||
#define YPULSE vsyncend-vsyncstart
|
||||
|
@@ -15,10 +15,21 @@ clean:
|
||||
%.o: %.S
|
||||
@cc -c $^
|
||||
|
||||
%.bin: %.o
|
||||
%.bin.nocrc: %.o
|
||||
@objcopy -Obinary $^ $@
|
||||
|
||||
%.bin.ihex: %.o
|
||||
%.crc: %.bin.nocrc
|
||||
@list=$$(for i in `seq 1 127`; do head -c$$i $^ | tail -c1 \
|
||||
| hexdump -v -e '/1 "%02X+"'; done); \
|
||||
echo "ibase=16;100-($${list%?})%100" | bc >$@
|
||||
|
||||
%.p: %.crc %.S
|
||||
@cc -c -DCRC="$$(cat $*.crc)" -o $@ $*.S
|
||||
|
||||
%.bin: %.p
|
||||
@objcopy -Obinary $^ $@
|
||||
|
||||
%.bin.ihex: %.p
|
||||
@objcopy -Oihex $^ $@
|
||||
@dos2unix $@ 2>/dev/null
|
||||
|
||||
|
@@ -47,9 +47,11 @@
|
||||
#define mfgname2id(v1,v2,v3) \
|
||||
((((v1-'@')&0x1f)<<10)+(((v2-'@')&0x1f)<<5)+((v3-'@')&0x1f))
|
||||
#define swap16(v1) ((v1>>8)+((v1&0xff)<<8))
|
||||
#define lsbs2(v1,v2) (((v1&0x0f)<<4)+(v2&0x0f))
|
||||
#define msbs2(v1,v2) ((((v1>>8)&0x0f)<<4)+((v2>>8)&0x0f))
|
||||
#define msbs4(v1,v2,v3,v4) \
|
||||
(((v1&0x03)>>2)+((v2&0x03)>>4)+((v3&0x03)>>6)+((v4&0x03)>>8))
|
||||
((((v1>>8)&0x03)<<6)+(((v2>>8)&0x03)<<4)+\
|
||||
(((v3>>4)&0x03)<<2)+((v4>>4)&0x03))
|
||||
#define pixdpi2mm(pix,dpi) ((pix*25)/dpi)
|
||||
#define xsize pixdpi2mm(XPIX,DPI)
|
||||
#define ysize pixdpi2mm(YPIX,DPI)
|
||||
@@ -200,9 +202,9 @@ y_msbs: .byte msbs2(YPIX,YBLANK)
|
||||
x_snc_off_lsb: .byte XOFFSET&0xff
|
||||
/* Horizontal sync pulse width pixels 8 lsbits (0-1023) */
|
||||
x_snc_pls_lsb: .byte XPULSE&0xff
|
||||
/* Bits 7-4 Vertical sync offset lines 4 lsbits -63)
|
||||
Bits 3-0 Vertical sync pulse width lines 4 lsbits -63) */
|
||||
y_snc_lsb: .byte ((YOFFSET-63)<<4)+(YPULSE-63)
|
||||
/* Bits 7-4 Vertical sync offset lines 4 lsbits (0-63)
|
||||
Bits 3-0 Vertical sync pulse width lines 4 lsbits (0-63) */
|
||||
y_snc_lsb: .byte lsbs2(YOFFSET, YPULSE)
|
||||
/* Bits 7-6 Horizontal sync offset pixels 2 msbits
|
||||
Bits 5-4 Horizontal sync pulse width pixels 2 msbits
|
||||
Bits 3-2 Vertical sync offset lines 2 msbits
|
||||
|
@@ -2,7 +2,7 @@
|
||||
# Makefile for Sphinx documentation
|
||||
#
|
||||
|
||||
subdir-y :=
|
||||
subdir-y := devicetree/bindings/
|
||||
|
||||
# You can set these variables from the command line.
|
||||
SPHINXBUILD = sphinx-build
|
||||
|
@@ -1,499 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<!-- Creator: fig2dev Version 3.2 Patchlevel 5e -->
|
||||
|
||||
<!-- CreationDate: Wed Dec 9 17:26:09 2015 -->
|
||||
|
||||
<!-- Magnification: 2.000 -->
|
||||
|
||||
<svg
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:cc="http://creativecommons.org/ns#"
|
||||
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
||||
xmlns:svg="http://www.w3.org/2000/svg"
|
||||
xmlns="http://www.w3.org/2000/svg"
|
||||
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
|
||||
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
|
||||
width="5.7in"
|
||||
height="6.6in"
|
||||
viewBox="-44 -44 6838 7888"
|
||||
id="svg2"
|
||||
version="1.1"
|
||||
inkscape:version="0.48.4 r9939"
|
||||
sodipodi:docname="BigTreeClassicRCUBH.fig">
|
||||
<metadata
|
||||
id="metadata110">
|
||||
<rdf:RDF>
|
||||
<cc:Work
|
||||
rdf:about="">
|
||||
<dc:format>image/svg+xml</dc:format>
|
||||
<dc:type
|
||||
rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
||||
<dc:title></dc:title>
|
||||
</cc:Work>
|
||||
</rdf:RDF>
|
||||
</metadata>
|
||||
<defs
|
||||
id="defs108">
|
||||
<marker
|
||||
inkscape:stockid="Arrow1Mend"
|
||||
orient="auto"
|
||||
refY="0.0"
|
||||
refX="0.0"
|
||||
id="Arrow1Mend"
|
||||
style="overflow:visible;">
|
||||
<path
|
||||
id="path3868"
|
||||
d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
|
||||
style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;"
|
||||
transform="scale(0.4) rotate(180) translate(10,0)" />
|
||||
</marker>
|
||||
<marker
|
||||
inkscape:stockid="Arrow2Mend"
|
||||
orient="auto"
|
||||
refY="0.0"
|
||||
refX="0.0"
|
||||
id="Arrow2Mend"
|
||||
style="overflow:visible;">
|
||||
<path
|
||||
id="path3886"
|
||||
style="fill-rule:evenodd;stroke-width:0.62500000;stroke-linejoin:round;"
|
||||
d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z "
|
||||
transform="scale(0.6) rotate(180) translate(0,0)" />
|
||||
</marker>
|
||||
</defs>
|
||||
<sodipodi:namedview
|
||||
pagecolor="#ffffff"
|
||||
bordercolor="#666666"
|
||||
borderopacity="1"
|
||||
objecttolerance="10"
|
||||
gridtolerance="10"
|
||||
guidetolerance="10"
|
||||
inkscape:pageopacity="0"
|
||||
inkscape:pageshadow="2"
|
||||
inkscape:window-width="878"
|
||||
inkscape:window-height="1148"
|
||||
id="namedview106"
|
||||
showgrid="false"
|
||||
inkscape:zoom="1.3547758"
|
||||
inkscape:cx="256.5"
|
||||
inkscape:cy="297"
|
||||
inkscape:window-x="45"
|
||||
inkscape:window-y="24"
|
||||
inkscape:window-maximized="0"
|
||||
inkscape:current-layer="g4" />
|
||||
<g
|
||||
style="stroke-width:.025in; fill:none"
|
||||
id="g4">
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="450"
|
||||
y="0"
|
||||
width="6300"
|
||||
height="7350"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
|
||||
id="rect6" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="4950"
|
||||
y="4950"
|
||||
width="1500"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
|
||||
id="rect8" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="750"
|
||||
y="600"
|
||||
width="5700"
|
||||
height="3750"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
|
||||
id="rect10" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="0"
|
||||
y="450"
|
||||
width="6300"
|
||||
height="7350"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
|
||||
id="rect12" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="300"
|
||||
y="1050"
|
||||
width="5700"
|
||||
height="3750"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
|
||||
id="rect14" />
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="2850"
|
||||
cy="3900"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle16" />
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="3150"
|
||||
cy="3900"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle18" />
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="3450"
|
||||
cy="3900"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle20" />
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="1350"
|
||||
cy="5100"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle22" />
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="1650"
|
||||
cy="5100"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle24" />
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="1950"
|
||||
cy="5100"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle26" />
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="4350"
|
||||
cy="5100"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle28" />
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="4650"
|
||||
cy="5100"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle30" />
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="4950"
|
||||
cy="5100"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle32" />
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="1350,3450 2350,2590 "
|
||||
style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
|
||||
id="polyline34" />
|
||||
<!-- Arrowhead on XXXpoint 1350 3450 - 2444 2510-->
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="4950,3450 3948,2590 "
|
||||
style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
|
||||
id="polyline38" />
|
||||
<!-- Arrowhead on XXXpoint 4950 3450 - 3854 2510-->
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="750"
|
||||
y="3450"
|
||||
width="1800"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
|
||||
id="rect42" />
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="2250,5400 2250,4414 "
|
||||
style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
|
||||
id="polyline44" />
|
||||
<!-- Arrowhead on XXXpoint 2250 5400 - 2250 4290-->
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="1500"
|
||||
y="5400"
|
||||
width="1500"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
|
||||
id="rect48" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="300"
|
||||
y="6600"
|
||||
width="1500"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
|
||||
id="rect50" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="3750"
|
||||
y="3450"
|
||||
width="1800"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
|
||||
id="rect52" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="4500"
|
||||
y="5400"
|
||||
width="1500"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
|
||||
id="rect54" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="3300"
|
||||
y="6600"
|
||||
width="1500"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
|
||||
id="rect56" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="2250"
|
||||
y="1650"
|
||||
width="1800"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
|
||||
id="rect58" />
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="6450"
|
||||
y="300"
|
||||
fill="#000000"
|
||||
font-family="Helvetica"
|
||||
font-style="normal"
|
||||
font-weight="normal"
|
||||
font-size="192"
|
||||
text-anchor="end"
|
||||
id="text60">rcu_bh</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="3150"
|
||||
y="1950"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text62">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="3150"
|
||||
y="2250"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text64">rcu_node</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="1650"
|
||||
y="3750"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text66">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="1650"
|
||||
y="4050"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text68">rcu_node</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="4650"
|
||||
y="4050"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text70">rcu_node</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="4650"
|
||||
y="3750"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text72">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="2250"
|
||||
y="5700"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text74">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="2250"
|
||||
y="6000"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text76">rcu_data</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="1050"
|
||||
y="6900"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text78">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="1050"
|
||||
y="7200"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text80">rcu_data</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="5250"
|
||||
y="5700"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text82">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="5250"
|
||||
y="6000"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text84">rcu_data</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="4050"
|
||||
y="6900"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text86">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="4050"
|
||||
y="7200"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text88">rcu_data</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="450"
|
||||
y="1350"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="start"
|
||||
id="text90">struct rcu_state</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="6000"
|
||||
y="750"
|
||||
fill="#000000"
|
||||
font-family="Helvetica"
|
||||
font-style="normal"
|
||||
font-weight="normal"
|
||||
font-size="192"
|
||||
text-anchor="end"
|
||||
id="text92">rcu_sched</text>
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="5250,5400 5250,4414 "
|
||||
style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
|
||||
id="polyline94" />
|
||||
<!-- Arrowhead on XXXpoint 5250 5400 - 5250 4290-->
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="4050,6600 4050,4414 "
|
||||
style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
|
||||
id="polyline98" />
|
||||
<!-- Arrowhead on XXXpoint 4050 6600 - 4050 4290-->
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="1050,6600 1050,4414 "
|
||||
style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
|
||||
id="polyline102" />
|
||||
<!-- Arrowhead on XXXpoint 1050 6600 - 1050 4290-->
|
||||
</g>
|
||||
</svg>
|
Before Width: | Height: | Size: 13 KiB |
@@ -1,695 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<!-- Creator: fig2dev Version 3.2 Patchlevel 5e -->
|
||||
|
||||
<!-- CreationDate: Wed Dec 9 17:20:02 2015 -->
|
||||
|
||||
<!-- Magnification: 2.000 -->
|
||||
|
||||
<svg
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:cc="http://creativecommons.org/ns#"
|
||||
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
||||
xmlns:svg="http://www.w3.org/2000/svg"
|
||||
xmlns="http://www.w3.org/2000/svg"
|
||||
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
|
||||
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
|
||||
width="5.7in"
|
||||
height="8.6in"
|
||||
viewBox="-44 -44 6838 10288"
|
||||
id="svg2"
|
||||
version="1.1"
|
||||
inkscape:version="0.48.4 r9939"
|
||||
sodipodi:docname="BigTreeClassicRCUBHdyntick.fig">
|
||||
<metadata
|
||||
id="metadata166">
|
||||
<rdf:RDF>
|
||||
<cc:Work
|
||||
rdf:about="">
|
||||
<dc:format>image/svg+xml</dc:format>
|
||||
<dc:type
|
||||
rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
||||
<dc:title></dc:title>
|
||||
</cc:Work>
|
||||
</rdf:RDF>
|
||||
</metadata>
|
||||
<defs
|
||||
id="defs164">
|
||||
<marker
|
||||
inkscape:stockid="Arrow1Mend"
|
||||
orient="auto"
|
||||
refY="0.0"
|
||||
refX="0.0"
|
||||
id="Arrow1Mend"
|
||||
style="overflow:visible;">
|
||||
<path
|
||||
id="path3924"
|
||||
d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
|
||||
style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;"
|
||||
transform="scale(0.4) rotate(180) translate(10,0)" />
|
||||
</marker>
|
||||
<marker
|
||||
inkscape:stockid="Arrow2Lend"
|
||||
orient="auto"
|
||||
refY="0.0"
|
||||
refX="0.0"
|
||||
id="Arrow2Lend"
|
||||
style="overflow:visible;">
|
||||
<path
|
||||
id="path3936"
|
||||
style="fill-rule:evenodd;stroke-width:0.62500000;stroke-linejoin:round;"
|
||||
d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z "
|
||||
transform="scale(1.1) rotate(180) translate(1,0)" />
|
||||
</marker>
|
||||
</defs>
|
||||
<sodipodi:namedview
|
||||
pagecolor="#ffffff"
|
||||
bordercolor="#666666"
|
||||
borderopacity="1"
|
||||
objecttolerance="10"
|
||||
gridtolerance="10"
|
||||
guidetolerance="10"
|
||||
inkscape:pageopacity="0"
|
||||
inkscape:pageshadow="2"
|
||||
inkscape:window-width="845"
|
||||
inkscape:window-height="988"
|
||||
id="namedview162"
|
||||
showgrid="false"
|
||||
inkscape:zoom="1.0452196"
|
||||
inkscape:cx="256.5"
|
||||
inkscape:cy="387.00003"
|
||||
inkscape:window-x="356"
|
||||
inkscape:window-y="61"
|
||||
inkscape:window-maximized="0"
|
||||
inkscape:current-layer="g4" />
|
||||
<g
|
||||
style="stroke-width:.025in; fill:none"
|
||||
id="g4">
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="450"
|
||||
y="0"
|
||||
width="6300"
|
||||
height="7350"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
|
||||
id="rect6" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="4950"
|
||||
y="4950"
|
||||
width="1500"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
|
||||
id="rect8" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="750"
|
||||
y="600"
|
||||
width="5700"
|
||||
height="3750"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
|
||||
id="rect10" />
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="5250,8100 5688,5912 "
|
||||
style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
|
||||
id="polyline12" />
|
||||
<!-- Arrowhead on XXXpoint 5250 8100 - 5710 5790-->
|
||||
<polyline
|
||||
points="5714 6068 5704 5822 5598 6044 "
|
||||
style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
|
||||
id="polyline14" />
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="4050,9300 4486,7262 "
|
||||
style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
|
||||
id="polyline16" />
|
||||
<!-- Arrowhead on XXXpoint 4050 9300 - 4512 7140-->
|
||||
<polyline
|
||||
points="4514 7418 4506 7172 4396 7394 "
|
||||
style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
|
||||
id="polyline18" />
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="1040,9300 1476,7262 "
|
||||
style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
|
||||
id="polyline20" />
|
||||
<!-- Arrowhead on XXXpoint 1040 9300 - 1502 7140-->
|
||||
<polyline
|
||||
points="1504 7418 1496 7172 1386 7394 "
|
||||
style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
|
||||
id="polyline22" />
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="2240,8100 2676,6062 "
|
||||
style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
|
||||
id="polyline24" />
|
||||
<!-- Arrowhead on XXXpoint 2240 8100 - 2702 5940-->
|
||||
<polyline
|
||||
points="2704 6218 2696 5972 2586 6194 "
|
||||
style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
|
||||
id="polyline26" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="0"
|
||||
y="450"
|
||||
width="6300"
|
||||
height="7350"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
|
||||
id="rect28" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="300"
|
||||
y="1050"
|
||||
width="5700"
|
||||
height="3750"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
|
||||
id="rect30" />
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="1350,3450 2350,2590 "
|
||||
style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
|
||||
id="polyline32" />
|
||||
<!-- Arrowhead on XXXpoint 1350 3450 - 2444 2510-->
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="4950,3450 3948,2590 "
|
||||
style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
|
||||
id="polyline36" />
|
||||
<!-- Arrowhead on XXXpoint 4950 3450 - 3854 2510-->
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="4050,6600 4050,4414 "
|
||||
style="stroke:#00d1d1;stroke-width:30.00455750000000066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
|
||||
id="polyline40" />
|
||||
<!-- Arrowhead on XXXpoint 4050 6600 - 4050 4290-->
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="1050,6600 1050,4414 "
|
||||
style="stroke:#00d1d1;stroke-width:30.00455750000000066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
|
||||
id="polyline44" />
|
||||
<!-- Arrowhead on XXXpoint 1050 6600 - 1050 4290-->
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="2250,5400 2250,4414 "
|
||||
style="stroke:#00d1d1;stroke-width:30.00455750000000066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
|
||||
id="polyline48" />
|
||||
<!-- Arrowhead on XXXpoint 2250 5400 - 2250 4290-->
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="2250,8100 2250,6364 "
|
||||
style="stroke:#00ff00;stroke-width:30;stroke-linejoin:miter;stroke-linecap:butt;marker-end:url(#Arrow1Mend)"
|
||||
id="polyline52" />
|
||||
<!-- Arrowhead on XXXpoint 2250 8100 - 2250 6240-->
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="1050,9300 1050,7564 "
|
||||
style="stroke:#00ff00;stroke-width:30;stroke-linejoin:miter;stroke-linecap:butt;marker-end:url(#Arrow1Mend)"
|
||||
id="polyline56" />
|
||||
<!-- Arrowhead on XXXpoint 1050 9300 - 1050 7440-->
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="4050,9300 4050,7564 "
|
||||
style="stroke:#00ff00;stroke-width:30;stroke-linejoin:miter;stroke-linecap:butt;marker-end:url(#Arrow1Mend)"
|
||||
id="polyline60" />
|
||||
<!-- Arrowhead on XXXpoint 4050 9300 - 4050 7440-->
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="5250,8100 5250,6364 "
|
||||
style="stroke:#00ff00;stroke-width:30;stroke-linejoin:miter;stroke-linecap:butt;marker-end:url(#Arrow1Mend)"
|
||||
id="polyline64" />
|
||||
<!-- Arrowhead on XXXpoint 5250 8100 - 5250 6240-->
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="2850"
|
||||
cy="3900"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle68" />
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="3150"
|
||||
cy="3900"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle70" />
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="3450"
|
||||
cy="3900"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle72" />
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="1350"
|
||||
cy="5100"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle74" />
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="1650"
|
||||
cy="5100"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle76" />
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="1950"
|
||||
cy="5100"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle78" />
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="4350"
|
||||
cy="5100"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle80" />
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="4650"
|
||||
cy="5100"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle82" />
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="4950"
|
||||
cy="5100"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle84" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="750"
|
||||
y="3450"
|
||||
width="1800"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
|
||||
id="rect86" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="300"
|
||||
y="6600"
|
||||
width="1500"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
|
||||
id="rect88" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="3750"
|
||||
y="3450"
|
||||
width="1800"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
|
||||
id="rect90" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="4500"
|
||||
y="5400"
|
||||
width="1500"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
|
||||
id="rect92" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="3300"
|
||||
y="6600"
|
||||
width="1500"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
|
||||
id="rect94" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="2250"
|
||||
y="1650"
|
||||
width="1800"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
|
||||
id="rect96" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="0"
|
||||
y="9300"
|
||||
width="2100"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
|
||||
id="rect98" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="1350"
|
||||
y="8100"
|
||||
width="2100"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
|
||||
id="rect100" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="3000"
|
||||
y="9300"
|
||||
width="2100"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
|
||||
id="rect102" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="4350"
|
||||
y="8100"
|
||||
width="2100"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
|
||||
id="rect104" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="1500"
|
||||
y="5400"
|
||||
width="1500"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
|
||||
id="rect106" />
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="6450"
|
||||
y="300"
|
||||
fill="#000000"
|
||||
font-family="Helvetica"
|
||||
font-style="normal"
|
||||
font-weight="normal"
|
||||
font-size="192"
|
||||
text-anchor="end"
|
||||
id="text108">rcu_bh</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="3150"
|
||||
y="1950"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text110">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="3150"
|
||||
y="2250"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text112">rcu_node</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="1650"
|
||||
y="3750"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text114">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="1650"
|
||||
y="4050"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text116">rcu_node</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="4650"
|
||||
y="4050"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text118">rcu_node</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="4650"
|
||||
y="3750"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text120">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="2250"
|
||||
y="5700"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text122">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="2250"
|
||||
y="6000"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text124">rcu_data</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="1050"
|
||||
y="6900"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text126">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="1050"
|
||||
y="7200"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text128">rcu_data</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="5250"
|
||||
y="5700"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text130">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="5250"
|
||||
y="6000"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text132">rcu_data</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="4050"
|
||||
y="6900"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text134">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="4050"
|
||||
y="7200"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text136">rcu_data</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="450"
|
||||
y="1350"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="start"
|
||||
id="text138">struct rcu_state</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="1050"
|
||||
y="9600"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text140">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="1050"
|
||||
y="9900"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text142">rcu_dynticks</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="4050"
|
||||
y="9600"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text144">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="4050"
|
||||
y="9900"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text146">rcu_dynticks</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="2400"
|
||||
y="8400"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text148">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="2400"
|
||||
y="8700"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text150">rcu_dynticks</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="5400"
|
||||
y="8400"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text152">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="5400"
|
||||
y="8700"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text154">rcu_dynticks</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="6000"
|
||||
y="750"
|
||||
fill="#000000"
|
||||
font-family="Helvetica"
|
||||
font-style="normal"
|
||||
font-weight="normal"
|
||||
font-size="192"
|
||||
text-anchor="end"
|
||||
id="text156">rcu_sched</text>
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="5250,5400 5250,4414 "
|
||||
style="stroke:#00d1d1;stroke-width:30.00455750000000066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
|
||||
id="polyline158" />
|
||||
<!-- Arrowhead on XXXpoint 5250 5400 - 5250 4290-->
|
||||
</g>
|
||||
</svg>
|
Before Width: | Height: | Size: 19 KiB |
@@ -1,741 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<!-- Creator: fig2dev Version 3.2 Patchlevel 5e -->
|
||||
|
||||
<!-- CreationDate: Wed Dec 9 17:32:59 2015 -->
|
||||
|
||||
<!-- Magnification: 2.000 -->
|
||||
|
||||
<svg
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:cc="http://creativecommons.org/ns#"
|
||||
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
||||
xmlns:svg="http://www.w3.org/2000/svg"
|
||||
xmlns="http://www.w3.org/2000/svg"
|
||||
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
|
||||
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
|
||||
width="6.1in"
|
||||
height="8.9in"
|
||||
viewBox="-44 -44 7288 10738"
|
||||
id="svg2"
|
||||
version="1.1"
|
||||
inkscape:version="0.48.4 r9939"
|
||||
sodipodi:docname="BigTreePreemptRCUBHdyntick.fig">
|
||||
<metadata
|
||||
id="metadata182">
|
||||
<rdf:RDF>
|
||||
<cc:Work
|
||||
rdf:about="">
|
||||
<dc:format>image/svg+xml</dc:format>
|
||||
<dc:type
|
||||
rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
||||
<dc:title></dc:title>
|
||||
</cc:Work>
|
||||
</rdf:RDF>
|
||||
</metadata>
|
||||
<defs
|
||||
id="defs180">
|
||||
<marker
|
||||
inkscape:stockid="Arrow1Mend"
|
||||
orient="auto"
|
||||
refY="0.0"
|
||||
refX="0.0"
|
||||
id="Arrow1Mend"
|
||||
style="overflow:visible;">
|
||||
<path
|
||||
id="path3940"
|
||||
d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
|
||||
style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;"
|
||||
transform="scale(0.4) rotate(180) translate(10,0)" />
|
||||
</marker>
|
||||
</defs>
|
||||
<sodipodi:namedview
|
||||
pagecolor="#ffffff"
|
||||
bordercolor="#666666"
|
||||
borderopacity="1"
|
||||
objecttolerance="10"
|
||||
gridtolerance="10"
|
||||
guidetolerance="10"
|
||||
inkscape:pageopacity="0"
|
||||
inkscape:pageshadow="2"
|
||||
inkscape:window-width="874"
|
||||
inkscape:window-height="1148"
|
||||
id="namedview178"
|
||||
showgrid="false"
|
||||
inkscape:zoom="1.2097379"
|
||||
inkscape:cx="274.5"
|
||||
inkscape:cy="400.49997"
|
||||
inkscape:window-x="946"
|
||||
inkscape:window-y="24"
|
||||
inkscape:window-maximized="0"
|
||||
inkscape:current-layer="g4" />
|
||||
<g
|
||||
style="stroke-width:.025in; fill:none"
|
||||
id="g4">
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="900"
|
||||
y="0"
|
||||
width="6300"
|
||||
height="7350"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
|
||||
id="rect6" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="1200"
|
||||
y="600"
|
||||
width="5700"
|
||||
height="3750"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
|
||||
id="rect8" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="5400"
|
||||
y="4950"
|
||||
width="1500"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
|
||||
id="rect10" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="450"
|
||||
y="450"
|
||||
width="6300"
|
||||
height="7350"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
|
||||
id="rect12" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="750"
|
||||
y="1050"
|
||||
width="5700"
|
||||
height="3750"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
|
||||
id="rect14" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="4950"
|
||||
y="5400"
|
||||
width="1500"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
|
||||
id="rect16" />
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="5250,8550 5688,6362 "
|
||||
style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
|
||||
id="polyline18" />
|
||||
<!-- Arrowhead on XXXpoint 5250 8550 - 5710 6240-->
|
||||
<polyline
|
||||
points="5714 6518 5704 6272 5598 6494 "
|
||||
style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
|
||||
id="polyline20" />
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="4050,9750 4486,7712 "
|
||||
style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
|
||||
id="polyline22" />
|
||||
<!-- Arrowhead on XXXpoint 4050 9750 - 4512 7590-->
|
||||
<polyline
|
||||
points="4514 7868 4506 7622 4396 7844 "
|
||||
style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
|
||||
id="polyline24" />
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="1040,9750 1476,7712 "
|
||||
style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
|
||||
id="polyline26" />
|
||||
<!-- Arrowhead on XXXpoint 1040 9750 - 1502 7590-->
|
||||
<polyline
|
||||
points="1504 7868 1496 7622 1386 7844 "
|
||||
style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
|
||||
id="polyline28" />
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="2240,8550 2676,6512 "
|
||||
style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
|
||||
id="polyline30" />
|
||||
<!-- Arrowhead on XXXpoint 2240 8550 - 2702 6390-->
|
||||
<polyline
|
||||
points="2704 6668 2696 6422 2586 6644 "
|
||||
style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
|
||||
id="polyline32" />
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="4050,9750 5682,6360 "
|
||||
style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
|
||||
id="polyline34" />
|
||||
<!-- Arrowhead on XXXpoint 4050 9750 - 5736 6246-->
|
||||
<polyline
|
||||
points="5672 6518 5722 6276 5562 6466 "
|
||||
style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
|
||||
id="polyline36" />
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="1010,9750 2642,6360 "
|
||||
style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
|
||||
id="polyline38" />
|
||||
<!-- Arrowhead on XXXpoint 1010 9750 - 2696 6246-->
|
||||
<polyline
|
||||
points="2632 6518 2682 6276 2522 6466 "
|
||||
style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
|
||||
id="polyline40" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="0"
|
||||
y="900"
|
||||
width="6300"
|
||||
height="7350"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
|
||||
id="rect42" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="300"
|
||||
y="1500"
|
||||
width="5700"
|
||||
height="3750"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
|
||||
id="rect44" />
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="1350,3900 2350,3040 "
|
||||
style="stroke:#00d1d1;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
|
||||
id="polyline46" />
|
||||
<!-- Arrowhead on XXXpoint 1350 3900 - 2444 2960-->
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="4950,3900 3948,3040 "
|
||||
style="stroke:#00d1d1;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
|
||||
id="polyline50" />
|
||||
<!-- Arrowhead on XXXpoint 4950 3900 - 3854 2960-->
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="4050,7050 4050,4864 "
|
||||
style="stroke:#00d1d1;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
|
||||
id="polyline54" />
|
||||
<!-- Arrowhead on XXXpoint 4050 7050 - 4050 4740-->
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="1050,7050 1050,4864 "
|
||||
style="stroke:#00d1d1;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
|
||||
id="polyline58" />
|
||||
<!-- Arrowhead on XXXpoint 1050 7050 - 1050 4740-->
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="2250,5850 2250,4864 "
|
||||
style="stroke:#00d1d1;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
|
||||
id="polyline62" />
|
||||
<!-- Arrowhead on XXXpoint 2250 5850 - 2250 4740-->
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="2250,8550 2250,6814 "
|
||||
style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
|
||||
id="polyline66" />
|
||||
<!-- Arrowhead on XXXpoint 2250 8550 - 2250 6690-->
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="1050,9750 1050,8014 "
|
||||
style="stroke:#00ff00;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
|
||||
id="polyline70" />
|
||||
<!-- Arrowhead on XXXpoint 1050 9750 - 1050 7890-->
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="4050,9750 4050,8014 "
|
||||
style="stroke:#00ff00;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
|
||||
id="polyline74" />
|
||||
<!-- Arrowhead on XXXpoint 4050 9750 - 4050 7890-->
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="5250,8550 5250,6814 "
|
||||
style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
|
||||
id="polyline78" />
|
||||
<!-- Arrowhead on XXXpoint 5250 8550 - 5250 6690-->
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="2850"
|
||||
cy="4350"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle82" />
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="3150"
|
||||
cy="4350"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle84" />
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="3450"
|
||||
cy="4350"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle86" />
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="1350"
|
||||
cy="5550"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle88" />
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="1650"
|
||||
cy="5550"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle90" />
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="1950"
|
||||
cy="5550"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle92" />
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="4350"
|
||||
cy="5550"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle94" />
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="4650"
|
||||
cy="5550"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle96" />
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="4950"
|
||||
cy="5550"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle98" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="750"
|
||||
y="3900"
|
||||
width="1800"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
|
||||
id="rect100" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="300"
|
||||
y="7050"
|
||||
width="1500"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
|
||||
id="rect102" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="3750"
|
||||
y="3900"
|
||||
width="1800"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
|
||||
id="rect104" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="4500"
|
||||
y="5850"
|
||||
width="1500"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
|
||||
id="rect106" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="3300"
|
||||
y="7050"
|
||||
width="1500"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
|
||||
id="rect108" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="2250"
|
||||
y="2100"
|
||||
width="1800"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
|
||||
id="rect110" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="0"
|
||||
y="9750"
|
||||
width="2100"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
|
||||
id="rect112" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="1350"
|
||||
y="8550"
|
||||
width="2100"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
|
||||
id="rect114" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="3000"
|
||||
y="9750"
|
||||
width="2100"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
|
||||
id="rect116" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="4350"
|
||||
y="8550"
|
||||
width="2100"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
|
||||
id="rect118" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="1500"
|
||||
y="5850"
|
||||
width="1500"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
|
||||
id="rect120" />
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="6450"
|
||||
y="750"
|
||||
fill="#000000"
|
||||
font-family="Helvetica"
|
||||
font-style="normal"
|
||||
font-weight="normal"
|
||||
font-size="192"
|
||||
text-anchor="end"
|
||||
id="text122">rcu_bh</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="3150"
|
||||
y="2400"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text124">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="3150"
|
||||
y="2700"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text126">rcu_node</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="1650"
|
||||
y="4200"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text128">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="1650"
|
||||
y="4500"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text130">rcu_node</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="4650"
|
||||
y="4500"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text132">rcu_node</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="4650"
|
||||
y="4200"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text134">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="2250"
|
||||
y="6150"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text136">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="2250"
|
||||
y="6450"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text138">rcu_data</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="1050"
|
||||
y="7350"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text140">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="1050"
|
||||
y="7650"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text142">rcu_data</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="5250"
|
||||
y="6150"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text144">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="5250"
|
||||
y="6450"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text146">rcu_data</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="4050"
|
||||
y="7350"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text148">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="4050"
|
||||
y="7650"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text150">rcu_data</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="450"
|
||||
y="1800"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="start"
|
||||
id="text152">struct rcu_state</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="1050"
|
||||
y="10050"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text154">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="1050"
|
||||
y="10350"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text156">rcu_dynticks</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="4050"
|
||||
y="10050"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text158">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="4050"
|
||||
y="10350"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text160">rcu_dynticks</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="2400"
|
||||
y="8850"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text162">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="2400"
|
||||
y="9150"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text164">rcu_dynticks</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="5400"
|
||||
y="8850"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text166">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="5400"
|
||||
y="9150"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text168">rcu_dynticks</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="6900"
|
||||
y="300"
|
||||
fill="#000000"
|
||||
font-family="Helvetica"
|
||||
font-style="normal"
|
||||
font-weight="normal"
|
||||
font-size="192"
|
||||
text-anchor="end"
|
||||
id="text170">rcu_preempt</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="6000"
|
||||
y="1200"
|
||||
fill="#000000"
|
||||
font-family="Helvetica"
|
||||
font-style="normal"
|
||||
font-weight="normal"
|
||||
font-size="192"
|
||||
text-anchor="end"
|
||||
id="text172">rcu_sched</text>
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="5250,5850 5250,4864 "
|
||||
style="stroke:#00d1d1;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
|
||||
id="polyline174" />
|
||||
<!-- Arrowhead on XXXpoint 5250 5850 - 5250 4740-->
|
||||
</g>
|
||||
</svg>
|
Before Width: | Height: | Size: 20 KiB |
Before Width: | Height: | Size: 24 KiB After Width: | Height: | Size: 22 KiB |
@@ -23,8 +23,6 @@ to each other.
|
||||
The <tt>rcu_segcblist</tt> Structure</a>
|
||||
<li> <a href="#The rcu_data Structure">
|
||||
The <tt>rcu_data</tt> Structure</a>
|
||||
<li> <a href="#The rcu_dynticks Structure">
|
||||
The <tt>rcu_dynticks</tt> Structure</a>
|
||||
<li> <a href="#The rcu_head Structure">
|
||||
The <tt>rcu_head</tt> Structure</a>
|
||||
<li> <a href="#RCU-Specific Fields in the task_struct Structure">
|
||||
@@ -127,9 +125,11 @@ CPUs, RCU would configure the <tt>rcu_node</tt> tree as follows:
|
||||
</p><p>RCU currently permits up to a four-level tree, which on a 64-bit system
|
||||
accommodates up to 4,194,304 CPUs, though only a mere 524,288 CPUs for
|
||||
32-bit systems.
|
||||
On the other hand, you can set <tt>CONFIG_RCU_FANOUT</tt> to be
|
||||
as small as 2 if you wish, which would permit only 16 CPUs, which
|
||||
is useful for testing.
|
||||
On the other hand, you can set both <tt>CONFIG_RCU_FANOUT</tt> and
|
||||
<tt>CONFIG_RCU_FANOUT_LEAF</tt> to be as small as 2, which would result
|
||||
in a 16-CPU test using a 4-level tree.
|
||||
This can be useful for testing large-system capabilities on small test
|
||||
machines.
|
||||
|
||||
</p><p>This multi-level combining tree allows us to get most of the
|
||||
performance and scalability
|
||||
@@ -154,44 +154,9 @@ on that root <tt>rcu_node</tt> structure remains acceptably low.
|
||||
keeping lock contention under control at all tree levels regardless
|
||||
of the level of loading on the system.
|
||||
|
||||
</p><p>The Linux kernel actually supports multiple flavors of RCU
|
||||
running concurrently, so RCU builds separate data structures for each
|
||||
flavor.
|
||||
For example, for <tt>CONFIG_TREE_RCU=y</tt> kernels, RCU provides
|
||||
rcu_sched and rcu_bh, as shown below:
|
||||
|
||||
</p><p><img src="BigTreeClassicRCUBH.svg" alt="BigTreeClassicRCUBH.svg" width="33%">
|
||||
|
||||
</p><p>Energy efficiency is increasingly important, and for that
|
||||
reason the Linux kernel provides <tt>CONFIG_NO_HZ_IDLE</tt>, which
|
||||
turns off the scheduling-clock interrupts on idle CPUs, which in
|
||||
turn allows those CPUs to attain deeper sleep states and to consume
|
||||
less energy.
|
||||
CPUs whose scheduling-clock interrupts have been turned off are
|
||||
said to be in <i>dyntick-idle mode</i>.
|
||||
RCU must handle dyntick-idle CPUs specially
|
||||
because RCU would otherwise wake up each CPU on every grace period,
|
||||
which would defeat the whole purpose of <tt>CONFIG_NO_HZ_IDLE</tt>.
|
||||
RCU uses the <tt>rcu_dynticks</tt> structure to track
|
||||
which CPUs are in dyntick idle mode, as shown below:
|
||||
|
||||
</p><p><img src="BigTreeClassicRCUBHdyntick.svg" alt="BigTreeClassicRCUBHdyntick.svg" width="33%">
|
||||
|
||||
</p><p>However, if a CPU is in dyntick-idle mode, it is in that mode
|
||||
for all flavors of RCU.
|
||||
Therefore, a single <tt>rcu_dynticks</tt> structure is allocated per
|
||||
CPU, and all of a given CPU's <tt>rcu_data</tt> structures share
|
||||
that <tt>rcu_dynticks</tt>, as shown in the figure.
|
||||
|
||||
</p><p>Kernels built with <tt>CONFIG_PREEMPT_RCU</tt> support
|
||||
rcu_preempt in addition to rcu_sched and rcu_bh, as shown below:
|
||||
|
||||
</p><p><img src="BigTreePreemptRCUBHdyntick.svg" alt="BigTreePreemptRCUBHdyntick.svg" width="35%">
|
||||
|
||||
</p><p>RCU updaters wait for normal grace periods by registering
|
||||
RCU callbacks, either directly via <tt>call_rcu()</tt> and
|
||||
friends (namely <tt>call_rcu_bh()</tt> and <tt>call_rcu_sched()</tt>),
|
||||
there being a separate interface per flavor of RCU)
|
||||
or indirectly via <tt>synchronize_rcu()</tt> and friends.
|
||||
RCU callbacks are represented by <tt>rcu_head</tt> structures,
|
||||
which are queued on <tt>rcu_data</tt> structures while they are
|
||||
@@ -214,9 +179,6 @@ its own synchronization:
|
||||
<li> Each <tt>rcu_node</tt> structure has a spinlock.
|
||||
<li> The fields in <tt>rcu_data</tt> are private to the corresponding
|
||||
CPU, although a few can be read and written by other CPUs.
|
||||
<li> Similarly, the fields in <tt>rcu_dynticks</tt> are private
|
||||
to the corresponding CPU, although a few can be read by
|
||||
other CPUs.
|
||||
</ol>
|
||||
|
||||
<p>It is important to note that different data structures can have
|
||||
@@ -272,11 +234,6 @@ follows:
|
||||
access to this information from the corresponding CPU.
|
||||
Finally, this structure records past dyntick-idle state
|
||||
for the corresponding CPU and also tracks statistics.
|
||||
<li> <tt>rcu_dynticks</tt>:
|
||||
This per-CPU structure tracks the current dyntick-idle
|
||||
state for the corresponding CPU.
|
||||
Unlike the other three structures, the <tt>rcu_dynticks</tt>
|
||||
structure is not replicated per RCU flavor.
|
||||
<li> <tt>rcu_head</tt>:
|
||||
This structure represents RCU callbacks, and is the
|
||||
only structure allocated and managed by RCU users.
|
||||
@@ -287,14 +244,14 @@ follows:
|
||||
<p>If all you wanted from this article was a general notion of how
|
||||
RCU's data structures are related, you are done.
|
||||
Otherwise, each of the following sections give more details on
|
||||
the <tt>rcu_state</tt>, <tt>rcu_node</tt>, <tt>rcu_data</tt>,
|
||||
and <tt>rcu_dynticks</tt> data structures.
|
||||
the <tt>rcu_state</tt>, <tt>rcu_node</tt> and <tt>rcu_data</tt> data
|
||||
structures.
|
||||
|
||||
<h3><a name="The rcu_state Structure">
|
||||
The <tt>rcu_state</tt> Structure</a></h3>
|
||||
|
||||
<p>The <tt>rcu_state</tt> structure is the base structure that
|
||||
represents a flavor of RCU.
|
||||
represents the state of RCU in the system.
|
||||
This structure forms the interconnection between the
|
||||
<tt>rcu_node</tt> and <tt>rcu_data</tt> structures,
|
||||
tracks grace periods, contains the lock used to
|
||||
@@ -389,7 +346,7 @@ sequence number.
|
||||
The bottom two bits are the state of the current grace period,
|
||||
which can be zero for not yet started or one for in progress.
|
||||
In other words, if the bottom two bits of <tt>->gp_seq</tt> are
|
||||
zero, the corresponding flavor of RCU is idle.
|
||||
zero, then RCU is idle.
|
||||
Any other value in the bottom two bits indicates that something is broken.
|
||||
This field is protected by the root <tt>rcu_node</tt> structure's
|
||||
<tt>->lock</tt> field.
|
||||
@@ -419,10 +376,10 @@ as follows:
|
||||
grace period in jiffies.
|
||||
It is protected by the root <tt>rcu_node</tt>'s <tt>->lock</tt>.
|
||||
|
||||
<p>The <tt>->name</tt> field points to the name of the RCU flavor
|
||||
(for example, “rcu_sched”), and is constant.
|
||||
The <tt>->abbr</tt> field contains a one-character abbreviation,
|
||||
for example, “s” for RCU-sched.
|
||||
<p>The <tt>->name</tt> and <tt>->abbr</tt> fields distinguish
|
||||
between preemptible RCU (“rcu_preempt” and “p”)
|
||||
and non-preemptible RCU (“rcu_sched” and “s”).
|
||||
These fields are used for diagnostic and tracing purposes.
|
||||
|
||||
<h3><a name="The rcu_node Structure">
|
||||
The <tt>rcu_node</tt> Structure</a></h3>
|
||||
@@ -971,25 +928,31 @@ this <tt>rcu_segcblist</tt> structure, <i>not</i> the <tt>->head</tt>
|
||||
pointer.
|
||||
The reason for this is that all the ready-to-invoke callbacks
|
||||
(that is, those in the <tt>RCU_DONE_TAIL</tt> segment) are extracted
|
||||
all at once at callback-invocation time.
|
||||
all at once at callback-invocation time (<tt>rcu_do_batch</tt>), due
|
||||
to which <tt>->head</tt> may be set to NULL if there are no not-done
|
||||
callbacks remaining in the <tt>rcu_segcblist</tt>.
|
||||
If callback invocation must be postponed, for example, because a
|
||||
high-priority process just woke up on this CPU, then the remaining
|
||||
callbacks are placed back on the <tt>RCU_DONE_TAIL</tt> segment.
|
||||
Either way, the <tt>->len</tt> and <tt>->len_lazy</tt> counts
|
||||
are adjusted after the corresponding callbacks have been invoked, and so
|
||||
again it is the <tt>->len</tt> count that accurately reflects whether
|
||||
or not there are callbacks associated with this <tt>rcu_segcblist</tt>
|
||||
structure.
|
||||
callbacks are placed back on the <tt>RCU_DONE_TAIL</tt> segment and
|
||||
<tt>->head</tt> once again points to the start of the segment.
|
||||
In short, the head field can briefly be <tt>NULL</tt> even though the
|
||||
CPU has callbacks present the entire time.
|
||||
Therefore, it is not appropriate to test the <tt>->head</tt> pointer
|
||||
for <tt>NULL</tt>.
|
||||
|
||||
<p>In contrast, the <tt>->len</tt> and <tt>->len_lazy</tt> counts
|
||||
are adjusted only after the corresponding callbacks have been invoked.
|
||||
This means that the <tt>->len</tt> count is zero only if
|
||||
the <tt>rcu_segcblist</tt> structure really is devoid of callbacks.
|
||||
Of course, off-CPU sampling of the <tt>->len</tt> count requires
|
||||
the use of appropriate synchronization, for example, memory barriers.
|
||||
careful use of appropriate synchronization, for example, memory barriers.
|
||||
This synchronization can be a bit subtle, particularly in the case
|
||||
of <tt>rcu_barrier()</tt>.
|
||||
|
||||
<h3><a name="The rcu_data Structure">
|
||||
The <tt>rcu_data</tt> Structure</a></h3>
|
||||
|
||||
<p>The <tt>rcu_data</tt> maintains the per-CPU state for the
|
||||
corresponding flavor of RCU.
|
||||
<p>The <tt>rcu_data</tt> maintains the per-CPU state for the RCU subsystem.
|
||||
The fields in this structure may be accessed only from the corresponding
|
||||
CPU (and from tracing) unless otherwise stated.
|
||||
This structure is the
|
||||
@@ -1015,30 +978,19 @@ as follows:
|
||||
|
||||
<pre>
|
||||
1 int cpu;
|
||||
2 struct rcu_state *rsp;
|
||||
3 struct rcu_node *mynode;
|
||||
4 struct rcu_dynticks *dynticks;
|
||||
5 unsigned long grpmask;
|
||||
6 bool beenonline;
|
||||
2 struct rcu_node *mynode;
|
||||
3 unsigned long grpmask;
|
||||
4 bool beenonline;
|
||||
</pre>
|
||||
|
||||
<p>The <tt>->cpu</tt> field contains the number of the
|
||||
corresponding CPU, the <tt>->rsp</tt> pointer references
|
||||
the corresponding <tt>rcu_state</tt> structure (and is most frequently
|
||||
used to locate the name of the corresponding flavor of RCU for tracing),
|
||||
and the <tt>->mynode</tt> field references the corresponding
|
||||
<tt>rcu_node</tt> structure.
|
||||
corresponding CPU and the <tt>->mynode</tt> field references the
|
||||
corresponding <tt>rcu_node</tt> structure.
|
||||
The <tt>->mynode</tt> is used to propagate quiescent states
|
||||
up the combining tree.
|
||||
<p>The <tt>->dynticks</tt> pointer references the
|
||||
<tt>rcu_dynticks</tt> structure corresponding to this
|
||||
CPU.
|
||||
Recall that a single per-CPU instance of the <tt>rcu_dynticks</tt>
|
||||
structure is shared among all flavors of RCU.
|
||||
These first four fields are constant and therefore require not
|
||||
synchronization.
|
||||
These two fields are constant and therefore do not require synchronization.
|
||||
|
||||
</p><p>The <tt>->grpmask</tt> field indicates the bit in
|
||||
<p>The <tt>->grpmask</tt> field indicates the bit in
|
||||
the <tt>->mynode->qsmask</tt> corresponding to this
|
||||
<tt>rcu_data</tt> structure, and is also used when propagating
|
||||
quiescent states.
|
||||
@@ -1057,12 +1009,12 @@ as follows:
|
||||
3 bool cpu_no_qs;
|
||||
4 bool core_needs_qs;
|
||||
5 bool gpwrap;
|
||||
6 unsigned long rcu_qs_ctr_snap;
|
||||
</pre>
|
||||
|
||||
<p>The <tt>->gp_seq</tt> and <tt>->gp_seq_needed</tt>
|
||||
fields are the counterparts of the fields of the same name
|
||||
in the <tt>rcu_state</tt> and <tt>rcu_node</tt> structures.
|
||||
<p>The <tt>->gp_seq</tt> field is the counterpart of the field of the same
|
||||
name in the <tt>rcu_state</tt> and <tt>rcu_node</tt> structures. The
|
||||
<tt>->gp_seq_needed</tt> field is the counterpart of the field of the same
|
||||
name in the rcu_node</tt> structure.
|
||||
They may each lag up to one behind their <tt>rcu_node</tt>
|
||||
counterparts, but in <tt>CONFIG_NO_HZ_IDLE</tt> and
|
||||
<tt>CONFIG_NO_HZ_FULL</tt> kernels can lag
|
||||
@@ -1103,10 +1055,6 @@ CPU has remained idle for so long that the
|
||||
<tt>gp_seq</tt> counter is in danger of overflow, which
|
||||
will cause the CPU to disregard the values of its counters on
|
||||
its next exit from idle.
|
||||
Finally, the <tt>rcu_qs_ctr_snap</tt> field is used to detect
|
||||
cases where a given operation has resulted in a quiescent state
|
||||
for all flavors of RCU, for example, <tt>cond_resched()</tt>
|
||||
when RCU has indicated a need for quiescent states.
|
||||
|
||||
<h5>RCU Callback Handling</h5>
|
||||
|
||||
@@ -1179,26 +1127,22 @@ Finally, the <tt>->dynticks_fqs</tt> field is used to
|
||||
count the number of times this CPU is determined to be in
|
||||
dyntick-idle state, and is used for tracing and debugging purposes.
|
||||
|
||||
<h3><a name="The rcu_dynticks Structure">
|
||||
The <tt>rcu_dynticks</tt> Structure</a></h3>
|
||||
|
||||
<p>The <tt>rcu_dynticks</tt> maintains the per-CPU dyntick-idle state
|
||||
for the corresponding CPU.
|
||||
Unlike the other structures, <tt>rcu_dynticks</tt> is not
|
||||
replicated over the different flavors of RCU.
|
||||
The fields in this structure may be accessed only from the corresponding
|
||||
CPU (and from tracing) unless otherwise stated.
|
||||
Its fields are as follows:
|
||||
<p>
|
||||
This portion of the rcu_data structure is declared as follows:
|
||||
|
||||
<pre>
|
||||
1 long dynticks_nesting;
|
||||
2 long dynticks_nmi_nesting;
|
||||
3 atomic_t dynticks;
|
||||
4 bool rcu_need_heavy_qs;
|
||||
5 unsigned long rcu_qs_ctr;
|
||||
6 bool rcu_urgent_qs;
|
||||
5 bool rcu_urgent_qs;
|
||||
</pre>
|
||||
|
||||
<p>These fields in the rcu_data structure maintain the per-CPU dyntick-idle
|
||||
state for the corresponding CPU.
|
||||
The fields may be accessed only from the corresponding CPU (and from tracing)
|
||||
unless otherwise stated.
|
||||
|
||||
<p>The <tt>->dynticks_nesting</tt> field counts the
|
||||
nesting depth of process execution, so that in normal circumstances
|
||||
this counter has value zero or one.
|
||||
@@ -1240,19 +1184,12 @@ it is willing to call for heavy-weight dyntick-counter operations.
|
||||
This flag is checked by RCU's context-switch and <tt>cond_resched()</tt>
|
||||
code, which provide a momentary idle sojourn in response.
|
||||
|
||||
</p><p>The <tt>->rcu_qs_ctr</tt> field is used to record
|
||||
quiescent states from <tt>cond_resched()</tt>.
|
||||
Because <tt>cond_resched()</tt> can execute quite frequently, this
|
||||
must be quite lightweight, as in a non-atomic increment of this
|
||||
per-CPU field.
|
||||
|
||||
</p><p>Finally, the <tt>->rcu_urgent_qs</tt> field is used to record
|
||||
the fact that the RCU core code would really like to see a quiescent
|
||||
state from the corresponding CPU, with the various other fields indicating
|
||||
just how badly RCU wants this quiescent state.
|
||||
This flag is checked by RCU's context-switch and <tt>cond_resched()</tt>
|
||||
code, which, if nothing else, non-atomically increment <tt>->rcu_qs_ctr</tt>
|
||||
in response.
|
||||
the fact that the RCU core code would really like to see a quiescent state from
|
||||
the corresponding CPU, with the various other fields indicating just how badly
|
||||
RCU wants this quiescent state.
|
||||
This flag is checked by RCU's context-switch path
|
||||
(<tt>rcu_note_context_switch</tt>) and the cond_resched code.
|
||||
|
||||
<table>
|
||||
<tr><th> </th></tr>
|
||||
@@ -1425,11 +1362,11 @@ the last part of the array, thus traversing only the leaf
|
||||
<h3><a name="Summary">
|
||||
Summary</a></h3>
|
||||
|
||||
So each flavor of RCU is represented by an <tt>rcu_state</tt> structure,
|
||||
So the state of RCU is represented by an <tt>rcu_state</tt> structure,
|
||||
which contains a combining tree of <tt>rcu_node</tt> and
|
||||
<tt>rcu_data</tt> structures.
|
||||
Finally, in <tt>CONFIG_NO_HZ_IDLE</tt> kernels, each CPU's dyntick-idle
|
||||
state is tracked by an <tt>rcu_dynticks</tt> structure.
|
||||
state is tracked by dynticks-related fields in the <tt>rcu_data</tt> structure.
|
||||
|
||||
If you made it this far, you are well prepared to read the code
|
||||
walkthroughs in the other articles in this series.
|
||||
|
Before Width: | Height: | Size: 24 KiB After Width: | Height: | Size: 20 KiB |
@@ -328,13 +328,13 @@
|
||||
inkscape:window-height="1148"
|
||||
id="namedview90"
|
||||
showgrid="true"
|
||||
inkscape:zoom="0.80021373"
|
||||
inkscape:cx="462.49289"
|
||||
inkscape:cy="473.6718"
|
||||
inkscape:zoom="0.69092787"
|
||||
inkscape:cx="476.34085"
|
||||
inkscape:cy="712.80957"
|
||||
inkscape:window-x="770"
|
||||
inkscape:window-y="24"
|
||||
inkscape:window-maximized="0"
|
||||
inkscape:current-layer="g4114-9-3-9"
|
||||
inkscape:current-layer="g4"
|
||||
inkscape:snap-grids="false"
|
||||
fit-margin-top="5"
|
||||
fit-margin-right="5"
|
||||
@@ -813,14 +813,18 @@
|
||||
<text
|
||||
sodipodi:linespacing="125%"
|
||||
id="text4110-5-7-6-2-4-0"
|
||||
y="841.88086"
|
||||
y="670.74316"
|
||||
x="1460.1007"
|
||||
style="font-size:267.24359131px;font-style:normal;font-weight:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
|
||||
xml:space="preserve"><tspan
|
||||
y="841.88086"
|
||||
y="670.74316"
|
||||
x="1460.1007"
|
||||
sodipodi:role="line"
|
||||
id="tspan4925-1-2-4-5">reched_cpu()</tspan></text>
|
||||
id="tspan4925-1-2-4-5">Request</tspan><tspan
|
||||
y="1004.7976"
|
||||
x="1460.1007"
|
||||
sodipodi:role="line"
|
||||
id="tspan3100">context switch</tspan></text>
|
||||
</g>
|
||||
</g>
|
||||
</svg>
|
||||
|
Before Width: | Height: | Size: 32 KiB After Width: | Height: | Size: 32 KiB |
@@ -72,10 +72,10 @@ will ignore it because idle and offline CPUs are already residing
|
||||
in quiescent states.
|
||||
Otherwise, the expedited grace period will use
|
||||
<tt>smp_call_function_single()</tt> to send the CPU an IPI, which
|
||||
is handled by <tt>sync_rcu_exp_handler()</tt>.
|
||||
is handled by <tt>rcu_exp_handler()</tt>.
|
||||
|
||||
<p>
|
||||
However, because this is preemptible RCU, <tt>sync_rcu_exp_handler()</tt>
|
||||
However, because this is preemptible RCU, <tt>rcu_exp_handler()</tt>
|
||||
can check to see if the CPU is currently running in an RCU read-side
|
||||
critical section.
|
||||
If not, the handler can immediately report a quiescent state.
|
||||
@@ -145,24 +145,23 @@ expedited grace period is shown in the following diagram:
|
||||
<p><img src="ExpSchedFlow.svg" alt="ExpSchedFlow.svg" width="55%">
|
||||
|
||||
<p>
|
||||
As with RCU-preempt's <tt>synchronize_rcu_expedited()</tt>,
|
||||
As with RCU-preempt, RCU-sched's
|
||||
<tt>synchronize_sched_expedited()</tt> ignores offline and
|
||||
idle CPUs, again because they are in remotely detectable
|
||||
quiescent states.
|
||||
However, the <tt>synchronize_rcu_expedited()</tt> handler
|
||||
is <tt>sync_sched_exp_handler()</tt>, and because the
|
||||
However, because the
|
||||
<tt>rcu_read_lock_sched()</tt> and <tt>rcu_read_unlock_sched()</tt>
|
||||
leave no trace of their invocation, in general it is not possible to tell
|
||||
whether or not the current CPU is in an RCU read-side critical section.
|
||||
The best that <tt>sync_sched_exp_handler()</tt> can do is to check
|
||||
The best that RCU-sched's <tt>rcu_exp_handler()</tt> can do is to check
|
||||
for idle, on the off-chance that the CPU went idle while the IPI
|
||||
was in flight.
|
||||
If the CPU is idle, then <tt>sync_sched_exp_handler()</tt> reports
|
||||
If the CPU is idle, then <tt>rcu_exp_handler()</tt> reports
|
||||
the quiescent state.
|
||||
|
||||
<p>
|
||||
Otherwise, the handler invokes <tt>resched_cpu()</tt>, which forces
|
||||
a future context switch.
|
||||
<p> Otherwise, the handler forces a future context switch by setting the
|
||||
NEED_RESCHED flag of the current task's thread flag and the CPU preempt
|
||||
counter.
|
||||
At the time of the context switch, the CPU reports the quiescent state.
|
||||
Should the CPU go offline first, it will report the quiescent state
|
||||
at that time.
|
||||
@@ -298,19 +297,18 @@ Instead, the task pushing the grace period forward will include the
|
||||
idle CPUs in the mask passed to <tt>rcu_report_exp_cpu_mult()</tt>.
|
||||
|
||||
<p>
|
||||
For RCU-sched, there is an additional check for idle in the IPI
|
||||
handler, <tt>sync_sched_exp_handler()</tt>.
|
||||
For RCU-sched, there is an additional check:
|
||||
If the IPI has interrupted the idle loop, then
|
||||
<tt>sync_sched_exp_handler()</tt> invokes <tt>rcu_report_exp_rdp()</tt>
|
||||
<tt>rcu_exp_handler()</tt> invokes <tt>rcu_report_exp_rdp()</tt>
|
||||
to report the corresponding quiescent state.
|
||||
|
||||
<p>
|
||||
For RCU-preempt, there is no specific check for idle in the
|
||||
IPI handler (<tt>sync_rcu_exp_handler()</tt>), but because
|
||||
IPI handler (<tt>rcu_exp_handler()</tt>), but because
|
||||
RCU read-side critical sections are not permitted within the
|
||||
idle loop, if <tt>sync_rcu_exp_handler()</tt> sees that the CPU is within
|
||||
idle loop, if <tt>rcu_exp_handler()</tt> sees that the CPU is within
|
||||
RCU read-side critical section, the CPU cannot possibly be idle.
|
||||
Otherwise, <tt>sync_rcu_exp_handler()</tt> invokes
|
||||
Otherwise, <tt>rcu_exp_handler()</tt> invokes
|
||||
<tt>rcu_report_exp_rdp()</tt> to report the corresponding quiescent
|
||||
state, regardless of whether or not that quiescent state was due to
|
||||
the CPU being idle.
|
||||
@@ -625,6 +623,8 @@ checks, but only during the mid-boot dead zone.
|
||||
<p>
|
||||
With this refinement, synchronous grace periods can now be used from
|
||||
task context pretty much any time during the life of the kernel.
|
||||
That is, aside from some points in the suspend, hibernate, or shutdown
|
||||
code path.
|
||||
|
||||
<h3><a name="Summary">
|
||||
Summary</a></h3>
|
||||
|
@@ -77,7 +77,7 @@ The key point is that the lock-acquisition functions, including
|
||||
<tt>smp_mb__after_unlock_lock()</tt> immediately after successful
|
||||
acquisition of the lock.
|
||||
|
||||
<p>Therefore, for any given <tt>rcu_node</tt> struction, any access
|
||||
<p>Therefore, for any given <tt>rcu_node</tt> structure, any access
|
||||
happening before one of the above lock-release functions will be seen
|
||||
by all CPUs as happening before any access happening after a later
|
||||
one of the above lock-acquisition functions.
|
||||
@@ -485,13 +485,13 @@ section that the grace period must wait on.
|
||||
noted by <tt>rcu_node_context_switch()</tt> on the left.
|
||||
On the other hand, if the CPU takes a scheduler-clock interrupt
|
||||
while executing in usermode, a quiescent state will be noted by
|
||||
<tt>rcu_check_callbacks()</tt> on the right.
|
||||
<tt>rcu_sched_clock_irq()</tt> on the right.
|
||||
Either way, the passage through a quiescent state will be noted
|
||||
in a per-CPU variable.
|
||||
|
||||
<p>The next time an <tt>RCU_SOFTIRQ</tt> handler executes on
|
||||
this CPU (for example, after the next scheduler-clock
|
||||
interrupt), <tt>__rcu_process_callbacks()</tt> will invoke
|
||||
interrupt), <tt>rcu_core()</tt> will invoke
|
||||
<tt>rcu_check_quiescent_state()</tt>, which will notice the
|
||||
recorded quiescent state, and invoke
|
||||
<tt>rcu_report_qs_rdp()</tt>.
|
||||
@@ -651,7 +651,7 @@ to end.
|
||||
These callbacks are identified by <tt>rcu_advance_cbs()</tt>,
|
||||
which is usually invoked by <tt>__note_gp_changes()</tt>.
|
||||
As shown in the diagram below, this invocation can be triggered by
|
||||
the scheduling-clock interrupt (<tt>rcu_check_callbacks()</tt> on
|
||||
the scheduling-clock interrupt (<tt>rcu_sched_clock_irq()</tt> on
|
||||
the left) or by idle entry (<tt>rcu_cleanup_after_idle()</tt> on
|
||||
the right, but only for kernels build with
|
||||
<tt>CONFIG_RCU_FAST_NO_HZ=y</tt>).
|
||||
|
@@ -349,7 +349,7 @@
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
id="text202-7-5"
|
||||
style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcu_check_callbacks()</text>
|
||||
style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcu_sched_clock_irq()</text>
|
||||
<rect
|
||||
x="7069.6187"
|
||||
y="5087.4678"
|
||||
|
Before Width: | Height: | Size: 16 KiB After Width: | Height: | Size: 16 KiB |
@@ -3902,7 +3902,7 @@
|
||||
font-style="normal"
|
||||
y="-4418.6582"
|
||||
x="3745.7725"
|
||||
xml:space="preserve">rcu_check_callbacks()</text>
|
||||
xml:space="preserve">rcu_sched_clock_irq()</text>
|
||||
</g>
|
||||
<g
|
||||
transform="translate(-850.30204,55463.106)"
|
||||
@@ -3924,7 +3924,7 @@
|
||||
font-style="normal"
|
||||
y="-4418.6582"
|
||||
x="3745.7725"
|
||||
xml:space="preserve">rcu_process_callbacks()</text>
|
||||
xml:space="preserve">rcu_core()</text>
|
||||
<text
|
||||
style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier"
|
||||
id="text202-7-5-3-27-0"
|
||||
@@ -3933,7 +3933,7 @@
|
||||
font-style="normal"
|
||||
y="-4165.7954"
|
||||
x="3745.7725"
|
||||
xml:space="preserve">rcu_check_quiescent_state())</text>
|
||||
xml:space="preserve">rcu_check_quiescent_state()</text>
|
||||
<text
|
||||
style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier"
|
||||
id="text202-7-5-3-27-0-9"
|
||||
@@ -4968,7 +4968,7 @@
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
id="text202-7-5-19"
|
||||
style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcu_check_callbacks()</text>
|
||||
style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcu_sched_clock_irq()</text>
|
||||
<rect
|
||||
x="5314.2671"
|
||||
y="82817.688"
|
||||
|
Before Width: | Height: | Size: 209 KiB After Width: | Height: | Size: 209 KiB |
@@ -775,7 +775,7 @@
|
||||
font-style="normal"
|
||||
y="-4418.6582"
|
||||
x="3745.7725"
|
||||
xml:space="preserve">rcu_check_callbacks()</text>
|
||||
xml:space="preserve">rcu_sched_clock_irq()</text>
|
||||
</g>
|
||||
<g
|
||||
transform="translate(399.7744,828.86448)"
|
||||
@@ -797,7 +797,7 @@
|
||||
font-style="normal"
|
||||
y="-4418.6582"
|
||||
x="3745.7725"
|
||||
xml:space="preserve">rcu_process_callbacks()</text>
|
||||
xml:space="preserve">rcu_core()</text>
|
||||
<text
|
||||
style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier"
|
||||
id="text202-7-5-3-27-0"
|
||||
@@ -806,7 +806,7 @@
|
||||
font-style="normal"
|
||||
y="-4165.7954"
|
||||
x="3745.7725"
|
||||
xml:space="preserve">rcu_check_quiescent_state())</text>
|
||||
xml:space="preserve">rcu_check_quiescent_state()</text>
|
||||
<text
|
||||
style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier"
|
||||
id="text202-7-5-3-27-0-9"
|
||||
|
Before Width: | Height: | Size: 43 KiB After Width: | Height: | Size: 43 KiB |
@@ -900,8 +900,6 @@ Except where otherwise noted, these non-guarantees were premeditated.
|
||||
Grace Periods Don't Partition Read-Side Critical Sections</a>
|
||||
<li> <a href="#Read-Side Critical Sections Don't Partition Grace Periods">
|
||||
Read-Side Critical Sections Don't Partition Grace Periods</a>
|
||||
<li> <a href="#Disabling Preemption Does Not Block Grace Periods">
|
||||
Disabling Preemption Does Not Block Grace Periods</a>
|
||||
</ol>
|
||||
|
||||
<h3><a name="Readers Impose Minimal Ordering">Readers Impose Minimal Ordering</a></h3>
|
||||
@@ -1259,54 +1257,6 @@ of RCU grace periods.
|
||||
<tr><td> </td></tr>
|
||||
</table>
|
||||
|
||||
<h3><a name="Disabling Preemption Does Not Block Grace Periods">
|
||||
Disabling Preemption Does Not Block Grace Periods</a></h3>
|
||||
|
||||
<p>
|
||||
There was a time when disabling preemption on any given CPU would block
|
||||
subsequent grace periods.
|
||||
However, this was an accident of implementation and is not a requirement.
|
||||
And in the current Linux-kernel implementation, disabling preemption
|
||||
on a given CPU in fact does not block grace periods, as Oleg Nesterov
|
||||
<a href="https://lkml.kernel.org/g/20150614193825.GA19582@redhat.com">demonstrated</a>.
|
||||
|
||||
<p>
|
||||
If you need a preempt-disable region to block grace periods, you need to add
|
||||
<tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>, for example
|
||||
as follows:
|
||||
|
||||
<blockquote>
|
||||
<pre>
|
||||
1 preempt_disable();
|
||||
2 rcu_read_lock();
|
||||
3 do_something();
|
||||
4 rcu_read_unlock();
|
||||
5 preempt_enable();
|
||||
6
|
||||
7 /* Spinlocks implicitly disable preemption. */
|
||||
8 spin_lock(&mylock);
|
||||
9 rcu_read_lock();
|
||||
10 do_something();
|
||||
11 rcu_read_unlock();
|
||||
12 spin_unlock(&mylock);
|
||||
</pre>
|
||||
</blockquote>
|
||||
|
||||
<p>
|
||||
In theory, you could enter the RCU read-side critical section first,
|
||||
but it is more efficient to keep the entire RCU read-side critical
|
||||
section contained in the preempt-disable region as shown above.
|
||||
Of course, RCU read-side critical sections that extend outside of
|
||||
preempt-disable regions will work correctly, but such critical sections
|
||||
can be preempted, which forces <tt>rcu_read_unlock()</tt> to do
|
||||
more work.
|
||||
And no, this is <i>not</i> an invitation to enclose all of your RCU
|
||||
read-side critical sections within preempt-disable regions, because
|
||||
doing so would degrade real-time response.
|
||||
|
||||
<p>
|
||||
This non-requirement appeared with preemptible RCU.
|
||||
|
||||
<h2><a name="Parallelism Facts of Life">Parallelism Facts of Life</a></h2>
|
||||
|
||||
<p>
|
||||
@@ -1381,6 +1331,7 @@ Classes of quality-of-implementation requirements are as follows:
|
||||
<ol>
|
||||
<li> <a href="#Specialization">Specialization</a>
|
||||
<li> <a href="#Performance and Scalability">Performance and Scalability</a>
|
||||
<li> <a href="#Forward Progress">Forward Progress</a>
|
||||
<li> <a href="#Composability">Composability</a>
|
||||
<li> <a href="#Corner Cases">Corner Cases</a>
|
||||
</ol>
|
||||
@@ -1645,7 +1596,7 @@ used in place of <tt>synchronize_rcu()</tt> as follows:
|
||||
16 struct foo *p;
|
||||
17
|
||||
18 spin_lock(&gp_lock);
|
||||
19 p = rcu_dereference(gp);
|
||||
19 p = rcu_access_pointer(gp);
|
||||
20 if (!p) {
|
||||
21 spin_unlock(&gp_lock);
|
||||
22 return false;
|
||||
@@ -1822,6 +1773,106 @@ so it is too early to tell whether they will stand the test of time.
|
||||
RCU thus provides a range of tools to allow updaters to strike the
|
||||
required tradeoff between latency, flexibility and CPU overhead.
|
||||
|
||||
<h3><a name="Forward Progress">Forward Progress</a></h3>
|
||||
|
||||
<p>
|
||||
In theory, delaying grace-period completion and callback invocation
|
||||
is harmless.
|
||||
In practice, not only are memory sizes finite but also callbacks sometimes
|
||||
do wakeups, and sufficiently deferred wakeups can be difficult
|
||||
to distinguish from system hangs.
|
||||
Therefore, RCU must provide a number of mechanisms to promote forward
|
||||
progress.
|
||||
|
||||
<p>
|
||||
These mechanisms are not foolproof, nor can they be.
|
||||
For one simple example, an infinite loop in an RCU read-side critical
|
||||
section must by definition prevent later grace periods from ever completing.
|
||||
For a more involved example, consider a 64-CPU system built with
|
||||
<tt>CONFIG_RCU_NOCB_CPU=y</tt> and booted with <tt>rcu_nocbs=1-63</tt>,
|
||||
where CPUs 1 through 63 spin in tight loops that invoke
|
||||
<tt>call_rcu()</tt>.
|
||||
Even if these tight loops also contain calls to <tt>cond_resched()</tt>
|
||||
(thus allowing grace periods to complete), CPU 0 simply will
|
||||
not be able to invoke callbacks as fast as the other 63 CPUs can
|
||||
register them, at least not until the system runs out of memory.
|
||||
In both of these examples, the Spiderman principle applies: With great
|
||||
power comes great responsibility.
|
||||
However, short of this level of abuse, RCU is required to
|
||||
ensure timely completion of grace periods and timely invocation of
|
||||
callbacks.
|
||||
|
||||
<p>
|
||||
RCU takes the following steps to encourage timely completion of
|
||||
grace periods:
|
||||
|
||||
<ol>
|
||||
<li> If a grace period fails to complete within 100 milliseconds,
|
||||
RCU causes future invocations of <tt>cond_resched()</tt> on
|
||||
the holdout CPUs to provide an RCU quiescent state.
|
||||
RCU also causes those CPUs' <tt>need_resched()</tt> invocations
|
||||
to return <tt>true</tt>, but only after the corresponding CPU's
|
||||
next scheduling-clock.
|
||||
<li> CPUs mentioned in the <tt>nohz_full</tt> kernel boot parameter
|
||||
can run indefinitely in the kernel without scheduling-clock
|
||||
interrupts, which defeats the above <tt>need_resched()</tt>
|
||||
strategem.
|
||||
RCU will therefore invoke <tt>resched_cpu()</tt> on any
|
||||
<tt>nohz_full</tt> CPUs still holding out after
|
||||
109 milliseconds.
|
||||
<li> In kernels built with <tt>CONFIG_RCU_BOOST=y</tt>, if a given
|
||||
task that has been preempted within an RCU read-side critical
|
||||
section is holding out for more than 500 milliseconds,
|
||||
RCU will resort to priority boosting.
|
||||
<li> If a CPU is still holding out 10 seconds into the grace
|
||||
period, RCU will invoke <tt>resched_cpu()</tt> on it regardless
|
||||
of its <tt>nohz_full</tt> state.
|
||||
</ol>
|
||||
|
||||
<p>
|
||||
The above values are defaults for systems running with <tt>HZ=1000</tt>.
|
||||
They will vary as the value of <tt>HZ</tt> varies, and can also be
|
||||
changed using the relevant Kconfig options and kernel boot parameters.
|
||||
RCU currently does not do much sanity checking of these
|
||||
parameters, so please use caution when changing them.
|
||||
Note that these forward-progress measures are provided only for RCU,
|
||||
not for
|
||||
<a href="#Sleepable RCU">SRCU</a> or
|
||||
<a href="#Tasks RCU">Tasks RCU</a>.
|
||||
|
||||
<p>
|
||||
RCU takes the following steps in <tt>call_rcu()</tt> to encourage timely
|
||||
invocation of callbacks when any given non-<tt>rcu_nocbs</tt> CPU has
|
||||
10,000 callbacks, or has 10,000 more callbacks than it had the last time
|
||||
encouragement was provided:
|
||||
|
||||
<ol>
|
||||
<li> Starts a grace period, if one is not already in progress.
|
||||
<li> Forces immediate checking for quiescent states, rather than
|
||||
waiting for three milliseconds to have elapsed since the
|
||||
beginning of the grace period.
|
||||
<li> Immediately tags the CPU's callbacks with their grace period
|
||||
completion numbers, rather than waiting for the <tt>RCU_SOFTIRQ</tt>
|
||||
handler to get around to it.
|
||||
<li> Lifts callback-execution batch limits, which speeds up callback
|
||||
invocation at the expense of degrading realtime response.
|
||||
</ol>
|
||||
|
||||
<p>
|
||||
Again, these are default values when running at <tt>HZ=1000</tt>,
|
||||
and can be overridden.
|
||||
Again, these forward-progress measures are provided only for RCU,
|
||||
not for
|
||||
<a href="#Sleepable RCU">SRCU</a> or
|
||||
<a href="#Tasks RCU">Tasks RCU</a>.
|
||||
Even for RCU, callback-invocation forward progress for <tt>rcu_nocbs</tt>
|
||||
CPUs is much less well-developed, in part because workloads benefiting
|
||||
from <tt>rcu_nocbs</tt> CPUs tend to invoke <tt>call_rcu()</tt>
|
||||
relatively infrequently.
|
||||
If workloads emerge that need both <tt>rcu_nocbs</tt> CPUs and high
|
||||
<tt>call_rcu()</tt> invocation rates, then additional forward-progress
|
||||
work will be required.
|
||||
|
||||
<h3><a name="Composability">Composability</a></h3>
|
||||
|
||||
<p>
|
||||
@@ -2272,7 +2323,7 @@ that meets this requirement.
|
||||
Furthermore, NMI handlers can be interrupted by what appear to RCU
|
||||
to be normal interrupts.
|
||||
One way that this can happen is for code that directly invokes
|
||||
<tt>rcu_irq_enter()</tt> and </tt>rcu_irq_exit()</tt> to be called
|
||||
<tt>rcu_irq_enter()</tt> and <tt>rcu_irq_exit()</tt> to be called
|
||||
from an NMI handler.
|
||||
This astonishing fact of life prompted the current code structure,
|
||||
which has <tt>rcu_irq_enter()</tt> invoking <tt>rcu_nmi_enter()</tt>
|
||||
@@ -2294,7 +2345,7 @@ via <tt>del_timer_sync()</tt> or similar.
|
||||
<p>
|
||||
Unfortunately, there is no way to cancel an RCU callback;
|
||||
once you invoke <tt>call_rcu()</tt>, the callback function is
|
||||
going to eventually be invoked, unless the system goes down first.
|
||||
eventually going to be invoked, unless the system goes down first.
|
||||
Because it is normally considered socially irresponsible to crash the system
|
||||
in response to a module unload request, we need some other way
|
||||
to deal with in-flight RCU callbacks.
|
||||
@@ -2424,23 +2475,37 @@ for context-switch-heavy <tt>CONFIG_NO_HZ_FULL=y</tt> workloads,
|
||||
but there is room for further improvement.
|
||||
|
||||
<p>
|
||||
In the past, it was forbidden to disable interrupts across an
|
||||
<tt>rcu_read_unlock()</tt> unless that interrupt-disabled region
|
||||
of code also included the matching <tt>rcu_read_lock()</tt>.
|
||||
Violating this restriction could result in deadlocks involving the
|
||||
scheduler's runqueue and priority-inheritance spinlocks.
|
||||
This restriction was lifted when interrupt-disabled calls to
|
||||
<tt>rcu_read_unlock()</tt> started deferring the reporting of
|
||||
the resulting RCU-preempt quiescent state until the end of that
|
||||
It is forbidden to hold any of scheduler's runqueue or priority-inheritance
|
||||
spinlocks across an <tt>rcu_read_unlock()</tt> unless interrupts have been
|
||||
disabled across the entire RCU read-side critical section, that is,
|
||||
up to and including the matching <tt>rcu_read_lock()</tt>.
|
||||
Violating this restriction can result in deadlocks involving these
|
||||
scheduler spinlocks.
|
||||
There was hope that this restriction might be lifted when interrupt-disabled
|
||||
calls to <tt>rcu_read_unlock()</tt> started deferring the reporting of
|
||||
the resulting RCU-preempt quiescent state until the end of the corresponding
|
||||
interrupts-disabled region.
|
||||
This deferred reporting means that the scheduler's runqueue and
|
||||
priority-inheritance locks cannot be held while reporting an RCU-preempt
|
||||
quiescent state, which lifts the earlier restriction, at least from
|
||||
a deadlock perspective.
|
||||
Unfortunately, real-time systems using RCU priority boosting may
|
||||
Unfortunately, timely reporting of the corresponding quiescent state
|
||||
to expedited grace periods requires a call to <tt>raise_softirq()</tt>,
|
||||
which can acquire these scheduler spinlocks.
|
||||
In addition, real-time systems using RCU priority boosting
|
||||
need this restriction to remain in effect because deferred
|
||||
quiescent-state reporting also defers deboosting, which in turn
|
||||
degrades real-time latencies.
|
||||
quiescent-state reporting would also defer deboosting, which in turn
|
||||
would degrade real-time latencies.
|
||||
|
||||
<p>
|
||||
In theory, if a given RCU read-side critical section could be
|
||||
guaranteed to be less than one second in duration, holding a scheduler
|
||||
spinlock across that critical section's <tt>rcu_read_unlock()</tt>
|
||||
would require only that preemption be disabled across the entire
|
||||
RCU read-side critical section, not interrupts.
|
||||
Unfortunately, given the possibility of vCPU preemption, long-running
|
||||
interrupts, and so on, it is not possible in practice to guarantee
|
||||
that a given RCU read-side critical section will complete in less than
|
||||
one second.
|
||||
Therefore, as noted above, if scheduler spinlocks are held across
|
||||
a given call to <tt>rcu_read_unlock()</tt>, interrupts must be
|
||||
disabled across the entire RCU read-side critical section.
|
||||
|
||||
<h3><a name="Tracing and RCU">Tracing and RCU</a></h3>
|
||||
|
||||
@@ -3034,7 +3099,7 @@ If you block forever in one of a given domain's SRCU read-side critical
|
||||
sections, then that domain's grace periods will also be blocked forever.
|
||||
Of course, one good way to block forever is to deadlock, which can
|
||||
happen if any operation in a given domain's SRCU read-side critical
|
||||
section can block waiting, either directly or indirectly, for that domain's
|
||||
section can wait, either directly or indirectly, for that domain's
|
||||
grace period to elapse.
|
||||
For example, this results in a self-deadlock:
|
||||
|
||||
@@ -3074,12 +3139,18 @@ API, which, in combination with <tt>srcu_read_unlock()</tt>,
|
||||
guarantees a full memory barrier.
|
||||
|
||||
<p>
|
||||
Also unlike other RCU flavors, SRCU's callbacks-wait function
|
||||
<tt>srcu_barrier()</tt> may be invoked from CPU-hotplug notifiers,
|
||||
though this is not necessarily a good idea.
|
||||
The reason that this is possible is that SRCU is insensitive
|
||||
to whether or not a CPU is online, which means that <tt>srcu_barrier()</tt>
|
||||
need not exclude CPU-hotplug operations.
|
||||
Also unlike other RCU flavors, <tt>synchronize_srcu()</tt> may <b>not</b>
|
||||
be invoked from CPU-hotplug notifiers, due to the fact that SRCU grace
|
||||
periods make use of timers and the possibility of timers being temporarily
|
||||
“stranded” on the outgoing CPU.
|
||||
This stranding of timers means that timers posted to the outgoing CPU
|
||||
will not fire until late in the CPU-hotplug process.
|
||||
The problem is that if a notifier is waiting on an SRCU grace period,
|
||||
that grace period is waiting on a timer, and that timer is stranded on the
|
||||
outgoing CPU, then the notifier will never be awakened, in other words,
|
||||
deadlock has occurred.
|
||||
This same situation of course also prohibits <tt>srcu_barrier()</tt>
|
||||
from being invoked from CPU-hotplug notifiers.
|
||||
|
||||
<p>
|
||||
SRCU also differs from other RCU flavors in that SRCU's expedited and
|
||||
@@ -3233,6 +3304,11 @@ For example, RCU callback overhead might be charged back to the
|
||||
originating <tt>call_rcu()</tt> instance, though probably not
|
||||
in production kernels.
|
||||
|
||||
<p>
|
||||
Additional work may be required to provide reasonable forward-progress
|
||||
guarantees under heavy load for grace periods and for callback
|
||||
invocation.
|
||||
|
||||
<h2><a name="Summary">Summary</a></h2>
|
||||
|
||||
<p>
|
||||
|
@@ -63,7 +63,7 @@ over a rather long period of time, but improvements are always welcome!
|
||||
pointer must be covered by rcu_read_lock(), rcu_read_lock_bh(),
|
||||
rcu_read_lock_sched(), or by the appropriate update-side lock.
|
||||
Disabling of preemption can serve as rcu_read_lock_sched(), but
|
||||
is less readable.
|
||||
is less readable and prevents lockdep from detecting locking issues.
|
||||
|
||||
Letting RCU-protected pointers "leak" out of an RCU read-side
|
||||
critical section is every bid as bad as letting them leak out
|
||||
@@ -285,11 +285,7 @@ over a rather long period of time, but improvements are always welcome!
|
||||
here is that superuser already has lots of ways to crash
|
||||
the machine.
|
||||
|
||||
d. Use call_rcu_bh() rather than call_rcu(), in order to take
|
||||
advantage of call_rcu_bh()'s faster grace periods. (This
|
||||
is only a partial solution, though.)
|
||||
|
||||
e. Periodically invoke synchronize_rcu(), permitting a limited
|
||||
d. Periodically invoke synchronize_rcu(), permitting a limited
|
||||
number of updates per grace period.
|
||||
|
||||
The same cautions apply to call_rcu_bh(), call_rcu_sched(),
|
||||
@@ -324,37 +320,14 @@ over a rather long period of time, but improvements are always welcome!
|
||||
will break Alpha, cause aggressive compilers to generate bad code,
|
||||
and confuse people trying to read your code.
|
||||
|
||||
11. Note that synchronize_rcu() -only- guarantees to wait until
|
||||
all currently executing rcu_read_lock()-protected RCU read-side
|
||||
critical sections complete. It does -not- necessarily guarantee
|
||||
that all currently running interrupts, NMIs, preempt_disable()
|
||||
code, or idle loops will complete. Therefore, if your
|
||||
read-side critical sections are protected by something other
|
||||
than rcu_read_lock(), do -not- use synchronize_rcu().
|
||||
|
||||
Similarly, disabling preemption is not an acceptable substitute
|
||||
for rcu_read_lock(). Code that attempts to use preemption
|
||||
disabling where it should be using rcu_read_lock() will break
|
||||
in CONFIG_PREEMPT=y kernel builds.
|
||||
|
||||
If you want to wait for interrupt handlers, NMI handlers, and
|
||||
code under the influence of preempt_disable(), you instead
|
||||
need to use synchronize_irq() or synchronize_sched().
|
||||
|
||||
This same limitation also applies to synchronize_rcu_bh()
|
||||
and synchronize_srcu(), as well as to the asynchronous and
|
||||
expedited forms of the three primitives, namely call_rcu(),
|
||||
call_rcu_bh(), call_srcu(), synchronize_rcu_expedited(),
|
||||
synchronize_rcu_bh_expedited(), and synchronize_srcu_expedited().
|
||||
|
||||
12. Any lock acquired by an RCU callback must be acquired elsewhere
|
||||
11. Any lock acquired by an RCU callback must be acquired elsewhere
|
||||
with softirq disabled, e.g., via spin_lock_irqsave(),
|
||||
spin_lock_bh(), etc. Failing to disable irq on a given
|
||||
acquisition of that lock will result in deadlock as soon as
|
||||
the RCU softirq handler happens to run your RCU callback while
|
||||
interrupting that acquisition's critical section.
|
||||
|
||||
13. RCU callbacks can be and are executed in parallel. In many cases,
|
||||
12. RCU callbacks can be and are executed in parallel. In many cases,
|
||||
the callback code simply wrappers around kfree(), so that this
|
||||
is not an issue (or, more accurately, to the extent that it is
|
||||
an issue, the memory-allocator locking handles it). However,
|
||||
@@ -370,7 +343,7 @@ over a rather long period of time, but improvements are always welcome!
|
||||
not the case, a self-spawning RCU callback would prevent the
|
||||
victim CPU from ever going offline.)
|
||||
|
||||
14. Unlike other forms of RCU, it -is- permissible to block in an
|
||||
13. Unlike other forms of RCU, it -is- permissible to block in an
|
||||
SRCU read-side critical section (demarked by srcu_read_lock()
|
||||
and srcu_read_unlock()), hence the "SRCU": "sleepable RCU".
|
||||
Please note that if you don't need to sleep in read-side critical
|
||||
@@ -414,7 +387,7 @@ over a rather long period of time, but improvements are always welcome!
|
||||
Note that rcu_dereference() and rcu_assign_pointer() relate to
|
||||
SRCU just as they do to other forms of RCU.
|
||||
|
||||
15. The whole point of call_rcu(), synchronize_rcu(), and friends
|
||||
14. The whole point of call_rcu(), synchronize_rcu(), and friends
|
||||
is to wait until all pre-existing readers have finished before
|
||||
carrying out some otherwise-destructive operation. It is
|
||||
therefore critically important to -first- remove any path
|
||||
@@ -426,13 +399,13 @@ over a rather long period of time, but improvements are always welcome!
|
||||
is the caller's responsibility to guarantee that any subsequent
|
||||
readers will execute safely.
|
||||
|
||||
16. The various RCU read-side primitives do -not- necessarily contain
|
||||
15. The various RCU read-side primitives do -not- necessarily contain
|
||||
memory barriers. You should therefore plan for the CPU
|
||||
and the compiler to freely reorder code into and out of RCU
|
||||
read-side critical sections. It is the responsibility of the
|
||||
RCU update-side primitives to deal with this.
|
||||
|
||||
17. Use CONFIG_PROVE_LOCKING, CONFIG_DEBUG_OBJECTS_RCU_HEAD, and the
|
||||
16. Use CONFIG_PROVE_LOCKING, CONFIG_DEBUG_OBJECTS_RCU_HEAD, and the
|
||||
__rcu sparse checks to validate your RCU code. These can help
|
||||
find problems as follows:
|
||||
|
||||
@@ -455,7 +428,7 @@ over a rather long period of time, but improvements are always welcome!
|
||||
These debugging aids can help you find problems that are
|
||||
otherwise extremely difficult to spot.
|
||||
|
||||
18. If you register a callback using call_rcu(), call_rcu_bh(),
|
||||
17. If you register a callback using call_rcu(), call_rcu_bh(),
|
||||
call_rcu_sched(), or call_srcu(), and pass in a function defined
|
||||
within a loadable module, then it in necessary to wait for
|
||||
all pending callbacks to be invoked after the last invocation
|
||||
@@ -469,8 +442,8 @@ over a rather long period of time, but improvements are always welcome!
|
||||
You instead need to use one of the barrier functions:
|
||||
|
||||
o call_rcu() -> rcu_barrier()
|
||||
o call_rcu_bh() -> rcu_barrier_bh()
|
||||
o call_rcu_sched() -> rcu_barrier_sched()
|
||||
o call_rcu_bh() -> rcu_barrier()
|
||||
o call_rcu_sched() -> rcu_barrier()
|
||||
o call_srcu() -> srcu_barrier()
|
||||
|
||||
However, these barrier functions are absolutely -not- guaranteed
|
||||
|
@@ -14,9 +14,9 @@ being the real world and all that.
|
||||
So let's look at an example RCU lockdep splat from 3.0-rc5, one that
|
||||
has long since been fixed:
|
||||
|
||||
===============================
|
||||
[ INFO: suspicious RCU usage. ]
|
||||
-------------------------------
|
||||
=============================
|
||||
WARNING: suspicious RCU usage
|
||||
-----------------------------
|
||||
block/cfq-iosched.c:2776 suspicious rcu_dereference_protected() usage!
|
||||
|
||||
other info that might help us debug this:
|
||||
@@ -24,11 +24,11 @@ other info that might help us debug this:
|
||||
|
||||
rcu_scheduler_active = 1, debug_locks = 0
|
||||
3 locks held by scsi_scan_6/1552:
|
||||
#0: (&shost->scan_mutex){+.+.+.}, at: [<ffffffff8145efca>]
|
||||
#0: (&shost->scan_mutex){+.+.}, at: [<ffffffff8145efca>]
|
||||
scsi_scan_host_selected+0x5a/0x150
|
||||
#1: (&eq->sysfs_lock){+.+...}, at: [<ffffffff812a5032>]
|
||||
#1: (&eq->sysfs_lock){+.+.}, at: [<ffffffff812a5032>]
|
||||
elevator_exit+0x22/0x60
|
||||
#2: (&(&q->__queue_lock)->rlock){-.-...}, at: [<ffffffff812b6233>]
|
||||
#2: (&(&q->__queue_lock)->rlock){-.-.}, at: [<ffffffff812b6233>]
|
||||
cfq_exit_queue+0x43/0x190
|
||||
|
||||
stack backtrace:
|
||||
|
@@ -176,9 +176,8 @@ causing stalls, and that the stall was affecting RCU-sched. This message
|
||||
will normally be followed by stack dumps for each CPU. Please note that
|
||||
PREEMPT_RCU builds can be stalled by tasks as well as by CPUs, and that
|
||||
the tasks will be indicated by PID, for example, "P3421". It is even
|
||||
possible for a rcu_preempt_state stall to be caused by both CPUs -and-
|
||||
tasks, in which case the offending CPUs and tasks will all be called
|
||||
out in the list.
|
||||
possible for an rcu_state stall to be caused by both CPUs -and- tasks,
|
||||
in which case the offending CPUs and tasks will all be called out in the list.
|
||||
|
||||
CPU 2's "(3 GPs behind)" indicates that this CPU has not interacted with
|
||||
the RCU core for the past three grace periods. In contrast, CPU 16's "(0
|
||||
@@ -206,7 +205,7 @@ handlers are no longer able to execute on this CPU. This can happen if
|
||||
the stalled CPU is spinning with interrupts are disabled, or, in -rt
|
||||
kernels, if a high-priority process is starving RCU's softirq handler.
|
||||
|
||||
The "fps=" shows the number of force-quiescent-state idle/offline
|
||||
The "fqs=" shows the number of force-quiescent-state idle/offline
|
||||
detection passes that the grace-period kthread has made across this
|
||||
CPU since the last time that this CPU noted the beginning of a grace
|
||||
period.
|
||||
@@ -220,17 +219,18 @@ an estimate of the total number of RCU callbacks queued across all CPUs
|
||||
In kernels with CONFIG_RCU_FAST_NO_HZ, more information is printed
|
||||
for each CPU:
|
||||
|
||||
0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 nonlazy_posted: 25 .D
|
||||
0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 Nonlazy posted: ..D
|
||||
|
||||
The "last_accelerate:" prints the low-order 16 bits (in hex) of the
|
||||
jiffies counter when this CPU last invoked rcu_try_advance_all_cbs()
|
||||
from rcu_needs_cpu() or last invoked rcu_accelerate_cbs() from
|
||||
rcu_prepare_for_idle(). The "nonlazy_posted:" prints the number
|
||||
of non-lazy callbacks posted since the last call to rcu_needs_cpu().
|
||||
Finally, an "L" indicates that there are currently no non-lazy callbacks
|
||||
("." is printed otherwise, as shown above) and "D" indicates that
|
||||
dyntick-idle processing is enabled ("." is printed otherwise, for example,
|
||||
if disabled via the "nohz=" kernel boot parameter).
|
||||
rcu_prepare_for_idle(). The "Nonlazy posted:" indicates lazy-callback
|
||||
status, so that an "l" indicates that all callbacks were lazy at the start
|
||||
of the last idle period and an "L" indicates that there are currently
|
||||
no non-lazy callbacks (in both cases, "." is printed otherwise, as
|
||||
shown above) and "D" indicates that dyntick-idle processing is enabled
|
||||
("." is printed otherwise, for example, if disabled via the "nohz="
|
||||
kernel boot parameter).
|
||||
|
||||
If the grace period ends just as the stall warning starts printing,
|
||||
there will be a spurious stall-warning message, which will include
|
||||
|
@@ -10,173 +10,8 @@ status messages via printk(), which can be examined via the dmesg
|
||||
command (perhaps grepping for "torture"). The test is started
|
||||
when the module is loaded, and stops when the module is unloaded.
|
||||
|
||||
|
||||
MODULE PARAMETERS
|
||||
|
||||
This module has the following parameters:
|
||||
|
||||
fqs_duration Duration (in microseconds) of artificially induced bursts
|
||||
of force_quiescent_state() invocations. In RCU
|
||||
implementations having force_quiescent_state(), these
|
||||
bursts help force races between forcing a given grace
|
||||
period and that grace period ending on its own.
|
||||
|
||||
fqs_holdoff Holdoff time (in microseconds) between consecutive calls
|
||||
to force_quiescent_state() within a burst.
|
||||
|
||||
fqs_stutter Wait time (in seconds) between consecutive bursts
|
||||
of calls to force_quiescent_state().
|
||||
|
||||
gp_normal Make the fake writers use normal synchronous grace-period
|
||||
primitives.
|
||||
|
||||
gp_exp Make the fake writers use expedited synchronous grace-period
|
||||
primitives. If both gp_normal and gp_exp are set, or
|
||||
if neither gp_normal nor gp_exp are set, then randomly
|
||||
choose the primitive so that about 50% are normal and
|
||||
50% expedited. By default, neither are set, which
|
||||
gives best overall test coverage.
|
||||
|
||||
irqreader Says to invoke RCU readers from irq level. This is currently
|
||||
done via timers. Defaults to "1" for variants of RCU that
|
||||
permit this. (Or, more accurately, variants of RCU that do
|
||||
-not- permit this know to ignore this variable.)
|
||||
|
||||
n_barrier_cbs If this is nonzero, RCU barrier testing will be conducted,
|
||||
in which case n_barrier_cbs specifies the number of
|
||||
RCU callbacks (and corresponding kthreads) to use for
|
||||
this testing. The value cannot be negative. If you
|
||||
specify this to be non-zero when torture_type indicates a
|
||||
synchronous RCU implementation (one for which a member of
|
||||
the synchronize_rcu() rather than the call_rcu() family is
|
||||
used -- see the documentation for torture_type below), an
|
||||
error will be reported and no testing will be carried out.
|
||||
|
||||
nfakewriters This is the number of RCU fake writer threads to run. Fake
|
||||
writer threads repeatedly use the synchronous "wait for
|
||||
current readers" function of the interface selected by
|
||||
torture_type, with a delay between calls to allow for various
|
||||
different numbers of writers running in parallel.
|
||||
nfakewriters defaults to 4, which provides enough parallelism
|
||||
to trigger special cases caused by multiple writers, such as
|
||||
the synchronize_srcu() early return optimization.
|
||||
|
||||
nreaders This is the number of RCU reading threads supported.
|
||||
The default is twice the number of CPUs. Why twice?
|
||||
To properly exercise RCU implementations with preemptible
|
||||
read-side critical sections.
|
||||
|
||||
onoff_interval
|
||||
The number of seconds between each attempt to execute a
|
||||
randomly selected CPU-hotplug operation. Defaults to
|
||||
zero, which disables CPU hotplugging. In HOTPLUG_CPU=n
|
||||
kernels, rcutorture will silently refuse to do any
|
||||
CPU-hotplug operations regardless of what value is
|
||||
specified for onoff_interval.
|
||||
|
||||
onoff_holdoff The number of seconds to wait until starting CPU-hotplug
|
||||
operations. This would normally only be used when
|
||||
rcutorture was built into the kernel and started
|
||||
automatically at boot time, in which case it is useful
|
||||
in order to avoid confusing boot-time code with CPUs
|
||||
coming and going.
|
||||
|
||||
shuffle_interval
|
||||
The number of seconds to keep the test threads affinitied
|
||||
to a particular subset of the CPUs, defaults to 3 seconds.
|
||||
Used in conjunction with test_no_idle_hz.
|
||||
|
||||
shutdown_secs The number of seconds to run the test before terminating
|
||||
the test and powering off the system. The default is
|
||||
zero, which disables test termination and system shutdown.
|
||||
This capability is useful for automated testing.
|
||||
|
||||
stall_cpu The number of seconds that a CPU should be stalled while
|
||||
within both an rcu_read_lock() and a preempt_disable().
|
||||
This stall happens only once per rcutorture run.
|
||||
If you need multiple stalls, use modprobe and rmmod to
|
||||
repeatedly run rcutorture. The default for stall_cpu
|
||||
is zero, which prevents rcutorture from stalling a CPU.
|
||||
|
||||
Note that attempts to rmmod rcutorture while the stall
|
||||
is ongoing will hang, so be careful what value you
|
||||
choose for this module parameter! In addition, too-large
|
||||
values for stall_cpu might well induce failures and
|
||||
warnings in other parts of the kernel. You have been
|
||||
warned!
|
||||
|
||||
stall_cpu_holdoff
|
||||
The number of seconds to wait after rcutorture starts
|
||||
before stalling a CPU. Defaults to 10 seconds.
|
||||
|
||||
stat_interval The number of seconds between output of torture
|
||||
statistics (via printk()). Regardless of the interval,
|
||||
statistics are printed when the module is unloaded.
|
||||
Setting the interval to zero causes the statistics to
|
||||
be printed -only- when the module is unloaded, and this
|
||||
is the default.
|
||||
|
||||
stutter The length of time to run the test before pausing for this
|
||||
same period of time. Defaults to "stutter=5", so as
|
||||
to run and pause for (roughly) five-second intervals.
|
||||
Specifying "stutter=0" causes the test to run continuously
|
||||
without pausing, which is the old default behavior.
|
||||
|
||||
test_boost Whether or not to test the ability of RCU to do priority
|
||||
boosting. Defaults to "test_boost=1", which performs
|
||||
RCU priority-inversion testing only if the selected
|
||||
RCU implementation supports priority boosting. Specifying
|
||||
"test_boost=0" never performs RCU priority-inversion
|
||||
testing. Specifying "test_boost=2" performs RCU
|
||||
priority-inversion testing even if the selected RCU
|
||||
implementation does not support RCU priority boosting,
|
||||
which can be used to test rcutorture's ability to
|
||||
carry out RCU priority-inversion testing.
|
||||
|
||||
test_boost_interval
|
||||
The number of seconds in an RCU priority-inversion test
|
||||
cycle. Defaults to "test_boost_interval=7". It is
|
||||
usually wise for this value to be relatively prime to
|
||||
the value selected for "stutter".
|
||||
|
||||
test_boost_duration
|
||||
The number of seconds to do RCU priority-inversion testing
|
||||
within any given "test_boost_interval". Defaults to
|
||||
"test_boost_duration=4".
|
||||
|
||||
test_no_idle_hz Whether or not to test the ability of RCU to operate in
|
||||
a kernel that disables the scheduling-clock interrupt to
|
||||
idle CPUs. Boolean parameter, "1" to test, "0" otherwise.
|
||||
Defaults to omitting this test.
|
||||
|
||||
torture_type The type of RCU to test, with string values as follows:
|
||||
|
||||
"rcu": rcu_read_lock(), rcu_read_unlock() and call_rcu(),
|
||||
along with expedited, synchronous, and polling
|
||||
variants.
|
||||
|
||||
"rcu_bh": rcu_read_lock_bh(), rcu_read_unlock_bh(), and
|
||||
call_rcu_bh(), along with expedited and synchronous
|
||||
variants.
|
||||
|
||||
"rcu_busted": This tests an intentionally incorrect version
|
||||
of RCU in order to help test rcutorture itself.
|
||||
|
||||
"srcu": srcu_read_lock(), srcu_read_unlock() and
|
||||
call_srcu(), along with expedited and
|
||||
synchronous variants.
|
||||
|
||||
"sched": preempt_disable(), preempt_enable(), and
|
||||
call_rcu_sched(), along with expedited,
|
||||
synchronous, and polling variants.
|
||||
|
||||
"tasks": voluntary context switch and call_rcu_tasks(),
|
||||
along with expedited and synchronous variants.
|
||||
|
||||
Defaults to "rcu".
|
||||
|
||||
verbose Enable debug printk()s. Default is disabled.
|
||||
|
||||
Module parameters are prefixed by "rcutorture." in
|
||||
Documentation/admin-guide/kernel-parameters.txt.
|
||||
|
||||
OUTPUT
|
||||
|
||||
|
@@ -266,7 +266,7 @@ rcu_dereference()
|
||||
unnecessary overhead on Alpha CPUs.
|
||||
|
||||
Note that the value returned by rcu_dereference() is valid
|
||||
only within the enclosing RCU read-side critical section.
|
||||
only within the enclosing RCU read-side critical section [1].
|
||||
For example, the following is -not- legal:
|
||||
|
||||
rcu_read_lock();
|
||||
@@ -292,6 +292,19 @@ rcu_dereference()
|
||||
typically used indirectly, via the _rcu list-manipulation
|
||||
primitives, such as list_for_each_entry_rcu().
|
||||
|
||||
[1] The variant rcu_dereference_protected() can be used outside
|
||||
of an RCU read-side critical section as long as the usage is
|
||||
protected by locks acquired by the update-side code. This variant
|
||||
avoids the lockdep warning that would happen when using (for
|
||||
example) rcu_dereference() without rcu_read_lock() protection.
|
||||
Using rcu_dereference_protected() also has the advantage
|
||||
of permitting compiler optimizations that rcu_dereference()
|
||||
must prohibit. The rcu_dereference_protected() variant takes
|
||||
a lockdep expression to indicate which locks must be acquired
|
||||
by the caller. If the indicated protection is not provided,
|
||||
a lockdep splat is emitted. See RCU/Design/Requirements/Requirements.html
|
||||
and the API's code comments for more details and example usage.
|
||||
|
||||
The following diagram shows how each API communicates among the
|
||||
reader, updater, and reclaimer.
|
||||
|
||||
@@ -322,28 +335,27 @@ to their callers and (2) call_rcu() callbacks may be invoked. Efficient
|
||||
implementations of the RCU infrastructure make heavy use of batching in
|
||||
order to amortize their overhead over many uses of the corresponding APIs.
|
||||
|
||||
There are no fewer than three RCU mechanisms in the Linux kernel; the
|
||||
diagram above shows the first one, which is by far the most commonly used.
|
||||
The rcu_dereference() and rcu_assign_pointer() primitives are used for
|
||||
all three mechanisms, but different defer and protect primitives are
|
||||
used as follows:
|
||||
There are at least three flavors of RCU usage in the Linux kernel. The diagram
|
||||
above shows the most common one. On the updater side, the rcu_assign_pointer(),
|
||||
sychronize_rcu() and call_rcu() primitives used are the same for all three
|
||||
flavors. However for protection (on the reader side), the primitives used vary
|
||||
depending on the flavor:
|
||||
|
||||
Defer Protect
|
||||
a. rcu_read_lock() / rcu_read_unlock()
|
||||
rcu_dereference()
|
||||
|
||||
a. synchronize_rcu() rcu_read_lock() / rcu_read_unlock()
|
||||
call_rcu() rcu_dereference()
|
||||
b. rcu_read_lock_bh() / rcu_read_unlock_bh()
|
||||
local_bh_disable() / local_bh_enable()
|
||||
rcu_dereference_bh()
|
||||
|
||||
b. synchronize_rcu_bh() rcu_read_lock_bh() / rcu_read_unlock_bh()
|
||||
call_rcu_bh() rcu_dereference_bh()
|
||||
|
||||
c. synchronize_sched() rcu_read_lock_sched() / rcu_read_unlock_sched()
|
||||
call_rcu_sched() preempt_disable() / preempt_enable()
|
||||
c. rcu_read_lock_sched() / rcu_read_unlock_sched()
|
||||
preempt_disable() / preempt_enable()
|
||||
local_irq_save() / local_irq_restore()
|
||||
hardirq enter / hardirq exit
|
||||
NMI enter / NMI exit
|
||||
rcu_dereference_sched()
|
||||
|
||||
These three mechanisms are used as follows:
|
||||
These three flavors are used as follows:
|
||||
|
||||
a. RCU applied to normal data structures.
|
||||
|
||||
@@ -548,7 +560,7 @@ presents two such "toy" implementations of RCU, one that is implemented
|
||||
in terms of familiar locking primitives, and another that more closely
|
||||
resembles "classic" RCU. Both are way too simple for real-world use,
|
||||
lacking both functionality and performance. However, they are useful
|
||||
in getting a feel for how RCU works. See kernel/rcupdate.c for a
|
||||
in getting a feel for how RCU works. See kernel/rcu/update.c for a
|
||||
production-quality implementation, and see:
|
||||
|
||||
http://www.rdrop.com/users/paulmck/RCU
|
||||
@@ -867,18 +879,20 @@ RCU: Critical sections Grace period Barrier
|
||||
|
||||
bh: Critical sections Grace period Barrier
|
||||
|
||||
rcu_read_lock_bh call_rcu_bh rcu_barrier_bh
|
||||
rcu_read_unlock_bh synchronize_rcu_bh
|
||||
rcu_dereference_bh synchronize_rcu_bh_expedited
|
||||
rcu_read_lock_bh call_rcu rcu_barrier
|
||||
rcu_read_unlock_bh synchronize_rcu
|
||||
[local_bh_disable] synchronize_rcu_expedited
|
||||
[and friends]
|
||||
rcu_dereference_bh
|
||||
rcu_dereference_bh_check
|
||||
rcu_dereference_bh_protected
|
||||
rcu_read_lock_bh_held
|
||||
|
||||
sched: Critical sections Grace period Barrier
|
||||
|
||||
rcu_read_lock_sched synchronize_sched rcu_barrier_sched
|
||||
rcu_read_unlock_sched call_rcu_sched
|
||||
[preempt_disable] synchronize_sched_expedited
|
||||
rcu_read_lock_sched call_rcu rcu_barrier
|
||||
rcu_read_unlock_sched synchronize_rcu
|
||||
[preempt_disable] synchronize_rcu_expedited
|
||||
[and friends]
|
||||
rcu_read_lock_sched_notrace
|
||||
rcu_read_unlock_sched_notrace
|
||||
@@ -890,8 +904,8 @@ sched: Critical sections Grace period Barrier
|
||||
|
||||
SRCU: Critical sections Grace period Barrier
|
||||
|
||||
srcu_read_lock synchronize_srcu srcu_barrier
|
||||
srcu_read_unlock call_srcu
|
||||
srcu_read_lock call_srcu srcu_barrier
|
||||
srcu_read_unlock synchronize_srcu
|
||||
srcu_dereference synchronize_srcu_expedited
|
||||
srcu_dereference_check
|
||||
srcu_read_lock_held
|
||||
@@ -1034,7 +1048,7 @@ Answer: Just as PREEMPT_RT permits preemption of spinlock
|
||||
spinlocks blocking while in RCU read-side critical
|
||||
sections.
|
||||
|
||||
Why the apparent inconsistency? Because it is it
|
||||
Why the apparent inconsistency? Because it is
|
||||
possible to use priority boosting to keep the RCU
|
||||
grace periods short if need be (for example, if running
|
||||
short of memory). In contrast, if blocking waiting
|
||||
|
@@ -56,12 +56,12 @@ situation from a state where some tasks are stalled but the CPU is
|
||||
still doing productive work. As such, time spent in this subset of the
|
||||
stall state is tracked separately and exported in the "full" averages.
|
||||
|
||||
The ratios are tracked as recent trends over ten, sixty, and three
|
||||
hundred second windows, which gives insight into short term events as
|
||||
well as medium and long term trends. The total absolute stall time is
|
||||
tracked and exported as well, to allow detection of latency spikes
|
||||
which wouldn't necessarily make a dent in the time averages, or to
|
||||
average trends over custom time frames.
|
||||
The ratios (in %) are tracked as recent trends over ten, sixty, and
|
||||
three hundred second windows, which gives insight into short term events
|
||||
as well as medium and long term trends. The total absolute stall time
|
||||
(in us) is tracked and exported as well, to allow detection of latency
|
||||
spikes which wouldn't necessarily make a dent in the time averages,
|
||||
or to average trends over custom time frames.
|
||||
|
||||
Cgroup2 interface
|
||||
=================
|
||||
|
@@ -23,7 +23,7 @@ kernel.
|
||||
|
||||
The resultant userspace tool binary is then located at:
|
||||
|
||||
tools/acpi/power/acpi/acpidbg/acpidbg
|
||||
tools/power/acpi/acpidbg
|
||||
|
||||
It can be installed to system directories by running "make install" (as a
|
||||
sufficiently privileged user).
|
||||
@@ -35,7 +35,7 @@ kernel.
|
||||
|
||||
# mount -t debugfs none /sys/kernel/debug
|
||||
# modprobe acpi_dbg
|
||||
# tools/acpi/power/acpi/acpidbg/acpidbg
|
||||
# tools/power/acpi/acpidbg
|
||||
|
||||
That spawns the interactive AML debugger environment where you can execute
|
||||
debugger commands.
|
||||
|
@@ -14,6 +14,10 @@ upgrade the ACPI execution environment that is defined by the ACPI tables
|
||||
via upgrading the ACPI tables provided by the BIOS with an instrumented,
|
||||
modified, more recent version one, or installing brand new ACPI tables.
|
||||
|
||||
When building initrd with kernel in a single image, option
|
||||
ACPI_TABLE_OVERRIDE_VIA_BUILTIN_INITRD should also be true for this
|
||||
feature to work.
|
||||
|
||||
For a full list of ACPI tables that can be upgraded/installed, take a look
|
||||
at the char *table_sigs[MAX_ACPI_SIGNATURE]; definition in
|
||||
drivers/acpi/tables.c.
|
||||
|
@@ -6,7 +6,7 @@ If you want to use SELinux, chances are you will want
|
||||
to use the distro-provided policies, or install the
|
||||
latest reference policy release from
|
||||
|
||||
http://oss.tresys.com/projects/refpolicy
|
||||
https://github.com/SELinuxProject/refpolicy
|
||||
|
||||
However, if you want to install a dummy policy for
|
||||
testing, you can do using ``mdp`` provided under
|
||||
|
107
Documentation/admin-guide/LSM/SafeSetID.rst
Normal file
@@ -0,0 +1,107 @@
|
||||
=========
|
||||
SafeSetID
|
||||
=========
|
||||
SafeSetID is an LSM module that gates the setid family of syscalls to restrict
|
||||
UID/GID transitions from a given UID/GID to only those approved by a
|
||||
system-wide whitelist. These restrictions also prohibit the given UIDs/GIDs
|
||||
from obtaining auxiliary privileges associated with CAP_SET{U/G}ID, such as
|
||||
allowing a user to set up user namespace UID mappings.
|
||||
|
||||
|
||||
Background
|
||||
==========
|
||||
In absence of file capabilities, processes spawned on a Linux system that need
|
||||
to switch to a different user must be spawned with CAP_SETUID privileges.
|
||||
CAP_SETUID is granted to programs running as root or those running as a non-root
|
||||
user that have been explicitly given the CAP_SETUID runtime capability. It is
|
||||
often preferable to use Linux runtime capabilities rather than file
|
||||
capabilities, since using file capabilities to run a program with elevated
|
||||
privileges opens up possible security holes since any user with access to the
|
||||
file can exec() that program to gain the elevated privileges.
|
||||
|
||||
While it is possible to implement a tree of processes by giving full
|
||||
CAP_SET{U/G}ID capabilities, this is often at odds with the goals of running a
|
||||
tree of processes under non-root user(s) in the first place. Specifically,
|
||||
since CAP_SETUID allows changing to any user on the system, including the root
|
||||
user, it is an overpowered capability for what is needed in this scenario,
|
||||
especially since programs often only call setuid() to drop privileges to a
|
||||
lesser-privileged user -- not elevate privileges. Unfortunately, there is no
|
||||
generally feasible way in Linux to restrict the potential UIDs that a user can
|
||||
switch to through setuid() beyond allowing a switch to any user on the system.
|
||||
This SafeSetID LSM seeks to provide a solution for restricting setid
|
||||
capabilities in such a way.
|
||||
|
||||
The main use case for this LSM is to allow a non-root program to transition to
|
||||
other untrusted uids without full blown CAP_SETUID capabilities. The non-root
|
||||
program would still need CAP_SETUID to do any kind of transition, but the
|
||||
additional restrictions imposed by this LSM would mean it is a "safer" version
|
||||
of CAP_SETUID since the non-root program cannot take advantage of CAP_SETUID to
|
||||
do any unapproved actions (e.g. setuid to uid 0 or create/enter new user
|
||||
namespace). The higher level goal is to allow for uid-based sandboxing of system
|
||||
services without having to give out CAP_SETUID all over the place just so that
|
||||
non-root programs can drop to even-lesser-privileged uids. This is especially
|
||||
relevant when one non-root daemon on the system should be allowed to spawn other
|
||||
processes as different uids, but its undesirable to give the daemon a
|
||||
basically-root-equivalent CAP_SETUID.
|
||||
|
||||
|
||||
Other Approaches Considered
|
||||
===========================
|
||||
|
||||
Solve this problem in userspace
|
||||
-------------------------------
|
||||
For candidate applications that would like to have restricted setid capabilities
|
||||
as implemented in this LSM, an alternative option would be to simply take away
|
||||
setid capabilities from the application completely and refactor the process
|
||||
spawning semantics in the application (e.g. by using a privileged helper program
|
||||
to do process spawning and UID/GID transitions). Unfortunately, there are a
|
||||
number of semantics around process spawning that would be affected by this, such
|
||||
as fork() calls where the program doesn???t immediately call exec() after the
|
||||
fork(), parent processes specifying custom environment variables or command line
|
||||
args for spawned child processes, or inheritance of file handles across a
|
||||
fork()/exec(). Because of this, as solution that uses a privileged helper in
|
||||
userspace would likely be less appealing to incorporate into existing projects
|
||||
that rely on certain process-spawning semantics in Linux.
|
||||
|
||||
Use user namespaces
|
||||
-------------------
|
||||
Another possible approach would be to run a given process tree in its own user
|
||||
namespace and give programs in the tree setid capabilities. In this way,
|
||||
programs in the tree could change to any desired UID/GID in the context of their
|
||||
own user namespace, and only approved UIDs/GIDs could be mapped back to the
|
||||
initial system user namespace, affectively preventing privilege escalation.
|
||||
Unfortunately, it is not generally feasible to use user namespaces in isolation,
|
||||
without pairing them with other namespace types, which is not always an option.
|
||||
Linux checks for capabilities based off of the user namespace that ???owns??? some
|
||||
entity. For example, Linux has the notion that network namespaces are owned by
|
||||
the user namespace in which they were created. A consequence of this is that
|
||||
capability checks for access to a given network namespace are done by checking
|
||||
whether a task has the given capability in the context of the user namespace
|
||||
that owns the network namespace -- not necessarily the user namespace under
|
||||
which the given task runs. Therefore spawning a process in a new user namespace
|
||||
effectively prevents it from accessing the network namespace owned by the
|
||||
initial namespace. This is a deal-breaker for any application that expects to
|
||||
retain the CAP_NET_ADMIN capability for the purpose of adjusting network
|
||||
configurations. Using user namespaces in isolation causes problems regarding
|
||||
other system interactions, including use of pid namespaces and device creation.
|
||||
|
||||
Use an existing LSM
|
||||
-------------------
|
||||
None of the other in-tree LSMs have the capability to gate setid transitions, or
|
||||
even employ the security_task_fix_setuid hook at all. SELinux says of that hook:
|
||||
"Since setuid only affects the current process, and since the SELinux controls
|
||||
are not based on the Linux identity attributes, SELinux does not need to control
|
||||
this operation."
|
||||
|
||||
|
||||
Directions for use
|
||||
==================
|
||||
This LSM hooks the setid syscalls to make sure transitions are allowed if an
|
||||
applicable restriction policy is in place. Policies are configured through
|
||||
securityfs by writing to the safesetid/add_whitelist_policy and
|
||||
safesetid/flush_whitelist_policies files at the location where securityfs is
|
||||
mounted. The format for adding a policy is '<UID>:<UID>', using literal
|
||||
numbers, such as '123:456'. To flush the policies, any write to the file is
|
||||
sufficient. Again, configuring a policy for a UID will prevent that UID from
|
||||
obtaining auxiliary setid privileges, such as allowing a user to set up user
|
||||
namespace UID mappings.
|
@@ -818,6 +818,10 @@ Smack supports some mount options:
|
||||
specifies a label to which all labels set on the
|
||||
filesystem must have read access. Not yet enforced.
|
||||
|
||||
smackfstransmute=label:
|
||||
behaves exactly like smackfsroot except that it also
|
||||
sets the transmute flag on the root of the mount
|
||||
|
||||
These mount options apply to all file system types.
|
||||
|
||||
Smack auditing
|
||||
|
@@ -17,9 +17,8 @@ MAC extensions, other extensions can be built using the LSM to provide
|
||||
specific changes to system operation when these tweaks are not available
|
||||
in the core functionality of Linux itself.
|
||||
|
||||
Without a specific LSM built into the kernel, the default LSM will be the
|
||||
Linux capabilities system. Most LSMs choose to extend the capabilities
|
||||
system, building their checks on top of the defined capability hooks.
|
||||
The Linux capabilities modules will always be included. This may be
|
||||
followed by any number of "minor" modules and at most one "major" module.
|
||||
For more details on capabilities, see ``capabilities(7)`` in the Linux
|
||||
man-pages project.
|
||||
|
||||
@@ -30,6 +29,14 @@ order in which checks are made. The capability module will always
|
||||
be first, followed by any "minor" modules (e.g. Yama) and then
|
||||
the one "major" module (e.g. SELinux) if there is one configured.
|
||||
|
||||
Process attributes associated with "major" security modules should
|
||||
be accessed and maintained using the special files in ``/proc/.../attr``.
|
||||
A security module may maintain a module specific subdirectory there,
|
||||
named after the module. ``/proc/.../attr/smack`` is provided by the Smack
|
||||
security module and contains all its special files. The files directly
|
||||
in ``/proc/.../attr`` remain as legacy interfaces for modules that provide
|
||||
subdirectories.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
||||
@@ -39,3 +46,4 @@ the one "major" module (e.g. SELinux) if there is one configured.
|
||||
Smack
|
||||
tomoyo
|
||||
Yama
|
||||
SafeSetID
|
||||
|
@@ -1,9 +1,9 @@
|
||||
.. _readme:
|
||||
|
||||
Linux kernel release 4.x <http://kernel.org/>
|
||||
Linux kernel release 5.x <http://kernel.org/>
|
||||
=============================================
|
||||
|
||||
These are the release notes for Linux version 4. Read them carefully,
|
||||
These are the release notes for Linux version 5. Read them carefully,
|
||||
as they tell you what this is all about, explain how to install the
|
||||
kernel, and what to do if something goes wrong.
|
||||
|
||||
@@ -63,7 +63,7 @@ Installing the kernel source
|
||||
directory where you have permissions (e.g. your home directory) and
|
||||
unpack it::
|
||||
|
||||
xz -cd linux-4.X.tar.xz | tar xvf -
|
||||
xz -cd linux-5.x.tar.xz | tar xvf -
|
||||
|
||||
Replace "X" with the version number of the latest kernel.
|
||||
|
||||
@@ -72,26 +72,26 @@ Installing the kernel source
|
||||
files. They should match the library, and not get messed up by
|
||||
whatever the kernel-du-jour happens to be.
|
||||
|
||||
- You can also upgrade between 4.x releases by patching. Patches are
|
||||
- You can also upgrade between 5.x releases by patching. Patches are
|
||||
distributed in the xz format. To install by patching, get all the
|
||||
newer patch files, enter the top level directory of the kernel source
|
||||
(linux-4.X) and execute::
|
||||
(linux-5.x) and execute::
|
||||
|
||||
xz -cd ../patch-4.x.xz | patch -p1
|
||||
xz -cd ../patch-5.x.xz | patch -p1
|
||||
|
||||
Replace "x" for all versions bigger than the version "X" of your current
|
||||
Replace "x" for all versions bigger than the version "x" of your current
|
||||
source tree, **in_order**, and you should be ok. You may want to remove
|
||||
the backup files (some-file-name~ or some-file-name.orig), and make sure
|
||||
that there are no failed patches (some-file-name# or some-file-name.rej).
|
||||
If there are, either you or I have made a mistake.
|
||||
|
||||
Unlike patches for the 4.x kernels, patches for the 4.x.y kernels
|
||||
Unlike patches for the 5.x kernels, patches for the 5.x.y kernels
|
||||
(also known as the -stable kernels) are not incremental but instead apply
|
||||
directly to the base 4.x kernel. For example, if your base kernel is 4.0
|
||||
and you want to apply the 4.0.3 patch, you must not first apply the 4.0.1
|
||||
and 4.0.2 patches. Similarly, if you are running kernel version 4.0.2 and
|
||||
want to jump to 4.0.3, you must first reverse the 4.0.2 patch (that is,
|
||||
patch -R) **before** applying the 4.0.3 patch. You can read more on this in
|
||||
directly to the base 5.x kernel. For example, if your base kernel is 5.0
|
||||
and you want to apply the 5.0.3 patch, you must not first apply the 5.0.1
|
||||
and 5.0.2 patches. Similarly, if you are running kernel version 5.0.2 and
|
||||
want to jump to 5.0.3, you must first reverse the 5.0.2 patch (that is,
|
||||
patch -R) **before** applying the 5.0.3 patch. You can read more on this in
|
||||
:ref:`Documentation/process/applying-patches.rst <applying_patches>`.
|
||||
|
||||
Alternatively, the script patch-kernel can be used to automate this
|
||||
@@ -114,7 +114,7 @@ Installing the kernel source
|
||||
Software requirements
|
||||
---------------------
|
||||
|
||||
Compiling and running the 4.x kernels requires up-to-date
|
||||
Compiling and running the 5.x kernels requires up-to-date
|
||||
versions of various software packages. Consult
|
||||
:ref:`Documentation/process/changes.rst <changes>` for the minimum version numbers
|
||||
required and how to get updates for these packages. Beware that using
|
||||
@@ -132,12 +132,12 @@ Build directory for the kernel
|
||||
place for the output files (including .config).
|
||||
Example::
|
||||
|
||||
kernel source code: /usr/src/linux-4.X
|
||||
kernel source code: /usr/src/linux-5.x
|
||||
build directory: /home/name/build/kernel
|
||||
|
||||
To configure and build the kernel, use::
|
||||
|
||||
cd /usr/src/linux-4.X
|
||||
cd /usr/src/linux-5.x
|
||||
make O=/home/name/build/kernel menuconfig
|
||||
make O=/home/name/build/kernel
|
||||
sudo make O=/home/name/build/kernel modules_install install
|
||||
@@ -251,7 +251,7 @@ Configuring the kernel
|
||||
Compiling the kernel
|
||||
--------------------
|
||||
|
||||
- Make sure you have at least gcc 3.2 available.
|
||||
- Make sure you have at least gcc 4.6 available.
|
||||
For more information, refer to :ref:`Documentation/process/changes.rst <changes>`.
|
||||
|
||||
Please note that you can still run a.out user programs with this kernel.
|
||||
|
@@ -56,11 +56,13 @@ v1 is available under Documentation/cgroup-v1/.
|
||||
5-3-3-2. IO Latency Interface Files
|
||||
5-4. PID
|
||||
5-4-1. PID Interface Files
|
||||
5-5. Device
|
||||
5-6. RDMA
|
||||
5-6-1. RDMA Interface Files
|
||||
5-7. Misc
|
||||
5-7-1. perf_event
|
||||
5-5. Cpuset
|
||||
5.5-1. Cpuset Interface Files
|
||||
5-6. Device
|
||||
5-7. RDMA
|
||||
5-7-1. RDMA Interface Files
|
||||
5-8. Misc
|
||||
5-8-1. perf_event
|
||||
5-N. Non-normative information
|
||||
5-N-1. CPU controller root cgroup process behaviour
|
||||
5-N-2. IO controller root cgroup process behaviour
|
||||
@@ -1187,6 +1189,10 @@ PAGE_SIZE multiple when read back.
|
||||
Amount of cached filesystem data that was modified and
|
||||
is currently being written back to disk
|
||||
|
||||
anon_thp
|
||||
Amount of memory used in anonymous mappings backed by
|
||||
transparent hugepages
|
||||
|
||||
inactive_anon, active_anon, inactive_file, active_file, unevictable
|
||||
Amount of memory, swap-backed and filesystem-backed,
|
||||
on the internal memory management lists used by the
|
||||
@@ -1246,6 +1252,18 @@ PAGE_SIZE multiple when read back.
|
||||
|
||||
Amount of reclaimed lazyfree pages
|
||||
|
||||
thp_fault_alloc
|
||||
|
||||
Number of transparent hugepages which were allocated to satisfy
|
||||
a page fault, including COW faults. This counter is not present
|
||||
when CONFIG_TRANSPARENT_HUGEPAGE is not set.
|
||||
|
||||
thp_collapse_alloc
|
||||
|
||||
Number of transparent hugepages which were allocated to allow
|
||||
collapsing an existing range of pages. This counter is not
|
||||
present when CONFIG_TRANSPARENT_HUGEPAGE is not set.
|
||||
|
||||
memory.swap.current
|
||||
A read-only single value file which exists on non-root
|
||||
cgroups.
|
||||
@@ -1501,7 +1519,7 @@ protected workload.
|
||||
|
||||
The limits are only applied at the peer level in the hierarchy. This means that
|
||||
in the diagram below, only groups A, B, and C will influence each other, and
|
||||
groups D and F will influence each other. Group G will influence nobody.
|
||||
groups D and F will influence each other. Group G will influence nobody::
|
||||
|
||||
[root]
|
||||
/ | \
|
||||
@@ -1610,6 +1628,176 @@ through fork() or clone(). These will return -EAGAIN if the creation
|
||||
of a new process would cause a cgroup policy to be violated.
|
||||
|
||||
|
||||
Cpuset
|
||||
------
|
||||
|
||||
The "cpuset" controller provides a mechanism for constraining
|
||||
the CPU and memory node placement of tasks to only the resources
|
||||
specified in the cpuset interface files in a task's current cgroup.
|
||||
This is especially valuable on large NUMA systems where placing jobs
|
||||
on properly sized subsets of the systems with careful processor and
|
||||
memory placement to reduce cross-node memory access and contention
|
||||
can improve overall system performance.
|
||||
|
||||
The "cpuset" controller is hierarchical. That means the controller
|
||||
cannot use CPUs or memory nodes not allowed in its parent.
|
||||
|
||||
|
||||
Cpuset Interface Files
|
||||
~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
cpuset.cpus
|
||||
A read-write multiple values file which exists on non-root
|
||||
cpuset-enabled cgroups.
|
||||
|
||||
It lists the requested CPUs to be used by tasks within this
|
||||
cgroup. The actual list of CPUs to be granted, however, is
|
||||
subjected to constraints imposed by its parent and can differ
|
||||
from the requested CPUs.
|
||||
|
||||
The CPU numbers are comma-separated numbers or ranges.
|
||||
For example:
|
||||
|
||||
# cat cpuset.cpus
|
||||
0-4,6,8-10
|
||||
|
||||
An empty value indicates that the cgroup is using the same
|
||||
setting as the nearest cgroup ancestor with a non-empty
|
||||
"cpuset.cpus" or all the available CPUs if none is found.
|
||||
|
||||
The value of "cpuset.cpus" stays constant until the next update
|
||||
and won't be affected by any CPU hotplug events.
|
||||
|
||||
cpuset.cpus.effective
|
||||
A read-only multiple values file which exists on all
|
||||
cpuset-enabled cgroups.
|
||||
|
||||
It lists the onlined CPUs that are actually granted to this
|
||||
cgroup by its parent. These CPUs are allowed to be used by
|
||||
tasks within the current cgroup.
|
||||
|
||||
If "cpuset.cpus" is empty, the "cpuset.cpus.effective" file shows
|
||||
all the CPUs from the parent cgroup that can be available to
|
||||
be used by this cgroup. Otherwise, it should be a subset of
|
||||
"cpuset.cpus" unless none of the CPUs listed in "cpuset.cpus"
|
||||
can be granted. In this case, it will be treated just like an
|
||||
empty "cpuset.cpus".
|
||||
|
||||
Its value will be affected by CPU hotplug events.
|
||||
|
||||
cpuset.mems
|
||||
A read-write multiple values file which exists on non-root
|
||||
cpuset-enabled cgroups.
|
||||
|
||||
It lists the requested memory nodes to be used by tasks within
|
||||
this cgroup. The actual list of memory nodes granted, however,
|
||||
is subjected to constraints imposed by its parent and can differ
|
||||
from the requested memory nodes.
|
||||
|
||||
The memory node numbers are comma-separated numbers or ranges.
|
||||
For example:
|
||||
|
||||
# cat cpuset.mems
|
||||
0-1,3
|
||||
|
||||
An empty value indicates that the cgroup is using the same
|
||||
setting as the nearest cgroup ancestor with a non-empty
|
||||
"cpuset.mems" or all the available memory nodes if none
|
||||
is found.
|
||||
|
||||
The value of "cpuset.mems" stays constant until the next update
|
||||
and won't be affected by any memory nodes hotplug events.
|
||||
|
||||
cpuset.mems.effective
|
||||
A read-only multiple values file which exists on all
|
||||
cpuset-enabled cgroups.
|
||||
|
||||
It lists the onlined memory nodes that are actually granted to
|
||||
this cgroup by its parent. These memory nodes are allowed to
|
||||
be used by tasks within the current cgroup.
|
||||
|
||||
If "cpuset.mems" is empty, it shows all the memory nodes from the
|
||||
parent cgroup that will be available to be used by this cgroup.
|
||||
Otherwise, it should be a subset of "cpuset.mems" unless none of
|
||||
the memory nodes listed in "cpuset.mems" can be granted. In this
|
||||
case, it will be treated just like an empty "cpuset.mems".
|
||||
|
||||
Its value will be affected by memory nodes hotplug events.
|
||||
|
||||
cpuset.cpus.partition
|
||||
A read-write single value file which exists on non-root
|
||||
cpuset-enabled cgroups. This flag is owned by the parent cgroup
|
||||
and is not delegatable.
|
||||
|
||||
It accepts only the following input values when written to.
|
||||
|
||||
"root" - a paritition root
|
||||
"member" - a non-root member of a partition
|
||||
|
||||
When set to be a partition root, the current cgroup is the
|
||||
root of a new partition or scheduling domain that comprises
|
||||
itself and all its descendants except those that are separate
|
||||
partition roots themselves and their descendants. The root
|
||||
cgroup is always a partition root.
|
||||
|
||||
There are constraints on where a partition root can be set.
|
||||
It can only be set in a cgroup if all the following conditions
|
||||
are true.
|
||||
|
||||
1) The "cpuset.cpus" is not empty and the list of CPUs are
|
||||
exclusive, i.e. they are not shared by any of its siblings.
|
||||
2) The parent cgroup is a partition root.
|
||||
3) The "cpuset.cpus" is also a proper subset of the parent's
|
||||
"cpuset.cpus.effective".
|
||||
4) There is no child cgroups with cpuset enabled. This is for
|
||||
eliminating corner cases that have to be handled if such a
|
||||
condition is allowed.
|
||||
|
||||
Setting it to partition root will take the CPUs away from the
|
||||
effective CPUs of the parent cgroup. Once it is set, this
|
||||
file cannot be reverted back to "member" if there are any child
|
||||
cgroups with cpuset enabled.
|
||||
|
||||
A parent partition cannot distribute all its CPUs to its
|
||||
child partitions. There must be at least one cpu left in the
|
||||
parent partition.
|
||||
|
||||
Once becoming a partition root, changes to "cpuset.cpus" is
|
||||
generally allowed as long as the first condition above is true,
|
||||
the change will not take away all the CPUs from the parent
|
||||
partition and the new "cpuset.cpus" value is a superset of its
|
||||
children's "cpuset.cpus" values.
|
||||
|
||||
Sometimes, external factors like changes to ancestors'
|
||||
"cpuset.cpus" or cpu hotplug can cause the state of the partition
|
||||
root to change. On read, the "cpuset.sched.partition" file
|
||||
can show the following values.
|
||||
|
||||
"member" Non-root member of a partition
|
||||
"root" Partition root
|
||||
"root invalid" Invalid partition root
|
||||
|
||||
It is a partition root if the first 2 partition root conditions
|
||||
above are true and at least one CPU from "cpuset.cpus" is
|
||||
granted by the parent cgroup.
|
||||
|
||||
A partition root can become invalid if none of CPUs requested
|
||||
in "cpuset.cpus" can be granted by the parent cgroup or the
|
||||
parent cgroup is no longer a partition root itself. In this
|
||||
case, it is not a real partition even though the restriction
|
||||
of the first partition root condition above will still apply.
|
||||
The cpu affinity of all the tasks in the cgroup will then be
|
||||
associated with CPUs in the nearest ancestor partition.
|
||||
|
||||
An invalid partition root can be transitioned back to a
|
||||
real partition root if at least one of the requested CPUs
|
||||
can now be granted by its parent. In this case, the cpu
|
||||
affinity of all the tasks in the formerly invalid partition
|
||||
will be associated to the CPUs of the newly formed partition.
|
||||
Changing the partition state of an invalid partition root to
|
||||
"member" is always allowed even if child cpusets are present.
|
||||
|
||||
|
||||
Device controller
|
||||
-----------------
|
||||
|
||||
@@ -1879,8 +2067,10 @@ following two functions.
|
||||
|
||||
wbc_init_bio(@wbc, @bio)
|
||||
Should be called for each bio carrying writeback data and
|
||||
associates the bio with the inode's owner cgroup. Can be
|
||||
called anytime between bio allocation and submission.
|
||||
associates the bio with the inode's owner cgroup and the
|
||||
corresponding request queue. This must be called after
|
||||
a queue (device) has been associated with the bio and
|
||||
before submission.
|
||||
|
||||
wbc_account_io(@wbc, @page, @bytes)
|
||||
Should be called for each data segment being written out.
|
||||
@@ -1899,7 +2089,7 @@ the configuration, the bio may be executed at a lower priority and if
|
||||
the writeback session is holding shared resources, e.g. a journal
|
||||
entry, may lead to priority inversion. There is no one easy solution
|
||||
for the problem. Filesystems can try to work around specific problem
|
||||
cases by skipping wbc_init_bio() or using bio_associate_blkcg()
|
||||
cases by skipping wbc_init_bio() and using bio_associate_blkg()
|
||||
directly.
|
||||
|
||||
|
||||
|
@@ -1,3 +1,4 @@
|
||||
.. _admin_devices:
|
||||
|
||||
Linux allocated devices (4.x+ version)
|
||||
======================================
|
||||
|
@@ -110,8 +110,8 @@ If your query set is big, you can batch them too::
|
||||
|
||||
~# cat query-batch-file > <debugfs>/dynamic_debug/control
|
||||
|
||||
A another way is to use wildcard. The match rule support ``*`` (matches
|
||||
zero or more characters) and ``?`` (matches exactly one character).For
|
||||
Another way is to use wildcards. The match rule supports ``*`` (matches
|
||||
zero or more characters) and ``?`` (matches exactly one character). For
|
||||
example, you can match all usb drivers::
|
||||
|
||||
~# echo "file drivers/usb/* +p" > <debugfs>/dynamic_debug/control
|
||||
@@ -258,7 +258,7 @@ this boot parameter for debugging purposes.
|
||||
|
||||
If ``foo`` module is not built-in, ``foo.dyndbg`` will still be processed at
|
||||
boot time, without effect, but will be reprocessed when module is
|
||||
loaded later. ``dyndbg_query=`` and bare ``dyndbg=`` are only processed at
|
||||
loaded later. ``ddebug_query=`` and bare ``dyndbg=`` are only processed at
|
||||
boot.
|
||||
|
||||
|
||||
@@ -301,7 +301,7 @@ The ``dyndbg`` option is a "fake" module parameter, which means:
|
||||
|
||||
For ``CONFIG_DYNAMIC_DEBUG`` kernels, any settings given at boot-time (or
|
||||
enabled by ``-DDEBUG`` flag during compilation) can be disabled later via
|
||||
the sysfs interface if the debug messages are no longer needed::
|
||||
the debugfs interface if the debug messages are no longer needed::
|
||||
|
||||
echo "module module_name -p" > <debugfs>/dynamic_debug/control
|
||||
|
||||
|
@@ -76,6 +76,7 @@ configure specific aspects of kernel behavior to your liking.
|
||||
thunderbolt
|
||||
LSM/index
|
||||
mm/index
|
||||
perf-security
|
||||
|
||||
.. only:: subproject and html
|
||||
|
||||
|
@@ -331,7 +331,7 @@
|
||||
APC and your system crashes randomly.
|
||||
|
||||
apic= [APIC,X86] Advanced Programmable Interrupt Controller
|
||||
Change the output verbosity whilst booting
|
||||
Change the output verbosity while booting
|
||||
Format: { quiet (default) | verbose | debug }
|
||||
Change the amount of debugging information output
|
||||
when initialising the APIC and IO-APIC components.
|
||||
@@ -461,6 +461,11 @@
|
||||
possible to determine what the correct size should be.
|
||||
This option provides an override for these situations.
|
||||
|
||||
carrier_timeout=
|
||||
[NET] Specifies amount of time (in seconds) that
|
||||
the kernel should wait for a network carrier. By default
|
||||
it waits 120 seconds.
|
||||
|
||||
ca_keys= [KEYS] This parameter identifies a specific key(s) on
|
||||
the system trusted keyring to be used for certificate
|
||||
trust validation.
|
||||
@@ -486,10 +491,14 @@
|
||||
cut the overhead, others just disable the usage. So
|
||||
only cgroup_disable=memory is actually worthy}
|
||||
|
||||
cgroup_no_v1= [KNL] Disable one, multiple, all cgroup controllers in v1
|
||||
Format: { controller[,controller...] | "all" }
|
||||
cgroup_no_v1= [KNL] Disable cgroup controllers and named hierarchies in v1
|
||||
Format: { { controller | "all" | "named" }
|
||||
[,{ controller | "all" | "named" }...] }
|
||||
Like cgroup_disable, but only applies to cgroup v1;
|
||||
the blacklisted controllers remain available in cgroup2.
|
||||
"all" blacklists all controllers and "named" disables
|
||||
named mounts. Specifying both "all" and "named" disables
|
||||
all v1 hierarchies.
|
||||
|
||||
cgroup.memory= [KNL] Pass options to the cgroup memory controller.
|
||||
Format: <string>
|
||||
@@ -674,6 +683,9 @@
|
||||
cpuidle.off=1 [CPU_IDLE]
|
||||
disable the cpuidle sub-system
|
||||
|
||||
cpuidle.governor=
|
||||
[CPU_IDLE] Name of the cpuidle governor to use.
|
||||
|
||||
cpufreq.off=1 [CPU_FREQ]
|
||||
disable the cpufreq sub-system
|
||||
|
||||
@@ -903,6 +915,10 @@
|
||||
The filter can be disabled or changed to another
|
||||
driver later using sysfs.
|
||||
|
||||
driver_async_probe= [KNL]
|
||||
List of driver names to be probed asynchronously.
|
||||
Format: <driver_name1>,<driver_name2>...
|
||||
|
||||
drm.edid_firmware=[<connector>:]<file>[,[<connector>:]<file>]
|
||||
Broken monitors, graphic adapters, KVMs and EDIDless
|
||||
panels may send no or incorrect EDID data sets.
|
||||
@@ -1021,6 +1037,12 @@
|
||||
specified address. The serial port must already be
|
||||
setup and configured. Options are not yet supported.
|
||||
|
||||
rda,<addr>
|
||||
Start an early, polled-mode console on a serial port
|
||||
of an RDA Micro SoC, such as RDA8810PL, at the
|
||||
specified address. The serial port must already be
|
||||
setup and configured. Options are not yet supported.
|
||||
|
||||
smh Use ARM semihosting calls for early console.
|
||||
|
||||
s3c2410,<addr>
|
||||
@@ -1060,9 +1082,15 @@
|
||||
specified address. The serial port must already be
|
||||
setup and configured. Options are not yet supported.
|
||||
|
||||
efifb,[options]
|
||||
Start an early, unaccelerated console on the EFI
|
||||
memory mapped framebuffer (if available). On cache
|
||||
coherent non-x86 systems that use system memory for
|
||||
the framebuffer, pass the 'ram' option so that it is
|
||||
mapped with the correct attributes.
|
||||
|
||||
earlyprintk= [X86,SH,ARM,M68k,S390]
|
||||
earlyprintk=vga
|
||||
earlyprintk=efi
|
||||
earlyprintk=sclp
|
||||
earlyprintk=xen
|
||||
earlyprintk=serial[,ttySn[,baudrate]]
|
||||
@@ -1169,9 +1197,10 @@
|
||||
arch/x86/kernel/cpu/cpufreq/elanfreq.c.
|
||||
|
||||
elevator= [IOSCHED]
|
||||
Format: {"cfq" | "deadline" | "noop"}
|
||||
See Documentation/block/cfq-iosched.txt and
|
||||
Documentation/block/deadline-iosched.txt for details.
|
||||
Format: { "mq-deadline" | "kyber" | "bfq" }
|
||||
See Documentation/block/deadline-iosched.txt,
|
||||
Documentation/block/kyber-iosched.txt and
|
||||
Documentation/block/bfq-iosched.txt for details.
|
||||
|
||||
elfcorehdr=[size[KMG]@]offset[KMG] [IA64,PPC,SH,X86,S390]
|
||||
Specifies physical address of start of kernel core
|
||||
@@ -1683,12 +1712,11 @@
|
||||
By default, super page will be supported if Intel IOMMU
|
||||
has the capability. With this option, super page will
|
||||
not be supported.
|
||||
ecs_off [Default Off]
|
||||
By default, extended context tables will be supported if
|
||||
the hardware advertises that it has support both for the
|
||||
extended tables themselves, and also PASID support. With
|
||||
this option set, extended tables will not be used even
|
||||
on hardware which claims to support them.
|
||||
sm_on [Default Off]
|
||||
By default, scalable mode will be disabled even if the
|
||||
hardware advertises that it has support for the scalable
|
||||
mode translation. With this option set, scalable mode
|
||||
will be used on hardware which claims to support it.
|
||||
tboot_noforce [Default Off]
|
||||
Do not force the Intel IOMMU enabled under tboot.
|
||||
By default, tboot will force Intel IOMMU on, which
|
||||
@@ -1818,6 +1846,11 @@
|
||||
to let secondary kernels in charge of setting up
|
||||
LPIs.
|
||||
|
||||
irqchip.gicv3_pseudo_nmi= [ARM64]
|
||||
Enables support for pseudo-NMIs in the kernel. This
|
||||
requires the kernel to be built with
|
||||
CONFIG_ARM64_PSEUDO_NMI.
|
||||
|
||||
irqfixup [HW]
|
||||
When an interrupt is not handled search all handlers
|
||||
for it. Intended to get systems with badly broken
|
||||
@@ -1969,6 +2002,12 @@
|
||||
Built with CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF=y,
|
||||
the default is off.
|
||||
|
||||
kpti= [ARM64] Control page table isolation of user
|
||||
and kernel address spaces.
|
||||
Default: enabled on cores which need mitigation.
|
||||
0: force disabled
|
||||
1: force enabled
|
||||
|
||||
kvm.ignore_msrs=[KVM] Ignore guest accesses to unhandled MSRs.
|
||||
Default is 0 (don't ignore, but inject #GP)
|
||||
|
||||
@@ -2096,6 +2135,9 @@
|
||||
off
|
||||
Disables hypervisor mitigations and doesn't
|
||||
emit any warnings.
|
||||
It also drops the swap size and available
|
||||
RAM limit restriction on both hypervisor and
|
||||
bare metal.
|
||||
|
||||
Default is 'flush'.
|
||||
|
||||
@@ -2303,6 +2345,10 @@
|
||||
|
||||
lsm.debug [SECURITY] Enable LSM initialization debugging output.
|
||||
|
||||
lsm=lsm1,...,lsmN
|
||||
[SECURITY] Choose order of LSM initialization. This
|
||||
overrides CONFIG_LSM, and the "security=" parameter.
|
||||
|
||||
machvec= [IA-64] Force the use of a particular machine-vector
|
||||
(machvec) in a generic kernel.
|
||||
Example: machvec=hpzx1_swiotlb
|
||||
@@ -2827,7 +2873,7 @@
|
||||
check bypass). With this option data leaks are possible
|
||||
in the system.
|
||||
|
||||
nospectre_v2 [X86] Disable all mitigations for the Spectre variant 2
|
||||
nospectre_v2 [X86,PPC_FSL_BOOK3E] Disable all mitigations for the Spectre variant 2
|
||||
(indirect branch prediction) vulnerability. System may
|
||||
allow data leaks with this option, which is equivalent
|
||||
to spectre_v2=off.
|
||||
@@ -3082,6 +3128,14 @@
|
||||
timeout < 0: reboot immediately
|
||||
Format: <timeout>
|
||||
|
||||
panic_print= Bitmask for printing system info when panic happens.
|
||||
User can chose combination of the following bits:
|
||||
bit 0: print all tasks info
|
||||
bit 1: print system memory info
|
||||
bit 2: print timer info
|
||||
bit 3: print locks info if CONFIG_LOCKDEP is on
|
||||
bit 4: print ftrace buffer
|
||||
|
||||
panic_on_warn panic() instead of WARN(). Useful to cause kdump
|
||||
on a WARN().
|
||||
|
||||
@@ -3630,19 +3684,6 @@
|
||||
latencies, which will choose a value aligned
|
||||
with the appropriate hardware boundaries.
|
||||
|
||||
rcutree.jiffies_till_sched_qs= [KNL]
|
||||
Set required age in jiffies for a
|
||||
given grace period before RCU starts
|
||||
soliciting quiescent-state help from
|
||||
rcu_note_context_switch(). If not specified, the
|
||||
kernel will calculate a value based on the most
|
||||
recent settings of rcutree.jiffies_till_first_fqs
|
||||
and rcutree.jiffies_till_next_fqs.
|
||||
This calculated value may be viewed in
|
||||
rcutree.jiffies_to_sched_qs. Any attempt to
|
||||
set rcutree.jiffies_to_sched_qs will be
|
||||
cheerfully overwritten.
|
||||
|
||||
rcutree.jiffies_till_first_fqs= [KNL]
|
||||
Set delay from grace-period initialization to
|
||||
first attempt to force quiescent states.
|
||||
@@ -3654,6 +3695,20 @@
|
||||
quiescent states. Units are jiffies, minimum
|
||||
value is one, and maximum value is HZ.
|
||||
|
||||
rcutree.jiffies_till_sched_qs= [KNL]
|
||||
Set required age in jiffies for a
|
||||
given grace period before RCU starts
|
||||
soliciting quiescent-state help from
|
||||
rcu_note_context_switch() and cond_resched().
|
||||
If not specified, the kernel will calculate
|
||||
a value based on the most recent settings
|
||||
of rcutree.jiffies_till_first_fqs
|
||||
and rcutree.jiffies_till_next_fqs.
|
||||
This calculated value may be viewed in
|
||||
rcutree.jiffies_to_sched_qs. Any attempt to set
|
||||
rcutree.jiffies_to_sched_qs will be cheerfully
|
||||
overwritten.
|
||||
|
||||
rcutree.kthread_prio= [KNL,BOOT]
|
||||
Set the SCHED_FIFO priority of the RCU per-CPU
|
||||
kthreads (rcuc/N). This value is also used for
|
||||
@@ -3697,6 +3752,11 @@
|
||||
This wake_up() will be accompanied by a
|
||||
WARN_ONCE() splat and an ftrace_dump().
|
||||
|
||||
rcutree.sysrq_rcu= [KNL]
|
||||
Commandeer a sysrq key to dump out Tree RCU's
|
||||
rcu_node tree with an eye towards determining
|
||||
why a new grace period has not yet started.
|
||||
|
||||
rcuperf.gp_async= [KNL]
|
||||
Measure performance of asynchronous
|
||||
grace-period primitives such as call_rcu().
|
||||
@@ -3748,24 +3808,6 @@
|
||||
in microseconds. The default of zero says
|
||||
no holdoff.
|
||||
|
||||
rcutorture.cbflood_inter_holdoff= [KNL]
|
||||
Set holdoff time (jiffies) between successive
|
||||
callback-flood tests.
|
||||
|
||||
rcutorture.cbflood_intra_holdoff= [KNL]
|
||||
Set holdoff time (jiffies) between successive
|
||||
bursts of callbacks within a given callback-flood
|
||||
test.
|
||||
|
||||
rcutorture.cbflood_n_burst= [KNL]
|
||||
Set the number of bursts making up a given
|
||||
callback-flood test. Set this to zero to
|
||||
disable callback-flood testing.
|
||||
|
||||
rcutorture.cbflood_n_per_burst= [KNL]
|
||||
Set the number of callbacks to be registered
|
||||
in a given burst of a callback-flood test.
|
||||
|
||||
rcutorture.fqs_duration= [KNL]
|
||||
Set duration of force_quiescent_state bursts
|
||||
in microseconds.
|
||||
@@ -3778,6 +3820,23 @@
|
||||
Set wait time between force_quiescent_state bursts
|
||||
in seconds.
|
||||
|
||||
rcutorture.fwd_progress= [KNL]
|
||||
Enable RCU grace-period forward-progress testing
|
||||
for the types of RCU supporting this notion.
|
||||
|
||||
rcutorture.fwd_progress_div= [KNL]
|
||||
Specify the fraction of a CPU-stall-warning
|
||||
period to do tight-loop forward-progress testing.
|
||||
|
||||
rcutorture.fwd_progress_holdoff= [KNL]
|
||||
Number of seconds to wait between successive
|
||||
forward-progress tests.
|
||||
|
||||
rcutorture.fwd_progress_need_resched= [KNL]
|
||||
Enclose cond_resched() calls within checks for
|
||||
need_resched() during tight-loop forward-progress
|
||||
testing.
|
||||
|
||||
rcutorture.gp_cond= [KNL]
|
||||
Use conditional/asynchronous update-side
|
||||
primitives, if available.
|
||||
@@ -4067,11 +4126,9 @@
|
||||
Note: increases power consumption, thus should only be
|
||||
enabled if running jitter sensitive (HPC/RT) workloads.
|
||||
|
||||
security= [SECURITY] Choose a security module to enable at boot.
|
||||
If this boot parameter is not specified, only the first
|
||||
security module asking for security registration will be
|
||||
loaded. An invalid security module name will be treated
|
||||
as if no module has been chosen.
|
||||
security= [SECURITY] Choose a legacy "major" security module to
|
||||
enable at boot. This has been deprecated by the
|
||||
"lsm=" parameter.
|
||||
|
||||
selinux= [SELINUX] Disable or enable SELinux at boot time.
|
||||
Format: { "0" | "1" }
|
||||
@@ -4675,7 +4732,8 @@
|
||||
usbcore.authorized_default=
|
||||
[USB] Default USB device authorization:
|
||||
(default -1 = authorized except for wireless USB,
|
||||
0 = not authorized, 1 = authorized)
|
||||
0 = not authorized, 1 = authorized, 2 = authorized
|
||||
if device connected to internal port)
|
||||
|
||||
usbcore.autosuspend=
|
||||
[USB] The autosuspend time delay (in seconds) used
|
||||
@@ -5020,6 +5078,14 @@
|
||||
or other driver-specific files in the
|
||||
Documentation/watchdog/ directory.
|
||||
|
||||
watchdog_thresh=
|
||||
[KNL]
|
||||
Set the hard lockup detector stall duration
|
||||
threshold in seconds. The soft lockup detector
|
||||
threshold is set to twice the value. A value of 0
|
||||
disables both lockup detectors. Default is 10
|
||||
seconds.
|
||||
|
||||
workqueue.watchdog_thresh=
|
||||
If CONFIG_WQ_WATCHDOG is configured, workqueue can
|
||||
warn stall conditions and dump internal state to
|
||||
|
@@ -405,6 +405,9 @@ time with the option "l1tf=". The valid arguments for this option are:
|
||||
|
||||
off Disables hypervisor mitigations and doesn't emit any
|
||||
warnings.
|
||||
It also drops the swap size and available RAM limit restrictions
|
||||
on both hypervisor and bare metal.
|
||||
|
||||
============ =============================================================
|
||||
|
||||
The default is 'flush'. For details about L1D flushing see :ref:`l1d_flush`.
|
||||
@@ -576,7 +579,8 @@ Default mitigations
|
||||
The kernel default mitigations for vulnerable processors are:
|
||||
|
||||
- PTE inversion to protect against malicious user space. This is done
|
||||
unconditionally and cannot be controlled.
|
||||
unconditionally and cannot be controlled. The swap storage is limited
|
||||
to ~16TB.
|
||||
|
||||
- L1D conditional flushing on VMENTER when EPT is enabled for
|
||||
a guest.
|
||||
|
@@ -756,3 +756,6 @@ These currently include:
|
||||
The cache mode for raid5. raid5 could include an extra disk for
|
||||
caching. The mode can be "write-throuth" and "write-back". The
|
||||
default is "write-through".
|
||||
|
||||
ppl_write_hint
|
||||
NVMe stream ID to be set for each PPL write request.
|
||||
|
@@ -4,13 +4,13 @@
|
||||
Concepts overview
|
||||
=================
|
||||
|
||||
The memory management in Linux is complex system that evolved over the
|
||||
years and included more and more functionality to support variety of
|
||||
The memory management in Linux is a complex system that evolved over the
|
||||
years and included more and more functionality to support a variety of
|
||||
systems from MMU-less microcontrollers to supercomputers. The memory
|
||||
management for systems without MMU is called ``nommu`` and it
|
||||
management for systems without an MMU is called ``nommu`` and it
|
||||
definitely deserves a dedicated document, which hopefully will be
|
||||
eventually written. Yet, although some of the concepts are the same,
|
||||
here we assume that MMU is available and CPU can translate a virtual
|
||||
here we assume that an MMU is available and a CPU can translate a virtual
|
||||
address to a physical address.
|
||||
|
||||
.. contents:: :local:
|
||||
@@ -21,10 +21,10 @@ Virtual Memory Primer
|
||||
The physical memory in a computer system is a limited resource and
|
||||
even for systems that support memory hotplug there is a hard limit on
|
||||
the amount of memory that can be installed. The physical memory is not
|
||||
necessary contiguous, it might be accessible as a set of distinct
|
||||
necessarily contiguous; it might be accessible as a set of distinct
|
||||
address ranges. Besides, different CPU architectures, and even
|
||||
different implementations of the same architecture have different view
|
||||
how these address ranges defined.
|
||||
different implementations of the same architecture have different views
|
||||
of how these address ranges are defined.
|
||||
|
||||
All this makes dealing directly with physical memory quite complex and
|
||||
to avoid this complexity a concept of virtual memory was developed.
|
||||
@@ -48,8 +48,8 @@ appropriate kernel configuration option.
|
||||
|
||||
Each physical memory page can be mapped as one or more virtual
|
||||
pages. These mappings are described by page tables that allow
|
||||
translation from virtual address used by programs to real address in
|
||||
the physical memory. The page tables organized hierarchically.
|
||||
translation from a virtual address used by programs to the physical
|
||||
memory address. The page tables are organized hierarchically.
|
||||
|
||||
The tables at the lowest level of the hierarchy contain physical
|
||||
addresses of actual pages used by the software. The tables at higher
|
||||
@@ -121,8 +121,8 @@ Nodes
|
||||
Many multi-processor machines are NUMA - Non-Uniform Memory Access -
|
||||
systems. In such systems the memory is arranged into banks that have
|
||||
different access latency depending on the "distance" from the
|
||||
processor. Each bank is referred as `node` and for each node Linux
|
||||
constructs an independent memory management subsystem. A node has it's
|
||||
processor. Each bank is referred to as a `node` and for each node Linux
|
||||
constructs an independent memory management subsystem. A node has its
|
||||
own set of zones, lists of free and used pages and various statistics
|
||||
counters. You can find more details about NUMA in
|
||||
:ref:`Documentation/vm/numa.rst <numa>` and in
|
||||
@@ -149,9 +149,9 @@ for program's stack and heap or by explicit calls to mmap(2) system
|
||||
call. Usually, the anonymous mappings only define virtual memory areas
|
||||
that the program is allowed to access. The read accesses will result
|
||||
in creation of a page table entry that references a special physical
|
||||
page filled with zeroes. When the program performs a write, regular
|
||||
page filled with zeroes. When the program performs a write, a regular
|
||||
physical page will be allocated to hold the written data. The page
|
||||
will be marked dirty and if the kernel will decide to repurpose it,
|
||||
will be marked dirty and if the kernel decides to repurpose it,
|
||||
the dirty page will be swapped out.
|
||||
|
||||
Reclaim
|
||||
@@ -181,8 +181,8 @@ pressure.
|
||||
The process of freeing the reclaimable physical memory pages and
|
||||
repurposing them is called (surprise!) `reclaim`. Linux can reclaim
|
||||
pages either asynchronously or synchronously, depending on the state
|
||||
of the system. When system is not loaded, most of the memory is free
|
||||
and allocation request will be satisfied immediately from the free
|
||||
of the system. When the system is not loaded, most of the memory is free
|
||||
and allocation requests will be satisfied immediately from the free
|
||||
pages supply. As the load increases, the amount of the free pages goes
|
||||
down and when it reaches a certain threshold (high watermark), an
|
||||
allocation request will awaken the ``kswapd`` daemon. It will
|
||||
@@ -190,7 +190,7 @@ asynchronously scan memory pages and either just free them if the data
|
||||
they contain is available elsewhere, or evict to the backing storage
|
||||
device (remember those dirty pages?). As memory usage increases even
|
||||
more and reaches another threshold - min watermark - an allocation
|
||||
will trigger the `direct reclaim`. In this case allocation is stalled
|
||||
will trigger `direct reclaim`. In this case allocation is stalled
|
||||
until enough memory pages are reclaimed to satisfy the request.
|
||||
|
||||
Compaction
|
||||
@@ -200,7 +200,7 @@ As the system runs, tasks allocate and free the memory and it becomes
|
||||
fragmented. Although with virtual memory it is possible to present
|
||||
scattered physical pages as virtually contiguous range, sometimes it is
|
||||
necessary to allocate large physically contiguous memory areas. Such
|
||||
need may arise, for instance, when a device driver requires large
|
||||
need may arise, for instance, when a device driver requires a large
|
||||
buffer for DMA, or when THP allocates a huge page. Memory `compaction`
|
||||
addresses the fragmentation issue. This mechanism moves occupied pages
|
||||
from the lower part of a memory zone to free pages in the upper part
|
||||
@@ -208,15 +208,16 @@ of the zone. When a compaction scan is finished free pages are grouped
|
||||
together at the beginning of the zone and allocations of large
|
||||
physically contiguous areas become possible.
|
||||
|
||||
Like reclaim, the compaction may happen asynchronously in ``kcompactd``
|
||||
daemon or synchronously as a result of memory allocation request.
|
||||
Like reclaim, the compaction may happen asynchronously in the ``kcompactd``
|
||||
daemon or synchronously as a result of a memory allocation request.
|
||||
|
||||
OOM killer
|
||||
==========
|
||||
|
||||
It may happen, that on a loaded machine memory will be exhausted. When
|
||||
the kernel detects that the system runs out of memory (OOM) it invokes
|
||||
`OOM killer`. Its mission is simple: all it has to do is to select a
|
||||
task to sacrifice for the sake of the overall system health. The
|
||||
selected task is killed in a hope that after it exits enough memory
|
||||
will be freed to continue normal operation.
|
||||
It is possible that on a loaded machine memory will be exhausted and the
|
||||
kernel will be unable to reclaim enough memory to continue to operate. In
|
||||
order to save the rest of the system, it invokes the `OOM killer`.
|
||||
|
||||
The `OOM killer` selects a task to sacrifice for the sake of the overall
|
||||
system health. The selected task is killed in a hope that after it exits
|
||||
enough memory will be freed to continue normal operation.
|
||||
|
@@ -75,9 +75,10 @@ number of times a page is mapped.
|
||||
20. NOPAGE
|
||||
21. KSM
|
||||
22. THP
|
||||
23. BALLOON
|
||||
23. OFFLINE
|
||||
24. ZERO_PAGE
|
||||
25. IDLE
|
||||
26. PGTABLE
|
||||
|
||||
* ``/proc/kpagecgroup``. This file contains a 64-bit inode number of the
|
||||
memory cgroup each page is charged to, indexed by PFN. Only available when
|
||||
@@ -118,8 +119,8 @@ Short descriptions to the page flags
|
||||
identical memory pages dynamically shared between one or more processes
|
||||
22 - THP
|
||||
contiguous pages which construct transparent hugepages
|
||||
23 - BALLOON
|
||||
balloon compaction page
|
||||
23 - OFFLINE
|
||||
page is logically offline
|
||||
24 - ZERO_PAGE
|
||||
zero page for pfn_zero or huge_zero page
|
||||
25 - IDLE
|
||||
@@ -128,6 +129,8 @@ Short descriptions to the page flags
|
||||
Note that this flag may be stale in case the page was accessed via
|
||||
a PTE. To make sure the flag is up-to-date one has to read
|
||||
``/sys/kernel/mm/page_idle/bitmap`` first.
|
||||
26 - PGTABLE
|
||||
page is in use as a page table
|
||||
|
||||
IO related page flags
|
||||
---------------------
|
||||
|
230
Documentation/admin-guide/perf-security.rst
Normal file
@@ -0,0 +1,230 @@
|
||||
.. _perf_security:
|
||||
|
||||
Perf Events and tool security
|
||||
=============================
|
||||
|
||||
Overview
|
||||
--------
|
||||
|
||||
Usage of Performance Counters for Linux (perf_events) [1]_ , [2]_ , [3]_
|
||||
can impose a considerable risk of leaking sensitive data accessed by
|
||||
monitored processes. The data leakage is possible both in scenarios of
|
||||
direct usage of perf_events system call API [2]_ and over data files
|
||||
generated by Perf tool user mode utility (Perf) [3]_ , [4]_ . The risk
|
||||
depends on the nature of data that perf_events performance monitoring
|
||||
units (PMU) [2]_ and Perf collect and expose for performance analysis.
|
||||
Collected system and performance data may be split into several
|
||||
categories:
|
||||
|
||||
1. System hardware and software configuration data, for example: a CPU
|
||||
model and its cache configuration, an amount of available memory and
|
||||
its topology, used kernel and Perf versions, performance monitoring
|
||||
setup including experiment time, events configuration, Perf command
|
||||
line parameters, etc.
|
||||
|
||||
2. User and kernel module paths and their load addresses with sizes,
|
||||
process and thread names with their PIDs and TIDs, timestamps for
|
||||
captured hardware and software events.
|
||||
|
||||
3. Content of kernel software counters (e.g., for context switches, page
|
||||
faults, CPU migrations), architectural hardware performance counters
|
||||
(PMC) [8]_ and machine specific registers (MSR) [9]_ that provide
|
||||
execution metrics for various monitored parts of the system (e.g.,
|
||||
memory controller (IMC), interconnect (QPI/UPI) or peripheral (PCIe)
|
||||
uncore counters) without direct attribution to any execution context
|
||||
state.
|
||||
|
||||
4. Content of architectural execution context registers (e.g., RIP, RSP,
|
||||
RBP on x86_64), process user and kernel space memory addresses and
|
||||
data, content of various architectural MSRs that capture data from
|
||||
this category.
|
||||
|
||||
Data that belong to the fourth category can potentially contain
|
||||
sensitive process data. If PMUs in some monitoring modes capture values
|
||||
of execution context registers or data from process memory then access
|
||||
to such monitoring capabilities requires to be ordered and secured
|
||||
properly. So, perf_events/Perf performance monitoring is the subject for
|
||||
security access control management [5]_ .
|
||||
|
||||
perf_events/Perf access control
|
||||
-------------------------------
|
||||
|
||||
To perform security checks, the Linux implementation splits processes
|
||||
into two categories [6]_ : a) privileged processes (whose effective user
|
||||
ID is 0, referred to as superuser or root), and b) unprivileged
|
||||
processes (whose effective UID is nonzero). Privileged processes bypass
|
||||
all kernel security permission checks so perf_events performance
|
||||
monitoring is fully available to privileged processes without access,
|
||||
scope and resource restrictions.
|
||||
|
||||
Unprivileged processes are subject to a full security permission check
|
||||
based on the process's credentials [5]_ (usually: effective UID,
|
||||
effective GID, and supplementary group list).
|
||||
|
||||
Linux divides the privileges traditionally associated with superuser
|
||||
into distinct units, known as capabilities [6]_ , which can be
|
||||
independently enabled and disabled on per-thread basis for processes and
|
||||
files of unprivileged users.
|
||||
|
||||
Unprivileged processes with enabled CAP_SYS_ADMIN capability are treated
|
||||
as privileged processes with respect to perf_events performance
|
||||
monitoring and bypass *scope* permissions checks in the kernel.
|
||||
|
||||
Unprivileged processes using perf_events system call API is also subject
|
||||
for PTRACE_MODE_READ_REALCREDS ptrace access mode check [7]_ , whose
|
||||
outcome determines whether monitoring is permitted. So unprivileged
|
||||
processes provided with CAP_SYS_PTRACE capability are effectively
|
||||
permitted to pass the check.
|
||||
|
||||
Other capabilities being granted to unprivileged processes can
|
||||
effectively enable capturing of additional data required for later
|
||||
performance analysis of monitored processes or a system. For example,
|
||||
CAP_SYSLOG capability permits reading kernel space memory addresses from
|
||||
/proc/kallsyms file.
|
||||
|
||||
perf_events/Perf privileged users
|
||||
---------------------------------
|
||||
|
||||
Mechanisms of capabilities, privileged capability-dumb files [6]_ and
|
||||
file system ACLs [10]_ can be used to create a dedicated group of
|
||||
perf_events/Perf privileged users who are permitted to execute
|
||||
performance monitoring without scope limits. The following steps can be
|
||||
taken to create such a group of privileged Perf users.
|
||||
|
||||
1. Create perf_users group of privileged Perf users, assign perf_users
|
||||
group to Perf tool executable and limit access to the executable for
|
||||
other users in the system who are not in the perf_users group:
|
||||
|
||||
::
|
||||
|
||||
# groupadd perf_users
|
||||
# ls -alhF
|
||||
-rwxr-xr-x 2 root root 11M Oct 19 15:12 perf
|
||||
# chgrp perf_users perf
|
||||
# ls -alhF
|
||||
-rwxr-xr-x 2 root perf_users 11M Oct 19 15:12 perf
|
||||
# chmod o-rwx perf
|
||||
# ls -alhF
|
||||
-rwxr-x--- 2 root perf_users 11M Oct 19 15:12 perf
|
||||
|
||||
2. Assign the required capabilities to the Perf tool executable file and
|
||||
enable members of perf_users group with performance monitoring
|
||||
privileges [6]_ :
|
||||
|
||||
::
|
||||
|
||||
# setcap "cap_sys_admin,cap_sys_ptrace,cap_syslog=ep" perf
|
||||
# setcap -v "cap_sys_admin,cap_sys_ptrace,cap_syslog=ep" perf
|
||||
perf: OK
|
||||
# getcap perf
|
||||
perf = cap_sys_ptrace,cap_sys_admin,cap_syslog+ep
|
||||
|
||||
As a result, members of perf_users group are capable of conducting
|
||||
performance monitoring by using functionality of the configured Perf
|
||||
tool executable that, when executes, passes perf_events subsystem scope
|
||||
checks.
|
||||
|
||||
This specific access control management is only available to superuser
|
||||
or root running processes with CAP_SETPCAP, CAP_SETFCAP [6]_
|
||||
capabilities.
|
||||
|
||||
perf_events/Perf unprivileged users
|
||||
-----------------------------------
|
||||
|
||||
perf_events/Perf *scope* and *access* control for unprivileged processes
|
||||
is governed by perf_event_paranoid [2]_ setting:
|
||||
|
||||
-1:
|
||||
Impose no *scope* and *access* restrictions on using perf_events
|
||||
performance monitoring. Per-user per-cpu perf_event_mlock_kb [2]_
|
||||
locking limit is ignored when allocating memory buffers for storing
|
||||
performance data. This is the least secure mode since allowed
|
||||
monitored *scope* is maximized and no perf_events specific limits
|
||||
are imposed on *resources* allocated for performance monitoring.
|
||||
|
||||
>=0:
|
||||
*scope* includes per-process and system wide performance monitoring
|
||||
but excludes raw tracepoints and ftrace function tracepoints
|
||||
monitoring. CPU and system events happened when executing either in
|
||||
user or in kernel space can be monitored and captured for later
|
||||
analysis. Per-user per-cpu perf_event_mlock_kb locking limit is
|
||||
imposed but ignored for unprivileged processes with CAP_IPC_LOCK
|
||||
[6]_ capability.
|
||||
|
||||
>=1:
|
||||
*scope* includes per-process performance monitoring only and
|
||||
excludes system wide performance monitoring. CPU and system events
|
||||
happened when executing either in user or in kernel space can be
|
||||
monitored and captured for later analysis. Per-user per-cpu
|
||||
perf_event_mlock_kb locking limit is imposed but ignored for
|
||||
unprivileged processes with CAP_IPC_LOCK capability.
|
||||
|
||||
>=2:
|
||||
*scope* includes per-process performance monitoring only. CPU and
|
||||
system events happened when executing in user space only can be
|
||||
monitored and captured for later analysis. Per-user per-cpu
|
||||
perf_event_mlock_kb locking limit is imposed but ignored for
|
||||
unprivileged processes with CAP_IPC_LOCK capability.
|
||||
|
||||
perf_events/Perf resource control
|
||||
---------------------------------
|
||||
|
||||
Open file descriptors
|
||||
+++++++++++++++++++++
|
||||
|
||||
The perf_events system call API [2]_ allocates file descriptors for
|
||||
every configured PMU event. Open file descriptors are a per-process
|
||||
accountable resource governed by the RLIMIT_NOFILE [11]_ limit
|
||||
(ulimit -n), which is usually derived from the login shell process. When
|
||||
configuring Perf collection for a long list of events on a large server
|
||||
system, this limit can be easily hit preventing required monitoring
|
||||
configuration. RLIMIT_NOFILE limit can be increased on per-user basis
|
||||
modifying content of the limits.conf file [12]_ . Ordinarily, a Perf
|
||||
sampling session (perf record) requires an amount of open perf_event
|
||||
file descriptors that is not less than the number of monitored events
|
||||
multiplied by the number of monitored CPUs.
|
||||
|
||||
Memory allocation
|
||||
+++++++++++++++++
|
||||
|
||||
The amount of memory available to user processes for capturing
|
||||
performance monitoring data is governed by the perf_event_mlock_kb [2]_
|
||||
setting. This perf_event specific resource setting defines overall
|
||||
per-cpu limits of memory allowed for mapping by the user processes to
|
||||
execute performance monitoring. The setting essentially extends the
|
||||
RLIMIT_MEMLOCK [11]_ limit, but only for memory regions mapped
|
||||
specifically for capturing monitored performance events and related data.
|
||||
|
||||
For example, if a machine has eight cores and perf_event_mlock_kb limit
|
||||
is set to 516 KiB, then a user process is provided with 516 KiB * 8 =
|
||||
4128 KiB of memory above the RLIMIT_MEMLOCK limit (ulimit -l) for
|
||||
perf_event mmap buffers. In particular, this means that, if the user
|
||||
wants to start two or more performance monitoring processes, the user is
|
||||
required to manually distribute the available 4128 KiB between the
|
||||
monitoring processes, for example, using the --mmap-pages Perf record
|
||||
mode option. Otherwise, the first started performance monitoring process
|
||||
allocates all available 4128 KiB and the other processes will fail to
|
||||
proceed due to the lack of memory.
|
||||
|
||||
RLIMIT_MEMLOCK and perf_event_mlock_kb resource constraints are ignored
|
||||
for processes with the CAP_IPC_LOCK capability. Thus, perf_events/Perf
|
||||
privileged users can be provided with memory above the constraints for
|
||||
perf_events/Perf performance monitoring purpose by providing the Perf
|
||||
executable with CAP_IPC_LOCK capability.
|
||||
|
||||
Bibliography
|
||||
------------
|
||||
|
||||
.. [1] `<https://lwn.net/Articles/337493/>`_
|
||||
.. [2] `<http://man7.org/linux/man-pages/man2/perf_event_open.2.html>`_
|
||||
.. [3] `<http://web.eece.maine.edu/~vweaver/projects/perf_events/>`_
|
||||
.. [4] `<https://perf.wiki.kernel.org/index.php/Main_Page>`_
|
||||
.. [5] `<https://www.kernel.org/doc/html/latest/security/credentials.html>`_
|
||||
.. [6] `<http://man7.org/linux/man-pages/man7/capabilities.7.html>`_
|
||||
.. [7] `<http://man7.org/linux/man-pages/man2/ptrace.2.html>`_
|
||||
.. [8] `<https://en.wikipedia.org/wiki/Hardware_performance_counter>`_
|
||||
.. [9] `<https://en.wikipedia.org/wiki/Model-specific_register>`_
|
||||
.. [10] `<http://man7.org/linux/man-pages/man5/acl.5.html>`_
|
||||
.. [11] `<http://man7.org/linux/man-pages/man2/getrlimit.2.html>`_
|
||||
.. [12] `<http://man7.org/linux/man-pages/man5/limits.conf.5.html>`_
|
||||
|
719
Documentation/admin-guide/pm/cpuidle.rst
Normal file
@@ -0,0 +1,719 @@
|
||||
.. |struct cpuidle_state| replace:: :c:type:`struct cpuidle_state <cpuidle_state>`
|
||||
.. |cpufreq| replace:: :doc:`CPU Performance Scaling <cpufreq>`
|
||||
|
||||
========================
|
||||
CPU Idle Time Management
|
||||
========================
|
||||
|
||||
::
|
||||
|
||||
Copyright (c) 2018 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
||||
|
||||
Concepts
|
||||
========
|
||||
|
||||
Modern processors are generally able to enter states in which the execution of
|
||||
a program is suspended and instructions belonging to it are not fetched from
|
||||
memory or executed. Those states are the *idle* states of the processor.
|
||||
|
||||
Since part of the processor hardware is not used in idle states, entering them
|
||||
generally allows power drawn by the processor to be reduced and, in consequence,
|
||||
it is an opportunity to save energy.
|
||||
|
||||
CPU idle time management is an energy-efficiency feature concerned about using
|
||||
the idle states of processors for this purpose.
|
||||
|
||||
Logical CPUs
|
||||
------------
|
||||
|
||||
CPU idle time management operates on CPUs as seen by the *CPU scheduler* (that
|
||||
is the part of the kernel responsible for the distribution of computational
|
||||
work in the system). In its view, CPUs are *logical* units. That is, they need
|
||||
not be separate physical entities and may just be interfaces appearing to
|
||||
software as individual single-core processors. In other words, a CPU is an
|
||||
entity which appears to be fetching instructions that belong to one sequence
|
||||
(program) from memory and executing them, but it need not work this way
|
||||
physically. Generally, three different cases can be consider here.
|
||||
|
||||
First, if the whole processor can only follow one sequence of instructions (one
|
||||
program) at a time, it is a CPU. In that case, if the hardware is asked to
|
||||
enter an idle state, that applies to the processor as a whole.
|
||||
|
||||
Second, if the processor is multi-core, each core in it is able to follow at
|
||||
least one program at a time. The cores need not be entirely independent of each
|
||||
other (for example, they may share caches), but still most of the time they
|
||||
work physically in parallel with each other, so if each of them executes only
|
||||
one program, those programs run mostly independently of each other at the same
|
||||
time. The entire cores are CPUs in that case and if the hardware is asked to
|
||||
enter an idle state, that applies to the core that asked for it in the first
|
||||
place, but it also may apply to a larger unit (say a "package" or a "cluster")
|
||||
that the core belongs to (in fact, it may apply to an entire hierarchy of larger
|
||||
units containing the core). Namely, if all of the cores in the larger unit
|
||||
except for one have been put into idle states at the "core level" and the
|
||||
remaining core asks the processor to enter an idle state, that may trigger it
|
||||
to put the whole larger unit into an idle state which also will affect the
|
||||
other cores in that unit.
|
||||
|
||||
Finally, each core in a multi-core processor may be able to follow more than one
|
||||
program in the same time frame (that is, each core may be able to fetch
|
||||
instructions from multiple locations in memory and execute them in the same time
|
||||
frame, but not necessarily entirely in parallel with each other). In that case
|
||||
the cores present themselves to software as "bundles" each consisting of
|
||||
multiple individual single-core "processors", referred to as *hardware threads*
|
||||
(or hyper-threads specifically on Intel hardware), that each can follow one
|
||||
sequence of instructions. Then, the hardware threads are CPUs from the CPU idle
|
||||
time management perspective and if the processor is asked to enter an idle state
|
||||
by one of them, the hardware thread (or CPU) that asked for it is stopped, but
|
||||
nothing more happens, unless all of the other hardware threads within the same
|
||||
core also have asked the processor to enter an idle state. In that situation,
|
||||
the core may be put into an idle state individually or a larger unit containing
|
||||
it may be put into an idle state as a whole (if the other cores within the
|
||||
larger unit are in idle states already).
|
||||
|
||||
Idle CPUs
|
||||
---------
|
||||
|
||||
Logical CPUs, simply referred to as "CPUs" in what follows, are regarded as
|
||||
*idle* by the Linux kernel when there are no tasks to run on them except for the
|
||||
special "idle" task.
|
||||
|
||||
Tasks are the CPU scheduler's representation of work. Each task consists of a
|
||||
sequence of instructions to execute, or code, data to be manipulated while
|
||||
running that code, and some context information that needs to be loaded into the
|
||||
processor every time the task's code is run by a CPU. The CPU scheduler
|
||||
distributes work by assigning tasks to run to the CPUs present in the system.
|
||||
|
||||
Tasks can be in various states. In particular, they are *runnable* if there are
|
||||
no specific conditions preventing their code from being run by a CPU as long as
|
||||
there is a CPU available for that (for example, they are not waiting for any
|
||||
events to occur or similar). When a task becomes runnable, the CPU scheduler
|
||||
assigns it to one of the available CPUs to run and if there are no more runnable
|
||||
tasks assigned to it, the CPU will load the given task's context and run its
|
||||
code (from the instruction following the last one executed so far, possibly by
|
||||
another CPU). [If there are multiple runnable tasks assigned to one CPU
|
||||
simultaneously, they will be subject to prioritization and time sharing in order
|
||||
to allow them to make some progress over time.]
|
||||
|
||||
The special "idle" task becomes runnable if there are no other runnable tasks
|
||||
assigned to the given CPU and the CPU is then regarded as idle. In other words,
|
||||
in Linux idle CPUs run the code of the "idle" task called *the idle loop*. That
|
||||
code may cause the processor to be put into one of its idle states, if they are
|
||||
supported, in order to save energy, but if the processor does not support any
|
||||
idle states, or there is not enough time to spend in an idle state before the
|
||||
next wakeup event, or there are strict latency constraints preventing any of the
|
||||
available idle states from being used, the CPU will simply execute more or less
|
||||
useless instructions in a loop until it is assigned a new task to run.
|
||||
|
||||
|
||||
.. _idle-loop:
|
||||
|
||||
The Idle Loop
|
||||
=============
|
||||
|
||||
The idle loop code takes two major steps in every iteration of it. First, it
|
||||
calls into a code module referred to as the *governor* that belongs to the CPU
|
||||
idle time management subsystem called ``CPUIdle`` to select an idle state for
|
||||
the CPU to ask the hardware to enter. Second, it invokes another code module
|
||||
from the ``CPUIdle`` subsystem, called the *driver*, to actually ask the
|
||||
processor hardware to enter the idle state selected by the governor.
|
||||
|
||||
The role of the governor is to find an idle state most suitable for the
|
||||
conditions at hand. For this purpose, idle states that the hardware can be
|
||||
asked to enter by logical CPUs are represented in an abstract way independent of
|
||||
the platform or the processor architecture and organized in a one-dimensional
|
||||
(linear) array. That array has to be prepared and supplied by the ``CPUIdle``
|
||||
driver matching the platform the kernel is running on at the initialization
|
||||
time. This allows ``CPUIdle`` governors to be independent of the underlying
|
||||
hardware and to work with any platforms that the Linux kernel can run on.
|
||||
|
||||
Each idle state present in that array is characterized by two parameters to be
|
||||
taken into account by the governor, the *target residency* and the (worst-case)
|
||||
*exit latency*. The target residency is the minimum time the hardware must
|
||||
spend in the given state, including the time needed to enter it (which may be
|
||||
substantial), in order to save more energy than it would save by entering one of
|
||||
the shallower idle states instead. [The "depth" of an idle state roughly
|
||||
corresponds to the power drawn by the processor in that state.] The exit
|
||||
latency, in turn, is the maximum time it will take a CPU asking the processor
|
||||
hardware to enter an idle state to start executing the first instruction after a
|
||||
wakeup from that state. Note that in general the exit latency also must cover
|
||||
the time needed to enter the given state in case the wakeup occurs when the
|
||||
hardware is entering it and it must be entered completely to be exited in an
|
||||
ordered manner.
|
||||
|
||||
There are two types of information that can influence the governor's decisions.
|
||||
First of all, the governor knows the time until the closest timer event. That
|
||||
time is known exactly, because the kernel programs timers and it knows exactly
|
||||
when they will trigger, and it is the maximum time the hardware that the given
|
||||
CPU depends on can spend in an idle state, including the time necessary to enter
|
||||
and exit it. However, the CPU may be woken up by a non-timer event at any time
|
||||
(in particular, before the closest timer triggers) and it generally is not known
|
||||
when that may happen. The governor can only see how much time the CPU actually
|
||||
was idle after it has been woken up (that time will be referred to as the *idle
|
||||
duration* from now on) and it can use that information somehow along with the
|
||||
time until the closest timer to estimate the idle duration in future. How the
|
||||
governor uses that information depends on what algorithm is implemented by it
|
||||
and that is the primary reason for having more than one governor in the
|
||||
``CPUIdle`` subsystem.
|
||||
|
||||
There are three ``CPUIdle`` governors available, ``menu``, `TEO <teo-gov_>`_
|
||||
and ``ladder``. Which of them is used by default depends on the configuration
|
||||
of the kernel and in particular on whether or not the scheduler tick can be
|
||||
`stopped by the idle loop <idle-cpus-and-tick_>`_. It is possible to change the
|
||||
governor at run time if the ``cpuidle_sysfs_switch`` command line parameter has
|
||||
been passed to the kernel, but that is not safe in general, so it should not be
|
||||
done on production systems (that may change in the future, though). The name of
|
||||
the ``CPUIdle`` governor currently used by the kernel can be read from the
|
||||
:file:`current_governor_ro` (or :file:`current_governor` if
|
||||
``cpuidle_sysfs_switch`` is present in the kernel command line) file under
|
||||
:file:`/sys/devices/system/cpu/cpuidle/` in ``sysfs``.
|
||||
|
||||
Which ``CPUIdle`` driver is used, on the other hand, usually depends on the
|
||||
platform the kernel is running on, but there are platforms with more than one
|
||||
matching driver. For example, there are two drivers that can work with the
|
||||
majority of Intel platforms, ``intel_idle`` and ``acpi_idle``, one with
|
||||
hardcoded idle states information and the other able to read that information
|
||||
from the system's ACPI tables, respectively. Still, even in those cases, the
|
||||
driver chosen at the system initialization time cannot be replaced later, so the
|
||||
decision on which one of them to use has to be made early (on Intel platforms
|
||||
the ``acpi_idle`` driver will be used if ``intel_idle`` is disabled for some
|
||||
reason or if it does not recognize the processor). The name of the ``CPUIdle``
|
||||
driver currently used by the kernel can be read from the :file:`current_driver`
|
||||
file under :file:`/sys/devices/system/cpu/cpuidle/` in ``sysfs``.
|
||||
|
||||
|
||||
.. _idle-cpus-and-tick:
|
||||
|
||||
Idle CPUs and The Scheduler Tick
|
||||
================================
|
||||
|
||||
The scheduler tick is a timer that triggers periodically in order to implement
|
||||
the time sharing strategy of the CPU scheduler. Of course, if there are
|
||||
multiple runnable tasks assigned to one CPU at the same time, the only way to
|
||||
allow them to make reasonable progress in a given time frame is to make them
|
||||
share the available CPU time. Namely, in rough approximation, each task is
|
||||
given a slice of the CPU time to run its code, subject to the scheduling class,
|
||||
prioritization and so on and when that time slice is used up, the CPU should be
|
||||
switched over to running (the code of) another task. The currently running task
|
||||
may not want to give the CPU away voluntarily, however, and the scheduler tick
|
||||
is there to make the switch happen regardless. That is not the only role of the
|
||||
tick, but it is the primary reason for using it.
|
||||
|
||||
The scheduler tick is problematic from the CPU idle time management perspective,
|
||||
because it triggers periodically and relatively often (depending on the kernel
|
||||
configuration, the length of the tick period is between 1 ms and 10 ms).
|
||||
Thus, if the tick is allowed to trigger on idle CPUs, it will not make sense
|
||||
for them to ask the hardware to enter idle states with target residencies above
|
||||
the tick period length. Moreover, in that case the idle duration of any CPU
|
||||
will never exceed the tick period length and the energy used for entering and
|
||||
exiting idle states due to the tick wakeups on idle CPUs will be wasted.
|
||||
|
||||
Fortunately, it is not really necessary to allow the tick to trigger on idle
|
||||
CPUs, because (by definition) they have no tasks to run except for the special
|
||||
"idle" one. In other words, from the CPU scheduler perspective, the only user
|
||||
of the CPU time on them is the idle loop. Since the time of an idle CPU need
|
||||
not be shared between multiple runnable tasks, the primary reason for using the
|
||||
tick goes away if the given CPU is idle. Consequently, it is possible to stop
|
||||
the scheduler tick entirely on idle CPUs in principle, even though that may not
|
||||
always be worth the effort.
|
||||
|
||||
Whether or not it makes sense to stop the scheduler tick in the idle loop
|
||||
depends on what is expected by the governor. First, if there is another
|
||||
(non-tick) timer due to trigger within the tick range, stopping the tick clearly
|
||||
would be a waste of time, even though the timer hardware may not need to be
|
||||
reprogrammed in that case. Second, if the governor is expecting a non-timer
|
||||
wakeup within the tick range, stopping the tick is not necessary and it may even
|
||||
be harmful. Namely, in that case the governor will select an idle state with
|
||||
the target residency within the time until the expected wakeup, so that state is
|
||||
going to be relatively shallow. The governor really cannot select a deep idle
|
||||
state then, as that would contradict its own expectation of a wakeup in short
|
||||
order. Now, if the wakeup really occurs shortly, stopping the tick would be a
|
||||
waste of time and in this case the timer hardware would need to be reprogrammed,
|
||||
which is expensive. On the other hand, if the tick is stopped and the wakeup
|
||||
does not occur any time soon, the hardware may spend indefinite amount of time
|
||||
in the shallow idle state selected by the governor, which will be a waste of
|
||||
energy. Hence, if the governor is expecting a wakeup of any kind within the
|
||||
tick range, it is better to allow the tick trigger. Otherwise, however, the
|
||||
governor will select a relatively deep idle state, so the tick should be stopped
|
||||
so that it does not wake up the CPU too early.
|
||||
|
||||
In any case, the governor knows what it is expecting and the decision on whether
|
||||
or not to stop the scheduler tick belongs to it. Still, if the tick has been
|
||||
stopped already (in one of the previous iterations of the loop), it is better
|
||||
to leave it as is and the governor needs to take that into account.
|
||||
|
||||
The kernel can be configured to disable stopping the scheduler tick in the idle
|
||||
loop altogether. That can be done through the build-time configuration of it
|
||||
(by unsetting the ``CONFIG_NO_HZ_IDLE`` configuration option) or by passing
|
||||
``nohz=off`` to it in the command line. In both cases, as the stopping of the
|
||||
scheduler tick is disabled, the governor's decisions regarding it are simply
|
||||
ignored by the idle loop code and the tick is never stopped.
|
||||
|
||||
The systems that run kernels configured to allow the scheduler tick to be
|
||||
stopped on idle CPUs are referred to as *tickless* systems and they are
|
||||
generally regarded as more energy-efficient than the systems running kernels in
|
||||
which the tick cannot be stopped. If the given system is tickless, it will use
|
||||
the ``menu`` governor by default and if it is not tickless, the default
|
||||
``CPUIdle`` governor on it will be ``ladder``.
|
||||
|
||||
|
||||
.. _menu-gov:
|
||||
|
||||
The ``menu`` Governor
|
||||
=====================
|
||||
|
||||
The ``menu`` governor is the default ``CPUIdle`` governor for tickless systems.
|
||||
It is quite complex, but the basic principle of its design is straightforward.
|
||||
Namely, when invoked to select an idle state for a CPU (i.e. an idle state that
|
||||
the CPU will ask the processor hardware to enter), it attempts to predict the
|
||||
idle duration and uses the predicted value for idle state selection.
|
||||
|
||||
It first obtains the time until the closest timer event with the assumption
|
||||
that the scheduler tick will be stopped. That time, referred to as the *sleep
|
||||
length* in what follows, is the upper bound on the time before the next CPU
|
||||
wakeup. It is used to determine the sleep length range, which in turn is needed
|
||||
to get the sleep length correction factor.
|
||||
|
||||
The ``menu`` governor maintains two arrays of sleep length correction factors.
|
||||
One of them is used when tasks previously running on the given CPU are waiting
|
||||
for some I/O operations to complete and the other one is used when that is not
|
||||
the case. Each array contains several correction factor values that correspond
|
||||
to different sleep length ranges organized so that each range represented in the
|
||||
array is approximately 10 times wider than the previous one.
|
||||
|
||||
The correction factor for the given sleep length range (determined before
|
||||
selecting the idle state for the CPU) is updated after the CPU has been woken
|
||||
up and the closer the sleep length is to the observed idle duration, the closer
|
||||
to 1 the correction factor becomes (it must fall between 0 and 1 inclusive).
|
||||
The sleep length is multiplied by the correction factor for the range that it
|
||||
falls into to obtain the first approximation of the predicted idle duration.
|
||||
|
||||
Next, the governor uses a simple pattern recognition algorithm to refine its
|
||||
idle duration prediction. Namely, it saves the last 8 observed idle duration
|
||||
values and, when predicting the idle duration next time, it computes the average
|
||||
and variance of them. If the variance is small (smaller than 400 square
|
||||
milliseconds) or it is small relative to the average (the average is greater
|
||||
that 6 times the standard deviation), the average is regarded as the "typical
|
||||
interval" value. Otherwise, the longest of the saved observed idle duration
|
||||
values is discarded and the computation is repeated for the remaining ones.
|
||||
Again, if the variance of them is small (in the above sense), the average is
|
||||
taken as the "typical interval" value and so on, until either the "typical
|
||||
interval" is determined or too many data points are disregarded, in which case
|
||||
the "typical interval" is assumed to equal "infinity" (the maximum unsigned
|
||||
integer value). The "typical interval" computed this way is compared with the
|
||||
sleep length multiplied by the correction factor and the minimum of the two is
|
||||
taken as the predicted idle duration.
|
||||
|
||||
Then, the governor computes an extra latency limit to help "interactive"
|
||||
workloads. It uses the observation that if the exit latency of the selected
|
||||
idle state is comparable with the predicted idle duration, the total time spent
|
||||
in that state probably will be very short and the amount of energy to save by
|
||||
entering it will be relatively small, so likely it is better to avoid the
|
||||
overhead related to entering that state and exiting it. Thus selecting a
|
||||
shallower state is likely to be a better option then. The first approximation
|
||||
of the extra latency limit is the predicted idle duration itself which
|
||||
additionally is divided by a value depending on the number of tasks that
|
||||
previously ran on the given CPU and now they are waiting for I/O operations to
|
||||
complete. The result of that division is compared with the latency limit coming
|
||||
from the power management quality of service, or `PM QoS <cpu-pm-qos_>`_,
|
||||
framework and the minimum of the two is taken as the limit for the idle states'
|
||||
exit latency.
|
||||
|
||||
Now, the governor is ready to walk the list of idle states and choose one of
|
||||
them. For this purpose, it compares the target residency of each state with
|
||||
the predicted idle duration and the exit latency of it with the computed latency
|
||||
limit. It selects the state with the target residency closest to the predicted
|
||||
idle duration, but still below it, and exit latency that does not exceed the
|
||||
limit.
|
||||
|
||||
In the final step the governor may still need to refine the idle state selection
|
||||
if it has not decided to `stop the scheduler tick <idle-cpus-and-tick_>`_. That
|
||||
happens if the idle duration predicted by it is less than the tick period and
|
||||
the tick has not been stopped already (in a previous iteration of the idle
|
||||
loop). Then, the sleep length used in the previous computations may not reflect
|
||||
the real time until the closest timer event and if it really is greater than
|
||||
that time, the governor may need to select a shallower state with a suitable
|
||||
target residency.
|
||||
|
||||
|
||||
.. _teo-gov:
|
||||
|
||||
The Timer Events Oriented (TEO) Governor
|
||||
========================================
|
||||
|
||||
The timer events oriented (TEO) governor is an alternative ``CPUIdle`` governor
|
||||
for tickless systems. It follows the same basic strategy as the ``menu`` `one
|
||||
<menu-gov_>`_: it always tries to find the deepest idle state suitable for the
|
||||
given conditions. However, it applies a different approach to that problem.
|
||||
|
||||
First, it does not use sleep length correction factors, but instead it attempts
|
||||
to correlate the observed idle duration values with the available idle states
|
||||
and use that information to pick up the idle state that is most likely to
|
||||
"match" the upcoming CPU idle interval. Second, it does not take the tasks
|
||||
that were running on the given CPU in the past and are waiting on some I/O
|
||||
operations to complete now at all (there is no guarantee that they will run on
|
||||
the same CPU when they become runnable again) and the pattern detection code in
|
||||
it avoids taking timer wakeups into account. It also only uses idle duration
|
||||
values less than the current time till the closest timer (with the scheduler
|
||||
tick excluded) for that purpose.
|
||||
|
||||
Like in the ``menu`` governor `case <menu-gov_>`_, the first step is to obtain
|
||||
the *sleep length*, which is the time until the closest timer event with the
|
||||
assumption that the scheduler tick will be stopped (that also is the upper bound
|
||||
on the time until the next CPU wakeup). That value is then used to preselect an
|
||||
idle state on the basis of three metrics maintained for each idle state provided
|
||||
by the ``CPUIdle`` driver: ``hits``, ``misses`` and ``early_hits``.
|
||||
|
||||
The ``hits`` and ``misses`` metrics measure the likelihood that a given idle
|
||||
state will "match" the observed (post-wakeup) idle duration if it "matches" the
|
||||
sleep length. They both are subject to decay (after a CPU wakeup) every time
|
||||
the target residency of the idle state corresponding to them is less than or
|
||||
equal to the sleep length and the target residency of the next idle state is
|
||||
greater than the sleep length (that is, when the idle state corresponding to
|
||||
them "matches" the sleep length). The ``hits`` metric is increased if the
|
||||
former condition is satisfied and the target residency of the given idle state
|
||||
is less than or equal to the observed idle duration and the target residency of
|
||||
the next idle state is greater than the observed idle duration at the same time
|
||||
(that is, it is increased when the given idle state "matches" both the sleep
|
||||
length and the observed idle duration). In turn, the ``misses`` metric is
|
||||
increased when the given idle state "matches" the sleep length only and the
|
||||
observed idle duration is too short for its target residency.
|
||||
|
||||
The ``early_hits`` metric measures the likelihood that a given idle state will
|
||||
"match" the observed (post-wakeup) idle duration if it does not "match" the
|
||||
sleep length. It is subject to decay on every CPU wakeup and it is increased
|
||||
when the idle state corresponding to it "matches" the observed (post-wakeup)
|
||||
idle duration and the target residency of the next idle state is less than or
|
||||
equal to the sleep length (i.e. the idle state "matching" the sleep length is
|
||||
deeper than the given one).
|
||||
|
||||
The governor walks the list of idle states provided by the ``CPUIdle`` driver
|
||||
and finds the last (deepest) one with the target residency less than or equal
|
||||
to the sleep length. Then, the ``hits`` and ``misses`` metrics of that idle
|
||||
state are compared with each other and it is preselected if the ``hits`` one is
|
||||
greater (which means that that idle state is likely to "match" the observed idle
|
||||
duration after CPU wakeup). If the ``misses`` one is greater, the governor
|
||||
preselects the shallower idle state with the maximum ``early_hits`` metric
|
||||
(or if there are multiple shallower idle states with equal ``early_hits``
|
||||
metric which also is the maximum, the shallowest of them will be preselected).
|
||||
[If there is a wakeup latency constraint coming from the `PM QoS framework
|
||||
<cpu-pm-qos_>`_ which is hit before reaching the deepest idle state with the
|
||||
target residency within the sleep length, the deepest idle state with the exit
|
||||
latency within the constraint is preselected without consulting the ``hits``,
|
||||
``misses`` and ``early_hits`` metrics.]
|
||||
|
||||
Next, the governor takes several idle duration values observed most recently
|
||||
into consideration and if at least a half of them are greater than or equal to
|
||||
the target residency of the preselected idle state, that idle state becomes the
|
||||
final candidate to ask for. Otherwise, the average of the most recent idle
|
||||
duration values below the target residency of the preselected idle state is
|
||||
computed and the governor walks the idle states shallower than the preselected
|
||||
one and finds the deepest of them with the target residency within that average.
|
||||
That idle state is then taken as the final candidate to ask for.
|
||||
|
||||
Still, at this point the governor may need to refine the idle state selection if
|
||||
it has not decided to `stop the scheduler tick <idle-cpus-and-tick_>`_. That
|
||||
generally happens if the target residency of the idle state selected so far is
|
||||
less than the tick period and the tick has not been stopped already (in a
|
||||
previous iteration of the idle loop). Then, like in the ``menu`` governor
|
||||
`case <menu-gov_>`_, the sleep length used in the previous computations may not
|
||||
reflect the real time until the closest timer event and if it really is greater
|
||||
than that time, a shallower state with a suitable target residency may need to
|
||||
be selected.
|
||||
|
||||
|
||||
.. _idle-states-representation:
|
||||
|
||||
Representation of Idle States
|
||||
=============================
|
||||
|
||||
For the CPU idle time management purposes all of the physical idle states
|
||||
supported by the processor have to be represented as a one-dimensional array of
|
||||
|struct cpuidle_state| objects each allowing an individual (logical) CPU to ask
|
||||
the processor hardware to enter an idle state of certain properties. If there
|
||||
is a hierarchy of units in the processor, one |struct cpuidle_state| object can
|
||||
cover a combination of idle states supported by the units at different levels of
|
||||
the hierarchy. In that case, the `target residency and exit latency parameters
|
||||
of it <idle-loop_>`_, must reflect the properties of the idle state at the
|
||||
deepest level (i.e. the idle state of the unit containing all of the other
|
||||
units).
|
||||
|
||||
For example, take a processor with two cores in a larger unit referred to as
|
||||
a "module" and suppose that asking the hardware to enter a specific idle state
|
||||
(say "X") at the "core" level by one core will trigger the module to try to
|
||||
enter a specific idle state of its own (say "MX") if the other core is in idle
|
||||
state "X" already. In other words, asking for idle state "X" at the "core"
|
||||
level gives the hardware a license to go as deep as to idle state "MX" at the
|
||||
"module" level, but there is no guarantee that this is going to happen (the core
|
||||
asking for idle state "X" may just end up in that state by itself instead).
|
||||
Then, the target residency of the |struct cpuidle_state| object representing
|
||||
idle state "X" must reflect the minimum time to spend in idle state "MX" of
|
||||
the module (including the time needed to enter it), because that is the minimum
|
||||
time the CPU needs to be idle to save any energy in case the hardware enters
|
||||
that state. Analogously, the exit latency parameter of that object must cover
|
||||
the exit time of idle state "MX" of the module (and usually its entry time too),
|
||||
because that is the maximum delay between a wakeup signal and the time the CPU
|
||||
will start to execute the first new instruction (assuming that both cores in the
|
||||
module will always be ready to execute instructions as soon as the module
|
||||
becomes operational as a whole).
|
||||
|
||||
There are processors without direct coordination between different levels of the
|
||||
hierarchy of units inside them, however. In those cases asking for an idle
|
||||
state at the "core" level does not automatically affect the "module" level, for
|
||||
example, in any way and the ``CPUIdle`` driver is responsible for the entire
|
||||
handling of the hierarchy. Then, the definition of the idle state objects is
|
||||
entirely up to the driver, but still the physical properties of the idle state
|
||||
that the processor hardware finally goes into must always follow the parameters
|
||||
used by the governor for idle state selection (for instance, the actual exit
|
||||
latency of that idle state must not exceed the exit latency parameter of the
|
||||
idle state object selected by the governor).
|
||||
|
||||
In addition to the target residency and exit latency idle state parameters
|
||||
discussed above, the objects representing idle states each contain a few other
|
||||
parameters describing the idle state and a pointer to the function to run in
|
||||
order to ask the hardware to enter that state. Also, for each
|
||||
|struct cpuidle_state| object, there is a corresponding
|
||||
:c:type:`struct cpuidle_state_usage <cpuidle_state_usage>` one containing usage
|
||||
statistics of the given idle state. That information is exposed by the kernel
|
||||
via ``sysfs``.
|
||||
|
||||
For each CPU in the system, there is a :file:`/sys/devices/system/cpu<N>/cpuidle/`
|
||||
directory in ``sysfs``, where the number ``<N>`` is assigned to the given
|
||||
CPU at the initialization time. That directory contains a set of subdirectories
|
||||
called :file:`state0`, :file:`state1` and so on, up to the number of idle state
|
||||
objects defined for the given CPU minus one. Each of these directories
|
||||
corresponds to one idle state object and the larger the number in its name, the
|
||||
deeper the (effective) idle state represented by it. Each of them contains
|
||||
a number of files (attributes) representing the properties of the idle state
|
||||
object corresponding to it, as follows:
|
||||
|
||||
``above``
|
||||
Total number of times this idle state had been asked for, but the
|
||||
observed idle duration was certainly too short to match its target
|
||||
residency.
|
||||
|
||||
``below``
|
||||
Total number of times this idle state had been asked for, but cerainly
|
||||
a deeper idle state would have been a better match for the observed idle
|
||||
duration.
|
||||
|
||||
``desc``
|
||||
Description of the idle state.
|
||||
|
||||
``disable``
|
||||
Whether or not this idle state is disabled.
|
||||
|
||||
``latency``
|
||||
Exit latency of the idle state in microseconds.
|
||||
|
||||
``name``
|
||||
Name of the idle state.
|
||||
|
||||
``power``
|
||||
Power drawn by hardware in this idle state in milliwatts (if specified,
|
||||
0 otherwise).
|
||||
|
||||
``residency``
|
||||
Target residency of the idle state in microseconds.
|
||||
|
||||
``time``
|
||||
Total time spent in this idle state by the given CPU (as measured by the
|
||||
kernel) in microseconds.
|
||||
|
||||
``usage``
|
||||
Total number of times the hardware has been asked by the given CPU to
|
||||
enter this idle state.
|
||||
|
||||
The :file:`desc` and :file:`name` files both contain strings. The difference
|
||||
between them is that the name is expected to be more concise, while the
|
||||
description may be longer and it may contain white space or special characters.
|
||||
The other files listed above contain integer numbers.
|
||||
|
||||
The :file:`disable` attribute is the only writeable one. If it contains 1, the
|
||||
given idle state is disabled for this particular CPU, which means that the
|
||||
governor will never select it for this particular CPU and the ``CPUIdle``
|
||||
driver will never ask the hardware to enter it for that CPU as a result.
|
||||
However, disabling an idle state for one CPU does not prevent it from being
|
||||
asked for by the other CPUs, so it must be disabled for all of them in order to
|
||||
never be asked for by any of them. [Note that, due to the way the ``ladder``
|
||||
governor is implemented, disabling an idle state prevents that governor from
|
||||
selecting any idle states deeper than the disabled one too.]
|
||||
|
||||
If the :file:`disable` attribute contains 0, the given idle state is enabled for
|
||||
this particular CPU, but it still may be disabled for some or all of the other
|
||||
CPUs in the system at the same time. Writing 1 to it causes the idle state to
|
||||
be disabled for this particular CPU and writing 0 to it allows the governor to
|
||||
take it into consideration for the given CPU and the driver to ask for it,
|
||||
unless that state was disabled globally in the driver (in which case it cannot
|
||||
be used at all).
|
||||
|
||||
The :file:`power` attribute is not defined very well, especially for idle state
|
||||
objects representing combinations of idle states at different levels of the
|
||||
hierarchy of units in the processor, and it generally is hard to obtain idle
|
||||
state power numbers for complex hardware, so :file:`power` often contains 0 (not
|
||||
available) and if it contains a nonzero number, that number may not be very
|
||||
accurate and it should not be relied on for anything meaningful.
|
||||
|
||||
The number in the :file:`time` file generally may be greater than the total time
|
||||
really spent by the given CPU in the given idle state, because it is measured by
|
||||
the kernel and it may not cover the cases in which the hardware refused to enter
|
||||
this idle state and entered a shallower one instead of it (or even it did not
|
||||
enter any idle state at all). The kernel can only measure the time span between
|
||||
asking the hardware to enter an idle state and the subsequent wakeup of the CPU
|
||||
and it cannot say what really happened in the meantime at the hardware level.
|
||||
Moreover, if the idle state object in question represents a combination of idle
|
||||
states at different levels of the hierarchy of units in the processor,
|
||||
the kernel can never say how deep the hardware went down the hierarchy in any
|
||||
particular case. For these reasons, the only reliable way to find out how
|
||||
much time has been spent by the hardware in different idle states supported by
|
||||
it is to use idle state residency counters in the hardware, if available.
|
||||
|
||||
|
||||
.. _cpu-pm-qos:
|
||||
|
||||
Power Management Quality of Service for CPUs
|
||||
============================================
|
||||
|
||||
The power management quality of service (PM QoS) framework in the Linux kernel
|
||||
allows kernel code and user space processes to set constraints on various
|
||||
energy-efficiency features of the kernel to prevent performance from dropping
|
||||
below a required level. The PM QoS constraints can be set globally, in
|
||||
predefined categories referred to as PM QoS classes, or against individual
|
||||
devices.
|
||||
|
||||
CPU idle time management can be affected by PM QoS in two ways, through the
|
||||
global constraint in the ``PM_QOS_CPU_DMA_LATENCY`` class and through the
|
||||
resume latency constraints for individual CPUs. Kernel code (e.g. device
|
||||
drivers) can set both of them with the help of special internal interfaces
|
||||
provided by the PM QoS framework. User space can modify the former by opening
|
||||
the :file:`cpu_dma_latency` special device file under :file:`/dev/` and writing
|
||||
a binary value (interpreted as a signed 32-bit integer) to it. In turn, the
|
||||
resume latency constraint for a CPU can be modified by user space by writing a
|
||||
string (representing a signed 32-bit integer) to the
|
||||
:file:`power/pm_qos_resume_latency_us` file under
|
||||
:file:`/sys/devices/system/cpu/cpu<N>/` in ``sysfs``, where the CPU number
|
||||
``<N>`` is allocated at the system initialization time. Negative values
|
||||
will be rejected in both cases and, also in both cases, the written integer
|
||||
number will be interpreted as a requested PM QoS constraint in microseconds.
|
||||
|
||||
The requested value is not automatically applied as a new constraint, however,
|
||||
as it may be less restrictive (greater in this particular case) than another
|
||||
constraint previously requested by someone else. For this reason, the PM QoS
|
||||
framework maintains a list of requests that have been made so far in each
|
||||
global class and for each device, aggregates them and applies the effective
|
||||
(minimum in this particular case) value as the new constraint.
|
||||
|
||||
In fact, opening the :file:`cpu_dma_latency` special device file causes a new
|
||||
PM QoS request to be created and added to the priority list of requests in the
|
||||
``PM_QOS_CPU_DMA_LATENCY`` class and the file descriptor coming from the
|
||||
"open" operation represents that request. If that file descriptor is then
|
||||
used for writing, the number written to it will be associated with the PM QoS
|
||||
request represented by it as a new requested constraint value. Next, the
|
||||
priority list mechanism will be used to determine the new effective value of
|
||||
the entire list of requests and that effective value will be set as a new
|
||||
constraint. Thus setting a new requested constraint value will only change the
|
||||
real constraint if the effective "list" value is affected by it. In particular,
|
||||
for the ``PM_QOS_CPU_DMA_LATENCY`` class it only affects the real constraint if
|
||||
it is the minimum of the requested constraints in the list. The process holding
|
||||
a file descriptor obtained by opening the :file:`cpu_dma_latency` special device
|
||||
file controls the PM QoS request associated with that file descriptor, but it
|
||||
controls this particular PM QoS request only.
|
||||
|
||||
Closing the :file:`cpu_dma_latency` special device file or, more precisely, the
|
||||
file descriptor obtained while opening it, causes the PM QoS request associated
|
||||
with that file descriptor to be removed from the ``PM_QOS_CPU_DMA_LATENCY``
|
||||
class priority list and destroyed. If that happens, the priority list mechanism
|
||||
will be used, again, to determine the new effective value for the whole list
|
||||
and that value will become the new real constraint.
|
||||
|
||||
In turn, for each CPU there is only one resume latency PM QoS request
|
||||
associated with the :file:`power/pm_qos_resume_latency_us` file under
|
||||
:file:`/sys/devices/system/cpu/cpu<N>/` in ``sysfs`` and writing to it causes
|
||||
this single PM QoS request to be updated regardless of which user space
|
||||
process does that. In other words, this PM QoS request is shared by the entire
|
||||
user space, so access to the file associated with it needs to be arbitrated
|
||||
to avoid confusion. [Arguably, the only legitimate use of this mechanism in
|
||||
practice is to pin a process to the CPU in question and let it use the
|
||||
``sysfs`` interface to control the resume latency constraint for it.] It
|
||||
still only is a request, however. It is a member of a priority list used to
|
||||
determine the effective value to be set as the resume latency constraint for the
|
||||
CPU in question every time the list of requests is updated this way or another
|
||||
(there may be other requests coming from kernel code in that list).
|
||||
|
||||
CPU idle time governors are expected to regard the minimum of the global
|
||||
effective ``PM_QOS_CPU_DMA_LATENCY`` class constraint and the effective
|
||||
resume latency constraint for the given CPU as the upper limit for the exit
|
||||
latency of the idle states they can select for that CPU. They should never
|
||||
select any idle states with exit latency beyond that limit.
|
||||
|
||||
|
||||
Idle States Control Via Kernel Command Line
|
||||
===========================================
|
||||
|
||||
In addition to the ``sysfs`` interface allowing individual idle states to be
|
||||
`disabled for individual CPUs <idle-states-representation_>`_, there are kernel
|
||||
command line parameters affecting CPU idle time management.
|
||||
|
||||
The ``cpuidle.off=1`` kernel command line option can be used to disable the
|
||||
CPU idle time management entirely. It does not prevent the idle loop from
|
||||
running on idle CPUs, but it prevents the CPU idle time governors and drivers
|
||||
from being invoked. If it is added to the kernel command line, the idle loop
|
||||
will ask the hardware to enter idle states on idle CPUs via the CPU architecture
|
||||
support code that is expected to provide a default mechanism for this purpose.
|
||||
That default mechanism usually is the least common denominator for all of the
|
||||
processors implementing the architecture (i.e. CPU instruction set) in question,
|
||||
however, so it is rather crude and not very energy-efficient. For this reason,
|
||||
it is not recommended for production use.
|
||||
|
||||
The ``cpuidle.governor=`` kernel command line switch allows the ``CPUIdle``
|
||||
governor to use to be specified. It has to be appended with a string matching
|
||||
the name of an available governor (e.g. ``cpuidle.governor=menu``) and that
|
||||
governor will be used instead of the default one. It is possible to force
|
||||
the ``menu`` governor to be used on the systems that use the ``ladder`` governor
|
||||
by default this way, for example.
|
||||
|
||||
The other kernel command line parameters controlling CPU idle time management
|
||||
described below are only relevant for the *x86* architecture and some of
|
||||
them affect Intel processors only.
|
||||
|
||||
The *x86* architecture support code recognizes three kernel command line
|
||||
options related to CPU idle time management: ``idle=poll``, ``idle=halt``,
|
||||
and ``idle=nomwait``. The first two of them disable the ``acpi_idle`` and
|
||||
``intel_idle`` drivers altogether, which effectively causes the entire
|
||||
``CPUIdle`` subsystem to be disabled and makes the idle loop invoke the
|
||||
architecture support code to deal with idle CPUs. How it does that depends on
|
||||
which of the two parameters is added to the kernel command line. In the
|
||||
``idle=halt`` case, the architecture support code will use the ``HLT``
|
||||
instruction of the CPUs (which, as a rule, suspends the execution of the program
|
||||
and causes the hardware to attempt to enter the shallowest available idle state)
|
||||
for this purpose, and if ``idle=poll`` is used, idle CPUs will execute a
|
||||
more or less ``lightweight'' sequence of instructions in a tight loop. [Note
|
||||
that using ``idle=poll`` is somewhat drastic in many cases, as preventing idle
|
||||
CPUs from saving almost any energy at all may not be the only effect of it.
|
||||
For example, on Intel hardware it effectively prevents CPUs from using
|
||||
P-states (see |cpufreq|) that require any number of CPUs in a package to be
|
||||
idle, so it very well may hurt single-thread computations performance as well as
|
||||
energy-efficiency. Thus using it for performance reasons may not be a good idea
|
||||
at all.]
|
||||
|
||||
The ``idle=nomwait`` option disables the ``intel_idle`` driver and causes
|
||||
``acpi_idle`` to be used (as long as all of the information needed by it is
|
||||
there in the system's ACPI tables), but it is not allowed to use the
|
||||
``MWAIT`` instruction of the CPUs to ask the hardware to enter idle states.
|
||||
|
||||
In addition to the architecture-level kernel command line options affecting CPU
|
||||
idle time management, there are parameters affecting individual ``CPUIdle``
|
||||
drivers that can be passed to them via the kernel command line. Specifically,
|
||||
the ``intel_idle.max_cstate=<n>`` and ``processor.max_cstate=<n>`` parameters,
|
||||
where ``<n>`` is an idle state index also used in the name of the given
|
||||
state's directory in ``sysfs`` (see
|
||||
`Representation of Idle States <idle-states-representation_>`_), causes the
|
||||
``intel_idle`` and ``acpi_idle`` drivers, respectively, to discard all of the
|
||||
idle states deeper than idle state ``<n>``. In that case, they will never ask
|
||||
for any of those idle states or expose them to the governor. [The behavior of
|
||||
the two drivers is different for ``<n>`` equal to ``0``. Adding
|
||||
``intel_idle.max_cstate=0`` to the kernel command line disables the
|
||||
``intel_idle`` driver and allows ``acpi_idle`` to be used, whereas
|
||||
``processor.max_cstate=0`` is equivalent to ``processor.max_cstate=1``.
|
||||
Also, the ``acpi_idle`` driver is part of the ``processor`` kernel module that
|
||||
can be loaded separately and ``max_cstate=<n>`` can be passed to it as a module
|
||||
parameter when it is loaded.]
|
@@ -495,7 +495,15 @@ on the following rules, regardless of the current operation mode of the driver:
|
||||
|
||||
2. Each individual CPU is affected by its own per-policy limits (that is, it
|
||||
cannot be requested to run faster than its own per-policy maximum and it
|
||||
cannot be requested to run slower than its own per-policy minimum).
|
||||
cannot be requested to run slower than its own per-policy minimum). The
|
||||
effective performance depends on whether the platform supports per core
|
||||
P-states, hyper-threading is enabled and on current performance requests
|
||||
from other CPUs. When platform doesn't support per core P-states, the
|
||||
effective performance can be more than the policy limits set on a CPU, if
|
||||
other CPUs are requesting higher performance at that moment. Even with per
|
||||
core P-states support, when hyper-threading is enabled, if the sibling CPU
|
||||
is requesting higher performance, the other siblings will get higher
|
||||
performance than their policy limits.
|
||||
|
||||
3. The global and per-policy limits can be set independently.
|
||||
|
||||
|
@@ -5,5 +5,6 @@ Working-State Power Management
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
cpuidle
|
||||
cpufreq
|
||||
intel_pstate
|
||||
|
@@ -54,7 +54,7 @@ those errors are correctable.
|
||||
Types of errors
|
||||
---------------
|
||||
|
||||
Most mechanisms used on modern systems use use technologies like Hamming
|
||||
Most mechanisms used on modern systems use technologies like Hamming
|
||||
Codes that allow error correction when the number of errors on a bit packet
|
||||
is below a threshold. If the number of errors is above, those mechanisms
|
||||
can indicate with a high degree of confidence that an error happened, but
|
||||
|
@@ -67,7 +67,7 @@ If you can't figure out which subsystem caused the issue, you should file
|
||||
a bug in kernel.org bugzilla and send email to
|
||||
linux-kernel@vger.kernel.org, referencing the bugzilla URL. (For more
|
||||
information on the linux-kernel mailing list see
|
||||
http://www.tux.org/lkml/).
|
||||
http://vger.kernel.org/lkml/).
|
||||
|
||||
|
||||
Tips for reporting bugs
|
||||
|
@@ -44,7 +44,7 @@ only valid reason for deferring the publication of a fix is to accommodate
|
||||
the logistics of QA and large scale rollouts which require release
|
||||
coordination.
|
||||
|
||||
Whilst embargoed information may be shared with trusted individuals in
|
||||
While embargoed information may be shared with trusted individuals in
|
||||
order to develop a fix, such information will not be published alongside
|
||||
the fix or on any other disclosure channel without the permission of the
|
||||
reporter. This includes but is not limited to the original bug report
|
||||
|
@@ -1,59 +1,164 @@
|
||||
Tainted kernels
|
||||
---------------
|
||||
|
||||
Some oops reports contain the string **'Tainted: '** after the program
|
||||
counter. This indicates that the kernel has been tainted by some
|
||||
mechanism. The string is followed by a series of position-sensitive
|
||||
characters, each representing a particular tainted value.
|
||||
The kernel will mark itself as 'tainted' when something occurs that might be
|
||||
relevant later when investigating problems. Don't worry too much about this,
|
||||
most of the time it's not a problem to run a tainted kernel; the information is
|
||||
mainly of interest once someone wants to investigate some problem, as its real
|
||||
cause might be the event that got the kernel tainted. That's why bug reports
|
||||
from tainted kernels will often be ignored by developers, hence try to reproduce
|
||||
problems with an untainted kernel.
|
||||
|
||||
1) ``G`` if all modules loaded have a GPL or compatible license, ``P`` if
|
||||
Note the kernel will remain tainted even after you undo what caused the taint
|
||||
(i.e. unload a proprietary kernel module), to indicate the kernel remains not
|
||||
trustworthy. That's also why the kernel will print the tainted state when it
|
||||
notices an internal problem (a 'kernel bug'), a recoverable error
|
||||
('kernel oops') or a non-recoverable error ('kernel panic') and writes debug
|
||||
information about this to the logs ``dmesg`` outputs. It's also possible to
|
||||
check the tainted state at runtime through a file in ``/proc/``.
|
||||
|
||||
|
||||
Tainted flag in bugs, oops or panics messages
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
You find the tainted state near the top in a line starting with 'CPU:'; if or
|
||||
why the kernel was tainted is shown after the Process ID ('PID:') and a shortened
|
||||
name of the command ('Comm:') that triggered the event::
|
||||
|
||||
BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
|
||||
Oops: 0002 [#1] SMP PTI
|
||||
CPU: 0 PID: 4424 Comm: insmod Tainted: P W O 4.20.0-0.rc6.fc30 #1
|
||||
Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
|
||||
RIP: 0010:my_oops_init+0x13/0x1000 [kpanic]
|
||||
[...]
|
||||
|
||||
You'll find a 'Not tainted: ' there if the kernel was not tainted at the
|
||||
time of the event; if it was, then it will print 'Tainted: ' and characters
|
||||
either letters or blanks. In above example it looks like this::
|
||||
|
||||
Tainted: P W O
|
||||
|
||||
The meaning of those characters is explained in the table below. In tis case
|
||||
the kernel got tainted earlier because a proprietary Module (``P``) was loaded,
|
||||
a warning occurred (``W``), and an externally-built module was loaded (``O``).
|
||||
To decode other letters use the table below.
|
||||
|
||||
|
||||
Decoding tainted state at runtime
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
At runtime, you can query the tainted state by reading
|
||||
``cat /proc/sys/kernel/tainted``. If that returns ``0``, the kernel is not
|
||||
tainted; any other number indicates the reasons why it is. The easiest way to
|
||||
decode that number is the script ``tools/debugging/kernel-chktaint``, which your
|
||||
distribution might ship as part of a package called ``linux-tools`` or
|
||||
``kernel-tools``; if it doesn't you can download the script from
|
||||
`git.kernel.org <https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/plain/tools/debugging/kernel-chktaint>`_
|
||||
and execute it with ``sh kernel-chktaint``, which would print something like
|
||||
this on the machine that had the statements in the logs that were quoted earlier::
|
||||
|
||||
Kernel is Tainted for following reasons:
|
||||
* Proprietary module was loaded (#0)
|
||||
* Kernel issued warning (#9)
|
||||
* Externally-built ('out-of-tree') module was loaded (#12)
|
||||
See Documentation/admin-guide/tainted-kernels.rst in the the Linux kernel or
|
||||
https://www.kernel.org/doc/html/latest/admin-guide/tainted-kernels.html for
|
||||
a more details explanation of the various taint flags.
|
||||
Raw taint value as int/string: 4609/'P W O '
|
||||
|
||||
You can try to decode the number yourself. That's easy if there was only one
|
||||
reason that got your kernel tainted, as in this case you can find the number
|
||||
with the table below. If there were multiple reasons you need to decode the
|
||||
number, as it is a bitfield, where each bit indicates the absence or presence of
|
||||
a particular type of taint. It's best to leave that to the aforementioned
|
||||
script, but if you need something quick you can use this shell command to check
|
||||
which bits are set::
|
||||
|
||||
$ for i in $(seq 18); do echo $(($i-1)) $(($(cat /proc/sys/kernel/tainted)>>($i-1)&1));done
|
||||
|
||||
Table for decoding tainted state
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
=== === ====== ========================================================
|
||||
Bit Log Number Reason that got the kernel tainted
|
||||
=== === ====== ========================================================
|
||||
0 G/P 1 proprietary module was loaded
|
||||
1 _/F 2 module was force loaded
|
||||
2 _/S 4 SMP kernel oops on an officially SMP incapable processor
|
||||
3 _/R 8 module was force unloaded
|
||||
4 _/M 16 processor reported a Machine Check Exception (MCE)
|
||||
5 _/B 32 bad page referenced or some unexpected page flags
|
||||
6 _/U 64 taint requested by userspace application
|
||||
7 _/D 128 kernel died recently, i.e. there was an OOPS or BUG
|
||||
8 _/A 256 ACPI table overridden by user
|
||||
9 _/W 512 kernel issued warning
|
||||
10 _/C 1024 staging driver was loaded
|
||||
11 _/I 2048 workaround for bug in platform firmware applied
|
||||
12 _/O 4096 externally-built ("out-of-tree") module was loaded
|
||||
13 _/E 8192 unsigned module was loaded
|
||||
14 _/L 16384 soft lockup occurred
|
||||
15 _/K 32768 kernel has been live patched
|
||||
16 _/X 65536 auxiliary taint, defined for and used by distros
|
||||
17 _/T 131072 kernel was built with the struct randomization plugin
|
||||
=== === ====== ========================================================
|
||||
|
||||
Note: The character ``_`` is representing a blank in this table to make reading
|
||||
easier.
|
||||
|
||||
More detailed explanation for tainting
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
0) ``G`` if all modules loaded have a GPL or compatible license, ``P`` if
|
||||
any proprietary module has been loaded. Modules without a
|
||||
MODULE_LICENSE or with a MODULE_LICENSE that is not recognised by
|
||||
insmod as GPL compatible are assumed to be proprietary.
|
||||
|
||||
2) ``F`` if any module was force loaded by ``insmod -f``, ``' '`` if all
|
||||
1) ``F`` if any module was force loaded by ``insmod -f``, ``' '`` if all
|
||||
modules were loaded normally.
|
||||
|
||||
3) ``S`` if the oops occurred on an SMP kernel running on hardware that
|
||||
2) ``S`` if the oops occurred on an SMP kernel running on hardware that
|
||||
hasn't been certified as safe to run multiprocessor.
|
||||
Currently this occurs only on various Athlons that are not
|
||||
SMP capable.
|
||||
|
||||
4) ``R`` if a module was force unloaded by ``rmmod -f``, ``' '`` if all
|
||||
3) ``R`` if a module was force unloaded by ``rmmod -f``, ``' '`` if all
|
||||
modules were unloaded normally.
|
||||
|
||||
5) ``M`` if any processor has reported a Machine Check Exception,
|
||||
4) ``M`` if any processor has reported a Machine Check Exception,
|
||||
``' '`` if no Machine Check Exceptions have occurred.
|
||||
|
||||
6) ``B`` if a page-release function has found a bad page reference or
|
||||
some unexpected page flags.
|
||||
5) ``B`` If a page-release function has found a bad page reference or some
|
||||
unexpected page flags. This indicates a hardware problem or a kernel bug;
|
||||
there should be other information in the log indicating why this tainting
|
||||
occured.
|
||||
|
||||
7) ``U`` if a user or user application specifically requested that the
|
||||
6) ``U`` if a user or user application specifically requested that the
|
||||
Tainted flag be set, ``' '`` otherwise.
|
||||
|
||||
8) ``D`` if the kernel has died recently, i.e. there was an OOPS or BUG.
|
||||
7) ``D`` if the kernel has died recently, i.e. there was an OOPS or BUG.
|
||||
|
||||
9) ``A`` if the ACPI table has been overridden.
|
||||
8) ``A`` if an ACPI table has been overridden.
|
||||
|
||||
10) ``W`` if a warning has previously been issued by the kernel.
|
||||
9) ``W`` if a warning has previously been issued by the kernel.
|
||||
(Though some warnings may set more specific taint flags.)
|
||||
|
||||
11) ``C`` if a staging driver has been loaded.
|
||||
10) ``C`` if a staging driver has been loaded.
|
||||
|
||||
12) ``I`` if the kernel is working around a severe bug in the platform
|
||||
11) ``I`` if the kernel is working around a severe bug in the platform
|
||||
firmware (BIOS or similar).
|
||||
|
||||
13) ``O`` if an externally-built ("out-of-tree") module has been loaded.
|
||||
12) ``O`` if an externally-built ("out-of-tree") module has been loaded.
|
||||
|
||||
14) ``E`` if an unsigned module has been loaded in a kernel supporting
|
||||
13) ``E`` if an unsigned module has been loaded in a kernel supporting
|
||||
module signature.
|
||||
|
||||
15) ``L`` if a soft lockup has previously occurred on the system.
|
||||
14) ``L`` if a soft lockup has previously occurred on the system.
|
||||
|
||||
16) ``K`` if the kernel has been live patched.
|
||||
15) ``K`` if the kernel has been live patched.
|
||||
|
||||
The primary reason for the **'Tainted: '** string is to tell kernel
|
||||
debuggers if this is a clean kernel or if anything unusual has
|
||||
occurred. Tainting is permanent: even if an offending module is
|
||||
unloaded, the tainted value remains to indicate that the kernel is not
|
||||
trustworthy.
|
||||
16) ``X`` Auxiliary taint, defined for and used by Linux distributors.
|
||||
|
||||
17) ``T`` Kernel was build with the randstruct plugin, which can intentionally
|
||||
produce extremely unusual kernel structure layouts (even performance
|
||||
pathological ones), which is important to know when debugging. Set at
|
||||
build time.
|
||||
|
@@ -133,6 +133,26 @@ If the user still wants to connect the device they can either approve
|
||||
the device without a key or write a new key and write 1 to the
|
||||
``authorized`` file to get the new key stored on the device NVM.
|
||||
|
||||
DMA protection utilizing IOMMU
|
||||
------------------------------
|
||||
Recent systems from 2018 and forward with Thunderbolt ports may natively
|
||||
support IOMMU. This means that Thunderbolt security is handled by an IOMMU
|
||||
so connected devices cannot access memory regions outside of what is
|
||||
allocated for them by drivers. When Linux is running on such system it
|
||||
automatically enables IOMMU if not enabled by the user already. These
|
||||
systems can be identified by reading ``1`` from
|
||||
``/sys/bus/thunderbolt/devices/domainX/iommu_dma_protection`` attribute.
|
||||
|
||||
The driver does not do anything special in this case but because DMA
|
||||
protection is handled by the IOMMU, security levels (if set) are
|
||||
redundant. For this reason some systems ship with security level set to
|
||||
``none``. Other systems have security level set to ``user`` in order to
|
||||
support downgrade to older OS, so users who want to automatically
|
||||
authorize devices when IOMMU DMA protection is enabled can use the
|
||||
following ``udev`` rule::
|
||||
|
||||
ACTION=="add", SUBSYSTEM=="thunderbolt", ATTRS{iommu_dma_protection}=="1", ATTR{authorized}=="0", ATTR{authorized}="1"
|
||||
|
||||
Upgrading NVM on Thunderbolt device or host
|
||||
-------------------------------------------
|
||||
Since most of the functionality is handled in firmware running on a
|
||||
|
@@ -126,7 +126,7 @@ tagged list.
|
||||
The boot loader must pass at a minimum the size and location of the
|
||||
system memory, and the root filesystem location. The dtb must be
|
||||
placed in a region of memory where the kernel decompressor will not
|
||||
overwrite it, whilst remaining within the region which will be covered
|
||||
overwrite it, while remaining within the region which will be covered
|
||||
by the kernel's low-memory mapping.
|
||||
|
||||
A safe location is just above the 128MiB boundary from start of RAM.
|
||||
|
@@ -55,7 +55,7 @@ out s3c2410 API, then here are some notes on the process.
|
||||
as they have the same arguments, and can either take the pin specific
|
||||
values, or the more generic special-function-number arguments.
|
||||
|
||||
3) s3c2410_gpio_pullup() changes have the problem that whilst the
|
||||
3) s3c2410_gpio_pullup() changes have the problem that while the
|
||||
s3c2410_gpio_pullup(x, 1) can be easily translated to the
|
||||
s3c_gpio_setpull(x, S3C_GPIO_PULL_NONE), the s3c2410_gpio_pullup(x, 0)
|
||||
are not so easy.
|
||||
|
@@ -17,7 +17,7 @@ Introduction
|
||||
versions.
|
||||
|
||||
The S3C2416 and S3C2450 devices are very similar and S3C2450 support is
|
||||
included under the arch/arm/mach-s3c2416 directory. Note, whilst core
|
||||
included under the arch/arm/mach-s3c2416 directory. Note, while core
|
||||
support for these SoCs is in, work on some of the extra peripherals
|
||||
and extra interrupts is still ongoing.
|
||||
|
||||
|
@@ -87,7 +87,7 @@ Debugging
|
||||
suspending, which means that use of printascii() or similar direct
|
||||
access to the UARTs will cause the debug to stop.
|
||||
|
||||
2) Whilst the pm code itself will attempt to re-enable the UART clocks,
|
||||
2) While the pm code itself will attempt to re-enable the UART clocks,
|
||||
care should be taken that any external clock sources that the UARTs
|
||||
rely on are still enabled at that point.
|
||||
|
||||
|
@@ -6,7 +6,7 @@ TL;DR summary
|
||||
* Use only NEON instructions, or VFP instructions that don't rely on support
|
||||
code
|
||||
* Isolate your NEON code in a separate compilation unit, and compile it with
|
||||
'-mfpu=neon -mfloat-abi=softfp'
|
||||
'-march=armv7-a -mfpu=neon -mfloat-abi=softfp'
|
||||
* Put kernel_neon_begin() and kernel_neon_end() calls around the calls into your
|
||||
NEON code
|
||||
* Don't sleep in your NEON code, and be aware that it will be executed with
|
||||
@@ -87,7 +87,7 @@ instructions appearing in unexpected places if no special care is taken.
|
||||
Therefore, the recommended and only supported way of using NEON/VFP in the
|
||||
kernel is by adhering to the following rules:
|
||||
* isolate the NEON code in a separate compilation unit and compile it with
|
||||
'-mfpu=neon -mfloat-abi=softfp';
|
||||
'-march=armv7-a -mfpu=neon -mfloat-abi=softfp';
|
||||
* issue the calls to kernel_neon_begin(), kernel_neon_end() as well as the calls
|
||||
into the unit containing the NEON code from a compilation unit which is *not*
|
||||
built with the GCC flag '-mfpu=neon' set.
|
||||
|
@@ -188,6 +188,11 @@ Before jumping into the kernel, the following conditions must be met:
|
||||
the kernel image will be entered must be initialised by software at a
|
||||
higher exception level to prevent execution in an UNKNOWN state.
|
||||
|
||||
- SCR_EL3.FIQ must have the same value across all CPUs the kernel is
|
||||
executing on.
|
||||
- The value of SCR_EL3.FIQ must be the same as the one present at boot
|
||||
time whenever the kernel is executing.
|
||||
|
||||
For systems with a GICv3 interrupt controller to be used in v3 mode:
|
||||
- If EL3 is present:
|
||||
ICC_SRE_EL3.Enable (bit 3) must be initialiased to 0b1.
|
||||
@@ -205,6 +210,14 @@ Before jumping into the kernel, the following conditions must be met:
|
||||
ICC_SRE_EL2.SRE (bit 0) must be initialised to 0b0.
|
||||
- The DT or ACPI tables must describe a GICv2 interrupt controller.
|
||||
|
||||
For CPUs with pointer authentication functionality:
|
||||
- If EL3 is present:
|
||||
SCR_EL3.APK (bit 16) must be initialised to 0b1
|
||||
SCR_EL3.API (bit 17) must be initialised to 0b1
|
||||
- If the kernel is entered at EL1:
|
||||
HCR_EL2.APK (bit 40) must be initialised to 0b1
|
||||
HCR_EL2.API (bit 41) must be initialised to 0b1
|
||||
|
||||
The requirements described above for CPU mode, caches, MMUs, architected
|
||||
timers, coherency and system registers apply to all CPUs. All CPUs must
|
||||
enter the kernel in the same exception level.
|
||||
|
@@ -184,12 +184,20 @@ infrastructure:
|
||||
x--------------------------------------------------x
|
||||
| Name | bits | visible |
|
||||
|--------------------------------------------------|
|
||||
| GPI | [31-28] | y |
|
||||
|--------------------------------------------------|
|
||||
| GPA | [27-24] | y |
|
||||
|--------------------------------------------------|
|
||||
| LRCPC | [23-20] | y |
|
||||
|--------------------------------------------------|
|
||||
| FCMA | [19-16] | y |
|
||||
|--------------------------------------------------|
|
||||
| JSCVT | [15-12] | y |
|
||||
|--------------------------------------------------|
|
||||
| API | [11-8] | y |
|
||||
|--------------------------------------------------|
|
||||
| APA | [7-4] | y |
|
||||
|--------------------------------------------------|
|
||||
| DPB | [3-0] | y |
|
||||
x--------------------------------------------------x
|
||||
|
||||
|
@@ -182,3 +182,15 @@ HWCAP_FLAGM
|
||||
HWCAP_SSBS
|
||||
|
||||
Functionality implied by ID_AA64PFR1_EL1.SSBS == 0b0010.
|
||||
|
||||
HWCAP_PACA
|
||||
|
||||
Functionality implied by ID_AA64ISAR1_EL1.APA == 0b0001 or
|
||||
ID_AA64ISAR1_EL1.API == 0b0001, as described by
|
||||
Documentation/arm64/pointer-authentication.txt.
|
||||
|
||||
HWCAP_PACG
|
||||
|
||||
Functionality implied by ID_AA64ISAR1_EL1.GPA == 0b0001 or
|
||||
ID_AA64ISAR1_EL1.GPI == 0b0001, as described by
|
||||
Documentation/arm64/pointer-authentication.txt.
|
||||
|
93
Documentation/arm64/pointer-authentication.txt
Normal file
@@ -0,0 +1,93 @@
|
||||
Pointer authentication in AArch64 Linux
|
||||
=======================================
|
||||
|
||||
Author: Mark Rutland <mark.rutland@arm.com>
|
||||
Date: 2017-07-19
|
||||
|
||||
This document briefly describes the provision of pointer authentication
|
||||
functionality in AArch64 Linux.
|
||||
|
||||
|
||||
Architecture overview
|
||||
---------------------
|
||||
|
||||
The ARMv8.3 Pointer Authentication extension adds primitives that can be
|
||||
used to mitigate certain classes of attack where an attacker can corrupt
|
||||
the contents of some memory (e.g. the stack).
|
||||
|
||||
The extension uses a Pointer Authentication Code (PAC) to determine
|
||||
whether pointers have been modified unexpectedly. A PAC is derived from
|
||||
a pointer, another value (such as the stack pointer), and a secret key
|
||||
held in system registers.
|
||||
|
||||
The extension adds instructions to insert a valid PAC into a pointer,
|
||||
and to verify/remove the PAC from a pointer. The PAC occupies a number
|
||||
of high-order bits of the pointer, which varies dependent on the
|
||||
configured virtual address size and whether pointer tagging is in use.
|
||||
|
||||
A subset of these instructions have been allocated from the HINT
|
||||
encoding space. In the absence of the extension (or when disabled),
|
||||
these instructions behave as NOPs. Applications and libraries using
|
||||
these instructions operate correctly regardless of the presence of the
|
||||
extension.
|
||||
|
||||
The extension provides five separate keys to generate PACs - two for
|
||||
instruction addresses (APIAKey, APIBKey), two for data addresses
|
||||
(APDAKey, APDBKey), and one for generic authentication (APGAKey).
|
||||
|
||||
|
||||
Basic support
|
||||
-------------
|
||||
|
||||
When CONFIG_ARM64_PTR_AUTH is selected, and relevant HW support is
|
||||
present, the kernel will assign random key values to each process at
|
||||
exec*() time. The keys are shared by all threads within the process, and
|
||||
are preserved across fork().
|
||||
|
||||
Presence of address authentication functionality is advertised via
|
||||
HWCAP_PACA, and generic authentication functionality via HWCAP_PACG.
|
||||
|
||||
The number of bits that the PAC occupies in a pointer is 55 minus the
|
||||
virtual address size configured by the kernel. For example, with a
|
||||
virtual address size of 48, the PAC is 7 bits wide.
|
||||
|
||||
Recent versions of GCC can compile code with APIAKey-based return
|
||||
address protection when passed the -msign-return-address option. This
|
||||
uses instructions in the HINT space (unless -march=armv8.3-a or higher
|
||||
is also passed), and such code can run on systems without the pointer
|
||||
authentication extension.
|
||||
|
||||
In addition to exec(), keys can also be reinitialized to random values
|
||||
using the PR_PAC_RESET_KEYS prctl. A bitmask of PR_PAC_APIAKEY,
|
||||
PR_PAC_APIBKEY, PR_PAC_APDAKEY, PR_PAC_APDBKEY and PR_PAC_APGAKEY
|
||||
specifies which keys are to be reinitialized; specifying 0 means "all
|
||||
keys".
|
||||
|
||||
|
||||
Debugging
|
||||
---------
|
||||
|
||||
When CONFIG_ARM64_PTR_AUTH is selected, and HW support for address
|
||||
authentication is present, the kernel will expose the position of TTBR0
|
||||
PAC bits in the NT_ARM_PAC_MASK regset (struct user_pac_mask), which
|
||||
userspace can acquire via PTRACE_GETREGSET.
|
||||
|
||||
The regset is exposed only when HWCAP_PACA is set. Separate masks are
|
||||
exposed for data pointers and instruction pointers, as the set of PAC
|
||||
bits can vary between the two. Note that the masks apply to TTBR0
|
||||
addresses, and are not valid to apply to TTBR1 addresses (e.g. kernel
|
||||
pointers).
|
||||
|
||||
Additionally, when CONFIG_CHECKPOINT_RESTORE is also set, the kernel
|
||||
will expose the NT_ARM_PACA_KEYS and NT_ARM_PACG_KEYS regsets (struct
|
||||
user_pac_address_keys and struct user_pac_generic_keys). These can be
|
||||
used to get and set the keys for a thread.
|
||||
|
||||
|
||||
Virtualization
|
||||
--------------
|
||||
|
||||
Pointer authentication is not currently supported in KVM guests. KVM
|
||||
will mask the feature bits from ID_AA64ISAR1_EL1, and attempted use of
|
||||
the feature will result in an UNDEFINED exception being injected into
|
||||
the guest.
|
@@ -44,6 +44,8 @@ stable kernels.
|
||||
|
||||
| Implementor | Component | Erratum ID | Kconfig |
|
||||
+----------------+-----------------+-----------------+-----------------------------+
|
||||
| Allwinner | A64/R18 | UNKNOWN1 | SUN50I_ERRATUM_UNKNOWN1 |
|
||||
| | | | |
|
||||
| ARM | Cortex-A53 | #826319 | ARM64_ERRATUM_826319 |
|
||||
| ARM | Cortex-A53 | #827319 | ARM64_ERRATUM_827319 |
|
||||
| ARM | Cortex-A53 | #824069 | ARM64_ERRATUM_824069 |
|
||||
@@ -57,6 +59,7 @@ stable kernels.
|
||||
| ARM | Cortex-A73 | #858921 | ARM64_ERRATUM_858921 |
|
||||
| ARM | Cortex-A55 | #1024718 | ARM64_ERRATUM_1024718 |
|
||||
| ARM | Cortex-A76 | #1188873 | ARM64_ERRATUM_1188873 |
|
||||
| ARM | Cortex-A76 | #1165522 | ARM64_ERRATUM_1165522 |
|
||||
| ARM | Cortex-A76 | #1286807 | ARM64_ERRATUM_1286807 |
|
||||
| ARM | MMU-500 | #841119,#826419 | N/A |
|
||||
| | | | |
|
||||
@@ -79,3 +82,4 @@ stable kernels.
|
||||
| Qualcomm Tech. | Falkor v1 | E1009 | QCOM_FALKOR_ERRATUM_1009 |
|
||||
| Qualcomm Tech. | QDF2400 ITS | E0065 | QCOM_QDF2400_ERRATUM_0065 |
|
||||
| Qualcomm Tech. | Falkor v{1,2} | E1041 | QCOM_FALKOR_ERRATUM_1041 |
|
||||
| Fujitsu | A64FX | E#010001 | FUJITSU_ERRATUM_010001 |
|
||||
|
@@ -357,6 +357,13 @@ video playing/streaming, a very low drop rate may be more important
|
||||
than maximum throughput. In these cases, consider setting the
|
||||
strict_guarantees parameter.
|
||||
|
||||
slice_idle_us
|
||||
-------------
|
||||
|
||||
Controls the same tuning parameter as slice_idle, but in microseconds.
|
||||
Either tunable can be used to set idling behavior. Afterwards, the
|
||||
other tunable will reflect the newly set value in sysfs.
|
||||
|
||||
strict_guarantees
|
||||
-----------------
|
||||
|
||||
|
@@ -65,7 +65,6 @@ Description of Contents:
|
||||
3.2.3 I/O completion
|
||||
3.2.4 Implications for drivers that do not interpret bios (don't handle
|
||||
multiple segments)
|
||||
3.2.5 Request command tagging
|
||||
3.3 I/O submission
|
||||
4. The I/O scheduler
|
||||
5. Scalability related changes
|
||||
@@ -708,93 +707,6 @@ is crossed on completion of a transfer. (The end*request* functions should
|
||||
be used if only if the request has come down from block/bio path, not for
|
||||
direct access requests which only specify rq->buffer without a valid rq->bio)
|
||||
|
||||
3.2.5 Generic request command tagging
|
||||
|
||||
3.2.5.1 Tag helpers
|
||||
|
||||
Block now offers some simple generic functionality to help support command
|
||||
queueing (typically known as tagged command queueing), ie manage more than
|
||||
one outstanding command on a queue at any given time.
|
||||
|
||||
blk_queue_init_tags(struct request_queue *q, int depth)
|
||||
|
||||
Initialize internal command tagging structures for a maximum
|
||||
depth of 'depth'.
|
||||
|
||||
blk_queue_free_tags((struct request_queue *q)
|
||||
|
||||
Teardown tag info associated with the queue. This will be done
|
||||
automatically by block if blk_queue_cleanup() is called on a queue
|
||||
that is using tagging.
|
||||
|
||||
The above are initialization and exit management, the main helpers during
|
||||
normal operations are:
|
||||
|
||||
blk_queue_start_tag(struct request_queue *q, struct request *rq)
|
||||
|
||||
Start tagged operation for this request. A free tag number between
|
||||
0 and 'depth' is assigned to the request (rq->tag holds this number),
|
||||
and 'rq' is added to the internal tag management. If the maximum depth
|
||||
for this queue is already achieved (or if the tag wasn't started for
|
||||
some other reason), 1 is returned. Otherwise 0 is returned.
|
||||
|
||||
blk_queue_end_tag(struct request_queue *q, struct request *rq)
|
||||
|
||||
End tagged operation on this request. 'rq' is removed from the internal
|
||||
book keeping structures.
|
||||
|
||||
To minimize struct request and queue overhead, the tag helpers utilize some
|
||||
of the same request members that are used for normal request queue management.
|
||||
This means that a request cannot both be an active tag and be on the queue
|
||||
list at the same time. blk_queue_start_tag() will remove the request, but
|
||||
the driver must remember to call blk_queue_end_tag() before signalling
|
||||
completion of the request to the block layer. This means ending tag
|
||||
operations before calling end_that_request_last()! For an example of a user
|
||||
of these helpers, see the IDE tagged command queueing support.
|
||||
|
||||
3.2.5.2 Tag info
|
||||
|
||||
Some block functions exist to query current tag status or to go from a
|
||||
tag number to the associated request. These are, in no particular order:
|
||||
|
||||
blk_queue_tagged(q)
|
||||
|
||||
Returns 1 if the queue 'q' is using tagging, 0 if not.
|
||||
|
||||
blk_queue_tag_request(q, tag)
|
||||
|
||||
Returns a pointer to the request associated with tag 'tag'.
|
||||
|
||||
blk_queue_tag_depth(q)
|
||||
|
||||
Return current queue depth.
|
||||
|
||||
blk_queue_tag_queue(q)
|
||||
|
||||
Returns 1 if the queue can accept a new queued command, 0 if we are
|
||||
at the maximum depth already.
|
||||
|
||||
blk_queue_rq_tagged(rq)
|
||||
|
||||
Returns 1 if the request 'rq' is tagged.
|
||||
|
||||
3.2.5.2 Internal structure
|
||||
|
||||
Internally, block manages tags in the blk_queue_tag structure:
|
||||
|
||||
struct blk_queue_tag {
|
||||
struct request **tag_index; /* array or pointers to rq */
|
||||
unsigned long *tag_map; /* bitmap of free tags */
|
||||
struct list_head busy_list; /* fifo list of busy tags */
|
||||
int busy; /* queue depth */
|
||||
int max_depth; /* max queue depth */
|
||||
};
|
||||
|
||||
Most of the above is simple and straight forward, however busy_list may need
|
||||
a bit of explaining. Normally we don't care too much about request ordering,
|
||||
but in the event of any barrier requests in the tag queue we need to ensure
|
||||
that requests are restarted in the order they were queue.
|
||||
|
||||
3.3 I/O Submission
|
||||
|
||||
The routine submit_bio() is used to submit a single io. Higher level i/o
|
||||
|
@@ -117,3 +117,28 @@ Other implications:
|
||||
size limitations and the limitations of the underlying devices. Thus
|
||||
there's no need to define ->merge_bvec_fn() callbacks for individual block
|
||||
drivers.
|
||||
|
||||
Usage of helpers:
|
||||
=================
|
||||
|
||||
* The following helpers whose names have the suffix of "_all" can only be used
|
||||
on non-BIO_CLONED bio. They are usually used by filesystem code. Drivers
|
||||
shouldn't use them because the bio may have been split before it reached the
|
||||
driver.
|
||||
|
||||
bio_for_each_segment_all()
|
||||
bio_first_bvec_all()
|
||||
bio_first_page_all()
|
||||
bio_last_bvec_all()
|
||||
|
||||
* The following helpers iterate over single-page segment. The passed 'struct
|
||||
bio_vec' will contain a single-page IO vector during the iteration
|
||||
|
||||
bio_for_each_segment()
|
||||
bio_for_each_segment_all()
|
||||
|
||||
* The following helpers iterate over multi-page bvec. The passed 'struct
|
||||
bio_vec' will contain a multi-page IO vector during the iteration
|
||||
|
||||
bio_for_each_bvec()
|
||||
rq_for_each_bvec()
|
||||
|
@@ -1,291 +0,0 @@
|
||||
CFQ (Complete Fairness Queueing)
|
||||
===============================
|
||||
|
||||
The main aim of CFQ scheduler is to provide a fair allocation of the disk
|
||||
I/O bandwidth for all the processes which requests an I/O operation.
|
||||
|
||||
CFQ maintains the per process queue for the processes which request I/O
|
||||
operation(synchronous requests). In case of asynchronous requests, all the
|
||||
requests from all the processes are batched together according to their
|
||||
process's I/O priority.
|
||||
|
||||
CFQ ioscheduler tunables
|
||||
========================
|
||||
|
||||
slice_idle
|
||||
----------
|
||||
This specifies how long CFQ should idle for next request on certain cfq queues
|
||||
(for sequential workloads) and service trees (for random workloads) before
|
||||
queue is expired and CFQ selects next queue to dispatch from.
|
||||
|
||||
By default slice_idle is a non-zero value. That means by default we idle on
|
||||
queues/service trees. This can be very helpful on highly seeky media like
|
||||
single spindle SATA/SAS disks where we can cut down on overall number of
|
||||
seeks and see improved throughput.
|
||||
|
||||
Setting slice_idle to 0 will remove all the idling on queues/service tree
|
||||
level and one should see an overall improved throughput on faster storage
|
||||
devices like multiple SATA/SAS disks in hardware RAID configuration. The down
|
||||
side is that isolation provided from WRITES also goes down and notion of
|
||||
IO priority becomes weaker.
|
||||
|
||||
So depending on storage and workload, it might be useful to set slice_idle=0.
|
||||
In general I think for SATA/SAS disks and software RAID of SATA/SAS disks
|
||||
keeping slice_idle enabled should be useful. For any configurations where
|
||||
there are multiple spindles behind single LUN (Host based hardware RAID
|
||||
controller or for storage arrays), setting slice_idle=0 might end up in better
|
||||
throughput and acceptable latencies.
|
||||
|
||||
back_seek_max
|
||||
-------------
|
||||
This specifies, given in Kbytes, the maximum "distance" for backward seeking.
|
||||
The distance is the amount of space from the current head location to the
|
||||
sectors that are backward in terms of distance.
|
||||
|
||||
This parameter allows the scheduler to anticipate requests in the "backward"
|
||||
direction and consider them as being the "next" if they are within this
|
||||
distance from the current head location.
|
||||
|
||||
back_seek_penalty
|
||||
-----------------
|
||||
This parameter is used to compute the cost of backward seeking. If the
|
||||
backward distance of request is just 1/back_seek_penalty from a "front"
|
||||
request, then the seeking cost of two requests is considered equivalent.
|
||||
|
||||
So scheduler will not bias toward one or the other request (otherwise scheduler
|
||||
will bias toward front request). Default value of back_seek_penalty is 2.
|
||||
|
||||
fifo_expire_async
|
||||
-----------------
|
||||
This parameter is used to set the timeout of asynchronous requests. Default
|
||||
value of this is 248ms.
|
||||
|
||||
fifo_expire_sync
|
||||
----------------
|
||||
This parameter is used to set the timeout of synchronous requests. Default
|
||||
value of this is 124ms. In case to favor synchronous requests over asynchronous
|
||||
one, this value should be decreased relative to fifo_expire_async.
|
||||
|
||||
group_idle
|
||||
-----------
|
||||
This parameter forces idling at the CFQ group level instead of CFQ
|
||||
queue level. This was introduced after a bottleneck was observed
|
||||
in higher end storage due to idle on sequential queue and allow dispatch
|
||||
from a single queue. The idea with this parameter is that it can be run with
|
||||
slice_idle=0 and group_idle=8, so that idling does not happen on individual
|
||||
queues in the group but happens overall on the group and thus still keeps the
|
||||
IO controller working.
|
||||
Not idling on individual queues in the group will dispatch requests from
|
||||
multiple queues in the group at the same time and achieve higher throughput
|
||||
on higher end storage.
|
||||
|
||||
Default value for this parameter is 8ms.
|
||||
|
||||
low_latency
|
||||
-----------
|
||||
This parameter is used to enable/disable the low latency mode of the CFQ
|
||||
scheduler. If enabled, CFQ tries to recompute the slice time for each process
|
||||
based on the target_latency set for the system. This favors fairness over
|
||||
throughput. Disabling low latency (setting it to 0) ignores target latency,
|
||||
allowing each process in the system to get a full time slice.
|
||||
|
||||
By default low latency mode is enabled.
|
||||
|
||||
target_latency
|
||||
--------------
|
||||
This parameter is used to calculate the time slice for a process if cfq's
|
||||
latency mode is enabled. It will ensure that sync requests have an estimated
|
||||
latency. But if sequential workload is higher(e.g. sequential read),
|
||||
then to meet the latency constraints, throughput may decrease because of less
|
||||
time for each process to issue I/O request before the cfq queue is switched.
|
||||
|
||||
Though this can be overcome by disabling the latency_mode, it may increase
|
||||
the read latency for some applications. This parameter allows for changing
|
||||
target_latency through the sysfs interface which can provide the balanced
|
||||
throughput and read latency.
|
||||
|
||||
Default value for target_latency is 300ms.
|
||||
|
||||
slice_async
|
||||
-----------
|
||||
This parameter is same as of slice_sync but for asynchronous queue. The
|
||||
default value is 40ms.
|
||||
|
||||
slice_async_rq
|
||||
--------------
|
||||
This parameter is used to limit the dispatching of asynchronous request to
|
||||
device request queue in queue's slice time. The maximum number of request that
|
||||
are allowed to be dispatched also depends upon the io priority. Default value
|
||||
for this is 2.
|
||||
|
||||
slice_sync
|
||||
----------
|
||||
When a queue is selected for execution, the queues IO requests are only
|
||||
executed for a certain amount of time(time_slice) before switching to another
|
||||
queue. This parameter is used to calculate the time slice of synchronous
|
||||
queue.
|
||||
|
||||
time_slice is computed using the below equation:-
|
||||
time_slice = slice_sync + (slice_sync/5 * (4 - prio)). To increase the
|
||||
time_slice of synchronous queue, increase the value of slice_sync. Default
|
||||
value is 100ms.
|
||||
|
||||
quantum
|
||||
-------
|
||||
This specifies the number of request dispatched to the device queue. In a
|
||||
queue's time slice, a request will not be dispatched if the number of request
|
||||
in the device exceeds this parameter. This parameter is used for synchronous
|
||||
request.
|
||||
|
||||
In case of storage with several disk, this setting can limit the parallel
|
||||
processing of request. Therefore, increasing the value can improve the
|
||||
performance although this can cause the latency of some I/O to increase due
|
||||
to more number of requests.
|
||||
|
||||
CFQ Group scheduling
|
||||
====================
|
||||
|
||||
CFQ supports blkio cgroup and has "blkio." prefixed files in each
|
||||
blkio cgroup directory. It is weight-based and there are four knobs
|
||||
for configuration - weight[_device] and leaf_weight[_device].
|
||||
Internal cgroup nodes (the ones with children) can also have tasks in
|
||||
them, so the former two configure how much proportion the cgroup as a
|
||||
whole is entitled to at its parent's level while the latter two
|
||||
configure how much proportion the tasks in the cgroup have compared to
|
||||
its direct children.
|
||||
|
||||
Another way to think about it is assuming that each internal node has
|
||||
an implicit leaf child node which hosts all the tasks whose weight is
|
||||
configured by leaf_weight[_device]. Let's assume a blkio hierarchy
|
||||
composed of five cgroups - root, A, B, AA and AB - with the following
|
||||
weights where the names represent the hierarchy.
|
||||
|
||||
weight leaf_weight
|
||||
root : 125 125
|
||||
A : 500 750
|
||||
B : 250 500
|
||||
AA : 500 500
|
||||
AB : 1000 500
|
||||
|
||||
root never has a parent making its weight is meaningless. For backward
|
||||
compatibility, weight is always kept in sync with leaf_weight. B, AA
|
||||
and AB have no child and thus its tasks have no children cgroup to
|
||||
compete with. They always get 100% of what the cgroup won at the
|
||||
parent level. Considering only the weights which matter, the hierarchy
|
||||
looks like the following.
|
||||
|
||||
root
|
||||
/ | \
|
||||
A B leaf
|
||||
500 250 125
|
||||
/ | \
|
||||
AA AB leaf
|
||||
500 1000 750
|
||||
|
||||
If all cgroups have active IOs and competing with each other, disk
|
||||
time will be distributed like the following.
|
||||
|
||||
Distribution below root. The total active weight at this level is
|
||||
A:500 + B:250 + C:125 = 875.
|
||||
|
||||
root-leaf : 125 / 875 =~ 14%
|
||||
A : 500 / 875 =~ 57%
|
||||
B(-leaf) : 250 / 875 =~ 28%
|
||||
|
||||
A has children and further distributes its 57% among the children and
|
||||
the implicit leaf node. The total active weight at this level is
|
||||
AA:500 + AB:1000 + A-leaf:750 = 2250.
|
||||
|
||||
A-leaf : ( 750 / 2250) * A =~ 19%
|
||||
AA(-leaf) : ( 500 / 2250) * A =~ 12%
|
||||
AB(-leaf) : (1000 / 2250) * A =~ 25%
|
||||
|
||||
CFQ IOPS Mode for group scheduling
|
||||
===================================
|
||||
Basic CFQ design is to provide priority based time slices. Higher priority
|
||||
process gets bigger time slice and lower priority process gets smaller time
|
||||
slice. Measuring time becomes harder if storage is fast and supports NCQ and
|
||||
it would be better to dispatch multiple requests from multiple cfq queues in
|
||||
request queue at a time. In such scenario, it is not possible to measure time
|
||||
consumed by single queue accurately.
|
||||
|
||||
What is possible though is to measure number of requests dispatched from a
|
||||
single queue and also allow dispatch from multiple cfq queue at the same time.
|
||||
This effectively becomes the fairness in terms of IOPS (IO operations per
|
||||
second).
|
||||
|
||||
If one sets slice_idle=0 and if storage supports NCQ, CFQ internally switches
|
||||
to IOPS mode and starts providing fairness in terms of number of requests
|
||||
dispatched. Note that this mode switching takes effect only for group
|
||||
scheduling. For non-cgroup users nothing should change.
|
||||
|
||||
CFQ IO scheduler Idling Theory
|
||||
===============================
|
||||
Idling on a queue is primarily about waiting for the next request to come
|
||||
on same queue after completion of a request. In this process CFQ will not
|
||||
dispatch requests from other cfq queues even if requests are pending there.
|
||||
|
||||
The rationale behind idling is that it can cut down on number of seeks
|
||||
on rotational media. For example, if a process is doing dependent
|
||||
sequential reads (next read will come on only after completion of previous
|
||||
one), then not dispatching request from other queue should help as we
|
||||
did not move the disk head and kept on dispatching sequential IO from
|
||||
one queue.
|
||||
|
||||
CFQ has following service trees and various queues are put on these trees.
|
||||
|
||||
sync-idle sync-noidle async
|
||||
|
||||
All cfq queues doing synchronous sequential IO go on to sync-idle tree.
|
||||
On this tree we idle on each queue individually.
|
||||
|
||||
All synchronous non-sequential queues go on sync-noidle tree. Also any
|
||||
synchronous write request which is not marked with REQ_IDLE goes on this
|
||||
service tree. On this tree we do not idle on individual queues instead idle
|
||||
on the whole group of queues or the tree. So if there are 4 queues waiting
|
||||
for IO to dispatch we will idle only once last queue has dispatched the IO
|
||||
and there is no more IO on this service tree.
|
||||
|
||||
All async writes go on async service tree. There is no idling on async
|
||||
queues.
|
||||
|
||||
CFQ has some optimizations for SSDs and if it detects a non-rotational
|
||||
media which can support higher queue depth (multiple requests at in
|
||||
flight at a time), then it cuts down on idling of individual queues and
|
||||
all the queues move to sync-noidle tree and only tree idle remains. This
|
||||
tree idling provides isolation with buffered write queues on async tree.
|
||||
|
||||
FAQ
|
||||
===
|
||||
Q1. Why to idle at all on queues not marked with REQ_IDLE.
|
||||
|
||||
A1. We only do tree idle (all queues on sync-noidle tree) on queues not marked
|
||||
with REQ_IDLE. This helps in providing isolation with all the sync-idle
|
||||
queues. Otherwise in presence of many sequential readers, other
|
||||
synchronous IO might not get fair share of disk.
|
||||
|
||||
For example, if there are 10 sequential readers doing IO and they get
|
||||
100ms each. If a !REQ_IDLE request comes in, it will be scheduled
|
||||
roughly after 1 second. If after completion of !REQ_IDLE request we
|
||||
do not idle, and after a couple of milli seconds a another !REQ_IDLE
|
||||
request comes in, again it will be scheduled after 1second. Repeat it
|
||||
and notice how a workload can lose its disk share and suffer due to
|
||||
multiple sequential readers.
|
||||
|
||||
fsync can generate dependent IO where bunch of data is written in the
|
||||
context of fsync, and later some journaling data is written. Journaling
|
||||
data comes in only after fsync has finished its IO (atleast for ext4
|
||||
that seemed to be the case). Now if one decides not to idle on fsync
|
||||
thread due to !REQ_IDLE, then next journaling write will not get
|
||||
scheduled for another second. A process doing small fsync, will suffer
|
||||
badly in presence of multiple sequential readers.
|
||||
|
||||
Hence doing tree idling on threads using !REQ_IDLE flag on requests
|
||||
provides isolation from multiple sequential readers and at the same
|
||||
time we do not idle on individual threads.
|
||||
|
||||
Q2. When to specify REQ_IDLE
|
||||
A2. I would think whenever one is doing synchronous write and expecting
|
||||
more writes to be dispatched from same context soon, should be able
|
||||
to specify REQ_IDLE on writes and that probably should work well for
|
||||
most of the cases.
|
@@ -88,7 +88,8 @@ shared_tags=[0/1]: Default: 0
|
||||
|
||||
zoned=[0/1]: Default: 0
|
||||
0: Block device is exposed as a random-access block device.
|
||||
1: Block device is exposed as a host-managed zoned block device.
|
||||
1: Block device is exposed as a host-managed zoned block device. Requires
|
||||
CONFIG_BLK_DEV_ZONED.
|
||||
|
||||
zone_size=[MB]: Default: 256
|
||||
Per zone size when exposed as a zoned block device. Must be a power of two.
|
||||
|