Merge 5.5-rc1 into android-mainline
Linux 5.5-rc1 Signed-off-by: Greg Kroah-Hartman <gregkh@google.com> Change-Id: I6f952ebdd40746115165a2f99bab340482f5c237
This commit is contained in:
2
.gitattributes
vendored
2
.gitattributes
vendored
@@ -1,2 +1,4 @@
|
|||||||
*.c diff=cpp
|
*.c diff=cpp
|
||||||
*.h diff=cpp
|
*.h diff=cpp
|
||||||
|
*.dtsi diff=dts
|
||||||
|
*.dts diff=dts
|
||||||
|
2
.gitignore
vendored
2
.gitignore
vendored
@@ -32,7 +32,6 @@
|
|||||||
*.lzo
|
*.lzo
|
||||||
*.mod
|
*.mod
|
||||||
*.mod.c
|
*.mod.c
|
||||||
*.ns_deps
|
|
||||||
*.o
|
*.o
|
||||||
*.o.*
|
*.o.*
|
||||||
*.patch
|
*.patch
|
||||||
@@ -61,6 +60,7 @@ modules.order
|
|||||||
/System.map
|
/System.map
|
||||||
/Module.markers
|
/Module.markers
|
||||||
/modules.builtin.modinfo
|
/modules.builtin.modinfo
|
||||||
|
/modules.nsdeps
|
||||||
|
|
||||||
#
|
#
|
||||||
# RPM spec file (make rpm-pkg)
|
# RPM spec file (make rpm-pkg)
|
||||||
|
5
.mailmap
5
.mailmap
@@ -32,6 +32,7 @@ Andy Adamson <andros@citi.umich.edu>
|
|||||||
Antoine Tenart <antoine.tenart@free-electrons.com>
|
Antoine Tenart <antoine.tenart@free-electrons.com>
|
||||||
Antonio Ospite <ao2@ao2.it> <ao2@amarulasolutions.com>
|
Antonio Ospite <ao2@ao2.it> <ao2@amarulasolutions.com>
|
||||||
Archit Taneja <archit@ti.com>
|
Archit Taneja <archit@ti.com>
|
||||||
|
Ard Biesheuvel <ardb@kernel.org> <ard.biesheuvel@linaro.org>
|
||||||
Arnaud Patard <arnaud.patard@rtp-net.org>
|
Arnaud Patard <arnaud.patard@rtp-net.org>
|
||||||
Arnd Bergmann <arnd@arndb.de>
|
Arnd Bergmann <arnd@arndb.de>
|
||||||
Axel Dyks <xl@xlsigned.net>
|
Axel Dyks <xl@xlsigned.net>
|
||||||
@@ -104,6 +105,9 @@ James E Wilson <wilson@specifix.com>
|
|||||||
James Hogan <jhogan@kernel.org> <james.hogan@imgtec.com>
|
James Hogan <jhogan@kernel.org> <james.hogan@imgtec.com>
|
||||||
James Hogan <jhogan@kernel.org> <james@albanarts.com>
|
James Hogan <jhogan@kernel.org> <james@albanarts.com>
|
||||||
James Ketrenos <jketreno@io.(none)>
|
James Ketrenos <jketreno@io.(none)>
|
||||||
|
Jan Glauber <jan.glauber@gmail.com> <jang@de.ibm.com>
|
||||||
|
Jan Glauber <jan.glauber@gmail.com> <jang@linux.vnet.ibm.com>
|
||||||
|
Jan Glauber <jan.glauber@gmail.com> <jglauber@cavium.com>
|
||||||
Jason Gunthorpe <jgg@ziepe.ca> <jgg@mellanox.com>
|
Jason Gunthorpe <jgg@ziepe.ca> <jgg@mellanox.com>
|
||||||
Jason Gunthorpe <jgg@ziepe.ca> <jgunthorpe@obsidianresearch.com>
|
Jason Gunthorpe <jgg@ziepe.ca> <jgunthorpe@obsidianresearch.com>
|
||||||
Javi Merino <javi.merino@kernel.org> <javi.merino@arm.com>
|
Javi Merino <javi.merino@kernel.org> <javi.merino@arm.com>
|
||||||
@@ -155,6 +159,7 @@ Mark Brown <broonie@sirena.org.uk>
|
|||||||
Mark Yao <markyao0591@gmail.com> <mark.yao@rock-chips.com>
|
Mark Yao <markyao0591@gmail.com> <mark.yao@rock-chips.com>
|
||||||
Martin Kepplinger <martink@posteo.de> <martin.kepplinger@theobroma-systems.com>
|
Martin Kepplinger <martink@posteo.de> <martin.kepplinger@theobroma-systems.com>
|
||||||
Martin Kepplinger <martink@posteo.de> <martin.kepplinger@ginzinger.com>
|
Martin Kepplinger <martink@posteo.de> <martin.kepplinger@ginzinger.com>
|
||||||
|
Martin Kepplinger <martink@posteo.de> <martin.kepplinger@puri.sm>
|
||||||
Mathieu Othacehe <m.othacehe@gmail.com>
|
Mathieu Othacehe <m.othacehe@gmail.com>
|
||||||
Matthew Wilcox <willy@infradead.org> <matthew.r.wilcox@intel.com>
|
Matthew Wilcox <willy@infradead.org> <matthew.r.wilcox@intel.com>
|
||||||
Matthew Wilcox <willy@infradead.org> <matthew@wil.cx>
|
Matthew Wilcox <willy@infradead.org> <matthew@wil.cx>
|
||||||
|
3
CREDITS
3
CREDITS
@@ -1875,8 +1875,9 @@ S: The Netherlands
|
|||||||
|
|
||||||
N: Martin Kepplinger
|
N: Martin Kepplinger
|
||||||
E: martink@posteo.de
|
E: martink@posteo.de
|
||||||
E: martin.kepplinger@ginzinger.com
|
E: martin.kepplinger@puri.sm
|
||||||
W: http://www.martinkepplinger.com
|
W: http://www.martinkepplinger.com
|
||||||
|
P: 4096R/5AB387D3 F208 2B88 0F9E 4239 3468 6E3F 5003 98DF 5AB3 87D3
|
||||||
D: mma8452 accelerators iio driver
|
D: mma8452 accelerators iio driver
|
||||||
D: pegasus_notetaker input driver
|
D: pegasus_notetaker input driver
|
||||||
D: Kernel fixes and cleanups
|
D: Kernel fixes and cleanups
|
||||||
|
@@ -314,25 +314,6 @@ Description:
|
|||||||
board_id: (RO) Manufacturing board ID
|
board_id: (RO) Manufacturing board ID
|
||||||
|
|
||||||
|
|
||||||
sysfs interface for Chelsio T3 RDMA Driver (cxgb3)
|
|
||||||
--------------------------------------------------
|
|
||||||
|
|
||||||
What: /sys/class/infiniband/cxgb3_X/hw_rev
|
|
||||||
What: /sys/class/infiniband/cxgb3_X/hca_type
|
|
||||||
What: /sys/class/infiniband/cxgb3_X/board_id
|
|
||||||
Date: Feb, 2007
|
|
||||||
KernelVersion: v2.6.21
|
|
||||||
Contact: linux-rdma@vger.kernel.org
|
|
||||||
Description:
|
|
||||||
hw_rev: (RO) Hardware revision number
|
|
||||||
|
|
||||||
hca_type: (RO) HCA type. Here it is a driver short name.
|
|
||||||
It should normally match the name in its bus
|
|
||||||
driver structure (e.g. pci_driver::name).
|
|
||||||
|
|
||||||
board_id: (RO) Manufacturing board id
|
|
||||||
|
|
||||||
|
|
||||||
sysfs interface for Mellanox ConnectX HCA IB driver (mlx4)
|
sysfs interface for Mellanox ConnectX HCA IB driver (mlx4)
|
||||||
----------------------------------------------------------
|
----------------------------------------------------------
|
||||||
|
|
||||||
|
@@ -6,10 +6,19 @@ Description: Configures which IO port the host side of the UART
|
|||||||
Users: OpenBMC. Proposed changes should be mailed to
|
Users: OpenBMC. Proposed changes should be mailed to
|
||||||
openbmc@lists.ozlabs.org
|
openbmc@lists.ozlabs.org
|
||||||
|
|
||||||
What: /sys/bus/platform/drivers/aspeed-vuart*/sirq
|
What: /sys/bus/platform/drivers/aspeed-vuart/*/sirq
|
||||||
Date: April 2017
|
Date: April 2017
|
||||||
Contact: Jeremy Kerr <jk@ozlabs.org>
|
Contact: Jeremy Kerr <jk@ozlabs.org>
|
||||||
Description: Configures which interrupt number the host side of
|
Description: Configures which interrupt number the host side of
|
||||||
the UART will appear on the host <-> BMC LPC bus.
|
the UART will appear on the host <-> BMC LPC bus.
|
||||||
Users: OpenBMC. Proposed changes should be mailed to
|
Users: OpenBMC. Proposed changes should be mailed to
|
||||||
openbmc@lists.ozlabs.org
|
openbmc@lists.ozlabs.org
|
||||||
|
|
||||||
|
What: /sys/bus/platform/drivers/aspeed-vuart/*/sirq_polarity
|
||||||
|
Date: July 2019
|
||||||
|
Contact: Oskar Senft <osk@google.com>
|
||||||
|
Description: Configures the polarity of the serial interrupt to the
|
||||||
|
host via the BMC LPC bus.
|
||||||
|
Set to 0 for active-low or 1 for active-high.
|
||||||
|
Users: OpenBMC. Proposed changes should be mailed to
|
||||||
|
openbmc@lists.ozlabs.org
|
||||||
|
@@ -67,6 +67,8 @@ Description: Interface for making ib_srp connect to a new target.
|
|||||||
initiator is allowed to queue per SCSI host. The default
|
initiator is allowed to queue per SCSI host. The default
|
||||||
value for this parameter is 62. The lowest supported value
|
value for this parameter is 62. The lowest supported value
|
||||||
is 2.
|
is 2.
|
||||||
|
* max_it_iu_size, a decimal number specifying the maximum
|
||||||
|
initiator to target information unit length.
|
||||||
|
|
||||||
What: /sys/class/infiniband_srp/srp-<hca>-<port_number>/ibdev
|
What: /sys/class/infiniband_srp/srp-<hca>-<port_number>/ibdev
|
||||||
Date: January 2, 2006
|
Date: January 2, 2006
|
||||||
|
57
Documentation/ABI/testing/debugfs-hisi-hpre
Normal file
57
Documentation/ABI/testing/debugfs-hisi-hpre
Normal file
@@ -0,0 +1,57 @@
|
|||||||
|
What: /sys/kernel/debug/hisi_hpre/<bdf>/cluster[0-3]/regs
|
||||||
|
Date: Sep 2019
|
||||||
|
Contact: linux-crypto@vger.kernel.org
|
||||||
|
Description: Dump debug registers from the HPRE cluster.
|
||||||
|
Only available for PF.
|
||||||
|
|
||||||
|
What: /sys/kernel/debug/hisi_hpre/<bdf>/cluster[0-3]/cluster_ctrl
|
||||||
|
Date: Sep 2019
|
||||||
|
Contact: linux-crypto@vger.kernel.org
|
||||||
|
Description: Write the HPRE core selection in the cluster into this file,
|
||||||
|
and then we can read the debug information of the core.
|
||||||
|
Only available for PF.
|
||||||
|
|
||||||
|
What: /sys/kernel/debug/hisi_hpre/<bdf>/rdclr_en
|
||||||
|
Date: Sep 2019
|
||||||
|
Contact: linux-crypto@vger.kernel.org
|
||||||
|
Description: HPRE cores debug registers read clear control. 1 means enable
|
||||||
|
register read clear, otherwise 0. Writing to this file has no
|
||||||
|
functional effect, only enable or disable counters clear after
|
||||||
|
reading of these registers.
|
||||||
|
Only available for PF.
|
||||||
|
|
||||||
|
What: /sys/kernel/debug/hisi_hpre/<bdf>/current_qm
|
||||||
|
Date: Sep 2019
|
||||||
|
Contact: linux-crypto@vger.kernel.org
|
||||||
|
Description: One HPRE controller has one PF and multiple VFs, each function
|
||||||
|
has a QM. Select the QM which below qm refers to.
|
||||||
|
Only available for PF.
|
||||||
|
|
||||||
|
What: /sys/kernel/debug/hisi_hpre/<bdf>/regs
|
||||||
|
Date: Sep 2019
|
||||||
|
Contact: linux-crypto@vger.kernel.org
|
||||||
|
Description: Dump debug registers from the HPRE.
|
||||||
|
Only available for PF.
|
||||||
|
|
||||||
|
What: /sys/kernel/debug/hisi_hpre/<bdf>/qm/qm_regs
|
||||||
|
Date: Sep 2019
|
||||||
|
Contact: linux-crypto@vger.kernel.org
|
||||||
|
Description: Dump debug registers from the QM.
|
||||||
|
Available for PF and VF in host. VF in guest currently only
|
||||||
|
has one debug register.
|
||||||
|
|
||||||
|
What: /sys/kernel/debug/hisi_hpre/<bdf>/qm/current_q
|
||||||
|
Date: Sep 2019
|
||||||
|
Contact: linux-crypto@vger.kernel.org
|
||||||
|
Description: One QM may contain multiple queues. Select specific queue to
|
||||||
|
show its debug registers in above qm_regs.
|
||||||
|
Only available for PF.
|
||||||
|
|
||||||
|
What: /sys/kernel/debug/hisi_hpre/<bdf>/qm/clear_enable
|
||||||
|
Date: Sep 2019
|
||||||
|
Contact: linux-crypto@vger.kernel.org
|
||||||
|
Description: QM debug registers(qm_regs) read clear control. 1 means enable
|
||||||
|
register read clear, otherwise 0.
|
||||||
|
Writing to this file has no functional effect, only enable or
|
||||||
|
disable counters clear after reading of these registers.
|
||||||
|
Only available for PF.
|
43
Documentation/ABI/testing/debugfs-hisi-sec
Normal file
43
Documentation/ABI/testing/debugfs-hisi-sec
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
What: /sys/kernel/debug/hisi_sec/<bdf>/sec_dfx
|
||||||
|
Date: Oct 2019
|
||||||
|
Contact: linux-crypto@vger.kernel.org
|
||||||
|
Description: Dump the debug registers of SEC cores.
|
||||||
|
Only available for PF.
|
||||||
|
|
||||||
|
What: /sys/kernel/debug/hisi_sec/<bdf>/clear_enable
|
||||||
|
Date: Oct 2019
|
||||||
|
Contact: linux-crypto@vger.kernel.org
|
||||||
|
Description: Enabling/disabling of clear action after reading
|
||||||
|
the SEC debug registers.
|
||||||
|
0: disable, 1: enable.
|
||||||
|
Only available for PF, and take no other effect on SEC.
|
||||||
|
|
||||||
|
What: /sys/kernel/debug/hisi_sec/<bdf>/current_qm
|
||||||
|
Date: Oct 2019
|
||||||
|
Contact: linux-crypto@vger.kernel.org
|
||||||
|
Description: One SEC controller has one PF and multiple VFs, each function
|
||||||
|
has a QM. This file can be used to select the QM which below
|
||||||
|
qm refers to.
|
||||||
|
Only available for PF.
|
||||||
|
|
||||||
|
What: /sys/kernel/debug/hisi_sec/<bdf>/qm/qm_regs
|
||||||
|
Date: Oct 2019
|
||||||
|
Contact: linux-crypto@vger.kernel.org
|
||||||
|
Description: Dump of QM related debug registers.
|
||||||
|
Available for PF and VF in host. VF in guest currently only
|
||||||
|
has one debug register.
|
||||||
|
|
||||||
|
What: /sys/kernel/debug/hisi_sec/<bdf>/qm/current_q
|
||||||
|
Date: Oct 2019
|
||||||
|
Contact: linux-crypto@vger.kernel.org
|
||||||
|
Description: One QM of SEC may contain multiple queues. Select specific
|
||||||
|
queue to show its debug registers in above 'qm_regs'.
|
||||||
|
Only available for PF.
|
||||||
|
|
||||||
|
What: /sys/kernel/debug/hisi_sec/<bdf>/qm/clear_enable
|
||||||
|
Date: Oct 2019
|
||||||
|
Contact: linux-crypto@vger.kernel.org
|
||||||
|
Description: Enabling/disabling of clear action after reading
|
||||||
|
the SEC's QM debug registers.
|
||||||
|
0: disable, 1: enable.
|
||||||
|
Only available for PF, and take no other effect on SEC.
|
23
Documentation/ABI/testing/debugfs-hyperv
Normal file
23
Documentation/ABI/testing/debugfs-hyperv
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
What: /sys/kernel/debug/hyperv/<UUID>/fuzz_test_state
|
||||||
|
Date: October 2019
|
||||||
|
KernelVersion: 5.5
|
||||||
|
Contact: Branden Bonaby <brandonbonaby94@gmail.com>
|
||||||
|
Description: Fuzz testing status of a vmbus device, whether its in an ON
|
||||||
|
state or a OFF state
|
||||||
|
Users: Debugging tools
|
||||||
|
|
||||||
|
What: /sys/kernel/debug/hyperv/<UUID>/delay/fuzz_test_buffer_interrupt_delay
|
||||||
|
Date: October 2019
|
||||||
|
KernelVersion: 5.5
|
||||||
|
Contact: Branden Bonaby <brandonbonaby94@gmail.com>
|
||||||
|
Description: Fuzz testing buffer interrupt delay value between 0 - 1000
|
||||||
|
microseconds (inclusive).
|
||||||
|
Users: Debugging tools
|
||||||
|
|
||||||
|
What: /sys/kernel/debug/hyperv/<UUID>/delay/fuzz_test_message_delay
|
||||||
|
Date: October 2019
|
||||||
|
KernelVersion: 5.5
|
||||||
|
Contact: Branden Bonaby <brandonbonaby94@gmail.com>
|
||||||
|
Description: Fuzz testing message delay value between 0 - 1000 microseconds
|
||||||
|
(inclusive).
|
||||||
|
Users: Debugging tools
|
@@ -25,6 +25,7 @@ Description:
|
|||||||
lsm: [[subj_user=] [subj_role=] [subj_type=]
|
lsm: [[subj_user=] [subj_role=] [subj_type=]
|
||||||
[obj_user=] [obj_role=] [obj_type=]]
|
[obj_user=] [obj_role=] [obj_type=]]
|
||||||
option: [[appraise_type=]] [template=] [permit_directio]
|
option: [[appraise_type=]] [template=] [permit_directio]
|
||||||
|
[appraise_flag=]
|
||||||
base: func:= [BPRM_CHECK][MMAP_CHECK][CREDS_CHECK][FILE_CHECK][MODULE_CHECK]
|
base: func:= [BPRM_CHECK][MMAP_CHECK][CREDS_CHECK][FILE_CHECK][MODULE_CHECK]
|
||||||
[FIRMWARE_CHECK]
|
[FIRMWARE_CHECK]
|
||||||
[KEXEC_KERNEL_CHECK] [KEXEC_INITRAMFS_CHECK]
|
[KEXEC_KERNEL_CHECK] [KEXEC_INITRAMFS_CHECK]
|
||||||
@@ -38,6 +39,9 @@ Description:
|
|||||||
fowner:= decimal value
|
fowner:= decimal value
|
||||||
lsm: are LSM specific
|
lsm: are LSM specific
|
||||||
option: appraise_type:= [imasig] [imasig|modsig]
|
option: appraise_type:= [imasig] [imasig|modsig]
|
||||||
|
appraise_flag:= [check_blacklist]
|
||||||
|
Currently, blacklist check is only for files signed with appended
|
||||||
|
signature.
|
||||||
template:= name of a defined IMA template type
|
template:= name of a defined IMA template type
|
||||||
(eg, ima-ng). Only valid when action is "measure".
|
(eg, ima-ng). Only valid when action is "measure".
|
||||||
pcr:= decimal value
|
pcr:= decimal value
|
||||||
|
@@ -29,4 +29,9 @@ Description:
|
|||||||
17 - sectors discarded
|
17 - sectors discarded
|
||||||
18 - time spent discarding
|
18 - time spent discarding
|
||||||
|
|
||||||
|
Kernel 5.5+ appends two more fields for flush requests:
|
||||||
|
|
||||||
|
19 - flush requests completed successfully
|
||||||
|
20 - time spent flushing
|
||||||
|
|
||||||
For more details refer to Documentation/admin-guide/iostats.rst
|
For more details refer to Documentation/admin-guide/iostats.rst
|
||||||
|
@@ -15,6 +15,12 @@ Description:
|
|||||||
9 - I/Os currently in progress
|
9 - I/Os currently in progress
|
||||||
10 - time spent doing I/Os (ms)
|
10 - time spent doing I/Os (ms)
|
||||||
11 - weighted time spent doing I/Os (ms)
|
11 - weighted time spent doing I/Os (ms)
|
||||||
|
12 - discards completed
|
||||||
|
13 - discards merged
|
||||||
|
14 - sectors discarded
|
||||||
|
15 - time spent discarding (ms)
|
||||||
|
16 - flush requests completed
|
||||||
|
17 - time spent flushing (ms)
|
||||||
For more details refer Documentation/admin-guide/iostats.rst
|
For more details refer Documentation/admin-guide/iostats.rst
|
||||||
|
|
||||||
|
|
||||||
|
@@ -1,4 +1,4 @@
|
|||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/enable_source
|
What: /sys/bus/coresight/devices/etm<N>/enable_source
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
@@ -8,82 +8,82 @@ Description: (RW) Enable/disable tracing on this specific trace entiry.
|
|||||||
of coresight components linking the source to the sink is
|
of coresight components linking the source to the sink is
|
||||||
configured and managed automatically by the coresight framework.
|
configured and managed automatically by the coresight framework.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/cpu
|
What: /sys/bus/coresight/devices/etm<N>/cpu
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (R) The CPU this tracing entity is associated with.
|
Description: (R) The CPU this tracing entity is associated with.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/nr_pe_cmp
|
What: /sys/bus/coresight/devices/etm<N>/nr_pe_cmp
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (R) Indicates the number of PE comparator inputs that are
|
Description: (R) Indicates the number of PE comparator inputs that are
|
||||||
available for tracing.
|
available for tracing.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/nr_addr_cmp
|
What: /sys/bus/coresight/devices/etm<N>/nr_addr_cmp
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (R) Indicates the number of address comparator pairs that are
|
Description: (R) Indicates the number of address comparator pairs that are
|
||||||
available for tracing.
|
available for tracing.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/nr_cntr
|
What: /sys/bus/coresight/devices/etm<N>/nr_cntr
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (R) Indicates the number of counters that are available for
|
Description: (R) Indicates the number of counters that are available for
|
||||||
tracing.
|
tracing.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/nr_ext_inp
|
What: /sys/bus/coresight/devices/etm<N>/nr_ext_inp
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (R) Indicates how many external inputs are implemented.
|
Description: (R) Indicates how many external inputs are implemented.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/numcidc
|
What: /sys/bus/coresight/devices/etm<N>/numcidc
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (R) Indicates the number of Context ID comparators that are
|
Description: (R) Indicates the number of Context ID comparators that are
|
||||||
available for tracing.
|
available for tracing.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/numvmidc
|
What: /sys/bus/coresight/devices/etm<N>/numvmidc
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (R) Indicates the number of VMID comparators that are available
|
Description: (R) Indicates the number of VMID comparators that are available
|
||||||
for tracing.
|
for tracing.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/nrseqstate
|
What: /sys/bus/coresight/devices/etm<N>/nrseqstate
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (R) Indicates the number of sequencer states that are
|
Description: (R) Indicates the number of sequencer states that are
|
||||||
implemented.
|
implemented.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/nr_resource
|
What: /sys/bus/coresight/devices/etm<N>/nr_resource
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (R) Indicates the number of resource selection pairs that are
|
Description: (R) Indicates the number of resource selection pairs that are
|
||||||
available for tracing.
|
available for tracing.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/nr_ss_cmp
|
What: /sys/bus/coresight/devices/etm<N>/nr_ss_cmp
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (R) Indicates the number of single-shot comparator controls that
|
Description: (R) Indicates the number of single-shot comparator controls that
|
||||||
are available for tracing.
|
are available for tracing.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/reset
|
What: /sys/bus/coresight/devices/etm<N>/reset
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (W) Cancels all configuration on a trace unit and set it back
|
Description: (W) Cancels all configuration on a trace unit and set it back
|
||||||
to its boot configuration.
|
to its boot configuration.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/mode
|
What: /sys/bus/coresight/devices/etm<N>/mode
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
@@ -91,302 +91,349 @@ Description: (RW) Controls various modes supported by this ETM, for example
|
|||||||
P0 instruction tracing, branch broadcast, cycle counting and
|
P0 instruction tracing, branch broadcast, cycle counting and
|
||||||
context ID tracing.
|
context ID tracing.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/pe
|
What: /sys/bus/coresight/devices/etm<N>/pe
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (RW) Controls which PE to trace.
|
Description: (RW) Controls which PE to trace.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/event
|
What: /sys/bus/coresight/devices/etm<N>/event
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (RW) Controls the tracing of arbitrary events from bank 0 to 3.
|
Description: (RW) Controls the tracing of arbitrary events from bank 0 to 3.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/event_instren
|
What: /sys/bus/coresight/devices/etm<N>/event_instren
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (RW) Controls the behavior of the events in bank 0 to 3.
|
Description: (RW) Controls the behavior of the events in bank 0 to 3.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/event_ts
|
What: /sys/bus/coresight/devices/etm<N>/event_ts
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (RW) Controls the insertion of global timestamps in the trace
|
Description: (RW) Controls the insertion of global timestamps in the trace
|
||||||
streams.
|
streams.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/syncfreq
|
What: /sys/bus/coresight/devices/etm<N>/syncfreq
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (RW) Controls how often trace synchronization requests occur.
|
Description: (RW) Controls how often trace synchronization requests occur.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/cyc_threshold
|
What: /sys/bus/coresight/devices/etm<N>/cyc_threshold
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (RW) Sets the threshold value for cycle counting.
|
Description: (RW) Sets the threshold value for cycle counting.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/bb_ctrl
|
What: /sys/bus/coresight/devices/etm<N>/bb_ctrl
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (RW) Controls which regions in the memory map are enabled to
|
Description: (RW) Controls which regions in the memory map are enabled to
|
||||||
use branch broadcasting.
|
use branch broadcasting.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/event_vinst
|
What: /sys/bus/coresight/devices/etm<N>/event_vinst
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (RW) Controls instruction trace filtering.
|
Description: (RW) Controls instruction trace filtering.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/s_exlevel_vinst
|
What: /sys/bus/coresight/devices/etm<N>/s_exlevel_vinst
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (RW) In Secure state, each bit controls whether instruction
|
Description: (RW) In Secure state, each bit controls whether instruction
|
||||||
tracing is enabled for the corresponding exception level.
|
tracing is enabled for the corresponding exception level.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/ns_exlevel_vinst
|
What: /sys/bus/coresight/devices/etm<N>/ns_exlevel_vinst
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (RW) In non-secure state, each bit controls whether instruction
|
Description: (RW) In non-secure state, each bit controls whether instruction
|
||||||
tracing is enabled for the corresponding exception level.
|
tracing is enabled for the corresponding exception level.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/addr_idx
|
What: /sys/bus/coresight/devices/etm<N>/addr_idx
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (RW) Select which address comparator or pair (of comparators) to
|
Description: (RW) Select which address comparator or pair (of comparators) to
|
||||||
work with.
|
work with.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/addr_instdatatype
|
What: /sys/bus/coresight/devices/etm<N>/addr_instdatatype
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (RW) Controls what type of comparison the trace unit performs.
|
Description: (RW) Controls what type of comparison the trace unit performs.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/addr_single
|
What: /sys/bus/coresight/devices/etm<N>/addr_single
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (RW) Used to setup single address comparator values.
|
Description: (RW) Used to setup single address comparator values.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/addr_range
|
What: /sys/bus/coresight/devices/etm<N>/addr_range
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (RW) Used to setup address range comparator values.
|
Description: (RW) Used to setup address range comparator values.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/seq_idx
|
What: /sys/bus/coresight/devices/etm<N>/seq_idx
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (RW) Select which sequensor.
|
Description: (RW) Select which sequensor.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/seq_state
|
What: /sys/bus/coresight/devices/etm<N>/seq_state
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (RW) Use this to set, or read, the sequencer state.
|
Description: (RW) Use this to set, or read, the sequencer state.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/seq_event
|
What: /sys/bus/coresight/devices/etm<N>/seq_event
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (RW) Moves the sequencer state to a specific state.
|
Description: (RW) Moves the sequencer state to a specific state.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/seq_reset_event
|
What: /sys/bus/coresight/devices/etm<N>/seq_reset_event
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (RW) Moves the sequencer to state 0 when a programmed event
|
Description: (RW) Moves the sequencer to state 0 when a programmed event
|
||||||
occurs.
|
occurs.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/cntr_idx
|
What: /sys/bus/coresight/devices/etm<N>/cntr_idx
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (RW) Select which counter unit to work with.
|
Description: (RW) Select which counter unit to work with.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/cntrldvr
|
What: /sys/bus/coresight/devices/etm<N>/cntrldvr
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (RW) This sets or returns the reload count value of the
|
Description: (RW) This sets or returns the reload count value of the
|
||||||
specific counter.
|
specific counter.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/cntr_val
|
What: /sys/bus/coresight/devices/etm<N>/cntr_val
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (RW) This sets or returns the current count value of the
|
Description: (RW) This sets or returns the current count value of the
|
||||||
specific counter.
|
specific counter.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/cntr_ctrl
|
What: /sys/bus/coresight/devices/etm<N>/cntr_ctrl
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (RW) Controls the operation of the selected counter.
|
Description: (RW) Controls the operation of the selected counter.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/res_idx
|
What: /sys/bus/coresight/devices/etm<N>/res_idx
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (RW) Select which resource selection unit to work with.
|
Description: (RW) Select which resource selection unit to work with.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/res_ctrl
|
What: /sys/bus/coresight/devices/etm<N>/res_ctrl
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (RW) Controls the selection of the resources in the trace unit.
|
Description: (RW) Controls the selection of the resources in the trace unit.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/ctxid_idx
|
What: /sys/bus/coresight/devices/etm<N>/ctxid_idx
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (RW) Select which context ID comparator to work with.
|
Description: (RW) Select which context ID comparator to work with.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/ctxid_pid
|
What: /sys/bus/coresight/devices/etm<N>/ctxid_pid
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (RW) Get/Set the context ID comparator value to trigger on.
|
Description: (RW) Get/Set the context ID comparator value to trigger on.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/ctxid_masks
|
What: /sys/bus/coresight/devices/etm<N>/ctxid_masks
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (RW) Mask for all 8 context ID comparator value
|
Description: (RW) Mask for all 8 context ID comparator value
|
||||||
registers (if implemented).
|
registers (if implemented).
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/vmid_idx
|
What: /sys/bus/coresight/devices/etm<N>/vmid_idx
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (RW) Select which virtual machine ID comparator to work with.
|
Description: (RW) Select which virtual machine ID comparator to work with.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/vmid_val
|
What: /sys/bus/coresight/devices/etm<N>/vmid_val
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (RW) Get/Set the virtual machine ID comparator value to
|
Description: (RW) Get/Set the virtual machine ID comparator value to
|
||||||
trigger on.
|
trigger on.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/vmid_masks
|
What: /sys/bus/coresight/devices/etm<N>/vmid_masks
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (RW) Mask for all 8 virtual machine ID comparator value
|
Description: (RW) Mask for all 8 virtual machine ID comparator value
|
||||||
registers (if implemented).
|
registers (if implemented).
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/mgmt/trcoslsr
|
What: /sys/bus/coresight/devices/etm<N>/addr_exlevel_s_ns
|
||||||
|
Date: December 2019
|
||||||
|
KernelVersion: 5.5
|
||||||
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
|
Description: (RW) Set the Exception Level matching bits for secure and
|
||||||
|
non-secure exception levels.
|
||||||
|
|
||||||
|
What: /sys/bus/coresight/devices/etm<N>/vinst_pe_cmp_start_stop
|
||||||
|
Date: December 2019
|
||||||
|
KernelVersion: 5.5
|
||||||
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
|
Description: (RW) Access the start stop control register for PE input
|
||||||
|
comparators.
|
||||||
|
|
||||||
|
What: /sys/bus/coresight/devices/etm<N>/addr_cmp_view
|
||||||
|
Date: December 2019
|
||||||
|
KernelVersion: 5.5
|
||||||
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
|
Description: (R) Print the current settings for the selected address
|
||||||
|
comparator.
|
||||||
|
|
||||||
|
What: /sys/bus/coresight/devices/etm<N>/sshot_idx
|
||||||
|
Date: December 2019
|
||||||
|
KernelVersion: 5.5
|
||||||
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
|
Description: (RW) Select the single shot control register to access.
|
||||||
|
|
||||||
|
What: /sys/bus/coresight/devices/etm<N>/sshot_ctrl
|
||||||
|
Date: December 2019
|
||||||
|
KernelVersion: 5.5
|
||||||
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
|
Description: (RW) Access the selected single shot control register.
|
||||||
|
|
||||||
|
What: /sys/bus/coresight/devices/etm<N>/sshot_status
|
||||||
|
Date: December 2019
|
||||||
|
KernelVersion: 5.5
|
||||||
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
|
Description: (R) Print the current value of the selected single shot
|
||||||
|
status register.
|
||||||
|
|
||||||
|
What: /sys/bus/coresight/devices/etm<N>/sshot_pe_ctrl
|
||||||
|
Date: December 2019
|
||||||
|
KernelVersion: 5.5
|
||||||
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
|
Description: (RW) Access the selected single show PE comparator control
|
||||||
|
register.
|
||||||
|
|
||||||
|
What: /sys/bus/coresight/devices/etm<N>/mgmt/trcoslsr
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (R) Print the content of the OS Lock Status Register (0x304).
|
Description: (R) Print the content of the OS Lock Status Register (0x304).
|
||||||
The value it taken directly from the HW.
|
The value it taken directly from the HW.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/mgmt/trcpdcr
|
What: /sys/bus/coresight/devices/etm<N>/mgmt/trcpdcr
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (R) Print the content of the Power Down Control Register
|
Description: (R) Print the content of the Power Down Control Register
|
||||||
(0x310). The value is taken directly from the HW.
|
(0x310). The value is taken directly from the HW.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/mgmt/trcpdsr
|
What: /sys/bus/coresight/devices/etm<N>/mgmt/trcpdsr
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (R) Print the content of the Power Down Status Register
|
Description: (R) Print the content of the Power Down Status Register
|
||||||
(0x314). The value is taken directly from the HW.
|
(0x314). The value is taken directly from the HW.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/mgmt/trclsr
|
What: /sys/bus/coresight/devices/etm<N>/mgmt/trclsr
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (R) Print the content of the SW Lock Status Register
|
Description: (R) Print the content of the SW Lock Status Register
|
||||||
(0xFB4). The value is taken directly from the HW.
|
(0xFB4). The value is taken directly from the HW.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/mgmt/trcauthstatus
|
What: /sys/bus/coresight/devices/etm<N>/mgmt/trcauthstatus
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (R) Print the content of the Authentication Status Register
|
Description: (R) Print the content of the Authentication Status Register
|
||||||
(0xFB8). The value is taken directly from the HW.
|
(0xFB8). The value is taken directly from the HW.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/mgmt/trcdevid
|
What: /sys/bus/coresight/devices/etm<N>/mgmt/trcdevid
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (R) Print the content of the Device ID Register
|
Description: (R) Print the content of the Device ID Register
|
||||||
(0xFC8). The value is taken directly from the HW.
|
(0xFC8). The value is taken directly from the HW.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/mgmt/trcdevtype
|
What: /sys/bus/coresight/devices/etm<N>/mgmt/trcdevtype
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (R) Print the content of the Device Type Register
|
Description: (R) Print the content of the Device Type Register
|
||||||
(0xFCC). The value is taken directly from the HW.
|
(0xFCC). The value is taken directly from the HW.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/mgmt/trcpidr0
|
What: /sys/bus/coresight/devices/etm<N>/mgmt/trcpidr0
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (R) Print the content of the Peripheral ID0 Register
|
Description: (R) Print the content of the Peripheral ID0 Register
|
||||||
(0xFE0). The value is taken directly from the HW.
|
(0xFE0). The value is taken directly from the HW.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/mgmt/trcpidr1
|
What: /sys/bus/coresight/devices/etm<N>/mgmt/trcpidr1
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (R) Print the content of the Peripheral ID1 Register
|
Description: (R) Print the content of the Peripheral ID1 Register
|
||||||
(0xFE4). The value is taken directly from the HW.
|
(0xFE4). The value is taken directly from the HW.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/mgmt/trcpidr2
|
What: /sys/bus/coresight/devices/etm<N>/mgmt/trcpidr2
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (R) Print the content of the Peripheral ID2 Register
|
Description: (R) Print the content of the Peripheral ID2 Register
|
||||||
(0xFE8). The value is taken directly from the HW.
|
(0xFE8). The value is taken directly from the HW.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/mgmt/trcpidr3
|
What: /sys/bus/coresight/devices/etm<N>/mgmt/trcpidr3
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (R) Print the content of the Peripheral ID3 Register
|
Description: (R) Print the content of the Peripheral ID3 Register
|
||||||
(0xFEC). The value is taken directly from the HW.
|
(0xFEC). The value is taken directly from the HW.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/mgmt/trcconfig
|
What: /sys/bus/coresight/devices/etm<N>/mgmt/trcconfig
|
||||||
Date: February 2016
|
Date: February 2016
|
||||||
KernelVersion: 4.07
|
KernelVersion: 4.07
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (R) Print the content of the trace configuration register
|
Description: (R) Print the content of the trace configuration register
|
||||||
(0x010) as currently set by SW.
|
(0x010) as currently set by SW.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/mgmt/trctraceid
|
What: /sys/bus/coresight/devices/etm<N>/mgmt/trctraceid
|
||||||
Date: February 2016
|
Date: February 2016
|
||||||
KernelVersion: 4.07
|
KernelVersion: 4.07
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (R) Print the content of the trace ID register (0x040).
|
Description: (R) Print the content of the trace ID register (0x040).
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/trcidr/trcidr0
|
What: /sys/bus/coresight/devices/etm<N>/trcidr/trcidr0
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (R) Returns the tracing capabilities of the trace unit (0x1E0).
|
Description: (R) Returns the tracing capabilities of the trace unit (0x1E0).
|
||||||
The value is taken directly from the HW.
|
The value is taken directly from the HW.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/trcidr/trcidr1
|
What: /sys/bus/coresight/devices/etm<N>/trcidr/trcidr1
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (R) Returns the tracing capabilities of the trace unit (0x1E4).
|
Description: (R) Returns the tracing capabilities of the trace unit (0x1E4).
|
||||||
The value is taken directly from the HW.
|
The value is taken directly from the HW.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/trcidr/trcidr2
|
What: /sys/bus/coresight/devices/etm<N>/trcidr/trcidr2
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
@@ -394,7 +441,7 @@ Description: (R) Returns the maximum size of the data value, data address,
|
|||||||
VMID, context ID and instuction address in the trace unit
|
VMID, context ID and instuction address in the trace unit
|
||||||
(0x1E8). The value is taken directly from the HW.
|
(0x1E8). The value is taken directly from the HW.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/trcidr/trcidr3
|
What: /sys/bus/coresight/devices/etm<N>/trcidr/trcidr3
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
@@ -403,42 +450,42 @@ Description: (R) Returns the value associated with various resources
|
|||||||
architecture specification for more details (0x1E8).
|
architecture specification for more details (0x1E8).
|
||||||
The value is taken directly from the HW.
|
The value is taken directly from the HW.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/trcidr/trcidr4
|
What: /sys/bus/coresight/devices/etm<N>/trcidr/trcidr4
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (R) Returns how many resources the trace unit supports (0x1F0).
|
Description: (R) Returns how many resources the trace unit supports (0x1F0).
|
||||||
The value is taken directly from the HW.
|
The value is taken directly from the HW.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/trcidr/trcidr5
|
What: /sys/bus/coresight/devices/etm<N>/trcidr/trcidr5
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (R) Returns how many resources the trace unit supports (0x1F4).
|
Description: (R) Returns how many resources the trace unit supports (0x1F4).
|
||||||
The value is taken directly from the HW.
|
The value is taken directly from the HW.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/trcidr/trcidr8
|
What: /sys/bus/coresight/devices/etm<N>/trcidr/trcidr8
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (R) Returns the maximum speculation depth of the instruction
|
Description: (R) Returns the maximum speculation depth of the instruction
|
||||||
trace stream. (0x180). The value is taken directly from the HW.
|
trace stream. (0x180). The value is taken directly from the HW.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/trcidr/trcidr9
|
What: /sys/bus/coresight/devices/etm<N>/trcidr/trcidr9
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (R) Returns the number of P0 right-hand keys that the trace unit
|
Description: (R) Returns the number of P0 right-hand keys that the trace unit
|
||||||
can use (0x184). The value is taken directly from the HW.
|
can use (0x184). The value is taken directly from the HW.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/trcidr/trcidr10
|
What: /sys/bus/coresight/devices/etm<N>/trcidr/trcidr10
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
Description: (R) Returns the number of P1 right-hand keys that the trace unit
|
Description: (R) Returns the number of P1 right-hand keys that the trace unit
|
||||||
can use (0x188). The value is taken directly from the HW.
|
can use (0x188). The value is taken directly from the HW.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/trcidr/trcidr11
|
What: /sys/bus/coresight/devices/etm<N>/trcidr/trcidr11
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
@@ -446,7 +493,7 @@ Description: (R) Returns the number of special P1 right-hand keys that the
|
|||||||
trace unit can use (0x18C). The value is taken directly from
|
trace unit can use (0x18C). The value is taken directly from
|
||||||
the HW.
|
the HW.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/trcidr/trcidr12
|
What: /sys/bus/coresight/devices/etm<N>/trcidr/trcidr12
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
@@ -454,7 +501,7 @@ Description: (R) Returns the number of conditional P1 right-hand keys that
|
|||||||
the trace unit can use (0x190). The value is taken directly
|
the trace unit can use (0x190). The value is taken directly
|
||||||
from the HW.
|
from the HW.
|
||||||
|
|
||||||
What: /sys/bus/coresight/devices/<memory_map>.etm/trcidr/trcidr13
|
What: /sys/bus/coresight/devices/etm<N>/trcidr/trcidr13
|
||||||
Date: April 2015
|
Date: April 2015
|
||||||
KernelVersion: 4.01
|
KernelVersion: 4.01
|
||||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||||
|
@@ -1,25 +1,25 @@
|
|||||||
What: /sys/bus/platform/devices/fsi-master/rescan
|
What: /sys/bus/platform/devices/../fsi-master/fsi0/rescan
|
||||||
Date: May 2017
|
Date: May 2017
|
||||||
KernelVersion: 4.12
|
KernelVersion: 4.12
|
||||||
Contact: cbostic@linux.vnet.ibm.com
|
Contact: linux-fsi@lists.ozlabs.org
|
||||||
Description:
|
Description:
|
||||||
Initiates a FSI master scan for all connected slave devices
|
Initiates a FSI master scan for all connected slave devices
|
||||||
on its links.
|
on its links.
|
||||||
|
|
||||||
What: /sys/bus/platform/devices/fsi-master/break
|
What: /sys/bus/platform/devices/../fsi-master/fsi0/break
|
||||||
Date: May 2017
|
Date: May 2017
|
||||||
KernelVersion: 4.12
|
KernelVersion: 4.12
|
||||||
Contact: cbostic@linux.vnet.ibm.com
|
Contact: linux-fsi@lists.ozlabs.org
|
||||||
Description:
|
Description:
|
||||||
Sends an FSI BREAK command on a master's communication
|
Sends an FSI BREAK command on a master's communication
|
||||||
link to any connnected slaves. A BREAK resets connected
|
link to any connnected slaves. A BREAK resets connected
|
||||||
device's logic and preps it to receive further commands
|
device's logic and preps it to receive further commands
|
||||||
from the master.
|
from the master.
|
||||||
|
|
||||||
What: /sys/bus/platform/devices/fsi-master/slave@00:00/term
|
What: /sys/bus/platform/devices/../fsi-master/fsi0/slave@00:00/term
|
||||||
Date: May 2017
|
Date: May 2017
|
||||||
KernelVersion: 4.12
|
KernelVersion: 4.12
|
||||||
Contact: cbostic@linux.vnet.ibm.com
|
Contact: linux-fsi@lists.ozlabs.org
|
||||||
Description:
|
Description:
|
||||||
Sends an FSI terminate command from the master to its
|
Sends an FSI terminate command from the master to its
|
||||||
connected slave. A terminate resets the slave's state machines
|
connected slave. A terminate resets the slave's state machines
|
||||||
@@ -29,10 +29,10 @@ Description:
|
|||||||
ongoing operation in case of an expired 'Master Time Out'
|
ongoing operation in case of an expired 'Master Time Out'
|
||||||
timer.
|
timer.
|
||||||
|
|
||||||
What: /sys/bus/platform/devices/fsi-master/slave@00:00/raw
|
What: /sys/bus/platform/devices/../fsi-master/fsi0/slave@00:00/raw
|
||||||
Date: May 2017
|
Date: May 2017
|
||||||
KernelVersion: 4.12
|
KernelVersion: 4.12
|
||||||
Contact: cbostic@linux.vnet.ibm.com
|
Contact: linux-fsi@lists.ozlabs.org
|
||||||
Description:
|
Description:
|
||||||
Provides a means of reading/writing a 32 bit value from/to a
|
Provides a means of reading/writing a 32 bit value from/to a
|
||||||
specified FSI bus address.
|
specified FSI bus address.
|
||||||
|
@@ -753,6 +753,8 @@ What: /sys/.../events/in_illuminance0_thresh_falling_value
|
|||||||
what: /sys/.../events/in_illuminance0_thresh_rising_value
|
what: /sys/.../events/in_illuminance0_thresh_rising_value
|
||||||
what: /sys/.../events/in_proximity0_thresh_falling_value
|
what: /sys/.../events/in_proximity0_thresh_falling_value
|
||||||
what: /sys/.../events/in_proximity0_thresh_rising_value
|
what: /sys/.../events/in_proximity0_thresh_rising_value
|
||||||
|
What: /sys/.../events/in_illuminance_thresh_rising_value
|
||||||
|
What: /sys/.../events/in_illuminance_thresh_falling_value
|
||||||
KernelVersion: 2.6.37
|
KernelVersion: 2.6.37
|
||||||
Contact: linux-iio@vger.kernel.org
|
Contact: linux-iio@vger.kernel.org
|
||||||
Description:
|
Description:
|
||||||
@@ -972,6 +974,7 @@ What: /sys/.../events/in_activity_jogging_thresh_rising_period
|
|||||||
What: /sys/.../events/in_activity_jogging_thresh_falling_period
|
What: /sys/.../events/in_activity_jogging_thresh_falling_period
|
||||||
What: /sys/.../events/in_activity_running_thresh_rising_period
|
What: /sys/.../events/in_activity_running_thresh_rising_period
|
||||||
What: /sys/.../events/in_activity_running_thresh_falling_period
|
What: /sys/.../events/in_activity_running_thresh_falling_period
|
||||||
|
What: /sys/.../events/in_illuminance_thresh_either_period
|
||||||
KernelVersion: 2.6.37
|
KernelVersion: 2.6.37
|
||||||
Contact: linux-iio@vger.kernel.org
|
Contact: linux-iio@vger.kernel.org
|
||||||
Description:
|
Description:
|
||||||
@@ -1715,3 +1718,11 @@ Description:
|
|||||||
Mass concentration reading of particulate matter in ug / m3.
|
Mass concentration reading of particulate matter in ug / m3.
|
||||||
pmX consists of particles with aerodynamic diameter less or
|
pmX consists of particles with aerodynamic diameter less or
|
||||||
equal to X micrometers.
|
equal to X micrometers.
|
||||||
|
|
||||||
|
What: /sys/bus/iio/devices/iio:deviceX/events/in_illuminance_period_available
|
||||||
|
Date: November 2019
|
||||||
|
KernelVersion: 5.4
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
List of valid periods (in seconds) for which the light intensity
|
||||||
|
must be above the threshold level before interrupt is asserted.
|
||||||
|
39
Documentation/ABI/testing/sysfs-bus-iio-adc-ad7192
Normal file
39
Documentation/ABI/testing/sysfs-bus-iio-adc-ad7192
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
What: /sys/bus/iio/devices/iio:deviceX/ac_excitation_en
|
||||||
|
KernelVersion:
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Reading gives the state of AC excitation.
|
||||||
|
Writing '1' enables AC excitation.
|
||||||
|
|
||||||
|
What: /sys/bus/iio/devices/iio:deviceX/bridge_switch_en
|
||||||
|
KernelVersion:
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
This bridge switch is used to disconnect it when there is a
|
||||||
|
need to minimize the system current consumption.
|
||||||
|
Reading gives the state of the bridge switch.
|
||||||
|
Writing '1' enables the bridge switch.
|
||||||
|
|
||||||
|
What: /sys/bus/iio/devices/iio:deviceX/in_voltagex_sys_calibration
|
||||||
|
KernelVersion:
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Initiates the system calibration procedure. This is done on a
|
||||||
|
single channel at a time. Write '1' to start the calibration.
|
||||||
|
|
||||||
|
What: /sys/bus/iio/devices/iio:deviceX/in_voltagex_sys_calibration_mode_available
|
||||||
|
KernelVersion:
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Reading returns a list with the possible calibration modes.
|
||||||
|
There are two available options:
|
||||||
|
"zero_scale" - calibrate to zero scale
|
||||||
|
"full_scale" - calibrate to full scale
|
||||||
|
|
||||||
|
What: /sys/bus/iio/devices/iio:deviceX/in_voltagex_sys_calibration_mode
|
||||||
|
KernelVersion:
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Sets up the calibration mode used in the system calibration
|
||||||
|
procedure. Reading returns the current calibration mode.
|
||||||
|
Writing sets the system calibration mode.
|
@@ -4,7 +4,7 @@ KernelVersion: 3.10
|
|||||||
Contact: Samuel Ortiz <sameo@linux.intel.com>
|
Contact: Samuel Ortiz <sameo@linux.intel.com>
|
||||||
linux-mei@linux.intel.com
|
linux-mei@linux.intel.com
|
||||||
Description: Stores the same MODALIAS value emitted by uevent
|
Description: Stores the same MODALIAS value emitted by uevent
|
||||||
Format: mei:<mei device name>:<device uuid>:
|
Format: mei:<mei device name>:<device uuid>:<protocol version>
|
||||||
|
|
||||||
What: /sys/bus/mei/devices/.../name
|
What: /sys/bus/mei/devices/.../name
|
||||||
Date: May 2015
|
Date: May 2015
|
||||||
@@ -26,3 +26,24 @@ KernelVersion: 4.3
|
|||||||
Contact: Tomas Winkler <tomas.winkler@intel.com>
|
Contact: Tomas Winkler <tomas.winkler@intel.com>
|
||||||
Description: Stores mei client protocol version
|
Description: Stores mei client protocol version
|
||||||
Format: %d
|
Format: %d
|
||||||
|
|
||||||
|
What: /sys/bus/mei/devices/.../max_conn
|
||||||
|
Date: Nov 2019
|
||||||
|
KernelVersion: 5.5
|
||||||
|
Contact: Tomas Winkler <tomas.winkler@intel.com>
|
||||||
|
Description: Stores mei client maximum number of connections
|
||||||
|
Format: %d
|
||||||
|
|
||||||
|
What: /sys/bus/mei/devices/.../fixed
|
||||||
|
Date: Nov 2019
|
||||||
|
KernelVersion: 5.5
|
||||||
|
Contact: Tomas Winkler <tomas.winkler@intel.com>
|
||||||
|
Description: Stores mei client fixed address, if any
|
||||||
|
Format: %d
|
||||||
|
|
||||||
|
What: /sys/bus/mei/devices/.../max_len
|
||||||
|
Date: Nov 2019
|
||||||
|
KernelVersion: 5.5
|
||||||
|
Contact: Tomas Winkler <tomas.winkler@intel.com>
|
||||||
|
Description: Stores mei client maximum message length
|
||||||
|
Format: %d
|
||||||
|
@@ -347,3 +347,16 @@ Description:
|
|||||||
If the device has any Peer-to-Peer memory registered, this
|
If the device has any Peer-to-Peer memory registered, this
|
||||||
file contains a '1' if the memory has been published for
|
file contains a '1' if the memory has been published for
|
||||||
use outside the driver that owns the device.
|
use outside the driver that owns the device.
|
||||||
|
|
||||||
|
What: /sys/bus/pci/devices/.../link/clkpm
|
||||||
|
/sys/bus/pci/devices/.../link/l0s_aspm
|
||||||
|
/sys/bus/pci/devices/.../link/l1_aspm
|
||||||
|
/sys/bus/pci/devices/.../link/l1_1_aspm
|
||||||
|
/sys/bus/pci/devices/.../link/l1_2_aspm
|
||||||
|
/sys/bus/pci/devices/.../link/l1_1_pcipm
|
||||||
|
/sys/bus/pci/devices/.../link/l1_2_pcipm
|
||||||
|
Date: October 2019
|
||||||
|
Contact: Heiner Kallweit <hkallweit1@gmail.com>
|
||||||
|
Description: If ASPM is supported for an endpoint, these files can be
|
||||||
|
used to disable or enable the individual power management
|
||||||
|
states. Write y/1/on to enable, n/0/off to disable.
|
||||||
|
@@ -80,6 +80,14 @@ Contact: thunderbolt-software@lists.01.org
|
|||||||
Description: This attribute contains 1 if Thunderbolt device was already
|
Description: This attribute contains 1 if Thunderbolt device was already
|
||||||
authorized on boot and 0 otherwise.
|
authorized on boot and 0 otherwise.
|
||||||
|
|
||||||
|
What: /sys/bus/thunderbolt/devices/.../generation
|
||||||
|
Date: Jan 2020
|
||||||
|
KernelVersion: 5.5
|
||||||
|
Contact: Christian Kellner <christian@kellner.me>
|
||||||
|
Description: This attribute contains the generation of the Thunderbolt
|
||||||
|
controller associated with the device. It will contain 4
|
||||||
|
for USB4.
|
||||||
|
|
||||||
What: /sys/bus/thunderbolt/devices/.../key
|
What: /sys/bus/thunderbolt/devices/.../key
|
||||||
Date: Sep 2017
|
Date: Sep 2017
|
||||||
KernelVersion: 4.13
|
KernelVersion: 4.13
|
||||||
@@ -104,6 +112,34 @@ Contact: thunderbolt-software@lists.01.org
|
|||||||
Description: This attribute contains name of this device extracted from
|
Description: This attribute contains name of this device extracted from
|
||||||
the device DROM.
|
the device DROM.
|
||||||
|
|
||||||
|
What: /sys/bus/thunderbolt/devices/.../rx_speed
|
||||||
|
Date: Jan 2020
|
||||||
|
KernelVersion: 5.5
|
||||||
|
Contact: Mika Westerberg <mika.westerberg@linux.intel.com>
|
||||||
|
Description: This attribute reports the device RX speed per lane.
|
||||||
|
All RX lanes run at the same speed.
|
||||||
|
|
||||||
|
What: /sys/bus/thunderbolt/devices/.../rx_lanes
|
||||||
|
Date: Jan 2020
|
||||||
|
KernelVersion: 5.5
|
||||||
|
Contact: Mika Westerberg <mika.westerberg@linux.intel.com>
|
||||||
|
Description: This attribute reports number of RX lanes the device is
|
||||||
|
using simultaneusly through its upstream port.
|
||||||
|
|
||||||
|
What: /sys/bus/thunderbolt/devices/.../tx_speed
|
||||||
|
Date: Jan 2020
|
||||||
|
KernelVersion: 5.5
|
||||||
|
Contact: Mika Westerberg <mika.westerberg@linux.intel.com>
|
||||||
|
Description: This attribute reports the TX speed per lane.
|
||||||
|
All TX lanes run at the same speed.
|
||||||
|
|
||||||
|
What: /sys/bus/thunderbolt/devices/.../tx_lanes
|
||||||
|
Date: Jan 2020
|
||||||
|
KernelVersion: 5.5
|
||||||
|
Contact: Mika Westerberg <mika.westerberg@linux.intel.com>
|
||||||
|
Description: This attribute reports number of TX lanes the device is
|
||||||
|
using simultaneusly through its upstream port.
|
||||||
|
|
||||||
What: /sys/bus/thunderbolt/devices/.../vendor
|
What: /sys/bus/thunderbolt/devices/.../vendor
|
||||||
Date: Sep 2017
|
Date: Sep 2017
|
||||||
KernelVersion: 4.13
|
KernelVersion: 4.13
|
||||||
|
139
Documentation/ABI/testing/sysfs-class-led-driver-el15203000
Normal file
139
Documentation/ABI/testing/sysfs-class-led-driver-el15203000
Normal file
@@ -0,0 +1,139 @@
|
|||||||
|
What: /sys/class/leds/<led>/hw_pattern
|
||||||
|
Date: September 2019
|
||||||
|
KernelVersion: 5.5
|
||||||
|
Description:
|
||||||
|
Specify a hardware pattern for the EL15203000 LED.
|
||||||
|
The LEDs board supports only predefined patterns by firmware
|
||||||
|
for specific LEDs.
|
||||||
|
|
||||||
|
Breathing mode for Screen frame light tube:
|
||||||
|
"0 4000 1 4000"
|
||||||
|
|
||||||
|
^
|
||||||
|
|
|
||||||
|
Max-| ---
|
||||||
|
| / \
|
||||||
|
| / \
|
||||||
|
| / \ /
|
||||||
|
| / \ /
|
||||||
|
Min-|- ---
|
||||||
|
|
|
||||||
|
0------4------8--> time (sec)
|
||||||
|
|
||||||
|
Cascade mode for Pipe LED:
|
||||||
|
"1 800 2 800 4 800 8 800 16 800"
|
||||||
|
|
||||||
|
^
|
||||||
|
|
|
||||||
|
0 On -|----+ +----+ +---
|
||||||
|
| | | | |
|
||||||
|
Off-| +-------------------+ +-------------------+
|
||||||
|
|
|
||||||
|
1 On -| +----+ +----+
|
||||||
|
| | | | |
|
||||||
|
Off |----+ +-------------------+ +------------------
|
||||||
|
|
|
||||||
|
2 On -| +----+ +----+
|
||||||
|
| | | | |
|
||||||
|
Off-|---------+ +-------------------+ +-------------
|
||||||
|
|
|
||||||
|
3 On -| +----+ +----+
|
||||||
|
| | | | |
|
||||||
|
Off-|--------------+ +-------------------+ +--------
|
||||||
|
|
|
||||||
|
4 On -| +----+ +----+
|
||||||
|
| | | | |
|
||||||
|
Off-|-------------------+ +-------------------+ +---
|
||||||
|
|
|
||||||
|
0---0.8--1.6--2.4--3.2---4---4.8--5.6--6.4--7.2---8--> time (sec)
|
||||||
|
|
||||||
|
Inverted cascade mode for Pipe LED:
|
||||||
|
"30 800 29 800 27 800 23 800 15 800"
|
||||||
|
|
||||||
|
^
|
||||||
|
|
|
||||||
|
0 On -| +-------------------+ +-------------------+
|
||||||
|
| | | | |
|
||||||
|
Off-|----+ +----+ +---
|
||||||
|
|
|
||||||
|
1 On -|----+ +-------------------+ +------------------
|
||||||
|
| | | | |
|
||||||
|
Off | +----+ +----+
|
||||||
|
|
|
||||||
|
2 On -|---------+ +-------------------+ +-------------
|
||||||
|
| | | | |
|
||||||
|
Off-| +----+ +----+
|
||||||
|
|
|
||||||
|
3 On -|--------------+ +-------------------+ +--------
|
||||||
|
| | | | |
|
||||||
|
Off-| +----+ +----+
|
||||||
|
|
|
||||||
|
4 On -|-------------------+ +-------------------+ +---
|
||||||
|
| | | | |
|
||||||
|
Off-| +----+ +----+
|
||||||
|
|
|
||||||
|
0---0.8--1.6--2.4--3.2---4---4.8--5.6--6.4--7.2---8--> time (sec)
|
||||||
|
|
||||||
|
Bounce mode for Pipe LED:
|
||||||
|
"1 800 2 800 4 800 8 800 16 800 16 800 8 800 4 800 2 800 1 800"
|
||||||
|
|
||||||
|
^
|
||||||
|
|
|
||||||
|
0 On -|----+ +--------
|
||||||
|
| | |
|
||||||
|
Off-| +---------------------------------------+
|
||||||
|
|
|
||||||
|
1 On -| +----+ +----+
|
||||||
|
| | | | |
|
||||||
|
Off |----+ +-----------------------------+ +--------
|
||||||
|
|
|
||||||
|
2 On -| +----+ +----+
|
||||||
|
| | | | |
|
||||||
|
Off-|---------+ +-------------------+ +-------------
|
||||||
|
|
|
||||||
|
3 On -| +----+ +----+
|
||||||
|
| | | | |
|
||||||
|
Off-|--------------+ +---------+ +------------------
|
||||||
|
|
|
||||||
|
4 On -| +---------+
|
||||||
|
| | |
|
||||||
|
Off-|-------------------+ +-----------------------
|
||||||
|
|
|
||||||
|
0---0.8--1.6--2.4--3.2---4---4.8--5.6--6.4--7.2---8--> time (sec)
|
||||||
|
|
||||||
|
Inverted bounce mode for Pipe LED:
|
||||||
|
"30 800 29 800 27 800 23 800 15 800 15 800 23 800 27 800 29 800 30 800"
|
||||||
|
|
||||||
|
^
|
||||||
|
|
|
||||||
|
0 On -| +---------------------------------------+
|
||||||
|
| | |
|
||||||
|
Off-|----+ +--------
|
||||||
|
|
|
||||||
|
1 On -|----+ +-----------------------------+ +--------
|
||||||
|
| | | | |
|
||||||
|
Off | +----+ +----+
|
||||||
|
|
|
||||||
|
2 On -|---------+ +-------------------+ +-------------
|
||||||
|
| | | | |
|
||||||
|
Off-| +----+ +----+
|
||||||
|
|
|
||||||
|
3 On -|--------------+ +---------+ +------------------
|
||||||
|
| | | | |
|
||||||
|
Off-| +----+ +----+
|
||||||
|
|
|
||||||
|
4 On -|-------------------+ +-----------------------
|
||||||
|
| | |
|
||||||
|
Off-| +---------+
|
||||||
|
|
|
||||||
|
0---0.8--1.6--2.4--3.2---4---4.8--5.6--6.4--7.2---8--> time (sec)
|
||||||
|
|
||||||
|
What: /sys/class/leds/<led>/repeat
|
||||||
|
Date: September 2019
|
||||||
|
KernelVersion: 5.5
|
||||||
|
Description:
|
||||||
|
EL15203000 supports only indefinitely patterns,
|
||||||
|
so this file should always store -1.
|
||||||
|
|
||||||
|
For more info, please see:
|
||||||
|
Documentation/ABI/testing/sysfs-class-led-trigger-pattern
|
@@ -80,3 +80,13 @@ Description: Display the ME device state.
|
|||||||
DISABLED
|
DISABLED
|
||||||
POWER_DOWN
|
POWER_DOWN
|
||||||
POWER_UP
|
POWER_UP
|
||||||
|
|
||||||
|
What: /sys/class/mei/meiN/trc
|
||||||
|
Date: Nov 2019
|
||||||
|
KernelVersion: 5.5
|
||||||
|
Contact: Tomas Winkler <tomas.winkler@intel.com>
|
||||||
|
Description: Display trc status register content
|
||||||
|
|
||||||
|
The ME FW writes Glitch Detection HW (TRC)
|
||||||
|
status information into trc status register
|
||||||
|
for BIOS and OS to monitor fw health.
|
||||||
|
@@ -51,6 +51,14 @@ Description:
|
|||||||
packet processing. See the network driver for the exact
|
packet processing. See the network driver for the exact
|
||||||
meaning of this value.
|
meaning of this value.
|
||||||
|
|
||||||
|
What: /sys/class/<iface>/statistics/rx_errors
|
||||||
|
Date: April 2005
|
||||||
|
KernelVersion: 2.6.12
|
||||||
|
Contact: netdev@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Indicates the number of receive errors on this network device.
|
||||||
|
See the network driver for the exact meaning of this value.
|
||||||
|
|
||||||
What: /sys/class/<iface>/statistics/rx_fifo_errors
|
What: /sys/class/<iface>/statistics/rx_fifo_errors
|
||||||
Date: April 2005
|
Date: April 2005
|
||||||
KernelVersion: 2.6.12
|
KernelVersion: 2.6.12
|
||||||
@@ -88,6 +96,14 @@ Description:
|
|||||||
due to lack of capacity in the receive side. See the network
|
due to lack of capacity in the receive side. See the network
|
||||||
driver for the exact meaning of this value.
|
driver for the exact meaning of this value.
|
||||||
|
|
||||||
|
What: /sys/class/<iface>/statistics/rx_nohandler
|
||||||
|
Date: February 2016
|
||||||
|
KernelVersion: 4.6
|
||||||
|
Contact: netdev@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Indicates the number of received packets that were dropped on
|
||||||
|
an inactive device by the network core.
|
||||||
|
|
||||||
What: /sys/class/<iface>/statistics/rx_over_errors
|
What: /sys/class/<iface>/statistics/rx_over_errors
|
||||||
Date: April 2005
|
Date: April 2005
|
||||||
KernelVersion: 2.6.12
|
KernelVersion: 2.6.12
|
||||||
|
@@ -17,8 +17,13 @@ What: /sys/class/watchdog/watchdogn/nowayout
|
|||||||
Date: August 2015
|
Date: August 2015
|
||||||
Contact: Wim Van Sebroeck <wim@iguana.be>
|
Contact: Wim Van Sebroeck <wim@iguana.be>
|
||||||
Description:
|
Description:
|
||||||
It is a read only file. While reading, it gives '1' if that
|
It is a read/write file. While reading, it gives '1'
|
||||||
device supports nowayout feature else, it gives '0'.
|
if the device has the nowayout feature set, otherwise
|
||||||
|
it gives '0'. Writing a '1' to the file enables the
|
||||||
|
nowayout feature. Once set, the nowayout feature
|
||||||
|
cannot be disabled, so writing a '0' either has no
|
||||||
|
effect (if the feature was already disabled) or
|
||||||
|
results in a permission error.
|
||||||
|
|
||||||
What: /sys/class/watchdog/watchdogn/state
|
What: /sys/class/watchdog/watchdogn/state
|
||||||
Date: August 2015
|
Date: August 2015
|
||||||
|
@@ -31,6 +31,12 @@ Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
|
|||||||
Description:
|
Description:
|
||||||
Controls the issue rate of segment discard commands.
|
Controls the issue rate of segment discard commands.
|
||||||
|
|
||||||
|
What: /sys/fs/f2fs/<disk>/max_blkaddr
|
||||||
|
Date: November 2019
|
||||||
|
Contact: "Ramon Pantin" <pantin@google.com>
|
||||||
|
Description:
|
||||||
|
Shows first block address of MAIN area.
|
||||||
|
|
||||||
What: /sys/fs/f2fs/<disk>/ipu_policy
|
What: /sys/fs/f2fs/<disk>/ipu_policy
|
||||||
Date: November 2013
|
Date: November 2013
|
||||||
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
|
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
|
||||||
|
@@ -106,3 +106,135 @@ KernelVersion: 5.4
|
|||||||
Contact: Wu Hao <hao.wu@intel.com>
|
Contact: Wu Hao <hao.wu@intel.com>
|
||||||
Description: Read-only. Read this file to get the second error detected by
|
Description: Read-only. Read this file to get the second error detected by
|
||||||
hardware.
|
hardware.
|
||||||
|
|
||||||
|
What: /sys/bus/platform/devices/dfl-fme.0/hwmon/hwmonX/name
|
||||||
|
Date: October 2019
|
||||||
|
KernelVersion: 5.5
|
||||||
|
Contact: Wu Hao <hao.wu@intel.com>
|
||||||
|
Description: Read-Only. Read this file to get the name of hwmon device, it
|
||||||
|
supports values:
|
||||||
|
'dfl_fme_thermal' - thermal hwmon device name
|
||||||
|
'dfl_fme_power' - power hwmon device name
|
||||||
|
|
||||||
|
What: /sys/bus/platform/devices/dfl-fme.0/hwmon/hwmonX/temp1_input
|
||||||
|
Date: October 2019
|
||||||
|
KernelVersion: 5.5
|
||||||
|
Contact: Wu Hao <hao.wu@intel.com>
|
||||||
|
Description: Read-Only. It returns FPGA device temperature in millidegrees
|
||||||
|
Celsius.
|
||||||
|
|
||||||
|
What: /sys/bus/platform/devices/dfl-fme.0/hwmon/hwmonX/temp1_max
|
||||||
|
Date: October 2019
|
||||||
|
KernelVersion: 5.5
|
||||||
|
Contact: Wu Hao <hao.wu@intel.com>
|
||||||
|
Description: Read-Only. It returns hardware threshold1 temperature in
|
||||||
|
millidegrees Celsius. If temperature rises at or above this
|
||||||
|
threshold, hardware starts 50% or 90% throttling (see
|
||||||
|
'temp1_max_policy').
|
||||||
|
|
||||||
|
What: /sys/bus/platform/devices/dfl-fme.0/hwmon/hwmonX/temp1_crit
|
||||||
|
Date: October 2019
|
||||||
|
KernelVersion: 5.5
|
||||||
|
Contact: Wu Hao <hao.wu@intel.com>
|
||||||
|
Description: Read-Only. It returns hardware threshold2 temperature in
|
||||||
|
millidegrees Celsius. If temperature rises at or above this
|
||||||
|
threshold, hardware starts 100% throttling.
|
||||||
|
|
||||||
|
What: /sys/bus/platform/devices/dfl-fme.0/hwmon/hwmonX/temp1_emergency
|
||||||
|
Date: October 2019
|
||||||
|
KernelVersion: 5.5
|
||||||
|
Contact: Wu Hao <hao.wu@intel.com>
|
||||||
|
Description: Read-Only. It returns hardware trip threshold temperature in
|
||||||
|
millidegrees Celsius. If temperature rises at or above this
|
||||||
|
threshold, a fatal event will be triggered to board management
|
||||||
|
controller (BMC) to shutdown FPGA.
|
||||||
|
|
||||||
|
What: /sys/bus/platform/devices/dfl-fme.0/hwmon/hwmonX/temp1_max_alarm
|
||||||
|
Date: October 2019
|
||||||
|
KernelVersion: 5.5
|
||||||
|
Contact: Wu Hao <hao.wu@intel.com>
|
||||||
|
Description: Read-only. It returns 1 if temperature is currently at or above
|
||||||
|
hardware threshold1 (see 'temp1_max'), otherwise 0.
|
||||||
|
|
||||||
|
What: /sys/bus/platform/devices/dfl-fme.0/hwmon/hwmonX/temp1_crit_alarm
|
||||||
|
Date: October 2019
|
||||||
|
KernelVersion: 5.5
|
||||||
|
Contact: Wu Hao <hao.wu@intel.com>
|
||||||
|
Description: Read-only. It returns 1 if temperature is currently at or above
|
||||||
|
hardware threshold2 (see 'temp1_crit'), otherwise 0.
|
||||||
|
|
||||||
|
What: /sys/bus/platform/devices/dfl-fme.0/hwmon/hwmonX/temp1_max_policy
|
||||||
|
Date: October 2019
|
||||||
|
KernelVersion: 5.5
|
||||||
|
Contact: Wu Hao <hao.wu@intel.com>
|
||||||
|
Description: Read-Only. Read this file to get the policy of hardware threshold1
|
||||||
|
(see 'temp1_max'). It only supports two values (policies):
|
||||||
|
0 - AP2 state (90% throttling)
|
||||||
|
1 - AP1 state (50% throttling)
|
||||||
|
|
||||||
|
What: /sys/bus/platform/devices/dfl-fme.0/hwmon/hwmonX/power1_input
|
||||||
|
Date: October 2019
|
||||||
|
KernelVersion: 5.5
|
||||||
|
Contact: Wu Hao <hao.wu@intel.com>
|
||||||
|
Description: Read-Only. It returns current FPGA power consumption in uW.
|
||||||
|
|
||||||
|
What: /sys/bus/platform/devices/dfl-fme.0/hwmon/hwmonX/power1_max
|
||||||
|
Date: October 2019
|
||||||
|
KernelVersion: 5.5
|
||||||
|
Contact: Wu Hao <hao.wu@intel.com>
|
||||||
|
Description: Read-Write. Read this file to get current hardware power
|
||||||
|
threshold1 in uW. If power consumption rises at or above
|
||||||
|
this threshold, hardware starts 50% throttling.
|
||||||
|
Write this file to set current hardware power threshold1 in uW.
|
||||||
|
As hardware only accepts values in Watts, so input value will
|
||||||
|
be round down per Watts (< 1 watts part will be discarded) and
|
||||||
|
clamped within the range from 0 to 127 Watts. Write fails with
|
||||||
|
-EINVAL if input parsing fails.
|
||||||
|
|
||||||
|
What: /sys/bus/platform/devices/dfl-fme.0/hwmon/hwmonX/power1_crit
|
||||||
|
Date: October 2019
|
||||||
|
KernelVersion: 5.5
|
||||||
|
Contact: Wu Hao <hao.wu@intel.com>
|
||||||
|
Description: Read-Write. Read this file to get current hardware power
|
||||||
|
threshold2 in uW. If power consumption rises at or above
|
||||||
|
this threshold, hardware starts 90% throttling.
|
||||||
|
Write this file to set current hardware power threshold2 in uW.
|
||||||
|
As hardware only accepts values in Watts, so input value will
|
||||||
|
be round down per Watts (< 1 watts part will be discarded) and
|
||||||
|
clamped within the range from 0 to 127 Watts. Write fails with
|
||||||
|
-EINVAL if input parsing fails.
|
||||||
|
|
||||||
|
What: /sys/bus/platform/devices/dfl-fme.0/hwmon/hwmonX/power1_max_alarm
|
||||||
|
Date: October 2019
|
||||||
|
KernelVersion: 5.5
|
||||||
|
Contact: Wu Hao <hao.wu@intel.com>
|
||||||
|
Description: Read-only. It returns 1 if power consumption is currently at or
|
||||||
|
above hardware threshold1 (see 'power1_max'), otherwise 0.
|
||||||
|
|
||||||
|
What: /sys/bus/platform/devices/dfl-fme.0/hwmon/hwmonX/power1_crit_alarm
|
||||||
|
Date: October 2019
|
||||||
|
KernelVersion: 5.5
|
||||||
|
Contact: Wu Hao <hao.wu@intel.com>
|
||||||
|
Description: Read-only. It returns 1 if power consumption is currently at or
|
||||||
|
above hardware threshold2 (see 'power1_crit'), otherwise 0.
|
||||||
|
|
||||||
|
What: /sys/bus/platform/devices/dfl-fme.0/hwmon/hwmonX/power1_xeon_limit
|
||||||
|
Date: October 2019
|
||||||
|
KernelVersion: 5.5
|
||||||
|
Contact: Wu Hao <hao.wu@intel.com>
|
||||||
|
Description: Read-Only. It returns power limit for XEON in uW.
|
||||||
|
|
||||||
|
What: /sys/bus/platform/devices/dfl-fme.0/hwmon/hwmonX/power1_fpga_limit
|
||||||
|
Date: October 2019
|
||||||
|
KernelVersion: 5.5
|
||||||
|
Contact: Wu Hao <hao.wu@intel.com>
|
||||||
|
Description: Read-Only. It returns power limit for FPGA in uW.
|
||||||
|
|
||||||
|
What: /sys/bus/platform/devices/dfl-fme.0/hwmon/hwmonX/power1_ltr
|
||||||
|
Date: October 2019
|
||||||
|
KernelVersion: 5.5
|
||||||
|
Contact: Wu Hao <hao.wu@intel.com>
|
||||||
|
Description: Read-only. Read this file to get current Latency Tolerance
|
||||||
|
Reporting (ltr) value. It returns 1 if all Accelerated
|
||||||
|
Function Units (AFUs) can tolerate latency >= 40us for memory
|
||||||
|
access or 0 if any AFU is latency sensitive (< 40us).
|
||||||
|
58
Documentation/ABI/testing/sysfs-platform-mellanox-bootctl
Normal file
58
Documentation/ABI/testing/sysfs-platform-mellanox-bootctl
Normal file
@@ -0,0 +1,58 @@
|
|||||||
|
What: /sys/bus/platform/devices/MLNXBF04:00/driver/lifecycle_state
|
||||||
|
Date: Oct 2019
|
||||||
|
KernelVersion: 5.5
|
||||||
|
Contact: "Liming Sun <lsun@mellanox.com>"
|
||||||
|
Description:
|
||||||
|
The Life-cycle state of the SoC, which could be one of the
|
||||||
|
following values.
|
||||||
|
Production - Production state and can be updated to secure
|
||||||
|
GA Secured - Secure chip and not able to change state
|
||||||
|
GA Non-Secured - Non-Secure chip and not able to change state
|
||||||
|
RMA - Return Merchandise Authorization
|
||||||
|
|
||||||
|
What: /sys/bus/platform/devices/MLNXBF04:00/driver/post_reset_wdog
|
||||||
|
Date: Oct 2019
|
||||||
|
KernelVersion: 5.5
|
||||||
|
Contact: "Liming Sun <lsun@mellanox.com>"
|
||||||
|
Description:
|
||||||
|
The watchdog setting in seconds for the next booting. It's used
|
||||||
|
to reboot the chip and recover it to the old state if the new
|
||||||
|
boot partition fails.
|
||||||
|
|
||||||
|
What: /sys/bus/platform/devices/MLNXBF04:00/driver/reset_action
|
||||||
|
Date: Oct 2019
|
||||||
|
KernelVersion: 5.5
|
||||||
|
Contact: "Liming Sun <lsun@mellanox.com>"
|
||||||
|
Description:
|
||||||
|
The source of the boot stream for the next reset. It could be
|
||||||
|
one of the following values.
|
||||||
|
external - boot from external source (USB or PCIe)
|
||||||
|
emmc - boot from the onchip eMMC
|
||||||
|
emmc_legacy - boot from the onchip eMMC in legacy (slow) mode
|
||||||
|
|
||||||
|
What: /sys/bus/platform/devices/MLNXBF04:00/driver/second_reset_action
|
||||||
|
Date: Oct 2019
|
||||||
|
KernelVersion: 5.5
|
||||||
|
Contact: "Liming Sun <lsun@mellanox.com>"
|
||||||
|
Description:
|
||||||
|
Update the source of the boot stream after next reset. It could
|
||||||
|
be one of the following values and will be applied after next
|
||||||
|
reset.
|
||||||
|
external - boot from external source (USB or PCIe)
|
||||||
|
emmc - boot from the onchip eMMC
|
||||||
|
emmc_legacy - boot from the onchip eMMC in legacy (slow) mode
|
||||||
|
swap_emmc - swap the primary / secondary boot partition
|
||||||
|
none - cancel the action
|
||||||
|
|
||||||
|
What: /sys/bus/platform/devices/MLNXBF04:00/driver/secure_boot_fuse_state
|
||||||
|
Date: Oct 2019
|
||||||
|
KernelVersion: 5.5
|
||||||
|
Contact: "Liming Sun <lsun@mellanox.com>"
|
||||||
|
Description:
|
||||||
|
The state of eFuse versions with the following values.
|
||||||
|
InUse - burnt, valid and currently in use
|
||||||
|
Used - burnt and valid
|
||||||
|
Free - not burnt and free to use
|
||||||
|
Skipped - not burnt but not free (skipped)
|
||||||
|
Wasted - burnt and invalid
|
||||||
|
Invalid - not burnt but marked as valid (error state).
|
@@ -31,6 +31,23 @@ Description:
|
|||||||
Output will a version string be similar to the example below:
|
Output will a version string be similar to the example below:
|
||||||
08B6
|
08B6
|
||||||
|
|
||||||
|
What: /sys/bus/platform/devices/GOOG000C\:00/usb_charge
|
||||||
|
Date: October 2019
|
||||||
|
KernelVersion: 5.5
|
||||||
|
Description:
|
||||||
|
Control the USB PowerShare Policy. USB PowerShare is a policy
|
||||||
|
which affects charging via the special USB PowerShare port
|
||||||
|
(marked with a small lightning bolt or battery icon) when in
|
||||||
|
low power states:
|
||||||
|
- In S0, the port will always provide power.
|
||||||
|
- In S0ix, if usb_charge is enabled, then power will be
|
||||||
|
supplied to the port when on AC or if battery is > 50%.
|
||||||
|
Else no power is supplied.
|
||||||
|
- In S5, if usb_charge is enabled, then power will be supplied
|
||||||
|
to the port when on AC. Else no power is supplied.
|
||||||
|
|
||||||
|
Input should be either "0" or "1".
|
||||||
|
|
||||||
What: /sys/bus/platform/devices/GOOG000C\:00/version
|
What: /sys/bus/platform/devices/GOOG000C\:00/version
|
||||||
Date: May 2019
|
Date: May 2019
|
||||||
KernelVersion: 5.3
|
KernelVersion: 5.3
|
||||||
|
46
Documentation/ABI/testing/sysfs-secvar
Normal file
46
Documentation/ABI/testing/sysfs-secvar
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
What: /sys/firmware/secvar
|
||||||
|
Date: August 2019
|
||||||
|
Contact: Nayna Jain <nayna@linux.ibm.com>
|
||||||
|
Description: This directory is created if the POWER firmware supports OS
|
||||||
|
secureboot, thereby secure variables. It exposes interface
|
||||||
|
for reading/writing the secure variables
|
||||||
|
|
||||||
|
What: /sys/firmware/secvar/vars
|
||||||
|
Date: August 2019
|
||||||
|
Contact: Nayna Jain <nayna@linux.ibm.com>
|
||||||
|
Description: This directory lists all the secure variables that are supported
|
||||||
|
by the firmware.
|
||||||
|
|
||||||
|
What: /sys/firmware/secvar/format
|
||||||
|
Date: August 2019
|
||||||
|
Contact: Nayna Jain <nayna@linux.ibm.com>
|
||||||
|
Description: A string indicating which backend is in use by the firmware.
|
||||||
|
This determines the format of the variable and the accepted
|
||||||
|
format of variable updates.
|
||||||
|
|
||||||
|
What: /sys/firmware/secvar/vars/<variable name>
|
||||||
|
Date: August 2019
|
||||||
|
Contact: Nayna Jain <nayna@linux.ibm.com>
|
||||||
|
Description: Each secure variable is represented as a directory named as
|
||||||
|
<variable_name>. The variable name is unique and is in ASCII
|
||||||
|
representation. The data and size can be determined by reading
|
||||||
|
their respective attribute files.
|
||||||
|
|
||||||
|
What: /sys/firmware/secvar/vars/<variable_name>/size
|
||||||
|
Date: August 2019
|
||||||
|
Contact: Nayna Jain <nayna@linux.ibm.com>
|
||||||
|
Description: An integer representation of the size of the content of the
|
||||||
|
variable. In other words, it represents the size of the data.
|
||||||
|
|
||||||
|
What: /sys/firmware/secvar/vars/<variable_name>/data
|
||||||
|
Date: August 2019
|
||||||
|
Contact: Nayna Jain h<nayna@linux.ibm.com>
|
||||||
|
Description: A read-only file containing the value of the variable. The size
|
||||||
|
of the file represents the maximum size of the variable data.
|
||||||
|
|
||||||
|
What: /sys/firmware/secvar/vars/<variable_name>/update
|
||||||
|
Date: August 2019
|
||||||
|
Contact: Nayna Jain <nayna@linux.ibm.com>
|
||||||
|
Description: A write-only file that is used to submit the new value for the
|
||||||
|
variable. The size of the file represents the maximum size of
|
||||||
|
the variable data that can be written.
|
@@ -5,24 +5,6 @@ DMA attributes
|
|||||||
This document describes the semantics of the DMA attributes that are
|
This document describes the semantics of the DMA attributes that are
|
||||||
defined in linux/dma-mapping.h.
|
defined in linux/dma-mapping.h.
|
||||||
|
|
||||||
DMA_ATTR_WRITE_BARRIER
|
|
||||||
----------------------
|
|
||||||
|
|
||||||
DMA_ATTR_WRITE_BARRIER is a (write) barrier attribute for DMA. DMA
|
|
||||||
to a memory region with the DMA_ATTR_WRITE_BARRIER attribute forces
|
|
||||||
all pending DMA writes to complete, and thus provides a mechanism to
|
|
||||||
strictly order DMA from a device across all intervening busses and
|
|
||||||
bridges. This barrier is not specific to a particular type of
|
|
||||||
interconnect, it applies to the system as a whole, and so its
|
|
||||||
implementation must account for the idiosyncrasies of the system all
|
|
||||||
the way from the DMA device to memory.
|
|
||||||
|
|
||||||
As an example of a situation where DMA_ATTR_WRITE_BARRIER would be
|
|
||||||
useful, suppose that a device does a DMA write to indicate that data is
|
|
||||||
ready and available in memory. The DMA of the "completion indication"
|
|
||||||
could race with data DMA. Mapping the memory used for completion
|
|
||||||
indications with DMA_ATTR_WRITE_BARRIER would prevent the race.
|
|
||||||
|
|
||||||
DMA_ATTR_WEAK_ORDERING
|
DMA_ATTR_WEAK_ORDERING
|
||||||
----------------------
|
----------------------
|
||||||
|
|
||||||
|
@@ -13,7 +13,7 @@ endif
|
|||||||
SPHINXBUILD = sphinx-build
|
SPHINXBUILD = sphinx-build
|
||||||
SPHINXOPTS =
|
SPHINXOPTS =
|
||||||
SPHINXDIRS = .
|
SPHINXDIRS = .
|
||||||
_SPHINXDIRS = $(patsubst $(srctree)/Documentation/%/conf.py,%,$(wildcard $(srctree)/Documentation/*/conf.py))
|
_SPHINXDIRS = $(patsubst $(srctree)/Documentation/%/index.rst,%,$(wildcard $(srctree)/Documentation/*/index.rst))
|
||||||
SPHINX_CONF = conf.py
|
SPHINX_CONF = conf.py
|
||||||
PAPER =
|
PAPER =
|
||||||
BUILDDIR = $(obj)/output
|
BUILDDIR = $(obj)/output
|
||||||
@@ -33,8 +33,6 @@ ifeq ($(HAVE_SPHINX),0)
|
|||||||
|
|
||||||
else # HAVE_SPHINX
|
else # HAVE_SPHINX
|
||||||
|
|
||||||
export SPHINXOPTS = $(shell perl -e 'open IN,"sphinx-build --version 2>&1 |"; while (<IN>) { if (m/([\d\.]+)/) { print "-jauto" if ($$1 >= "1.7") } ;} close IN')
|
|
||||||
|
|
||||||
# User-friendly check for pdflatex and latexmk
|
# User-friendly check for pdflatex and latexmk
|
||||||
HAVE_PDFLATEX := $(shell if which $(PDFLATEX) >/dev/null 2>&1; then echo 1; else echo 0; fi)
|
HAVE_PDFLATEX := $(shell if which $(PDFLATEX) >/dev/null 2>&1; then echo 1; else echo 0; fi)
|
||||||
HAVE_LATEXMK := $(shell if which latexmk >/dev/null 2>&1; then echo 1; else echo 0; fi)
|
HAVE_LATEXMK := $(shell if which latexmk >/dev/null 2>&1; then echo 1; else echo 0; fi)
|
||||||
@@ -67,6 +65,8 @@ quiet_cmd_sphinx = SPHINX $@ --> file://$(abspath $(BUILDDIR)/$3/$4)
|
|||||||
cmd_sphinx = $(MAKE) BUILDDIR=$(abspath $(BUILDDIR)) $(build)=Documentation/media $2 && \
|
cmd_sphinx = $(MAKE) BUILDDIR=$(abspath $(BUILDDIR)) $(build)=Documentation/media $2 && \
|
||||||
PYTHONDONTWRITEBYTECODE=1 \
|
PYTHONDONTWRITEBYTECODE=1 \
|
||||||
BUILDDIR=$(abspath $(BUILDDIR)) SPHINX_CONF=$(abspath $(srctree)/$(src)/$5/$(SPHINX_CONF)) \
|
BUILDDIR=$(abspath $(BUILDDIR)) SPHINX_CONF=$(abspath $(srctree)/$(src)/$5/$(SPHINX_CONF)) \
|
||||||
|
$(PYTHON) $(srctree)/scripts/jobserver-exec \
|
||||||
|
$(SHELL) $(srctree)/Documentation/sphinx/parallel-wrapper.sh \
|
||||||
$(SPHINXBUILD) \
|
$(SPHINXBUILD) \
|
||||||
-b $2 \
|
-b $2 \
|
||||||
-c $(abspath $(srctree)/$(src)) \
|
-c $(abspath $(srctree)/$(src)) \
|
||||||
@@ -128,8 +128,10 @@ dochelp:
|
|||||||
@echo ' pdfdocs - PDF'
|
@echo ' pdfdocs - PDF'
|
||||||
@echo ' epubdocs - EPUB'
|
@echo ' epubdocs - EPUB'
|
||||||
@echo ' xmldocs - XML'
|
@echo ' xmldocs - XML'
|
||||||
@echo ' linkcheckdocs - check for broken external links (will connect to external hosts)'
|
@echo ' linkcheckdocs - check for broken external links'
|
||||||
@echo ' refcheckdocs - check for references to non-existing files under Documentation'
|
@echo ' (will connect to external hosts)'
|
||||||
|
@echo ' refcheckdocs - check for references to non-existing files under'
|
||||||
|
@echo ' Documentation'
|
||||||
@echo ' cleandocs - clean all generated files'
|
@echo ' cleandocs - clean all generated files'
|
||||||
@echo
|
@echo
|
||||||
@echo ' make SPHINXDIRS="s1 s2" [target] Generate only docs of folder s1, s2'
|
@echo ' make SPHINXDIRS="s1 s2" [target] Generate only docs of folder s1, s2'
|
||||||
|
File diff suppressed because it is too large
Load Diff
1163
Documentation/RCU/Design/Data-Structures/Data-Structures.rst
Normal file
1163
Documentation/RCU/Design/Data-Structures/Data-Structures.rst
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,668 +0,0 @@
|
|||||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
|
|
||||||
"http://www.w3.org/TR/html4/loose.dtd">
|
|
||||||
<html>
|
|
||||||
<head><title>A Tour Through TREE_RCU's Expedited Grace Periods</title>
|
|
||||||
<meta HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=iso-8859-1">
|
|
||||||
|
|
||||||
<h2>Introduction</h2>
|
|
||||||
|
|
||||||
This document describes RCU's expedited grace periods.
|
|
||||||
Unlike RCU's normal grace periods, which accept long latencies to attain
|
|
||||||
high efficiency and minimal disturbance, expedited grace periods accept
|
|
||||||
lower efficiency and significant disturbance to attain shorter latencies.
|
|
||||||
|
|
||||||
<p>
|
|
||||||
There are two flavors of RCU (RCU-preempt and RCU-sched), with an earlier
|
|
||||||
third RCU-bh flavor having been implemented in terms of the other two.
|
|
||||||
Each of the two implementations is covered in its own section.
|
|
||||||
|
|
||||||
<ol>
|
|
||||||
<li> <a href="#Expedited Grace Period Design">
|
|
||||||
Expedited Grace Period Design</a>
|
|
||||||
<li> <a href="#RCU-preempt Expedited Grace Periods">
|
|
||||||
RCU-preempt Expedited Grace Periods</a>
|
|
||||||
<li> <a href="#RCU-sched Expedited Grace Periods">
|
|
||||||
RCU-sched Expedited Grace Periods</a>
|
|
||||||
<li> <a href="#Expedited Grace Period and CPU Hotplug">
|
|
||||||
Expedited Grace Period and CPU Hotplug</a>
|
|
||||||
<li> <a href="#Expedited Grace Period Refinements">
|
|
||||||
Expedited Grace Period Refinements</a>
|
|
||||||
</ol>
|
|
||||||
|
|
||||||
<h2><a name="Expedited Grace Period Design">
|
|
||||||
Expedited Grace Period Design</a></h2>
|
|
||||||
|
|
||||||
<p>
|
|
||||||
The expedited RCU grace periods cannot be accused of being subtle,
|
|
||||||
given that they for all intents and purposes hammer every CPU that
|
|
||||||
has not yet provided a quiescent state for the current expedited
|
|
||||||
grace period.
|
|
||||||
The one saving grace is that the hammer has grown a bit smaller
|
|
||||||
over time: The old call to <tt>try_stop_cpus()</tt> has been
|
|
||||||
replaced with a set of calls to <tt>smp_call_function_single()</tt>,
|
|
||||||
each of which results in an IPI to the target CPU.
|
|
||||||
The corresponding handler function checks the CPU's state, motivating
|
|
||||||
a faster quiescent state where possible, and triggering a report
|
|
||||||
of that quiescent state.
|
|
||||||
As always for RCU, once everything has spent some time in a quiescent
|
|
||||||
state, the expedited grace period has completed.
|
|
||||||
|
|
||||||
<p>
|
|
||||||
The details of the <tt>smp_call_function_single()</tt> handler's
|
|
||||||
operation depend on the RCU flavor, as described in the following
|
|
||||||
sections.
|
|
||||||
|
|
||||||
<h2><a name="RCU-preempt Expedited Grace Periods">
|
|
||||||
RCU-preempt Expedited Grace Periods</a></h2>
|
|
||||||
|
|
||||||
<p>
|
|
||||||
<tt>CONFIG_PREEMPT=y</tt> kernels implement RCU-preempt.
|
|
||||||
The overall flow of the handling of a given CPU by an RCU-preempt
|
|
||||||
expedited grace period is shown in the following diagram:
|
|
||||||
|
|
||||||
<p><img src="ExpRCUFlow.svg" alt="ExpRCUFlow.svg" width="55%">
|
|
||||||
|
|
||||||
<p>
|
|
||||||
The solid arrows denote direct action, for example, a function call.
|
|
||||||
The dotted arrows denote indirect action, for example, an IPI
|
|
||||||
or a state that is reached after some time.
|
|
||||||
|
|
||||||
<p>
|
|
||||||
If a given CPU is offline or idle, <tt>synchronize_rcu_expedited()</tt>
|
|
||||||
will ignore it because idle and offline CPUs are already residing
|
|
||||||
in quiescent states.
|
|
||||||
Otherwise, the expedited grace period will use
|
|
||||||
<tt>smp_call_function_single()</tt> to send the CPU an IPI, which
|
|
||||||
is handled by <tt>rcu_exp_handler()</tt>.
|
|
||||||
|
|
||||||
<p>
|
|
||||||
However, because this is preemptible RCU, <tt>rcu_exp_handler()</tt>
|
|
||||||
can check to see if the CPU is currently running in an RCU read-side
|
|
||||||
critical section.
|
|
||||||
If not, the handler can immediately report a quiescent state.
|
|
||||||
Otherwise, it sets flags so that the outermost <tt>rcu_read_unlock()</tt>
|
|
||||||
invocation will provide the needed quiescent-state report.
|
|
||||||
This flag-setting avoids the previous forced preemption of all
|
|
||||||
CPUs that might have RCU read-side critical sections.
|
|
||||||
In addition, this flag-setting is done so as to avoid increasing
|
|
||||||
the overhead of the common-case fastpath through the scheduler.
|
|
||||||
|
|
||||||
<p>
|
|
||||||
Again because this is preemptible RCU, an RCU read-side critical section
|
|
||||||
can be preempted.
|
|
||||||
When that happens, RCU will enqueue the task, which will the continue to
|
|
||||||
block the current expedited grace period until it resumes and finds its
|
|
||||||
outermost <tt>rcu_read_unlock()</tt>.
|
|
||||||
The CPU will report a quiescent state just after enqueuing the task because
|
|
||||||
the CPU is no longer blocking the grace period.
|
|
||||||
It is instead the preempted task doing the blocking.
|
|
||||||
The list of blocked tasks is managed by <tt>rcu_preempt_ctxt_queue()</tt>,
|
|
||||||
which is called from <tt>rcu_preempt_note_context_switch()</tt>, which
|
|
||||||
in turn is called from <tt>rcu_note_context_switch()</tt>, which in
|
|
||||||
turn is called from the scheduler.
|
|
||||||
|
|
||||||
<table>
|
|
||||||
<tr><th> </th></tr>
|
|
||||||
<tr><th align="left">Quick Quiz:</th></tr>
|
|
||||||
<tr><td>
|
|
||||||
Why not just have the expedited grace period check the
|
|
||||||
state of all the CPUs?
|
|
||||||
After all, that would avoid all those real-time-unfriendly IPIs.
|
|
||||||
</td></tr>
|
|
||||||
<tr><th align="left">Answer:</th></tr>
|
|
||||||
<tr><td bgcolor="#ffffff"><font color="ffffff">
|
|
||||||
Because we want the RCU read-side critical sections to run fast,
|
|
||||||
which means no memory barriers.
|
|
||||||
Therefore, it is not possible to safely check the state from some
|
|
||||||
other CPU.
|
|
||||||
And even if it was possible to safely check the state, it would
|
|
||||||
still be necessary to IPI the CPU to safely interact with the
|
|
||||||
upcoming <tt>rcu_read_unlock()</tt> invocation, which means that
|
|
||||||
the remote state testing would not help the worst-case
|
|
||||||
latency that real-time applications care about.
|
|
||||||
|
|
||||||
<p><font color="ffffff">One way to prevent your real-time
|
|
||||||
application from getting hit with these IPIs is to
|
|
||||||
build your kernel with <tt>CONFIG_NO_HZ_FULL=y</tt>.
|
|
||||||
RCU would then perceive the CPU running your application
|
|
||||||
as being idle, and it would be able to safely detect that
|
|
||||||
state without needing to IPI the CPU.
|
|
||||||
</font></td></tr>
|
|
||||||
<tr><td> </td></tr>
|
|
||||||
</table>
|
|
||||||
|
|
||||||
<p>
|
|
||||||
Please note that this is just the overall flow:
|
|
||||||
Additional complications can arise due to races with CPUs going idle
|
|
||||||
or offline, among other things.
|
|
||||||
|
|
||||||
<h2><a name="RCU-sched Expedited Grace Periods">
|
|
||||||
RCU-sched Expedited Grace Periods</a></h2>
|
|
||||||
|
|
||||||
<p>
|
|
||||||
<tt>CONFIG_PREEMPT=n</tt> kernels implement RCU-sched.
|
|
||||||
The overall flow of the handling of a given CPU by an RCU-sched
|
|
||||||
expedited grace period is shown in the following diagram:
|
|
||||||
|
|
||||||
<p><img src="ExpSchedFlow.svg" alt="ExpSchedFlow.svg" width="55%">
|
|
||||||
|
|
||||||
<p>
|
|
||||||
As with RCU-preempt, RCU-sched's
|
|
||||||
<tt>synchronize_rcu_expedited()</tt> ignores offline and
|
|
||||||
idle CPUs, again because they are in remotely detectable
|
|
||||||
quiescent states.
|
|
||||||
However, because the
|
|
||||||
<tt>rcu_read_lock_sched()</tt> and <tt>rcu_read_unlock_sched()</tt>
|
|
||||||
leave no trace of their invocation, in general it is not possible to tell
|
|
||||||
whether or not the current CPU is in an RCU read-side critical section.
|
|
||||||
The best that RCU-sched's <tt>rcu_exp_handler()</tt> can do is to check
|
|
||||||
for idle, on the off-chance that the CPU went idle while the IPI
|
|
||||||
was in flight.
|
|
||||||
If the CPU is idle, then <tt>rcu_exp_handler()</tt> reports
|
|
||||||
the quiescent state.
|
|
||||||
|
|
||||||
<p> Otherwise, the handler forces a future context switch by setting the
|
|
||||||
NEED_RESCHED flag of the current task's thread flag and the CPU preempt
|
|
||||||
counter.
|
|
||||||
At the time of the context switch, the CPU reports the quiescent state.
|
|
||||||
Should the CPU go offline first, it will report the quiescent state
|
|
||||||
at that time.
|
|
||||||
|
|
||||||
<h2><a name="Expedited Grace Period and CPU Hotplug">
|
|
||||||
Expedited Grace Period and CPU Hotplug</a></h2>
|
|
||||||
|
|
||||||
<p>
|
|
||||||
The expedited nature of expedited grace periods require a much tighter
|
|
||||||
interaction with CPU hotplug operations than is required for normal
|
|
||||||
grace periods.
|
|
||||||
In addition, attempting to IPI offline CPUs will result in splats, but
|
|
||||||
failing to IPI online CPUs can result in too-short grace periods.
|
|
||||||
Neither option is acceptable in production kernels.
|
|
||||||
|
|
||||||
<p>
|
|
||||||
The interaction between expedited grace periods and CPU hotplug operations
|
|
||||||
is carried out at several levels:
|
|
||||||
|
|
||||||
<ol>
|
|
||||||
<li> The number of CPUs that have ever been online is tracked
|
|
||||||
by the <tt>rcu_state</tt> structure's <tt>->ncpus</tt>
|
|
||||||
field.
|
|
||||||
The <tt>rcu_state</tt> structure's <tt>->ncpus_snap</tt>
|
|
||||||
field tracks the number of CPUs that have ever been online
|
|
||||||
at the beginning of an RCU expedited grace period.
|
|
||||||
Note that this number never decreases, at least in the absence
|
|
||||||
of a time machine.
|
|
||||||
<li> The identities of the CPUs that have ever been online is
|
|
||||||
tracked by the <tt>rcu_node</tt> structure's
|
|
||||||
<tt>->expmaskinitnext</tt> field.
|
|
||||||
The <tt>rcu_node</tt> structure's <tt>->expmaskinit</tt>
|
|
||||||
field tracks the identities of the CPUs that were online
|
|
||||||
at least once at the beginning of the most recent RCU
|
|
||||||
expedited grace period.
|
|
||||||
The <tt>rcu_state</tt> structure's <tt>->ncpus</tt> and
|
|
||||||
<tt>->ncpus_snap</tt> fields are used to detect when
|
|
||||||
new CPUs have come online for the first time, that is,
|
|
||||||
when the <tt>rcu_node</tt> structure's <tt>->expmaskinitnext</tt>
|
|
||||||
field has changed since the beginning of the last RCU
|
|
||||||
expedited grace period, which triggers an update of each
|
|
||||||
<tt>rcu_node</tt> structure's <tt>->expmaskinit</tt>
|
|
||||||
field from its <tt>->expmaskinitnext</tt> field.
|
|
||||||
<li> Each <tt>rcu_node</tt> structure's <tt>->expmaskinit</tt>
|
|
||||||
field is used to initialize that structure's
|
|
||||||
<tt>->expmask</tt> at the beginning of each RCU
|
|
||||||
expedited grace period.
|
|
||||||
This means that only those CPUs that have been online at least
|
|
||||||
once will be considered for a given grace period.
|
|
||||||
<li> Any CPU that goes offline will clear its bit in its leaf
|
|
||||||
<tt>rcu_node</tt> structure's <tt>->qsmaskinitnext</tt>
|
|
||||||
field, so any CPU with that bit clear can safely be ignored.
|
|
||||||
However, it is possible for a CPU coming online or going offline
|
|
||||||
to have this bit set for some time while <tt>cpu_online</tt>
|
|
||||||
returns <tt>false</tt>.
|
|
||||||
<li> For each non-idle CPU that RCU believes is currently online, the grace
|
|
||||||
period invokes <tt>smp_call_function_single()</tt>.
|
|
||||||
If this succeeds, the CPU was fully online.
|
|
||||||
Failure indicates that the CPU is in the process of coming online
|
|
||||||
or going offline, in which case it is necessary to wait for a
|
|
||||||
short time period and try again.
|
|
||||||
The purpose of this wait (or series of waits, as the case may be)
|
|
||||||
is to permit a concurrent CPU-hotplug operation to complete.
|
|
||||||
<li> In the case of RCU-sched, one of the last acts of an outgoing CPU
|
|
||||||
is to invoke <tt>rcu_report_dead()</tt>, which
|
|
||||||
reports a quiescent state for that CPU.
|
|
||||||
However, this is likely paranoia-induced redundancy. <!-- @@@ -->
|
|
||||||
</ol>
|
|
||||||
|
|
||||||
<table>
|
|
||||||
<tr><th> </th></tr>
|
|
||||||
<tr><th align="left">Quick Quiz:</th></tr>
|
|
||||||
<tr><td>
|
|
||||||
Why all the dancing around with multiple counters and masks
|
|
||||||
tracking CPUs that were once online?
|
|
||||||
Why not just have a single set of masks tracking the currently
|
|
||||||
online CPUs and be done with it?
|
|
||||||
</td></tr>
|
|
||||||
<tr><th align="left">Answer:</th></tr>
|
|
||||||
<tr><td bgcolor="#ffffff"><font color="ffffff">
|
|
||||||
Maintaining single set of masks tracking the online CPUs <i>sounds</i>
|
|
||||||
easier, at least until you try working out all the race conditions
|
|
||||||
between grace-period initialization and CPU-hotplug operations.
|
|
||||||
For example, suppose initialization is progressing down the
|
|
||||||
tree while a CPU-offline operation is progressing up the tree.
|
|
||||||
This situation can result in bits set at the top of the tree
|
|
||||||
that have no counterparts at the bottom of the tree.
|
|
||||||
Those bits will never be cleared, which will result in
|
|
||||||
grace-period hangs.
|
|
||||||
In short, that way lies madness, to say nothing of a great many
|
|
||||||
bugs, hangs, and deadlocks.
|
|
||||||
|
|
||||||
<p><font color="ffffff">
|
|
||||||
In contrast, the current multi-mask multi-counter scheme ensures
|
|
||||||
that grace-period initialization will always see consistent masks
|
|
||||||
up and down the tree, which brings significant simplifications
|
|
||||||
over the single-mask method.
|
|
||||||
|
|
||||||
<p><font color="ffffff">
|
|
||||||
This is an instance of
|
|
||||||
<a href="http://www.cs.columbia.edu/~library/TR-repository/reports/reports-1992/cucs-039-92.ps.gz"><font color="ffffff">
|
|
||||||
deferring work in order to avoid synchronization</a>.
|
|
||||||
Lazily recording CPU-hotplug events at the beginning of the next
|
|
||||||
grace period greatly simplifies maintenance of the CPU-tracking
|
|
||||||
bitmasks in the <tt>rcu_node</tt> tree.
|
|
||||||
</font></td></tr>
|
|
||||||
<tr><td> </td></tr>
|
|
||||||
</table>
|
|
||||||
|
|
||||||
<h2><a name="Expedited Grace Period Refinements">
|
|
||||||
Expedited Grace Period Refinements</a></h2>
|
|
||||||
|
|
||||||
<ol>
|
|
||||||
<li> <a href="#Idle-CPU Checks">Idle-CPU checks</a>.
|
|
||||||
<li> <a href="#Batching via Sequence Counter">
|
|
||||||
Batching via sequence counter</a>.
|
|
||||||
<li> <a href="#Funnel Locking and Wait/Wakeup">
|
|
||||||
Funnel locking and wait/wakeup</a>.
|
|
||||||
<li> <a href="#Use of Workqueues">Use of Workqueues</a>.
|
|
||||||
<li> <a href="#Stall Warnings">Stall warnings</a>.
|
|
||||||
<li> <a href="#Mid-Boot Operation">Mid-boot operation</a>.
|
|
||||||
</ol>
|
|
||||||
|
|
||||||
<h3><a name="Idle-CPU Checks">Idle-CPU Checks</a></h3>
|
|
||||||
|
|
||||||
<p>
|
|
||||||
Each expedited grace period checks for idle CPUs when initially forming
|
|
||||||
the mask of CPUs to be IPIed and again just before IPIing a CPU
|
|
||||||
(both checks are carried out by <tt>sync_rcu_exp_select_cpus()</tt>).
|
|
||||||
If the CPU is idle at any time between those two times, the CPU will
|
|
||||||
not be IPIed.
|
|
||||||
Instead, the task pushing the grace period forward will include the
|
|
||||||
idle CPUs in the mask passed to <tt>rcu_report_exp_cpu_mult()</tt>.
|
|
||||||
|
|
||||||
<p>
|
|
||||||
For RCU-sched, there is an additional check:
|
|
||||||
If the IPI has interrupted the idle loop, then
|
|
||||||
<tt>rcu_exp_handler()</tt> invokes <tt>rcu_report_exp_rdp()</tt>
|
|
||||||
to report the corresponding quiescent state.
|
|
||||||
|
|
||||||
<p>
|
|
||||||
For RCU-preempt, there is no specific check for idle in the
|
|
||||||
IPI handler (<tt>rcu_exp_handler()</tt>), but because
|
|
||||||
RCU read-side critical sections are not permitted within the
|
|
||||||
idle loop, if <tt>rcu_exp_handler()</tt> sees that the CPU is within
|
|
||||||
RCU read-side critical section, the CPU cannot possibly be idle.
|
|
||||||
Otherwise, <tt>rcu_exp_handler()</tt> invokes
|
|
||||||
<tt>rcu_report_exp_rdp()</tt> to report the corresponding quiescent
|
|
||||||
state, regardless of whether or not that quiescent state was due to
|
|
||||||
the CPU being idle.
|
|
||||||
|
|
||||||
<p>
|
|
||||||
In summary, RCU expedited grace periods check for idle when building
|
|
||||||
the bitmask of CPUs that must be IPIed, just before sending each IPI,
|
|
||||||
and (either explicitly or implicitly) within the IPI handler.
|
|
||||||
|
|
||||||
<h3><a name="Batching via Sequence Counter">
|
|
||||||
Batching via Sequence Counter</a></h3>
|
|
||||||
|
|
||||||
<p>
|
|
||||||
If each grace-period request was carried out separately, expedited
|
|
||||||
grace periods would have abysmal scalability and
|
|
||||||
problematic high-load characteristics.
|
|
||||||
Because each grace-period operation can serve an unlimited number of
|
|
||||||
updates, it is important to <i>batch</i> requests, so that a single
|
|
||||||
expedited grace-period operation will cover all requests in the
|
|
||||||
corresponding batch.
|
|
||||||
|
|
||||||
<p>
|
|
||||||
This batching is controlled by a sequence counter named
|
|
||||||
<tt>->expedited_sequence</tt> in the <tt>rcu_state</tt> structure.
|
|
||||||
This counter has an odd value when there is an expedited grace period
|
|
||||||
in progress and an even value otherwise, so that dividing the counter
|
|
||||||
value by two gives the number of completed grace periods.
|
|
||||||
During any given update request, the counter must transition from
|
|
||||||
even to odd and then back to even, thus indicating that a grace
|
|
||||||
period has elapsed.
|
|
||||||
Therefore, if the initial value of the counter is <tt>s</tt>,
|
|
||||||
the updater must wait until the counter reaches at least the
|
|
||||||
value <tt>(s+3)&~0x1</tt>.
|
|
||||||
This counter is managed by the following access functions:
|
|
||||||
|
|
||||||
<ol>
|
|
||||||
<li> <tt>rcu_exp_gp_seq_start()</tt>, which marks the start of
|
|
||||||
an expedited grace period.
|
|
||||||
<li> <tt>rcu_exp_gp_seq_end()</tt>, which marks the end of an
|
|
||||||
expedited grace period.
|
|
||||||
<li> <tt>rcu_exp_gp_seq_snap()</tt>, which obtains a snapshot of
|
|
||||||
the counter.
|
|
||||||
<li> <tt>rcu_exp_gp_seq_done()</tt>, which returns <tt>true</tt>
|
|
||||||
if a full expedited grace period has elapsed since the
|
|
||||||
corresponding call to <tt>rcu_exp_gp_seq_snap()</tt>.
|
|
||||||
</ol>
|
|
||||||
|
|
||||||
<p>
|
|
||||||
Again, only one request in a given batch need actually carry out
|
|
||||||
a grace-period operation, which means there must be an efficient
|
|
||||||
way to identify which of many concurrent reqeusts will initiate
|
|
||||||
the grace period, and that there be an efficient way for the
|
|
||||||
remaining requests to wait for that grace period to complete.
|
|
||||||
However, that is the topic of the next section.
|
|
||||||
|
|
||||||
<h3><a name="Funnel Locking and Wait/Wakeup">
|
|
||||||
Funnel Locking and Wait/Wakeup</a></h3>
|
|
||||||
|
|
||||||
<p>
|
|
||||||
The natural way to sort out which of a batch of updaters will initiate
|
|
||||||
the expedited grace period is to use the <tt>rcu_node</tt> combining
|
|
||||||
tree, as implemented by the <tt>exp_funnel_lock()</tt> function.
|
|
||||||
The first updater corresponding to a given grace period arriving
|
|
||||||
at a given <tt>rcu_node</tt> structure records its desired grace-period
|
|
||||||
sequence number in the <tt>->exp_seq_rq</tt> field and moves up
|
|
||||||
to the next level in the tree.
|
|
||||||
Otherwise, if the <tt>->exp_seq_rq</tt> field already contains
|
|
||||||
the sequence number for the desired grace period or some later one,
|
|
||||||
the updater blocks on one of four wait queues in the
|
|
||||||
<tt>->exp_wq[]</tt> array, using the second-from-bottom
|
|
||||||
and third-from bottom bits as an index.
|
|
||||||
An <tt>->exp_lock</tt> field in the <tt>rcu_node</tt> structure
|
|
||||||
synchronizes access to these fields.
|
|
||||||
|
|
||||||
<p>
|
|
||||||
An empty <tt>rcu_node</tt> tree is shown in the following diagram,
|
|
||||||
with the white cells representing the <tt>->exp_seq_rq</tt> field
|
|
||||||
and the red cells representing the elements of the
|
|
||||||
<tt>->exp_wq[]</tt> array.
|
|
||||||
|
|
||||||
<p><img src="Funnel0.svg" alt="Funnel0.svg" width="75%">
|
|
||||||
|
|
||||||
<p>
|
|
||||||
The next diagram shows the situation after the arrival of Task A
|
|
||||||
and Task B at the leftmost and rightmost leaf <tt>rcu_node</tt>
|
|
||||||
structures, respectively.
|
|
||||||
The current value of the <tt>rcu_state</tt> structure's
|
|
||||||
<tt>->expedited_sequence</tt> field is zero, so adding three and
|
|
||||||
clearing the bottom bit results in the value two, which both tasks
|
|
||||||
record in the <tt>->exp_seq_rq</tt> field of their respective
|
|
||||||
<tt>rcu_node</tt> structures:
|
|
||||||
|
|
||||||
<p><img src="Funnel1.svg" alt="Funnel1.svg" width="75%">
|
|
||||||
|
|
||||||
<p>
|
|
||||||
Each of Tasks A and B will move up to the root
|
|
||||||
<tt>rcu_node</tt> structure.
|
|
||||||
Suppose that Task A wins, recording its desired grace-period sequence
|
|
||||||
number and resulting in the state shown below:
|
|
||||||
|
|
||||||
<p><img src="Funnel2.svg" alt="Funnel2.svg" width="75%">
|
|
||||||
|
|
||||||
<p>
|
|
||||||
Task A now advances to initiate a new grace period, while Task B
|
|
||||||
moves up to the root <tt>rcu_node</tt> structure, and, seeing that
|
|
||||||
its desired sequence number is already recorded, blocks on
|
|
||||||
<tt>->exp_wq[1]</tt>.
|
|
||||||
|
|
||||||
<table>
|
|
||||||
<tr><th> </th></tr>
|
|
||||||
<tr><th align="left">Quick Quiz:</th></tr>
|
|
||||||
<tr><td>
|
|
||||||
Why <tt>->exp_wq[1]</tt>?
|
|
||||||
Given that the value of these tasks' desired sequence number is
|
|
||||||
two, so shouldn't they instead block on <tt>->exp_wq[2]</tt>?
|
|
||||||
</td></tr>
|
|
||||||
<tr><th align="left">Answer:</th></tr>
|
|
||||||
<tr><td bgcolor="#ffffff"><font color="ffffff">
|
|
||||||
No.
|
|
||||||
|
|
||||||
<p><font color="ffffff">
|
|
||||||
Recall that the bottom bit of the desired sequence number indicates
|
|
||||||
whether or not a grace period is currently in progress.
|
|
||||||
It is therefore necessary to shift the sequence number right one
|
|
||||||
bit position to obtain the number of the grace period.
|
|
||||||
This results in <tt>->exp_wq[1]</tt>.
|
|
||||||
</font></td></tr>
|
|
||||||
<tr><td> </td></tr>
|
|
||||||
</table>
|
|
||||||
|
|
||||||
<p>
|
|
||||||
If Tasks C and D also arrive at this point, they will compute the
|
|
||||||
same desired grace-period sequence number, and see that both leaf
|
|
||||||
<tt>rcu_node</tt> structures already have that value recorded.
|
|
||||||
They will therefore block on their respective <tt>rcu_node</tt>
|
|
||||||
structures' <tt>->exp_wq[1]</tt> fields, as shown below:
|
|
||||||
|
|
||||||
<p><img src="Funnel3.svg" alt="Funnel3.svg" width="75%">
|
|
||||||
|
|
||||||
<p>
|
|
||||||
Task A now acquires the <tt>rcu_state</tt> structure's
|
|
||||||
<tt>->exp_mutex</tt> and initiates the grace period, which
|
|
||||||
increments <tt>->expedited_sequence</tt>.
|
|
||||||
Therefore, if Tasks E and F arrive, they will compute
|
|
||||||
a desired sequence number of 4 and will record this value as
|
|
||||||
shown below:
|
|
||||||
|
|
||||||
<p><img src="Funnel4.svg" alt="Funnel4.svg" width="75%">
|
|
||||||
|
|
||||||
<p>
|
|
||||||
Tasks E and F will propagate up the <tt>rcu_node</tt>
|
|
||||||
combining tree, with Task F blocking on the root <tt>rcu_node</tt>
|
|
||||||
structure and Task E wait for Task A to finish so that
|
|
||||||
it can start the next grace period.
|
|
||||||
The resulting state is as shown below:
|
|
||||||
|
|
||||||
<p><img src="Funnel5.svg" alt="Funnel5.svg" width="75%">
|
|
||||||
|
|
||||||
<p>
|
|
||||||
Once the grace period completes, Task A
|
|
||||||
starts waking up the tasks waiting for this grace period to complete,
|
|
||||||
increments the <tt>->expedited_sequence</tt>,
|
|
||||||
acquires the <tt>->exp_wake_mutex</tt> and then releases the
|
|
||||||
<tt>->exp_mutex</tt>.
|
|
||||||
This results in the following state:
|
|
||||||
|
|
||||||
<p><img src="Funnel6.svg" alt="Funnel6.svg" width="75%">
|
|
||||||
|
|
||||||
<p>
|
|
||||||
Task E can then acquire <tt>->exp_mutex</tt> and increment
|
|
||||||
<tt>->expedited_sequence</tt> to the value three.
|
|
||||||
If new tasks G and H arrive and moves up the combining tree at the
|
|
||||||
same time, the state will be as follows:
|
|
||||||
|
|
||||||
<p><img src="Funnel7.svg" alt="Funnel7.svg" width="75%">
|
|
||||||
|
|
||||||
<p>
|
|
||||||
Note that three of the root <tt>rcu_node</tt> structure's
|
|
||||||
waitqueues are now occupied.
|
|
||||||
However, at some point, Task A will wake up the
|
|
||||||
tasks blocked on the <tt>->exp_wq</tt> waitqueues, resulting
|
|
||||||
in the following state:
|
|
||||||
|
|
||||||
<p><img src="Funnel8.svg" alt="Funnel8.svg" width="75%">
|
|
||||||
|
|
||||||
<p>
|
|
||||||
Execution will continue with Tasks E and H completing
|
|
||||||
their grace periods and carrying out their wakeups.
|
|
||||||
|
|
||||||
<table>
|
|
||||||
<tr><th> </th></tr>
|
|
||||||
<tr><th align="left">Quick Quiz:</th></tr>
|
|
||||||
<tr><td>
|
|
||||||
What happens if Task A takes so long to do its wakeups
|
|
||||||
that Task E's grace period completes?
|
|
||||||
</td></tr>
|
|
||||||
<tr><th align="left">Answer:</th></tr>
|
|
||||||
<tr><td bgcolor="#ffffff"><font color="ffffff">
|
|
||||||
Then Task E will block on the <tt>->exp_wake_mutex</tt>,
|
|
||||||
which will also prevent it from releasing <tt>->exp_mutex</tt>,
|
|
||||||
which in turn will prevent the next grace period from starting.
|
|
||||||
This last is important in preventing overflow of the
|
|
||||||
<tt>->exp_wq[]</tt> array.
|
|
||||||
</font></td></tr>
|
|
||||||
<tr><td> </td></tr>
|
|
||||||
</table>
|
|
||||||
|
|
||||||
<h3><a name="Use of Workqueues">Use of Workqueues</a></h3>
|
|
||||||
|
|
||||||
<p>
|
|
||||||
In earlier implementations, the task requesting the expedited
|
|
||||||
grace period also drove it to completion.
|
|
||||||
This straightforward approach had the disadvantage of needing to
|
|
||||||
account for POSIX signals sent to user tasks,
|
|
||||||
so more recent implemementations use the Linux kernel's
|
|
||||||
<a href="https://www.kernel.org/doc/Documentation/core-api/workqueue.rst">workqueues</a>.
|
|
||||||
|
|
||||||
<p>
|
|
||||||
The requesting task still does counter snapshotting and funnel-lock
|
|
||||||
processing, but the task reaching the top of the funnel lock
|
|
||||||
does a <tt>schedule_work()</tt> (from <tt>_synchronize_rcu_expedited()</tt>
|
|
||||||
so that a workqueue kthread does the actual grace-period processing.
|
|
||||||
Because workqueue kthreads do not accept POSIX signals, grace-period-wait
|
|
||||||
processing need not allow for POSIX signals.
|
|
||||||
|
|
||||||
In addition, this approach allows wakeups for the previous expedited
|
|
||||||
grace period to be overlapped with processing for the next expedited
|
|
||||||
grace period.
|
|
||||||
Because there are only four sets of waitqueues, it is necessary to
|
|
||||||
ensure that the previous grace period's wakeups complete before the
|
|
||||||
next grace period's wakeups start.
|
|
||||||
This is handled by having the <tt>->exp_mutex</tt>
|
|
||||||
guard expedited grace-period processing and the
|
|
||||||
<tt>->exp_wake_mutex</tt> guard wakeups.
|
|
||||||
The key point is that the <tt>->exp_mutex</tt> is not released
|
|
||||||
until the first wakeup is complete, which means that the
|
|
||||||
<tt>->exp_wake_mutex</tt> has already been acquired at that point.
|
|
||||||
This approach ensures that the previous grace period's wakeups can
|
|
||||||
be carried out while the current grace period is in process, but
|
|
||||||
that these wakeups will complete before the next grace period starts.
|
|
||||||
This means that only three waitqueues are required, guaranteeing that
|
|
||||||
the four that are provided are sufficient.
|
|
||||||
|
|
||||||
<h3><a name="Stall Warnings">Stall Warnings</a></h3>
|
|
||||||
|
|
||||||
<p>
|
|
||||||
Expediting grace periods does nothing to speed things up when RCU
|
|
||||||
readers take too long, and therefore expedited grace periods check
|
|
||||||
for stalls just as normal grace periods do.
|
|
||||||
|
|
||||||
<table>
|
|
||||||
<tr><th> </th></tr>
|
|
||||||
<tr><th align="left">Quick Quiz:</th></tr>
|
|
||||||
<tr><td>
|
|
||||||
But why not just let the normal grace-period machinery
|
|
||||||
detect the stalls, given that a given reader must block
|
|
||||||
both normal and expedited grace periods?
|
|
||||||
</td></tr>
|
|
||||||
<tr><th align="left">Answer:</th></tr>
|
|
||||||
<tr><td bgcolor="#ffffff"><font color="ffffff">
|
|
||||||
Because it is quite possible that at a given time there
|
|
||||||
is no normal grace period in progress, in which case the
|
|
||||||
normal grace period cannot emit a stall warning.
|
|
||||||
</font></td></tr>
|
|
||||||
<tr><td> </td></tr>
|
|
||||||
</table>
|
|
||||||
|
|
||||||
The <tt>synchronize_sched_expedited_wait()</tt> function loops waiting
|
|
||||||
for the expedited grace period to end, but with a timeout set to the
|
|
||||||
current RCU CPU stall-warning time.
|
|
||||||
If this time is exceeded, any CPUs or <tt>rcu_node</tt> structures
|
|
||||||
blocking the current grace period are printed.
|
|
||||||
Each stall warning results in another pass through the loop, but the
|
|
||||||
second and subsequent passes use longer stall times.
|
|
||||||
|
|
||||||
<h3><a name="Mid-Boot Operation">Mid-boot operation</a></h3>
|
|
||||||
|
|
||||||
<p>
|
|
||||||
The use of workqueues has the advantage that the expedited
|
|
||||||
grace-period code need not worry about POSIX signals.
|
|
||||||
Unfortunately, it has the
|
|
||||||
corresponding disadvantage that workqueues cannot be used until
|
|
||||||
they are initialized, which does not happen until some time after
|
|
||||||
the scheduler spawns the first task.
|
|
||||||
Given that there are parts of the kernel that really do want to
|
|
||||||
execute grace periods during this mid-boot “dead zone”,
|
|
||||||
expedited grace periods must do something else during thie time.
|
|
||||||
|
|
||||||
<p>
|
|
||||||
What they do is to fall back to the old practice of requiring that the
|
|
||||||
requesting task drive the expedited grace period, as was the case
|
|
||||||
before the use of workqueues.
|
|
||||||
However, the requesting task is only required to drive the grace period
|
|
||||||
during the mid-boot dead zone.
|
|
||||||
Before mid-boot, a synchronous grace period is a no-op.
|
|
||||||
Some time after mid-boot, workqueues are used.
|
|
||||||
|
|
||||||
<p>
|
|
||||||
Non-expedited non-SRCU synchronous grace periods must also operate
|
|
||||||
normally during mid-boot.
|
|
||||||
This is handled by causing non-expedited grace periods to take the
|
|
||||||
expedited code path during mid-boot.
|
|
||||||
|
|
||||||
<p>
|
|
||||||
The current code assumes that there are no POSIX signals during
|
|
||||||
the mid-boot dead zone.
|
|
||||||
However, if an overwhelming need for POSIX signals somehow arises,
|
|
||||||
appropriate adjustments can be made to the expedited stall-warning code.
|
|
||||||
One such adjustment would reinstate the pre-workqueue stall-warning
|
|
||||||
checks, but only during the mid-boot dead zone.
|
|
||||||
|
|
||||||
<p>
|
|
||||||
With this refinement, synchronous grace periods can now be used from
|
|
||||||
task context pretty much any time during the life of the kernel.
|
|
||||||
That is, aside from some points in the suspend, hibernate, or shutdown
|
|
||||||
code path.
|
|
||||||
|
|
||||||
<h3><a name="Summary">
|
|
||||||
Summary</a></h3>
|
|
||||||
|
|
||||||
<p>
|
|
||||||
Expedited grace periods use a sequence-number approach to promote
|
|
||||||
batching, so that a single grace-period operation can serve numerous
|
|
||||||
requests.
|
|
||||||
A funnel lock is used to efficiently identify the one task out of
|
|
||||||
a concurrent group that will request the grace period.
|
|
||||||
All members of the group will block on waitqueues provided in
|
|
||||||
the <tt>rcu_node</tt> structure.
|
|
||||||
The actual grace-period processing is carried out by a workqueue.
|
|
||||||
|
|
||||||
<p>
|
|
||||||
CPU-hotplug operations are noted lazily in order to prevent the need
|
|
||||||
for tight synchronization between expedited grace periods and
|
|
||||||
CPU-hotplug operations.
|
|
||||||
The dyntick-idle counters are used to avoid sending IPIs to idle CPUs,
|
|
||||||
at least in the common case.
|
|
||||||
RCU-preempt and RCU-sched use different IPI handlers and different
|
|
||||||
code to respond to the state changes carried out by those handlers,
|
|
||||||
but otherwise use common code.
|
|
||||||
|
|
||||||
<p>
|
|
||||||
Quiescent states are tracked using the <tt>rcu_node</tt> tree,
|
|
||||||
and once all necessary quiescent states have been reported,
|
|
||||||
all tasks waiting on this expedited grace period are awakened.
|
|
||||||
A pair of mutexes are used to allow one grace period's wakeups
|
|
||||||
to proceed concurrently with the next grace period's processing.
|
|
||||||
|
|
||||||
<p>
|
|
||||||
This combination of mechanisms allows expedited grace periods to
|
|
||||||
run reasonably efficiently.
|
|
||||||
However, for non-time-critical tasks, normal grace periods should be
|
|
||||||
used instead because their longer duration permits much higher
|
|
||||||
degrees of batching, and thus much lower per-request overheads.
|
|
||||||
|
|
||||||
</body></html>
|
|
@@ -0,0 +1,521 @@
|
|||||||
|
=================================================
|
||||||
|
A Tour Through TREE_RCU's Expedited Grace Periods
|
||||||
|
=================================================
|
||||||
|
|
||||||
|
Introduction
|
||||||
|
============
|
||||||
|
|
||||||
|
This document describes RCU's expedited grace periods.
|
||||||
|
Unlike RCU's normal grace periods, which accept long latencies to attain
|
||||||
|
high efficiency and minimal disturbance, expedited grace periods accept
|
||||||
|
lower efficiency and significant disturbance to attain shorter latencies.
|
||||||
|
|
||||||
|
There are two flavors of RCU (RCU-preempt and RCU-sched), with an earlier
|
||||||
|
third RCU-bh flavor having been implemented in terms of the other two.
|
||||||
|
Each of the two implementations is covered in its own section.
|
||||||
|
|
||||||
|
Expedited Grace Period Design
|
||||||
|
=============================
|
||||||
|
|
||||||
|
The expedited RCU grace periods cannot be accused of being subtle,
|
||||||
|
given that they for all intents and purposes hammer every CPU that
|
||||||
|
has not yet provided a quiescent state for the current expedited
|
||||||
|
grace period.
|
||||||
|
The one saving grace is that the hammer has grown a bit smaller
|
||||||
|
over time: The old call to ``try_stop_cpus()`` has been
|
||||||
|
replaced with a set of calls to ``smp_call_function_single()``,
|
||||||
|
each of which results in an IPI to the target CPU.
|
||||||
|
The corresponding handler function checks the CPU's state, motivating
|
||||||
|
a faster quiescent state where possible, and triggering a report
|
||||||
|
of that quiescent state.
|
||||||
|
As always for RCU, once everything has spent some time in a quiescent
|
||||||
|
state, the expedited grace period has completed.
|
||||||
|
|
||||||
|
The details of the ``smp_call_function_single()`` handler's
|
||||||
|
operation depend on the RCU flavor, as described in the following
|
||||||
|
sections.
|
||||||
|
|
||||||
|
RCU-preempt Expedited Grace Periods
|
||||||
|
===================================
|
||||||
|
|
||||||
|
``CONFIG_PREEMPT=y`` kernels implement RCU-preempt.
|
||||||
|
The overall flow of the handling of a given CPU by an RCU-preempt
|
||||||
|
expedited grace period is shown in the following diagram:
|
||||||
|
|
||||||
|
.. kernel-figure:: ExpRCUFlow.svg
|
||||||
|
|
||||||
|
The solid arrows denote direct action, for example, a function call.
|
||||||
|
The dotted arrows denote indirect action, for example, an IPI
|
||||||
|
or a state that is reached after some time.
|
||||||
|
|
||||||
|
If a given CPU is offline or idle, ``synchronize_rcu_expedited()``
|
||||||
|
will ignore it because idle and offline CPUs are already residing
|
||||||
|
in quiescent states.
|
||||||
|
Otherwise, the expedited grace period will use
|
||||||
|
``smp_call_function_single()`` to send the CPU an IPI, which
|
||||||
|
is handled by ``rcu_exp_handler()``.
|
||||||
|
|
||||||
|
However, because this is preemptible RCU, ``rcu_exp_handler()``
|
||||||
|
can check to see if the CPU is currently running in an RCU read-side
|
||||||
|
critical section.
|
||||||
|
If not, the handler can immediately report a quiescent state.
|
||||||
|
Otherwise, it sets flags so that the outermost ``rcu_read_unlock()``
|
||||||
|
invocation will provide the needed quiescent-state report.
|
||||||
|
This flag-setting avoids the previous forced preemption of all
|
||||||
|
CPUs that might have RCU read-side critical sections.
|
||||||
|
In addition, this flag-setting is done so as to avoid increasing
|
||||||
|
the overhead of the common-case fastpath through the scheduler.
|
||||||
|
|
||||||
|
Again because this is preemptible RCU, an RCU read-side critical section
|
||||||
|
can be preempted.
|
||||||
|
When that happens, RCU will enqueue the task, which will the continue to
|
||||||
|
block the current expedited grace period until it resumes and finds its
|
||||||
|
outermost ``rcu_read_unlock()``.
|
||||||
|
The CPU will report a quiescent state just after enqueuing the task because
|
||||||
|
the CPU is no longer blocking the grace period.
|
||||||
|
It is instead the preempted task doing the blocking.
|
||||||
|
The list of blocked tasks is managed by ``rcu_preempt_ctxt_queue()``,
|
||||||
|
which is called from ``rcu_preempt_note_context_switch()``, which
|
||||||
|
in turn is called from ``rcu_note_context_switch()``, which in
|
||||||
|
turn is called from the scheduler.
|
||||||
|
|
||||||
|
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
| **Quick Quiz**: |
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
| Why not just have the expedited grace period check the state of all |
|
||||||
|
| the CPUs? After all, that would avoid all those real-time-unfriendly |
|
||||||
|
| IPIs. |
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
| **Answer**: |
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
| Because we want the RCU read-side critical sections to run fast, |
|
||||||
|
| which means no memory barriers. Therefore, it is not possible to |
|
||||||
|
| safely check the state from some other CPU. And even if it was |
|
||||||
|
| possible to safely check the state, it would still be necessary to |
|
||||||
|
| IPI the CPU to safely interact with the upcoming |
|
||||||
|
| ``rcu_read_unlock()`` invocation, which means that the remote state |
|
||||||
|
| testing would not help the worst-case latency that real-time |
|
||||||
|
| applications care about. |
|
||||||
|
| |
|
||||||
|
| One way to prevent your real-time application from getting hit with |
|
||||||
|
| these IPIs is to build your kernel with ``CONFIG_NO_HZ_FULL=y``. RCU |
|
||||||
|
| would then perceive the CPU running your application as being idle, |
|
||||||
|
| and it would be able to safely detect that state without needing to |
|
||||||
|
| IPI the CPU. |
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
|
||||||
|
Please note that this is just the overall flow: Additional complications
|
||||||
|
can arise due to races with CPUs going idle or offline, among other
|
||||||
|
things.
|
||||||
|
|
||||||
|
RCU-sched Expedited Grace Periods
|
||||||
|
---------------------------------
|
||||||
|
|
||||||
|
``CONFIG_PREEMPT=n`` kernels implement RCU-sched. The overall flow of
|
||||||
|
the handling of a given CPU by an RCU-sched expedited grace period is
|
||||||
|
shown in the following diagram:
|
||||||
|
|
||||||
|
.. kernel-figure:: ExpSchedFlow.svg
|
||||||
|
|
||||||
|
As with RCU-preempt, RCU-sched's ``synchronize_rcu_expedited()`` ignores
|
||||||
|
offline and idle CPUs, again because they are in remotely detectable
|
||||||
|
quiescent states. However, because the ``rcu_read_lock_sched()`` and
|
||||||
|
``rcu_read_unlock_sched()`` leave no trace of their invocation, in
|
||||||
|
general it is not possible to tell whether or not the current CPU is in
|
||||||
|
an RCU read-side critical section. The best that RCU-sched's
|
||||||
|
``rcu_exp_handler()`` can do is to check for idle, on the off-chance
|
||||||
|
that the CPU went idle while the IPI was in flight. If the CPU is idle,
|
||||||
|
then ``rcu_exp_handler()`` reports the quiescent state.
|
||||||
|
|
||||||
|
Otherwise, the handler forces a future context switch by setting the
|
||||||
|
NEED_RESCHED flag of the current task's thread flag and the CPU preempt
|
||||||
|
counter. At the time of the context switch, the CPU reports the
|
||||||
|
quiescent state. Should the CPU go offline first, it will report the
|
||||||
|
quiescent state at that time.
|
||||||
|
|
||||||
|
Expedited Grace Period and CPU Hotplug
|
||||||
|
--------------------------------------
|
||||||
|
|
||||||
|
The expedited nature of expedited grace periods require a much tighter
|
||||||
|
interaction with CPU hotplug operations than is required for normal
|
||||||
|
grace periods. In addition, attempting to IPI offline CPUs will result
|
||||||
|
in splats, but failing to IPI online CPUs can result in too-short grace
|
||||||
|
periods. Neither option is acceptable in production kernels.
|
||||||
|
|
||||||
|
The interaction between expedited grace periods and CPU hotplug
|
||||||
|
operations is carried out at several levels:
|
||||||
|
|
||||||
|
#. The number of CPUs that have ever been online is tracked by the
|
||||||
|
``rcu_state`` structure's ``->ncpus`` field. The ``rcu_state``
|
||||||
|
structure's ``->ncpus_snap`` field tracks the number of CPUs that
|
||||||
|
have ever been online at the beginning of an RCU expedited grace
|
||||||
|
period. Note that this number never decreases, at least in the
|
||||||
|
absence of a time machine.
|
||||||
|
#. The identities of the CPUs that have ever been online is tracked by
|
||||||
|
the ``rcu_node`` structure's ``->expmaskinitnext`` field. The
|
||||||
|
``rcu_node`` structure's ``->expmaskinit`` field tracks the
|
||||||
|
identities of the CPUs that were online at least once at the
|
||||||
|
beginning of the most recent RCU expedited grace period. The
|
||||||
|
``rcu_state`` structure's ``->ncpus`` and ``->ncpus_snap`` fields are
|
||||||
|
used to detect when new CPUs have come online for the first time,
|
||||||
|
that is, when the ``rcu_node`` structure's ``->expmaskinitnext``
|
||||||
|
field has changed since the beginning of the last RCU expedited grace
|
||||||
|
period, which triggers an update of each ``rcu_node`` structure's
|
||||||
|
``->expmaskinit`` field from its ``->expmaskinitnext`` field.
|
||||||
|
#. Each ``rcu_node`` structure's ``->expmaskinit`` field is used to
|
||||||
|
initialize that structure's ``->expmask`` at the beginning of each
|
||||||
|
RCU expedited grace period. This means that only those CPUs that have
|
||||||
|
been online at least once will be considered for a given grace
|
||||||
|
period.
|
||||||
|
#. Any CPU that goes offline will clear its bit in its leaf ``rcu_node``
|
||||||
|
structure's ``->qsmaskinitnext`` field, so any CPU with that bit
|
||||||
|
clear can safely be ignored. However, it is possible for a CPU coming
|
||||||
|
online or going offline to have this bit set for some time while
|
||||||
|
``cpu_online`` returns ``false``.
|
||||||
|
#. For each non-idle CPU that RCU believes is currently online, the
|
||||||
|
grace period invokes ``smp_call_function_single()``. If this
|
||||||
|
succeeds, the CPU was fully online. Failure indicates that the CPU is
|
||||||
|
in the process of coming online or going offline, in which case it is
|
||||||
|
necessary to wait for a short time period and try again. The purpose
|
||||||
|
of this wait (or series of waits, as the case may be) is to permit a
|
||||||
|
concurrent CPU-hotplug operation to complete.
|
||||||
|
#. In the case of RCU-sched, one of the last acts of an outgoing CPU is
|
||||||
|
to invoke ``rcu_report_dead()``, which reports a quiescent state for
|
||||||
|
that CPU. However, this is likely paranoia-induced redundancy.
|
||||||
|
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
| **Quick Quiz**: |
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
| Why all the dancing around with multiple counters and masks tracking |
|
||||||
|
| CPUs that were once online? Why not just have a single set of masks |
|
||||||
|
| tracking the currently online CPUs and be done with it? |
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
| **Answer**: |
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
| Maintaining single set of masks tracking the online CPUs *sounds* |
|
||||||
|
| easier, at least until you try working out all the race conditions |
|
||||||
|
| between grace-period initialization and CPU-hotplug operations. For |
|
||||||
|
| example, suppose initialization is progressing down the tree while a |
|
||||||
|
| CPU-offline operation is progressing up the tree. This situation can |
|
||||||
|
| result in bits set at the top of the tree that have no counterparts |
|
||||||
|
| at the bottom of the tree. Those bits will never be cleared, which |
|
||||||
|
| will result in grace-period hangs. In short, that way lies madness, |
|
||||||
|
| to say nothing of a great many bugs, hangs, and deadlocks. |
|
||||||
|
| In contrast, the current multi-mask multi-counter scheme ensures that |
|
||||||
|
| grace-period initialization will always see consistent masks up and |
|
||||||
|
| down the tree, which brings significant simplifications over the |
|
||||||
|
| single-mask method. |
|
||||||
|
| |
|
||||||
|
| This is an instance of `deferring work in order to avoid |
|
||||||
|
| synchronization <http://www.cs.columbia.edu/~library/TR-repository/re |
|
||||||
|
| ports/reports-1992/cucs-039-92.ps.gz>`__. |
|
||||||
|
| Lazily recording CPU-hotplug events at the beginning of the next |
|
||||||
|
| grace period greatly simplifies maintenance of the CPU-tracking |
|
||||||
|
| bitmasks in the ``rcu_node`` tree. |
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
|
||||||
|
Expedited Grace Period Refinements
|
||||||
|
----------------------------------
|
||||||
|
|
||||||
|
Idle-CPU Checks
|
||||||
|
~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
Each expedited grace period checks for idle CPUs when initially forming
|
||||||
|
the mask of CPUs to be IPIed and again just before IPIing a CPU (both
|
||||||
|
checks are carried out by ``sync_rcu_exp_select_cpus()``). If the CPU is
|
||||||
|
idle at any time between those two times, the CPU will not be IPIed.
|
||||||
|
Instead, the task pushing the grace period forward will include the idle
|
||||||
|
CPUs in the mask passed to ``rcu_report_exp_cpu_mult()``.
|
||||||
|
|
||||||
|
For RCU-sched, there is an additional check: If the IPI has interrupted
|
||||||
|
the idle loop, then ``rcu_exp_handler()`` invokes
|
||||||
|
``rcu_report_exp_rdp()`` to report the corresponding quiescent state.
|
||||||
|
|
||||||
|
For RCU-preempt, there is no specific check for idle in the IPI handler
|
||||||
|
(``rcu_exp_handler()``), but because RCU read-side critical sections are
|
||||||
|
not permitted within the idle loop, if ``rcu_exp_handler()`` sees that
|
||||||
|
the CPU is within RCU read-side critical section, the CPU cannot
|
||||||
|
possibly be idle. Otherwise, ``rcu_exp_handler()`` invokes
|
||||||
|
``rcu_report_exp_rdp()`` to report the corresponding quiescent state,
|
||||||
|
regardless of whether or not that quiescent state was due to the CPU
|
||||||
|
being idle.
|
||||||
|
|
||||||
|
In summary, RCU expedited grace periods check for idle when building the
|
||||||
|
bitmask of CPUs that must be IPIed, just before sending each IPI, and
|
||||||
|
(either explicitly or implicitly) within the IPI handler.
|
||||||
|
|
||||||
|
Batching via Sequence Counter
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
If each grace-period request was carried out separately, expedited grace
|
||||||
|
periods would have abysmal scalability and problematic high-load
|
||||||
|
characteristics. Because each grace-period operation can serve an
|
||||||
|
unlimited number of updates, it is important to *batch* requests, so
|
||||||
|
that a single expedited grace-period operation will cover all requests
|
||||||
|
in the corresponding batch.
|
||||||
|
|
||||||
|
This batching is controlled by a sequence counter named
|
||||||
|
``->expedited_sequence`` in the ``rcu_state`` structure. This counter
|
||||||
|
has an odd value when there is an expedited grace period in progress and
|
||||||
|
an even value otherwise, so that dividing the counter value by two gives
|
||||||
|
the number of completed grace periods. During any given update request,
|
||||||
|
the counter must transition from even to odd and then back to even, thus
|
||||||
|
indicating that a grace period has elapsed. Therefore, if the initial
|
||||||
|
value of the counter is ``s``, the updater must wait until the counter
|
||||||
|
reaches at least the value ``(s+3)&~0x1``. This counter is managed by
|
||||||
|
the following access functions:
|
||||||
|
|
||||||
|
#. ``rcu_exp_gp_seq_start()``, which marks the start of an expedited
|
||||||
|
grace period.
|
||||||
|
#. ``rcu_exp_gp_seq_end()``, which marks the end of an expedited grace
|
||||||
|
period.
|
||||||
|
#. ``rcu_exp_gp_seq_snap()``, which obtains a snapshot of the counter.
|
||||||
|
#. ``rcu_exp_gp_seq_done()``, which returns ``true`` if a full expedited
|
||||||
|
grace period has elapsed since the corresponding call to
|
||||||
|
``rcu_exp_gp_seq_snap()``.
|
||||||
|
|
||||||
|
Again, only one request in a given batch need actually carry out a
|
||||||
|
grace-period operation, which means there must be an efficient way to
|
||||||
|
identify which of many concurrent reqeusts will initiate the grace
|
||||||
|
period, and that there be an efficient way for the remaining requests to
|
||||||
|
wait for that grace period to complete. However, that is the topic of
|
||||||
|
the next section.
|
||||||
|
|
||||||
|
Funnel Locking and Wait/Wakeup
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
The natural way to sort out which of a batch of updaters will initiate
|
||||||
|
the expedited grace period is to use the ``rcu_node`` combining tree, as
|
||||||
|
implemented by the ``exp_funnel_lock()`` function. The first updater
|
||||||
|
corresponding to a given grace period arriving at a given ``rcu_node``
|
||||||
|
structure records its desired grace-period sequence number in the
|
||||||
|
``->exp_seq_rq`` field and moves up to the next level in the tree.
|
||||||
|
Otherwise, if the ``->exp_seq_rq`` field already contains the sequence
|
||||||
|
number for the desired grace period or some later one, the updater
|
||||||
|
blocks on one of four wait queues in the ``->exp_wq[]`` array, using the
|
||||||
|
second-from-bottom and third-from bottom bits as an index. An
|
||||||
|
``->exp_lock`` field in the ``rcu_node`` structure synchronizes access
|
||||||
|
to these fields.
|
||||||
|
|
||||||
|
An empty ``rcu_node`` tree is shown in the following diagram, with the
|
||||||
|
white cells representing the ``->exp_seq_rq`` field and the red cells
|
||||||
|
representing the elements of the ``->exp_wq[]`` array.
|
||||||
|
|
||||||
|
.. kernel-figure:: Funnel0.svg
|
||||||
|
|
||||||
|
The next diagram shows the situation after the arrival of Task A and
|
||||||
|
Task B at the leftmost and rightmost leaf ``rcu_node`` structures,
|
||||||
|
respectively. The current value of the ``rcu_state`` structure's
|
||||||
|
``->expedited_sequence`` field is zero, so adding three and clearing the
|
||||||
|
bottom bit results in the value two, which both tasks record in the
|
||||||
|
``->exp_seq_rq`` field of their respective ``rcu_node`` structures:
|
||||||
|
|
||||||
|
.. kernel-figure:: Funnel1.svg
|
||||||
|
|
||||||
|
Each of Tasks A and B will move up to the root ``rcu_node`` structure.
|
||||||
|
Suppose that Task A wins, recording its desired grace-period sequence
|
||||||
|
number and resulting in the state shown below:
|
||||||
|
|
||||||
|
.. kernel-figure:: Funnel2.svg
|
||||||
|
|
||||||
|
Task A now advances to initiate a new grace period, while Task B moves
|
||||||
|
up to the root ``rcu_node`` structure, and, seeing that its desired
|
||||||
|
sequence number is already recorded, blocks on ``->exp_wq[1]``.
|
||||||
|
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
| **Quick Quiz**: |
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
| Why ``->exp_wq[1]``? Given that the value of these tasks' desired |
|
||||||
|
| sequence number is two, so shouldn't they instead block on |
|
||||||
|
| ``->exp_wq[2]``? |
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
| **Answer**: |
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
| No. |
|
||||||
|
| Recall that the bottom bit of the desired sequence number indicates |
|
||||||
|
| whether or not a grace period is currently in progress. It is |
|
||||||
|
| therefore necessary to shift the sequence number right one bit |
|
||||||
|
| position to obtain the number of the grace period. This results in |
|
||||||
|
| ``->exp_wq[1]``. |
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
|
||||||
|
If Tasks C and D also arrive at this point, they will compute the same
|
||||||
|
desired grace-period sequence number, and see that both leaf
|
||||||
|
``rcu_node`` structures already have that value recorded. They will
|
||||||
|
therefore block on their respective ``rcu_node`` structures'
|
||||||
|
``->exp_wq[1]`` fields, as shown below:
|
||||||
|
|
||||||
|
.. kernel-figure:: Funnel3.svg
|
||||||
|
|
||||||
|
Task A now acquires the ``rcu_state`` structure's ``->exp_mutex`` and
|
||||||
|
initiates the grace period, which increments ``->expedited_sequence``.
|
||||||
|
Therefore, if Tasks E and F arrive, they will compute a desired sequence
|
||||||
|
number of 4 and will record this value as shown below:
|
||||||
|
|
||||||
|
.. kernel-figure:: Funnel4.svg
|
||||||
|
|
||||||
|
Tasks E and F will propagate up the ``rcu_node`` combining tree, with
|
||||||
|
Task F blocking on the root ``rcu_node`` structure and Task E wait for
|
||||||
|
Task A to finish so that it can start the next grace period. The
|
||||||
|
resulting state is as shown below:
|
||||||
|
|
||||||
|
.. kernel-figure:: Funnel5.svg
|
||||||
|
|
||||||
|
Once the grace period completes, Task A starts waking up the tasks
|
||||||
|
waiting for this grace period to complete, increments the
|
||||||
|
``->expedited_sequence``, acquires the ``->exp_wake_mutex`` and then
|
||||||
|
releases the ``->exp_mutex``. This results in the following state:
|
||||||
|
|
||||||
|
.. kernel-figure:: Funnel6.svg
|
||||||
|
|
||||||
|
Task E can then acquire ``->exp_mutex`` and increment
|
||||||
|
``->expedited_sequence`` to the value three. If new tasks G and H arrive
|
||||||
|
and moves up the combining tree at the same time, the state will be as
|
||||||
|
follows:
|
||||||
|
|
||||||
|
.. kernel-figure:: Funnel7.svg
|
||||||
|
|
||||||
|
Note that three of the root ``rcu_node`` structure's waitqueues are now
|
||||||
|
occupied. However, at some point, Task A will wake up the tasks blocked
|
||||||
|
on the ``->exp_wq`` waitqueues, resulting in the following state:
|
||||||
|
|
||||||
|
.. kernel-figure:: Funnel8.svg
|
||||||
|
|
||||||
|
Execution will continue with Tasks E and H completing their grace
|
||||||
|
periods and carrying out their wakeups.
|
||||||
|
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
| **Quick Quiz**: |
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
| What happens if Task A takes so long to do its wakeups that Task E's |
|
||||||
|
| grace period completes? |
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
| **Answer**: |
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
| Then Task E will block on the ``->exp_wake_mutex``, which will also |
|
||||||
|
| prevent it from releasing ``->exp_mutex``, which in turn will prevent |
|
||||||
|
| the next grace period from starting. This last is important in |
|
||||||
|
| preventing overflow of the ``->exp_wq[]`` array. |
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
|
||||||
|
Use of Workqueues
|
||||||
|
~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
In earlier implementations, the task requesting the expedited grace
|
||||||
|
period also drove it to completion. This straightforward approach had
|
||||||
|
the disadvantage of needing to account for POSIX signals sent to user
|
||||||
|
tasks, so more recent implemementations use the Linux kernel's
|
||||||
|
`workqueues <https://www.kernel.org/doc/Documentation/core-api/workqueue.rst>`__.
|
||||||
|
|
||||||
|
The requesting task still does counter snapshotting and funnel-lock
|
||||||
|
processing, but the task reaching the top of the funnel lock does a
|
||||||
|
``schedule_work()`` (from ``_synchronize_rcu_expedited()`` so that a
|
||||||
|
workqueue kthread does the actual grace-period processing. Because
|
||||||
|
workqueue kthreads do not accept POSIX signals, grace-period-wait
|
||||||
|
processing need not allow for POSIX signals. In addition, this approach
|
||||||
|
allows wakeups for the previous expedited grace period to be overlapped
|
||||||
|
with processing for the next expedited grace period. Because there are
|
||||||
|
only four sets of waitqueues, it is necessary to ensure that the
|
||||||
|
previous grace period's wakeups complete before the next grace period's
|
||||||
|
wakeups start. This is handled by having the ``->exp_mutex`` guard
|
||||||
|
expedited grace-period processing and the ``->exp_wake_mutex`` guard
|
||||||
|
wakeups. The key point is that the ``->exp_mutex`` is not released until
|
||||||
|
the first wakeup is complete, which means that the ``->exp_wake_mutex``
|
||||||
|
has already been acquired at that point. This approach ensures that the
|
||||||
|
previous grace period's wakeups can be carried out while the current
|
||||||
|
grace period is in process, but that these wakeups will complete before
|
||||||
|
the next grace period starts. This means that only three waitqueues are
|
||||||
|
required, guaranteeing that the four that are provided are sufficient.
|
||||||
|
|
||||||
|
Stall Warnings
|
||||||
|
~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
Expediting grace periods does nothing to speed things up when RCU
|
||||||
|
readers take too long, and therefore expedited grace periods check for
|
||||||
|
stalls just as normal grace periods do.
|
||||||
|
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
| **Quick Quiz**: |
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
| But why not just let the normal grace-period machinery detect the |
|
||||||
|
| stalls, given that a given reader must block both normal and |
|
||||||
|
| expedited grace periods? |
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
| **Answer**: |
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
| Because it is quite possible that at a given time there is no normal |
|
||||||
|
| grace period in progress, in which case the normal grace period |
|
||||||
|
| cannot emit a stall warning. |
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
|
||||||
|
The ``synchronize_sched_expedited_wait()`` function loops waiting for
|
||||||
|
the expedited grace period to end, but with a timeout set to the current
|
||||||
|
RCU CPU stall-warning time. If this time is exceeded, any CPUs or
|
||||||
|
``rcu_node`` structures blocking the current grace period are printed.
|
||||||
|
Each stall warning results in another pass through the loop, but the
|
||||||
|
second and subsequent passes use longer stall times.
|
||||||
|
|
||||||
|
Mid-boot operation
|
||||||
|
~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
The use of workqueues has the advantage that the expedited grace-period
|
||||||
|
code need not worry about POSIX signals. Unfortunately, it has the
|
||||||
|
corresponding disadvantage that workqueues cannot be used until they are
|
||||||
|
initialized, which does not happen until some time after the scheduler
|
||||||
|
spawns the first task. Given that there are parts of the kernel that
|
||||||
|
really do want to execute grace periods during this mid-boot “dead
|
||||||
|
zone”, expedited grace periods must do something else during thie time.
|
||||||
|
|
||||||
|
What they do is to fall back to the old practice of requiring that the
|
||||||
|
requesting task drive the expedited grace period, as was the case before
|
||||||
|
the use of workqueues. However, the requesting task is only required to
|
||||||
|
drive the grace period during the mid-boot dead zone. Before mid-boot, a
|
||||||
|
synchronous grace period is a no-op. Some time after mid-boot,
|
||||||
|
workqueues are used.
|
||||||
|
|
||||||
|
Non-expedited non-SRCU synchronous grace periods must also operate
|
||||||
|
normally during mid-boot. This is handled by causing non-expedited grace
|
||||||
|
periods to take the expedited code path during mid-boot.
|
||||||
|
|
||||||
|
The current code assumes that there are no POSIX signals during the
|
||||||
|
mid-boot dead zone. However, if an overwhelming need for POSIX signals
|
||||||
|
somehow arises, appropriate adjustments can be made to the expedited
|
||||||
|
stall-warning code. One such adjustment would reinstate the
|
||||||
|
pre-workqueue stall-warning checks, but only during the mid-boot dead
|
||||||
|
zone.
|
||||||
|
|
||||||
|
With this refinement, synchronous grace periods can now be used from
|
||||||
|
task context pretty much any time during the life of the kernel. That
|
||||||
|
is, aside from some points in the suspend, hibernate, or shutdown code
|
||||||
|
path.
|
||||||
|
|
||||||
|
Summary
|
||||||
|
~~~~~~~
|
||||||
|
|
||||||
|
Expedited grace periods use a sequence-number approach to promote
|
||||||
|
batching, so that a single grace-period operation can serve numerous
|
||||||
|
requests. A funnel lock is used to efficiently identify the one task out
|
||||||
|
of a concurrent group that will request the grace period. All members of
|
||||||
|
the group will block on waitqueues provided in the ``rcu_node``
|
||||||
|
structure. The actual grace-period processing is carried out by a
|
||||||
|
workqueue.
|
||||||
|
|
||||||
|
CPU-hotplug operations are noted lazily in order to prevent the need for
|
||||||
|
tight synchronization between expedited grace periods and CPU-hotplug
|
||||||
|
operations. The dyntick-idle counters are used to avoid sending IPIs to
|
||||||
|
idle CPUs, at least in the common case. RCU-preempt and RCU-sched use
|
||||||
|
different IPI handlers and different code to respond to the state
|
||||||
|
changes carried out by those handlers, but otherwise use common code.
|
||||||
|
|
||||||
|
Quiescent states are tracked using the ``rcu_node`` tree, and once all
|
||||||
|
necessary quiescent states have been reported, all tasks waiting on this
|
||||||
|
expedited grace period are awakened. A pair of mutexes are used to allow
|
||||||
|
one grace period's wakeups to proceed concurrently with the next grace
|
||||||
|
period's processing.
|
||||||
|
|
||||||
|
This combination of mechanisms allows expedited grace periods to run
|
||||||
|
reasonably efficiently. However, for non-time-critical tasks, normal
|
||||||
|
grace periods should be used instead because their longer duration
|
||||||
|
permits much higher degrees of batching, and thus much lower per-request
|
||||||
|
overheads.
|
@@ -1,9 +0,0 @@
|
|||||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
|
|
||||||
"http://www.w3.org/TR/html4/loose.dtd">
|
|
||||||
<html>
|
|
||||||
<head><title>A Diagram of TREE_RCU's Grace-Period Memory Ordering</title>
|
|
||||||
<meta HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=iso-8859-1">
|
|
||||||
|
|
||||||
<p><img src="TreeRCU-gp.svg" alt="TreeRCU-gp.svg">
|
|
||||||
|
|
||||||
</body></html>
|
|
@@ -1,704 +0,0 @@
|
|||||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
|
|
||||||
"http://www.w3.org/TR/html4/loose.dtd">
|
|
||||||
<html>
|
|
||||||
<head><title>A Tour Through TREE_RCU's Grace-Period Memory Ordering</title>
|
|
||||||
<meta HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=iso-8859-1">
|
|
||||||
|
|
||||||
<p>August 8, 2017</p>
|
|
||||||
<p>This article was contributed by Paul E. McKenney</p>
|
|
||||||
|
|
||||||
<h3>Introduction</h3>
|
|
||||||
|
|
||||||
<p>This document gives a rough visual overview of how Tree RCU's
|
|
||||||
grace-period memory ordering guarantee is provided.
|
|
||||||
|
|
||||||
<ol>
|
|
||||||
<li> <a href="#What Is Tree RCU's Grace Period Memory Ordering Guarantee?">
|
|
||||||
What Is Tree RCU's Grace Period Memory Ordering Guarantee?</a>
|
|
||||||
<li> <a href="#Tree RCU Grace Period Memory Ordering Building Blocks">
|
|
||||||
Tree RCU Grace Period Memory Ordering Building Blocks</a>
|
|
||||||
<li> <a href="#Tree RCU Grace Period Memory Ordering Components">
|
|
||||||
Tree RCU Grace Period Memory Ordering Components</a>
|
|
||||||
<li> <a href="#Putting It All Together">Putting It All Together</a>
|
|
||||||
</ol>
|
|
||||||
|
|
||||||
<h3><a name="What Is Tree RCU's Grace Period Memory Ordering Guarantee?">
|
|
||||||
What Is Tree RCU's Grace Period Memory Ordering Guarantee?</a></h3>
|
|
||||||
|
|
||||||
<p>RCU grace periods provide extremely strong memory-ordering guarantees
|
|
||||||
for non-idle non-offline code.
|
|
||||||
Any code that happens after the end of a given RCU grace period is guaranteed
|
|
||||||
to see the effects of all accesses prior to the beginning of that grace
|
|
||||||
period that are within RCU read-side critical sections.
|
|
||||||
Similarly, any code that happens before the beginning of a given RCU grace
|
|
||||||
period is guaranteed to see the effects of all accesses following the end
|
|
||||||
of that grace period that are within RCU read-side critical sections.
|
|
||||||
|
|
||||||
<p>Note well that RCU-sched read-side critical sections include any region
|
|
||||||
of code for which preemption is disabled.
|
|
||||||
Given that each individual machine instruction can be thought of as
|
|
||||||
an extremely small region of preemption-disabled code, one can think of
|
|
||||||
<tt>synchronize_rcu()</tt> as <tt>smp_mb()</tt> on steroids.
|
|
||||||
|
|
||||||
<p>RCU updaters use this guarantee by splitting their updates into
|
|
||||||
two phases, one of which is executed before the grace period and
|
|
||||||
the other of which is executed after the grace period.
|
|
||||||
In the most common use case, phase one removes an element from
|
|
||||||
a linked RCU-protected data structure, and phase two frees that element.
|
|
||||||
For this to work, any readers that have witnessed state prior to the
|
|
||||||
phase-one update (in the common case, removal) must not witness state
|
|
||||||
following the phase-two update (in the common case, freeing).
|
|
||||||
|
|
||||||
<p>The RCU implementation provides this guarantee using a network
|
|
||||||
of lock-based critical sections, memory barriers, and per-CPU
|
|
||||||
processing, as is described in the following sections.
|
|
||||||
|
|
||||||
<h3><a name="Tree RCU Grace Period Memory Ordering Building Blocks">
|
|
||||||
Tree RCU Grace Period Memory Ordering Building Blocks</a></h3>
|
|
||||||
|
|
||||||
<p>The workhorse for RCU's grace-period memory ordering is the
|
|
||||||
critical section for the <tt>rcu_node</tt> structure's
|
|
||||||
<tt>->lock</tt>.
|
|
||||||
These critical sections use helper functions for lock acquisition, including
|
|
||||||
<tt>raw_spin_lock_rcu_node()</tt>,
|
|
||||||
<tt>raw_spin_lock_irq_rcu_node()</tt>, and
|
|
||||||
<tt>raw_spin_lock_irqsave_rcu_node()</tt>.
|
|
||||||
Their lock-release counterparts are
|
|
||||||
<tt>raw_spin_unlock_rcu_node()</tt>,
|
|
||||||
<tt>raw_spin_unlock_irq_rcu_node()</tt>, and
|
|
||||||
<tt>raw_spin_unlock_irqrestore_rcu_node()</tt>,
|
|
||||||
respectively.
|
|
||||||
For completeness, a
|
|
||||||
<tt>raw_spin_trylock_rcu_node()</tt>
|
|
||||||
is also provided.
|
|
||||||
The key point is that the lock-acquisition functions, including
|
|
||||||
<tt>raw_spin_trylock_rcu_node()</tt>, all invoke
|
|
||||||
<tt>smp_mb__after_unlock_lock()</tt> immediately after successful
|
|
||||||
acquisition of the lock.
|
|
||||||
|
|
||||||
<p>Therefore, for any given <tt>rcu_node</tt> structure, any access
|
|
||||||
happening before one of the above lock-release functions will be seen
|
|
||||||
by all CPUs as happening before any access happening after a later
|
|
||||||
one of the above lock-acquisition functions.
|
|
||||||
Furthermore, any access happening before one of the
|
|
||||||
above lock-release function on any given CPU will be seen by all
|
|
||||||
CPUs as happening before any access happening after a later one
|
|
||||||
of the above lock-acquisition functions executing on that same CPU,
|
|
||||||
even if the lock-release and lock-acquisition functions are operating
|
|
||||||
on different <tt>rcu_node</tt> structures.
|
|
||||||
Tree RCU uses these two ordering guarantees to form an ordering
|
|
||||||
network among all CPUs that were in any way involved in the grace
|
|
||||||
period, including any CPUs that came online or went offline during
|
|
||||||
the grace period in question.
|
|
||||||
|
|
||||||
<p>The following litmus test exhibits the ordering effects of these
|
|
||||||
lock-acquisition and lock-release functions:
|
|
||||||
|
|
||||||
<pre>
|
|
||||||
1 int x, y, z;
|
|
||||||
2
|
|
||||||
3 void task0(void)
|
|
||||||
4 {
|
|
||||||
5 raw_spin_lock_rcu_node(rnp);
|
|
||||||
6 WRITE_ONCE(x, 1);
|
|
||||||
7 r1 = READ_ONCE(y);
|
|
||||||
8 raw_spin_unlock_rcu_node(rnp);
|
|
||||||
9 }
|
|
||||||
10
|
|
||||||
11 void task1(void)
|
|
||||||
12 {
|
|
||||||
13 raw_spin_lock_rcu_node(rnp);
|
|
||||||
14 WRITE_ONCE(y, 1);
|
|
||||||
15 r2 = READ_ONCE(z);
|
|
||||||
16 raw_spin_unlock_rcu_node(rnp);
|
|
||||||
17 }
|
|
||||||
18
|
|
||||||
19 void task2(void)
|
|
||||||
20 {
|
|
||||||
21 WRITE_ONCE(z, 1);
|
|
||||||
22 smp_mb();
|
|
||||||
23 r3 = READ_ONCE(x);
|
|
||||||
24 }
|
|
||||||
25
|
|
||||||
26 WARN_ON(r1 == 0 && r2 == 0 && r3 == 0);
|
|
||||||
</pre>
|
|
||||||
|
|
||||||
<p>The <tt>WARN_ON()</tt> is evaluated at “the end of time”,
|
|
||||||
after all changes have propagated throughout the system.
|
|
||||||
Without the <tt>smp_mb__after_unlock_lock()</tt> provided by the
|
|
||||||
acquisition functions, this <tt>WARN_ON()</tt> could trigger, for example
|
|
||||||
on PowerPC.
|
|
||||||
The <tt>smp_mb__after_unlock_lock()</tt> invocations prevent this
|
|
||||||
<tt>WARN_ON()</tt> from triggering.
|
|
||||||
|
|
||||||
<p>This approach must be extended to include idle CPUs, which need
|
|
||||||
RCU's grace-period memory ordering guarantee to extend to any
|
|
||||||
RCU read-side critical sections preceding and following the current
|
|
||||||
idle sojourn.
|
|
||||||
This case is handled by calls to the strongly ordered
|
|
||||||
<tt>atomic_add_return()</tt> read-modify-write atomic operation that
|
|
||||||
is invoked within <tt>rcu_dynticks_eqs_enter()</tt> at idle-entry
|
|
||||||
time and within <tt>rcu_dynticks_eqs_exit()</tt> at idle-exit time.
|
|
||||||
The grace-period kthread invokes <tt>rcu_dynticks_snap()</tt> and
|
|
||||||
<tt>rcu_dynticks_in_eqs_since()</tt> (both of which invoke
|
|
||||||
an <tt>atomic_add_return()</tt> of zero) to detect idle CPUs.
|
|
||||||
|
|
||||||
<table>
|
|
||||||
<tr><th> </th></tr>
|
|
||||||
<tr><th align="left">Quick Quiz:</th></tr>
|
|
||||||
<tr><td>
|
|
||||||
But what about CPUs that remain offline for the entire
|
|
||||||
grace period?
|
|
||||||
</td></tr>
|
|
||||||
<tr><th align="left">Answer:</th></tr>
|
|
||||||
<tr><td bgcolor="#ffffff"><font color="ffffff">
|
|
||||||
Such CPUs will be offline at the beginning of the grace period,
|
|
||||||
so the grace period won't expect quiescent states from them.
|
|
||||||
Races between grace-period start and CPU-hotplug operations
|
|
||||||
are mediated by the CPU's leaf <tt>rcu_node</tt> structure's
|
|
||||||
<tt>->lock</tt> as described above.
|
|
||||||
</font></td></tr>
|
|
||||||
<tr><td> </td></tr>
|
|
||||||
</table>
|
|
||||||
|
|
||||||
<p>The approach must be extended to handle one final case, that
|
|
||||||
of waking a task blocked in <tt>synchronize_rcu()</tt>.
|
|
||||||
This task might be affinitied to a CPU that is not yet aware that
|
|
||||||
the grace period has ended, and thus might not yet be subject to
|
|
||||||
the grace period's memory ordering.
|
|
||||||
Therefore, there is an <tt>smp_mb()</tt> after the return from
|
|
||||||
<tt>wait_for_completion()</tt> in the <tt>synchronize_rcu()</tt>
|
|
||||||
code path.
|
|
||||||
|
|
||||||
<table>
|
|
||||||
<tr><th> </th></tr>
|
|
||||||
<tr><th align="left">Quick Quiz:</th></tr>
|
|
||||||
<tr><td>
|
|
||||||
What? Where???
|
|
||||||
I don't see any <tt>smp_mb()</tt> after the return from
|
|
||||||
<tt>wait_for_completion()</tt>!!!
|
|
||||||
</td></tr>
|
|
||||||
<tr><th align="left">Answer:</th></tr>
|
|
||||||
<tr><td bgcolor="#ffffff"><font color="ffffff">
|
|
||||||
That would be because I spotted the need for that
|
|
||||||
<tt>smp_mb()</tt> during the creation of this documentation,
|
|
||||||
and it is therefore unlikely to hit mainline before v4.14.
|
|
||||||
Kudos to Lance Roy, Will Deacon, Peter Zijlstra, and
|
|
||||||
Jonathan Cameron for asking questions that sensitized me
|
|
||||||
to the rather elaborate sequence of events that demonstrate
|
|
||||||
the need for this memory barrier.
|
|
||||||
</font></td></tr>
|
|
||||||
<tr><td> </td></tr>
|
|
||||||
</table>
|
|
||||||
|
|
||||||
<p>Tree RCU's grace--period memory-ordering guarantees rely most
|
|
||||||
heavily on the <tt>rcu_node</tt> structure's <tt>->lock</tt>
|
|
||||||
field, so much so that it is necessary to abbreviate this pattern
|
|
||||||
in the diagrams in the next section.
|
|
||||||
For example, consider the <tt>rcu_prepare_for_idle()</tt> function
|
|
||||||
shown below, which is one of several functions that enforce ordering
|
|
||||||
of newly arrived RCU callbacks against future grace periods:
|
|
||||||
|
|
||||||
<pre>
|
|
||||||
1 static void rcu_prepare_for_idle(void)
|
|
||||||
2 {
|
|
||||||
3 bool needwake;
|
|
||||||
4 struct rcu_data *rdp;
|
|
||||||
5 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
|
|
||||||
6 struct rcu_node *rnp;
|
|
||||||
7 struct rcu_state *rsp;
|
|
||||||
8 int tne;
|
|
||||||
9
|
|
||||||
10 if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) ||
|
|
||||||
11 rcu_is_nocb_cpu(smp_processor_id()))
|
|
||||||
12 return;
|
|
||||||
13 tne = READ_ONCE(tick_nohz_active);
|
|
||||||
14 if (tne != rdtp->tick_nohz_enabled_snap) {
|
|
||||||
15 if (rcu_cpu_has_callbacks(NULL))
|
|
||||||
16 invoke_rcu_core();
|
|
||||||
17 rdtp->tick_nohz_enabled_snap = tne;
|
|
||||||
18 return;
|
|
||||||
19 }
|
|
||||||
20 if (!tne)
|
|
||||||
21 return;
|
|
||||||
22 if (rdtp->all_lazy &&
|
|
||||||
23 rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) {
|
|
||||||
24 rdtp->all_lazy = false;
|
|
||||||
25 rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
|
|
||||||
26 invoke_rcu_core();
|
|
||||||
27 return;
|
|
||||||
28 }
|
|
||||||
29 if (rdtp->last_accelerate == jiffies)
|
|
||||||
30 return;
|
|
||||||
31 rdtp->last_accelerate = jiffies;
|
|
||||||
32 for_each_rcu_flavor(rsp) {
|
|
||||||
33 rdp = this_cpu_ptr(rsp->rda);
|
|
||||||
34 if (rcu_segcblist_pend_cbs(&rdp->cblist))
|
|
||||||
35 continue;
|
|
||||||
36 rnp = rdp->mynode;
|
|
||||||
37 raw_spin_lock_rcu_node(rnp);
|
|
||||||
38 needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
|
|
||||||
39 raw_spin_unlock_rcu_node(rnp);
|
|
||||||
40 if (needwake)
|
|
||||||
41 rcu_gp_kthread_wake(rsp);
|
|
||||||
42 }
|
|
||||||
43 }
|
|
||||||
</pre>
|
|
||||||
|
|
||||||
<p>But the only part of <tt>rcu_prepare_for_idle()</tt> that really
|
|
||||||
matters for this discussion are lines 37–39.
|
|
||||||
We will therefore abbreviate this function as follows:
|
|
||||||
|
|
||||||
</p><p><img src="rcu_node-lock.svg" alt="rcu_node-lock.svg">
|
|
||||||
|
|
||||||
<p>The box represents the <tt>rcu_node</tt> structure's <tt>->lock</tt>
|
|
||||||
critical section, with the double line on top representing the additional
|
|
||||||
<tt>smp_mb__after_unlock_lock()</tt>.
|
|
||||||
|
|
||||||
<h3><a name="Tree RCU Grace Period Memory Ordering Components">
|
|
||||||
Tree RCU Grace Period Memory Ordering Components</a></h3>
|
|
||||||
|
|
||||||
<p>Tree RCU's grace-period memory-ordering guarantee is provided by
|
|
||||||
a number of RCU components:
|
|
||||||
|
|
||||||
<ol>
|
|
||||||
<li> <a href="#Callback Registry">Callback Registry</a>
|
|
||||||
<li> <a href="#Grace-Period Initialization">Grace-Period Initialization</a>
|
|
||||||
<li> <a href="#Self-Reported Quiescent States">
|
|
||||||
Self-Reported Quiescent States</a>
|
|
||||||
<li> <a href="#Dynamic Tick Interface">Dynamic Tick Interface</a>
|
|
||||||
<li> <a href="#CPU-Hotplug Interface">CPU-Hotplug Interface</a>
|
|
||||||
<li> <a href="Forcing Quiescent States">Forcing Quiescent States</a>
|
|
||||||
<li> <a href="Grace-Period Cleanup">Grace-Period Cleanup</a>
|
|
||||||
<li> <a href="Callback Invocation">Callback Invocation</a>
|
|
||||||
</ol>
|
|
||||||
|
|
||||||
<p>Each of the following section looks at the corresponding component
|
|
||||||
in detail.
|
|
||||||
|
|
||||||
<h4><a name="Callback Registry">Callback Registry</a></h4>
|
|
||||||
|
|
||||||
<p>If RCU's grace-period guarantee is to mean anything at all, any
|
|
||||||
access that happens before a given invocation of <tt>call_rcu()</tt>
|
|
||||||
must also happen before the corresponding grace period.
|
|
||||||
The implementation of this portion of RCU's grace period guarantee
|
|
||||||
is shown in the following figure:
|
|
||||||
|
|
||||||
</p><p><img src="TreeRCU-callback-registry.svg" alt="TreeRCU-callback-registry.svg">
|
|
||||||
|
|
||||||
<p>Because <tt>call_rcu()</tt> normally acts only on CPU-local state,
|
|
||||||
it provides no ordering guarantees, either for itself or for
|
|
||||||
phase one of the update (which again will usually be removal of
|
|
||||||
an element from an RCU-protected data structure).
|
|
||||||
It simply enqueues the <tt>rcu_head</tt> structure on a per-CPU list,
|
|
||||||
which cannot become associated with a grace period until a later
|
|
||||||
call to <tt>rcu_accelerate_cbs()</tt>, as shown in the diagram above.
|
|
||||||
|
|
||||||
<p>One set of code paths shown on the left invokes
|
|
||||||
<tt>rcu_accelerate_cbs()</tt> via
|
|
||||||
<tt>note_gp_changes()</tt>, either directly from <tt>call_rcu()</tt> (if
|
|
||||||
the current CPU is inundated with queued <tt>rcu_head</tt> structures)
|
|
||||||
or more likely from an <tt>RCU_SOFTIRQ</tt> handler.
|
|
||||||
Another code path in the middle is taken only in kernels built with
|
|
||||||
<tt>CONFIG_RCU_FAST_NO_HZ=y</tt>, which invokes
|
|
||||||
<tt>rcu_accelerate_cbs()</tt> via <tt>rcu_prepare_for_idle()</tt>.
|
|
||||||
The final code path on the right is taken only in kernels built with
|
|
||||||
<tt>CONFIG_HOTPLUG_CPU=y</tt>, which invokes
|
|
||||||
<tt>rcu_accelerate_cbs()</tt> via
|
|
||||||
<tt>rcu_advance_cbs()</tt>, <tt>rcu_migrate_callbacks</tt>,
|
|
||||||
<tt>rcutree_migrate_callbacks()</tt>, and <tt>takedown_cpu()</tt>,
|
|
||||||
which in turn is invoked on a surviving CPU after the outgoing
|
|
||||||
CPU has been completely offlined.
|
|
||||||
|
|
||||||
<p>There are a few other code paths within grace-period processing
|
|
||||||
that opportunistically invoke <tt>rcu_accelerate_cbs()</tt>.
|
|
||||||
However, either way, all of the CPU's recently queued <tt>rcu_head</tt>
|
|
||||||
structures are associated with a future grace-period number under
|
|
||||||
the protection of the CPU's lead <tt>rcu_node</tt> structure's
|
|
||||||
<tt>->lock</tt>.
|
|
||||||
In all cases, there is full ordering against any prior critical section
|
|
||||||
for that same <tt>rcu_node</tt> structure's <tt>->lock</tt>, and
|
|
||||||
also full ordering against any of the current task's or CPU's prior critical
|
|
||||||
sections for any <tt>rcu_node</tt> structure's <tt>->lock</tt>.
|
|
||||||
|
|
||||||
<p>The next section will show how this ordering ensures that any
|
|
||||||
accesses prior to the <tt>call_rcu()</tt> (particularly including phase
|
|
||||||
one of the update)
|
|
||||||
happen before the start of the corresponding grace period.
|
|
||||||
|
|
||||||
<table>
|
|
||||||
<tr><th> </th></tr>
|
|
||||||
<tr><th align="left">Quick Quiz:</th></tr>
|
|
||||||
<tr><td>
|
|
||||||
But what about <tt>synchronize_rcu()</tt>?
|
|
||||||
</td></tr>
|
|
||||||
<tr><th align="left">Answer:</th></tr>
|
|
||||||
<tr><td bgcolor="#ffffff"><font color="ffffff">
|
|
||||||
The <tt>synchronize_rcu()</tt> passes <tt>call_rcu()</tt>
|
|
||||||
to <tt>wait_rcu_gp()</tt>, which invokes it.
|
|
||||||
So either way, it eventually comes down to <tt>call_rcu()</tt>.
|
|
||||||
</font></td></tr>
|
|
||||||
<tr><td> </td></tr>
|
|
||||||
</table>
|
|
||||||
|
|
||||||
<h4><a name="Grace-Period Initialization">Grace-Period Initialization</a></h4>
|
|
||||||
|
|
||||||
<p>Grace-period initialization is carried out by
|
|
||||||
the grace-period kernel thread, which makes several passes over the
|
|
||||||
<tt>rcu_node</tt> tree within the <tt>rcu_gp_init()</tt> function.
|
|
||||||
This means that showing the full flow of ordering through the
|
|
||||||
grace-period computation will require duplicating this tree.
|
|
||||||
If you find this confusing, please note that the state of the
|
|
||||||
<tt>rcu_node</tt> changes over time, just like Heraclitus's river.
|
|
||||||
However, to keep the <tt>rcu_node</tt> river tractable, the
|
|
||||||
grace-period kernel thread's traversals are presented in multiple
|
|
||||||
parts, starting in this section with the various phases of
|
|
||||||
grace-period initialization.
|
|
||||||
|
|
||||||
<p>The first ordering-related grace-period initialization action is to
|
|
||||||
advance the <tt>rcu_state</tt> structure's <tt>->gp_seq</tt>
|
|
||||||
grace-period-number counter, as shown below:
|
|
||||||
|
|
||||||
</p><p><img src="TreeRCU-gp-init-1.svg" alt="TreeRCU-gp-init-1.svg" width="75%">
|
|
||||||
|
|
||||||
<p>The actual increment is carried out using <tt>smp_store_release()</tt>,
|
|
||||||
which helps reject false-positive RCU CPU stall detection.
|
|
||||||
Note that only the root <tt>rcu_node</tt> structure is touched.
|
|
||||||
|
|
||||||
<p>The first pass through the <tt>rcu_node</tt> tree updates bitmasks
|
|
||||||
based on CPUs having come online or gone offline since the start of
|
|
||||||
the previous grace period.
|
|
||||||
In the common case where the number of online CPUs for this <tt>rcu_node</tt>
|
|
||||||
structure has not transitioned to or from zero,
|
|
||||||
this pass will scan only the leaf <tt>rcu_node</tt> structures.
|
|
||||||
However, if the number of online CPUs for a given leaf <tt>rcu_node</tt>
|
|
||||||
structure has transitioned from zero,
|
|
||||||
<tt>rcu_init_new_rnp()</tt> will be invoked for the first incoming CPU.
|
|
||||||
Similarly, if the number of online CPUs for a given leaf <tt>rcu_node</tt>
|
|
||||||
structure has transitioned to zero,
|
|
||||||
<tt>rcu_cleanup_dead_rnp()</tt> will be invoked for the last outgoing CPU.
|
|
||||||
The diagram below shows the path of ordering if the leftmost
|
|
||||||
<tt>rcu_node</tt> structure onlines its first CPU and if the next
|
|
||||||
<tt>rcu_node</tt> structure has no online CPUs
|
|
||||||
(or, alternatively if the leftmost <tt>rcu_node</tt> structure offlines
|
|
||||||
its last CPU and if the next <tt>rcu_node</tt> structure has no online CPUs).
|
|
||||||
|
|
||||||
</p><p><img src="TreeRCU-gp-init-2.svg" alt="TreeRCU-gp-init-1.svg" width="75%">
|
|
||||||
|
|
||||||
<p>The final <tt>rcu_gp_init()</tt> pass through the <tt>rcu_node</tt>
|
|
||||||
tree traverses breadth-first, setting each <tt>rcu_node</tt> structure's
|
|
||||||
<tt>->gp_seq</tt> field to the newly advanced value from the
|
|
||||||
<tt>rcu_state</tt> structure, as shown in the following diagram.
|
|
||||||
|
|
||||||
</p><p><img src="TreeRCU-gp-init-3.svg" alt="TreeRCU-gp-init-1.svg" width="75%">
|
|
||||||
|
|
||||||
<p>This change will also cause each CPU's next call to
|
|
||||||
<tt>__note_gp_changes()</tt>
|
|
||||||
to notice that a new grace period has started, as described in the next
|
|
||||||
section.
|
|
||||||
But because the grace-period kthread started the grace period at the
|
|
||||||
root (with the advancing of the <tt>rcu_state</tt> structure's
|
|
||||||
<tt>->gp_seq</tt> field) before setting each leaf <tt>rcu_node</tt>
|
|
||||||
structure's <tt>->gp_seq</tt> field, each CPU's observation of
|
|
||||||
the start of the grace period will happen after the actual start
|
|
||||||
of the grace period.
|
|
||||||
|
|
||||||
<table>
|
|
||||||
<tr><th> </th></tr>
|
|
||||||
<tr><th align="left">Quick Quiz:</th></tr>
|
|
||||||
<tr><td>
|
|
||||||
But what about the CPU that started the grace period?
|
|
||||||
Why wouldn't it see the start of the grace period right when
|
|
||||||
it started that grace period?
|
|
||||||
</td></tr>
|
|
||||||
<tr><th align="left">Answer:</th></tr>
|
|
||||||
<tr><td bgcolor="#ffffff"><font color="ffffff">
|
|
||||||
In some deep philosophical and overly anthromorphized
|
|
||||||
sense, yes, the CPU starting the grace period is immediately
|
|
||||||
aware of having done so.
|
|
||||||
However, if we instead assume that RCU is not self-aware,
|
|
||||||
then even the CPU starting the grace period does not really
|
|
||||||
become aware of the start of this grace period until its
|
|
||||||
first call to <tt>__note_gp_changes()</tt>.
|
|
||||||
On the other hand, this CPU potentially gets early notification
|
|
||||||
because it invokes <tt>__note_gp_changes()</tt> during its
|
|
||||||
last <tt>rcu_gp_init()</tt> pass through its leaf
|
|
||||||
<tt>rcu_node</tt> structure.
|
|
||||||
</font></td></tr>
|
|
||||||
<tr><td> </td></tr>
|
|
||||||
</table>
|
|
||||||
|
|
||||||
<h4><a name="Self-Reported Quiescent States">
|
|
||||||
Self-Reported Quiescent States</a></h4>
|
|
||||||
|
|
||||||
<p>When all entities that might block the grace period have reported
|
|
||||||
quiescent states (or as described in a later section, had quiescent
|
|
||||||
states reported on their behalf), the grace period can end.
|
|
||||||
Online non-idle CPUs report their own quiescent states, as shown
|
|
||||||
in the following diagram:
|
|
||||||
|
|
||||||
</p><p><img src="TreeRCU-qs.svg" alt="TreeRCU-qs.svg" width="75%">
|
|
||||||
|
|
||||||
<p>This is for the last CPU to report a quiescent state, which signals
|
|
||||||
the end of the grace period.
|
|
||||||
Earlier quiescent states would push up the <tt>rcu_node</tt> tree
|
|
||||||
only until they encountered an <tt>rcu_node</tt> structure that
|
|
||||||
is waiting for additional quiescent states.
|
|
||||||
However, ordering is nevertheless preserved because some later quiescent
|
|
||||||
state will acquire that <tt>rcu_node</tt> structure's <tt>->lock</tt>.
|
|
||||||
|
|
||||||
<p>Any number of events can lead up to a CPU invoking
|
|
||||||
<tt>note_gp_changes</tt> (or alternatively, directly invoking
|
|
||||||
<tt>__note_gp_changes()</tt>), at which point that CPU will notice
|
|
||||||
the start of a new grace period while holding its leaf
|
|
||||||
<tt>rcu_node</tt> lock.
|
|
||||||
Therefore, all execution shown in this diagram happens after the
|
|
||||||
start of the grace period.
|
|
||||||
In addition, this CPU will consider any RCU read-side critical
|
|
||||||
section that started before the invocation of <tt>__note_gp_changes()</tt>
|
|
||||||
to have started before the grace period, and thus a critical
|
|
||||||
section that the grace period must wait on.
|
|
||||||
|
|
||||||
<table>
|
|
||||||
<tr><th> </th></tr>
|
|
||||||
<tr><th align="left">Quick Quiz:</th></tr>
|
|
||||||
<tr><td>
|
|
||||||
But a RCU read-side critical section might have started
|
|
||||||
after the beginning of the grace period
|
|
||||||
(the advancing of <tt>->gp_seq</tt> from earlier), so why should
|
|
||||||
the grace period wait on such a critical section?
|
|
||||||
</td></tr>
|
|
||||||
<tr><th align="left">Answer:</th></tr>
|
|
||||||
<tr><td bgcolor="#ffffff"><font color="ffffff">
|
|
||||||
It is indeed not necessary for the grace period to wait on such
|
|
||||||
a critical section.
|
|
||||||
However, it is permissible to wait on it.
|
|
||||||
And it is furthermore important to wait on it, as this
|
|
||||||
lazy approach is far more scalable than a “big bang”
|
|
||||||
all-at-once grace-period start could possibly be.
|
|
||||||
</font></td></tr>
|
|
||||||
<tr><td> </td></tr>
|
|
||||||
</table>
|
|
||||||
|
|
||||||
<p>If the CPU does a context switch, a quiescent state will be
|
|
||||||
noted by <tt>rcu_node_context_switch()</tt> on the left.
|
|
||||||
On the other hand, if the CPU takes a scheduler-clock interrupt
|
|
||||||
while executing in usermode, a quiescent state will be noted by
|
|
||||||
<tt>rcu_sched_clock_irq()</tt> on the right.
|
|
||||||
Either way, the passage through a quiescent state will be noted
|
|
||||||
in a per-CPU variable.
|
|
||||||
|
|
||||||
<p>The next time an <tt>RCU_SOFTIRQ</tt> handler executes on
|
|
||||||
this CPU (for example, after the next scheduler-clock
|
|
||||||
interrupt), <tt>rcu_core()</tt> will invoke
|
|
||||||
<tt>rcu_check_quiescent_state()</tt>, which will notice the
|
|
||||||
recorded quiescent state, and invoke
|
|
||||||
<tt>rcu_report_qs_rdp()</tt>.
|
|
||||||
If <tt>rcu_report_qs_rdp()</tt> verifies that the quiescent state
|
|
||||||
really does apply to the current grace period, it invokes
|
|
||||||
<tt>rcu_report_rnp()</tt> which traverses up the <tt>rcu_node</tt>
|
|
||||||
tree as shown at the bottom of the diagram, clearing bits from
|
|
||||||
each <tt>rcu_node</tt> structure's <tt>->qsmask</tt> field,
|
|
||||||
and propagating up the tree when the result is zero.
|
|
||||||
|
|
||||||
<p>Note that traversal passes upwards out of a given <tt>rcu_node</tt>
|
|
||||||
structure only if the current CPU is reporting the last quiescent
|
|
||||||
state for the subtree headed by that <tt>rcu_node</tt> structure.
|
|
||||||
A key point is that if a CPU's traversal stops at a given <tt>rcu_node</tt>
|
|
||||||
structure, then there will be a later traversal by another CPU
|
|
||||||
(or perhaps the same one) that proceeds upwards
|
|
||||||
from that point, and the <tt>rcu_node</tt> <tt>->lock</tt>
|
|
||||||
guarantees that the first CPU's quiescent state happens before the
|
|
||||||
remainder of the second CPU's traversal.
|
|
||||||
Applying this line of thought repeatedly shows that all CPUs'
|
|
||||||
quiescent states happen before the last CPU traverses through
|
|
||||||
the root <tt>rcu_node</tt> structure, the “last CPU”
|
|
||||||
being the one that clears the last bit in the root <tt>rcu_node</tt>
|
|
||||||
structure's <tt>->qsmask</tt> field.
|
|
||||||
|
|
||||||
<h4><a name="Dynamic Tick Interface">Dynamic Tick Interface</a></h4>
|
|
||||||
|
|
||||||
<p>Due to energy-efficiency considerations, RCU is forbidden from
|
|
||||||
disturbing idle CPUs.
|
|
||||||
CPUs are therefore required to notify RCU when entering or leaving idle
|
|
||||||
state, which they do via fully ordered value-returning atomic operations
|
|
||||||
on a per-CPU variable.
|
|
||||||
The ordering effects are as shown below:
|
|
||||||
|
|
||||||
</p><p><img src="TreeRCU-dyntick.svg" alt="TreeRCU-dyntick.svg" width="50%">
|
|
||||||
|
|
||||||
<p>The RCU grace-period kernel thread samples the per-CPU idleness
|
|
||||||
variable while holding the corresponding CPU's leaf <tt>rcu_node</tt>
|
|
||||||
structure's <tt>->lock</tt>.
|
|
||||||
This means that any RCU read-side critical sections that precede the
|
|
||||||
idle period (the oval near the top of the diagram above) will happen
|
|
||||||
before the end of the current grace period.
|
|
||||||
Similarly, the beginning of the current grace period will happen before
|
|
||||||
any RCU read-side critical sections that follow the
|
|
||||||
idle period (the oval near the bottom of the diagram above).
|
|
||||||
|
|
||||||
<p>Plumbing this into the full grace-period execution is described
|
|
||||||
<a href="#Forcing Quiescent States">below</a>.
|
|
||||||
|
|
||||||
<h4><a name="CPU-Hotplug Interface">CPU-Hotplug Interface</a></h4>
|
|
||||||
|
|
||||||
<p>RCU is also forbidden from disturbing offline CPUs, which might well
|
|
||||||
be powered off and removed from the system completely.
|
|
||||||
CPUs are therefore required to notify RCU of their comings and goings
|
|
||||||
as part of the corresponding CPU hotplug operations.
|
|
||||||
The ordering effects are shown below:
|
|
||||||
|
|
||||||
</p><p><img src="TreeRCU-hotplug.svg" alt="TreeRCU-hotplug.svg" width="50%">
|
|
||||||
|
|
||||||
<p>Because CPU hotplug operations are much less frequent than idle transitions,
|
|
||||||
they are heavier weight, and thus acquire the CPU's leaf <tt>rcu_node</tt>
|
|
||||||
structure's <tt>->lock</tt> and update this structure's
|
|
||||||
<tt>->qsmaskinitnext</tt>.
|
|
||||||
The RCU grace-period kernel thread samples this mask to detect CPUs
|
|
||||||
having gone offline since the beginning of this grace period.
|
|
||||||
|
|
||||||
<p>Plumbing this into the full grace-period execution is described
|
|
||||||
<a href="#Forcing Quiescent States">below</a>.
|
|
||||||
|
|
||||||
<h4><a name="Forcing Quiescent States">Forcing Quiescent States</a></h4>
|
|
||||||
|
|
||||||
<p>As noted above, idle and offline CPUs cannot report their own
|
|
||||||
quiescent states, and therefore the grace-period kernel thread
|
|
||||||
must do the reporting on their behalf.
|
|
||||||
This process is called “forcing quiescent states”, it is
|
|
||||||
repeated every few jiffies, and its ordering effects are shown below:
|
|
||||||
|
|
||||||
</p><p><img src="TreeRCU-gp-fqs.svg" alt="TreeRCU-gp-fqs.svg" width="100%">
|
|
||||||
|
|
||||||
<p>Each pass of quiescent state forcing is guaranteed to traverse the
|
|
||||||
leaf <tt>rcu_node</tt> structures, and if there are no new quiescent
|
|
||||||
states due to recently idled and/or offlined CPUs, then only the
|
|
||||||
leaves are traversed.
|
|
||||||
However, if there is a newly offlined CPU as illustrated on the left
|
|
||||||
or a newly idled CPU as illustrated on the right, the corresponding
|
|
||||||
quiescent state will be driven up towards the root.
|
|
||||||
As with self-reported quiescent states, the upwards driving stops
|
|
||||||
once it reaches an <tt>rcu_node</tt> structure that has quiescent
|
|
||||||
states outstanding from other CPUs.
|
|
||||||
|
|
||||||
<table>
|
|
||||||
<tr><th> </th></tr>
|
|
||||||
<tr><th align="left">Quick Quiz:</th></tr>
|
|
||||||
<tr><td>
|
|
||||||
The leftmost drive to root stopped before it reached
|
|
||||||
the root <tt>rcu_node</tt> structure, which means that
|
|
||||||
there are still CPUs subordinate to that structure on
|
|
||||||
which the current grace period is waiting.
|
|
||||||
Given that, how is it possible that the rightmost drive
|
|
||||||
to root ended the grace period?
|
|
||||||
</td></tr>
|
|
||||||
<tr><th align="left">Answer:</th></tr>
|
|
||||||
<tr><td bgcolor="#ffffff"><font color="ffffff">
|
|
||||||
Good analysis!
|
|
||||||
It is in fact impossible in the absence of bugs in RCU.
|
|
||||||
But this diagram is complex enough as it is, so simplicity
|
|
||||||
overrode accuracy.
|
|
||||||
You can think of it as poetic license, or you can think of
|
|
||||||
it as misdirection that is resolved in the
|
|
||||||
<a href="#Putting It All Together">stitched-together diagram</a>.
|
|
||||||
</font></td></tr>
|
|
||||||
<tr><td> </td></tr>
|
|
||||||
</table>
|
|
||||||
|
|
||||||
<h4><a name="Grace-Period Cleanup">Grace-Period Cleanup</a></h4>
|
|
||||||
|
|
||||||
<p>Grace-period cleanup first scans the <tt>rcu_node</tt> tree
|
|
||||||
breadth-first advancing all the <tt>->gp_seq</tt> fields, then it
|
|
||||||
advances the <tt>rcu_state</tt> structure's <tt>->gp_seq</tt> field.
|
|
||||||
The ordering effects are shown below:
|
|
||||||
|
|
||||||
</p><p><img src="TreeRCU-gp-cleanup.svg" alt="TreeRCU-gp-cleanup.svg" width="75%">
|
|
||||||
|
|
||||||
<p>As indicated by the oval at the bottom of the diagram, once
|
|
||||||
grace-period cleanup is complete, the next grace period can begin.
|
|
||||||
|
|
||||||
<table>
|
|
||||||
<tr><th> </th></tr>
|
|
||||||
<tr><th align="left">Quick Quiz:</th></tr>
|
|
||||||
<tr><td>
|
|
||||||
But when precisely does the grace period end?
|
|
||||||
</td></tr>
|
|
||||||
<tr><th align="left">Answer:</th></tr>
|
|
||||||
<tr><td bgcolor="#ffffff"><font color="ffffff">
|
|
||||||
There is no useful single point at which the grace period
|
|
||||||
can be said to end.
|
|
||||||
The earliest reasonable candidate is as soon as the last
|
|
||||||
CPU has reported its quiescent state, but it may be some
|
|
||||||
milliseconds before RCU becomes aware of this.
|
|
||||||
The latest reasonable candidate is once the <tt>rcu_state</tt>
|
|
||||||
structure's <tt>->gp_seq</tt> field has been updated,
|
|
||||||
but it is quite possible that some CPUs have already completed
|
|
||||||
phase two of their updates by that time.
|
|
||||||
In short, if you are going to work with RCU, you need to
|
|
||||||
learn to embrace uncertainty.
|
|
||||||
</font></td></tr>
|
|
||||||
<tr><td> </td></tr>
|
|
||||||
</table>
|
|
||||||
|
|
||||||
|
|
||||||
<h4><a name="Callback Invocation">Callback Invocation</a></h4>
|
|
||||||
|
|
||||||
<p>Once a given CPU's leaf <tt>rcu_node</tt> structure's
|
|
||||||
<tt>->gp_seq</tt> field has been updated, that CPU can begin
|
|
||||||
invoking its RCU callbacks that were waiting for this grace period
|
|
||||||
to end.
|
|
||||||
These callbacks are identified by <tt>rcu_advance_cbs()</tt>,
|
|
||||||
which is usually invoked by <tt>__note_gp_changes()</tt>.
|
|
||||||
As shown in the diagram below, this invocation can be triggered by
|
|
||||||
the scheduling-clock interrupt (<tt>rcu_sched_clock_irq()</tt> on
|
|
||||||
the left) or by idle entry (<tt>rcu_cleanup_after_idle()</tt> on
|
|
||||||
the right, but only for kernels build with
|
|
||||||
<tt>CONFIG_RCU_FAST_NO_HZ=y</tt>).
|
|
||||||
Either way, <tt>RCU_SOFTIRQ</tt> is raised, which results in
|
|
||||||
<tt>rcu_do_batch()</tt> invoking the callbacks, which in turn
|
|
||||||
allows those callbacks to carry out (either directly or indirectly
|
|
||||||
via wakeup) the needed phase-two processing for each update.
|
|
||||||
|
|
||||||
</p><p><img src="TreeRCU-callback-invocation.svg" alt="TreeRCU-callback-invocation.svg" width="60%">
|
|
||||||
|
|
||||||
<p>Please note that callback invocation can also be prompted by any
|
|
||||||
number of corner-case code paths, for example, when a CPU notes that
|
|
||||||
it has excessive numbers of callbacks queued.
|
|
||||||
In all cases, the CPU acquires its leaf <tt>rcu_node</tt> structure's
|
|
||||||
<tt>->lock</tt> before invoking callbacks, which preserves the
|
|
||||||
required ordering against the newly completed grace period.
|
|
||||||
|
|
||||||
<p>However, if the callback function communicates to other CPUs,
|
|
||||||
for example, doing a wakeup, then it is that function's responsibility
|
|
||||||
to maintain ordering.
|
|
||||||
For example, if the callback function wakes up a task that runs on
|
|
||||||
some other CPU, proper ordering must in place in both the callback
|
|
||||||
function and the task being awakened.
|
|
||||||
To see why this is important, consider the top half of the
|
|
||||||
<a href="#Grace-Period Cleanup">grace-period cleanup</a> diagram.
|
|
||||||
The callback might be running on a CPU corresponding to the leftmost
|
|
||||||
leaf <tt>rcu_node</tt> structure, and awaken a task that is to run on
|
|
||||||
a CPU corresponding to the rightmost leaf <tt>rcu_node</tt> structure,
|
|
||||||
and the grace-period kernel thread might not yet have reached the
|
|
||||||
rightmost leaf.
|
|
||||||
In this case, the grace period's memory ordering might not yet have
|
|
||||||
reached that CPU, so again the callback function and the awakened
|
|
||||||
task must supply proper ordering.
|
|
||||||
|
|
||||||
<h3><a name="Putting It All Together">Putting It All Together</a></h3>
|
|
||||||
|
|
||||||
<p>A stitched-together diagram is
|
|
||||||
<a href="Tree-RCU-Diagram.html">here</a>.
|
|
||||||
|
|
||||||
<h3><a name="Legal Statement">
|
|
||||||
Legal Statement</a></h3>
|
|
||||||
|
|
||||||
<p>This work represents the view of the author and does not necessarily
|
|
||||||
represent the view of IBM.
|
|
||||||
|
|
||||||
</p><p>Linux is a registered trademark of Linus Torvalds.
|
|
||||||
|
|
||||||
</p><p>Other company, product, and service names may be trademarks or
|
|
||||||
service marks of others.
|
|
||||||
|
|
||||||
</body></html>
|
|
@@ -0,0 +1,624 @@
|
|||||||
|
======================================================
|
||||||
|
A Tour Through TREE_RCU's Grace-Period Memory Ordering
|
||||||
|
======================================================
|
||||||
|
|
||||||
|
August 8, 2017
|
||||||
|
|
||||||
|
This article was contributed by Paul E. McKenney
|
||||||
|
|
||||||
|
Introduction
|
||||||
|
============
|
||||||
|
|
||||||
|
This document gives a rough visual overview of how Tree RCU's
|
||||||
|
grace-period memory ordering guarantee is provided.
|
||||||
|
|
||||||
|
What Is Tree RCU's Grace Period Memory Ordering Guarantee?
|
||||||
|
==========================================================
|
||||||
|
|
||||||
|
RCU grace periods provide extremely strong memory-ordering guarantees
|
||||||
|
for non-idle non-offline code.
|
||||||
|
Any code that happens after the end of a given RCU grace period is guaranteed
|
||||||
|
to see the effects of all accesses prior to the beginning of that grace
|
||||||
|
period that are within RCU read-side critical sections.
|
||||||
|
Similarly, any code that happens before the beginning of a given RCU grace
|
||||||
|
period is guaranteed to see the effects of all accesses following the end
|
||||||
|
of that grace period that are within RCU read-side critical sections.
|
||||||
|
|
||||||
|
Note well that RCU-sched read-side critical sections include any region
|
||||||
|
of code for which preemption is disabled.
|
||||||
|
Given that each individual machine instruction can be thought of as
|
||||||
|
an extremely small region of preemption-disabled code, one can think of
|
||||||
|
``synchronize_rcu()`` as ``smp_mb()`` on steroids.
|
||||||
|
|
||||||
|
RCU updaters use this guarantee by splitting their updates into
|
||||||
|
two phases, one of which is executed before the grace period and
|
||||||
|
the other of which is executed after the grace period.
|
||||||
|
In the most common use case, phase one removes an element from
|
||||||
|
a linked RCU-protected data structure, and phase two frees that element.
|
||||||
|
For this to work, any readers that have witnessed state prior to the
|
||||||
|
phase-one update (in the common case, removal) must not witness state
|
||||||
|
following the phase-two update (in the common case, freeing).
|
||||||
|
|
||||||
|
The RCU implementation provides this guarantee using a network
|
||||||
|
of lock-based critical sections, memory barriers, and per-CPU
|
||||||
|
processing, as is described in the following sections.
|
||||||
|
|
||||||
|
Tree RCU Grace Period Memory Ordering Building Blocks
|
||||||
|
=====================================================
|
||||||
|
|
||||||
|
The workhorse for RCU's grace-period memory ordering is the
|
||||||
|
critical section for the ``rcu_node`` structure's
|
||||||
|
``->lock``. These critical sections use helper functions for lock
|
||||||
|
acquisition, including ``raw_spin_lock_rcu_node()``,
|
||||||
|
``raw_spin_lock_irq_rcu_node()``, and ``raw_spin_lock_irqsave_rcu_node()``.
|
||||||
|
Their lock-release counterparts are ``raw_spin_unlock_rcu_node()``,
|
||||||
|
``raw_spin_unlock_irq_rcu_node()``, and
|
||||||
|
``raw_spin_unlock_irqrestore_rcu_node()``, respectively.
|
||||||
|
For completeness, a ``raw_spin_trylock_rcu_node()`` is also provided.
|
||||||
|
The key point is that the lock-acquisition functions, including
|
||||||
|
``raw_spin_trylock_rcu_node()``, all invoke ``smp_mb__after_unlock_lock()``
|
||||||
|
immediately after successful acquisition of the lock.
|
||||||
|
|
||||||
|
Therefore, for any given ``rcu_node`` structure, any access
|
||||||
|
happening before one of the above lock-release functions will be seen
|
||||||
|
by all CPUs as happening before any access happening after a later
|
||||||
|
one of the above lock-acquisition functions.
|
||||||
|
Furthermore, any access happening before one of the
|
||||||
|
above lock-release function on any given CPU will be seen by all
|
||||||
|
CPUs as happening before any access happening after a later one
|
||||||
|
of the above lock-acquisition functions executing on that same CPU,
|
||||||
|
even if the lock-release and lock-acquisition functions are operating
|
||||||
|
on different ``rcu_node`` structures.
|
||||||
|
Tree RCU uses these two ordering guarantees to form an ordering
|
||||||
|
network among all CPUs that were in any way involved in the grace
|
||||||
|
period, including any CPUs that came online or went offline during
|
||||||
|
the grace period in question.
|
||||||
|
|
||||||
|
The following litmus test exhibits the ordering effects of these
|
||||||
|
lock-acquisition and lock-release functions::
|
||||||
|
|
||||||
|
1 int x, y, z;
|
||||||
|
2
|
||||||
|
3 void task0(void)
|
||||||
|
4 {
|
||||||
|
5 raw_spin_lock_rcu_node(rnp);
|
||||||
|
6 WRITE_ONCE(x, 1);
|
||||||
|
7 r1 = READ_ONCE(y);
|
||||||
|
8 raw_spin_unlock_rcu_node(rnp);
|
||||||
|
9 }
|
||||||
|
10
|
||||||
|
11 void task1(void)
|
||||||
|
12 {
|
||||||
|
13 raw_spin_lock_rcu_node(rnp);
|
||||||
|
14 WRITE_ONCE(y, 1);
|
||||||
|
15 r2 = READ_ONCE(z);
|
||||||
|
16 raw_spin_unlock_rcu_node(rnp);
|
||||||
|
17 }
|
||||||
|
18
|
||||||
|
19 void task2(void)
|
||||||
|
20 {
|
||||||
|
21 WRITE_ONCE(z, 1);
|
||||||
|
22 smp_mb();
|
||||||
|
23 r3 = READ_ONCE(x);
|
||||||
|
24 }
|
||||||
|
25
|
||||||
|
26 WARN_ON(r1 == 0 && r2 == 0 && r3 == 0);
|
||||||
|
|
||||||
|
The ``WARN_ON()`` is evaluated at “the end of time”,
|
||||||
|
after all changes have propagated throughout the system.
|
||||||
|
Without the ``smp_mb__after_unlock_lock()`` provided by the
|
||||||
|
acquisition functions, this ``WARN_ON()`` could trigger, for example
|
||||||
|
on PowerPC.
|
||||||
|
The ``smp_mb__after_unlock_lock()`` invocations prevent this
|
||||||
|
``WARN_ON()`` from triggering.
|
||||||
|
|
||||||
|
This approach must be extended to include idle CPUs, which need
|
||||||
|
RCU's grace-period memory ordering guarantee to extend to any
|
||||||
|
RCU read-side critical sections preceding and following the current
|
||||||
|
idle sojourn.
|
||||||
|
This case is handled by calls to the strongly ordered
|
||||||
|
``atomic_add_return()`` read-modify-write atomic operation that
|
||||||
|
is invoked within ``rcu_dynticks_eqs_enter()`` at idle-entry
|
||||||
|
time and within ``rcu_dynticks_eqs_exit()`` at idle-exit time.
|
||||||
|
The grace-period kthread invokes ``rcu_dynticks_snap()`` and
|
||||||
|
``rcu_dynticks_in_eqs_since()`` (both of which invoke
|
||||||
|
an ``atomic_add_return()`` of zero) to detect idle CPUs.
|
||||||
|
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
| **Quick Quiz**: |
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
| But what about CPUs that remain offline for the entire grace period? |
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
| **Answer**: |
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
| Such CPUs will be offline at the beginning of the grace period, so |
|
||||||
|
| the grace period won't expect quiescent states from them. Races |
|
||||||
|
| between grace-period start and CPU-hotplug operations are mediated |
|
||||||
|
| by the CPU's leaf ``rcu_node`` structure's ``->lock`` as described |
|
||||||
|
| above. |
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
|
||||||
|
The approach must be extended to handle one final case, that of waking a
|
||||||
|
task blocked in ``synchronize_rcu()``. This task might be affinitied to
|
||||||
|
a CPU that is not yet aware that the grace period has ended, and thus
|
||||||
|
might not yet be subject to the grace period's memory ordering.
|
||||||
|
Therefore, there is an ``smp_mb()`` after the return from
|
||||||
|
``wait_for_completion()`` in the ``synchronize_rcu()`` code path.
|
||||||
|
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
| **Quick Quiz**: |
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
| What? Where??? I don't see any ``smp_mb()`` after the return from |
|
||||||
|
| ``wait_for_completion()``!!! |
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
| **Answer**: |
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
| That would be because I spotted the need for that ``smp_mb()`` during |
|
||||||
|
| the creation of this documentation, and it is therefore unlikely to |
|
||||||
|
| hit mainline before v4.14. Kudos to Lance Roy, Will Deacon, Peter |
|
||||||
|
| Zijlstra, and Jonathan Cameron for asking questions that sensitized |
|
||||||
|
| me to the rather elaborate sequence of events that demonstrate the |
|
||||||
|
| need for this memory barrier. |
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
|
||||||
|
Tree RCU's grace--period memory-ordering guarantees rely most heavily on
|
||||||
|
the ``rcu_node`` structure's ``->lock`` field, so much so that it is
|
||||||
|
necessary to abbreviate this pattern in the diagrams in the next
|
||||||
|
section. For example, consider the ``rcu_prepare_for_idle()`` function
|
||||||
|
shown below, which is one of several functions that enforce ordering of
|
||||||
|
newly arrived RCU callbacks against future grace periods:
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
1 static void rcu_prepare_for_idle(void)
|
||||||
|
2 {
|
||||||
|
3 bool needwake;
|
||||||
|
4 struct rcu_data *rdp;
|
||||||
|
5 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
|
||||||
|
6 struct rcu_node *rnp;
|
||||||
|
7 struct rcu_state *rsp;
|
||||||
|
8 int tne;
|
||||||
|
9
|
||||||
|
10 if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) ||
|
||||||
|
11 rcu_is_nocb_cpu(smp_processor_id()))
|
||||||
|
12 return;
|
||||||
|
13 tne = READ_ONCE(tick_nohz_active);
|
||||||
|
14 if (tne != rdtp->tick_nohz_enabled_snap) {
|
||||||
|
15 if (rcu_cpu_has_callbacks(NULL))
|
||||||
|
16 invoke_rcu_core();
|
||||||
|
17 rdtp->tick_nohz_enabled_snap = tne;
|
||||||
|
18 return;
|
||||||
|
19 }
|
||||||
|
20 if (!tne)
|
||||||
|
21 return;
|
||||||
|
22 if (rdtp->all_lazy &&
|
||||||
|
23 rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) {
|
||||||
|
24 rdtp->all_lazy = false;
|
||||||
|
25 rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
|
||||||
|
26 invoke_rcu_core();
|
||||||
|
27 return;
|
||||||
|
28 }
|
||||||
|
29 if (rdtp->last_accelerate == jiffies)
|
||||||
|
30 return;
|
||||||
|
31 rdtp->last_accelerate = jiffies;
|
||||||
|
32 for_each_rcu_flavor(rsp) {
|
||||||
|
33 rdp = this_cpu_ptr(rsp->rda);
|
||||||
|
34 if (rcu_segcblist_pend_cbs(&rdp->cblist))
|
||||||
|
35 continue;
|
||||||
|
36 rnp = rdp->mynode;
|
||||||
|
37 raw_spin_lock_rcu_node(rnp);
|
||||||
|
38 needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
|
||||||
|
39 raw_spin_unlock_rcu_node(rnp);
|
||||||
|
40 if (needwake)
|
||||||
|
41 rcu_gp_kthread_wake(rsp);
|
||||||
|
42 }
|
||||||
|
43 }
|
||||||
|
|
||||||
|
But the only part of ``rcu_prepare_for_idle()`` that really matters for
|
||||||
|
this discussion are lines 37–39. We will therefore abbreviate this
|
||||||
|
function as follows:
|
||||||
|
|
||||||
|
.. kernel-figure:: rcu_node-lock.svg
|
||||||
|
|
||||||
|
The box represents the ``rcu_node`` structure's ``->lock`` critical
|
||||||
|
section, with the double line on top representing the additional
|
||||||
|
``smp_mb__after_unlock_lock()``.
|
||||||
|
|
||||||
|
Tree RCU Grace Period Memory Ordering Components
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
Tree RCU's grace-period memory-ordering guarantee is provided by a
|
||||||
|
number of RCU components:
|
||||||
|
|
||||||
|
#. `Callback Registry`_
|
||||||
|
#. `Grace-Period Initialization`_
|
||||||
|
#. `Self-Reported Quiescent States`_
|
||||||
|
#. `Dynamic Tick Interface`_
|
||||||
|
#. `CPU-Hotplug Interface`_
|
||||||
|
#. `Forcing Quiescent States`_
|
||||||
|
#. `Grace-Period Cleanup`_
|
||||||
|
#. `Callback Invocation`_
|
||||||
|
|
||||||
|
Each of the following section looks at the corresponding component in
|
||||||
|
detail.
|
||||||
|
|
||||||
|
Callback Registry
|
||||||
|
^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
If RCU's grace-period guarantee is to mean anything at all, any access
|
||||||
|
that happens before a given invocation of ``call_rcu()`` must also
|
||||||
|
happen before the corresponding grace period. The implementation of this
|
||||||
|
portion of RCU's grace period guarantee is shown in the following
|
||||||
|
figure:
|
||||||
|
|
||||||
|
.. kernel-figure:: TreeRCU-callback-registry.svg
|
||||||
|
|
||||||
|
Because ``call_rcu()`` normally acts only on CPU-local state, it
|
||||||
|
provides no ordering guarantees, either for itself or for phase one of
|
||||||
|
the update (which again will usually be removal of an element from an
|
||||||
|
RCU-protected data structure). It simply enqueues the ``rcu_head``
|
||||||
|
structure on a per-CPU list, which cannot become associated with a grace
|
||||||
|
period until a later call to ``rcu_accelerate_cbs()``, as shown in the
|
||||||
|
diagram above.
|
||||||
|
|
||||||
|
One set of code paths shown on the left invokes ``rcu_accelerate_cbs()``
|
||||||
|
via ``note_gp_changes()``, either directly from ``call_rcu()`` (if the
|
||||||
|
current CPU is inundated with queued ``rcu_head`` structures) or more
|
||||||
|
likely from an ``RCU_SOFTIRQ`` handler. Another code path in the middle
|
||||||
|
is taken only in kernels built with ``CONFIG_RCU_FAST_NO_HZ=y``, which
|
||||||
|
invokes ``rcu_accelerate_cbs()`` via ``rcu_prepare_for_idle()``. The
|
||||||
|
final code path on the right is taken only in kernels built with
|
||||||
|
``CONFIG_HOTPLUG_CPU=y``, which invokes ``rcu_accelerate_cbs()`` via
|
||||||
|
``rcu_advance_cbs()``, ``rcu_migrate_callbacks``,
|
||||||
|
``rcutree_migrate_callbacks()``, and ``takedown_cpu()``, which in turn
|
||||||
|
is invoked on a surviving CPU after the outgoing CPU has been completely
|
||||||
|
offlined.
|
||||||
|
|
||||||
|
There are a few other code paths within grace-period processing that
|
||||||
|
opportunistically invoke ``rcu_accelerate_cbs()``. However, either way,
|
||||||
|
all of the CPU's recently queued ``rcu_head`` structures are associated
|
||||||
|
with a future grace-period number under the protection of the CPU's lead
|
||||||
|
``rcu_node`` structure's ``->lock``. In all cases, there is full
|
||||||
|
ordering against any prior critical section for that same ``rcu_node``
|
||||||
|
structure's ``->lock``, and also full ordering against any of the
|
||||||
|
current task's or CPU's prior critical sections for any ``rcu_node``
|
||||||
|
structure's ``->lock``.
|
||||||
|
|
||||||
|
The next section will show how this ordering ensures that any accesses
|
||||||
|
prior to the ``call_rcu()`` (particularly including phase one of the
|
||||||
|
update) happen before the start of the corresponding grace period.
|
||||||
|
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
| **Quick Quiz**: |
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
| But what about ``synchronize_rcu()``? |
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
| **Answer**: |
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
| The ``synchronize_rcu()`` passes ``call_rcu()`` to ``wait_rcu_gp()``, |
|
||||||
|
| which invokes it. So either way, it eventually comes down to |
|
||||||
|
| ``call_rcu()``. |
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
|
||||||
|
Grace-Period Initialization
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
Grace-period initialization is carried out by the grace-period kernel
|
||||||
|
thread, which makes several passes over the ``rcu_node`` tree within the
|
||||||
|
``rcu_gp_init()`` function. This means that showing the full flow of
|
||||||
|
ordering through the grace-period computation will require duplicating
|
||||||
|
this tree. If you find this confusing, please note that the state of the
|
||||||
|
``rcu_node`` changes over time, just like Heraclitus's river. However,
|
||||||
|
to keep the ``rcu_node`` river tractable, the grace-period kernel
|
||||||
|
thread's traversals are presented in multiple parts, starting in this
|
||||||
|
section with the various phases of grace-period initialization.
|
||||||
|
|
||||||
|
The first ordering-related grace-period initialization action is to
|
||||||
|
advance the ``rcu_state`` structure's ``->gp_seq`` grace-period-number
|
||||||
|
counter, as shown below:
|
||||||
|
|
||||||
|
.. kernel-figure:: TreeRCU-gp-init-1.svg
|
||||||
|
|
||||||
|
The actual increment is carried out using ``smp_store_release()``, which
|
||||||
|
helps reject false-positive RCU CPU stall detection. Note that only the
|
||||||
|
root ``rcu_node`` structure is touched.
|
||||||
|
|
||||||
|
The first pass through the ``rcu_node`` tree updates bitmasks based on
|
||||||
|
CPUs having come online or gone offline since the start of the previous
|
||||||
|
grace period. In the common case where the number of online CPUs for
|
||||||
|
this ``rcu_node`` structure has not transitioned to or from zero, this
|
||||||
|
pass will scan only the leaf ``rcu_node`` structures. However, if the
|
||||||
|
number of online CPUs for a given leaf ``rcu_node`` structure has
|
||||||
|
transitioned from zero, ``rcu_init_new_rnp()`` will be invoked for the
|
||||||
|
first incoming CPU. Similarly, if the number of online CPUs for a given
|
||||||
|
leaf ``rcu_node`` structure has transitioned to zero,
|
||||||
|
``rcu_cleanup_dead_rnp()`` will be invoked for the last outgoing CPU.
|
||||||
|
The diagram below shows the path of ordering if the leftmost
|
||||||
|
``rcu_node`` structure onlines its first CPU and if the next
|
||||||
|
``rcu_node`` structure has no online CPUs (or, alternatively if the
|
||||||
|
leftmost ``rcu_node`` structure offlines its last CPU and if the next
|
||||||
|
``rcu_node`` structure has no online CPUs).
|
||||||
|
|
||||||
|
.. kernel-figure:: TreeRCU-gp-init-1.svg
|
||||||
|
|
||||||
|
The final ``rcu_gp_init()`` pass through the ``rcu_node`` tree traverses
|
||||||
|
breadth-first, setting each ``rcu_node`` structure's ``->gp_seq`` field
|
||||||
|
to the newly advanced value from the ``rcu_state`` structure, as shown
|
||||||
|
in the following diagram.
|
||||||
|
|
||||||
|
.. kernel-figure:: TreeRCU-gp-init-1.svg
|
||||||
|
|
||||||
|
This change will also cause each CPU's next call to
|
||||||
|
``__note_gp_changes()`` to notice that a new grace period has started,
|
||||||
|
as described in the next section. But because the grace-period kthread
|
||||||
|
started the grace period at the root (with the advancing of the
|
||||||
|
``rcu_state`` structure's ``->gp_seq`` field) before setting each leaf
|
||||||
|
``rcu_node`` structure's ``->gp_seq`` field, each CPU's observation of
|
||||||
|
the start of the grace period will happen after the actual start of the
|
||||||
|
grace period.
|
||||||
|
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
| **Quick Quiz**: |
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
| But what about the CPU that started the grace period? Why wouldn't it |
|
||||||
|
| see the start of the grace period right when it started that grace |
|
||||||
|
| period? |
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
| **Answer**: |
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
| In some deep philosophical and overly anthromorphized sense, yes, the |
|
||||||
|
| CPU starting the grace period is immediately aware of having done so. |
|
||||||
|
| However, if we instead assume that RCU is not self-aware, then even |
|
||||||
|
| the CPU starting the grace period does not really become aware of the |
|
||||||
|
| start of this grace period until its first call to |
|
||||||
|
| ``__note_gp_changes()``. On the other hand, this CPU potentially gets |
|
||||||
|
| early notification because it invokes ``__note_gp_changes()`` during |
|
||||||
|
| its last ``rcu_gp_init()`` pass through its leaf ``rcu_node`` |
|
||||||
|
| structure. |
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
|
||||||
|
Self-Reported Quiescent States
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
When all entities that might block the grace period have reported
|
||||||
|
quiescent states (or as described in a later section, had quiescent
|
||||||
|
states reported on their behalf), the grace period can end. Online
|
||||||
|
non-idle CPUs report their own quiescent states, as shown in the
|
||||||
|
following diagram:
|
||||||
|
|
||||||
|
.. kernel-figure:: TreeRCU-qs.svg
|
||||||
|
|
||||||
|
This is for the last CPU to report a quiescent state, which signals the
|
||||||
|
end of the grace period. Earlier quiescent states would push up the
|
||||||
|
``rcu_node`` tree only until they encountered an ``rcu_node`` structure
|
||||||
|
that is waiting for additional quiescent states. However, ordering is
|
||||||
|
nevertheless preserved because some later quiescent state will acquire
|
||||||
|
that ``rcu_node`` structure's ``->lock``.
|
||||||
|
|
||||||
|
Any number of events can lead up to a CPU invoking ``note_gp_changes``
|
||||||
|
(or alternatively, directly invoking ``__note_gp_changes()``), at which
|
||||||
|
point that CPU will notice the start of a new grace period while holding
|
||||||
|
its leaf ``rcu_node`` lock. Therefore, all execution shown in this
|
||||||
|
diagram happens after the start of the grace period. In addition, this
|
||||||
|
CPU will consider any RCU read-side critical section that started before
|
||||||
|
the invocation of ``__note_gp_changes()`` to have started before the
|
||||||
|
grace period, and thus a critical section that the grace period must
|
||||||
|
wait on.
|
||||||
|
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
| **Quick Quiz**: |
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
| But a RCU read-side critical section might have started after the |
|
||||||
|
| beginning of the grace period (the advancing of ``->gp_seq`` from |
|
||||||
|
| earlier), so why should the grace period wait on such a critical |
|
||||||
|
| section? |
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
| **Answer**: |
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
| It is indeed not necessary for the grace period to wait on such a |
|
||||||
|
| critical section. However, it is permissible to wait on it. And it is |
|
||||||
|
| furthermore important to wait on it, as this lazy approach is far |
|
||||||
|
| more scalable than a “big bang” all-at-once grace-period start could |
|
||||||
|
| possibly be. |
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
|
||||||
|
If the CPU does a context switch, a quiescent state will be noted by
|
||||||
|
``rcu_note_context_switch()`` on the left. On the other hand, if the CPU
|
||||||
|
takes a scheduler-clock interrupt while executing in usermode, a
|
||||||
|
quiescent state will be noted by ``rcu_sched_clock_irq()`` on the right.
|
||||||
|
Either way, the passage through a quiescent state will be noted in a
|
||||||
|
per-CPU variable.
|
||||||
|
|
||||||
|
The next time an ``RCU_SOFTIRQ`` handler executes on this CPU (for
|
||||||
|
example, after the next scheduler-clock interrupt), ``rcu_core()`` will
|
||||||
|
invoke ``rcu_check_quiescent_state()``, which will notice the recorded
|
||||||
|
quiescent state, and invoke ``rcu_report_qs_rdp()``. If
|
||||||
|
``rcu_report_qs_rdp()`` verifies that the quiescent state really does
|
||||||
|
apply to the current grace period, it invokes ``rcu_report_rnp()`` which
|
||||||
|
traverses up the ``rcu_node`` tree as shown at the bottom of the
|
||||||
|
diagram, clearing bits from each ``rcu_node`` structure's ``->qsmask``
|
||||||
|
field, and propagating up the tree when the result is zero.
|
||||||
|
|
||||||
|
Note that traversal passes upwards out of a given ``rcu_node`` structure
|
||||||
|
only if the current CPU is reporting the last quiescent state for the
|
||||||
|
subtree headed by that ``rcu_node`` structure. A key point is that if a
|
||||||
|
CPU's traversal stops at a given ``rcu_node`` structure, then there will
|
||||||
|
be a later traversal by another CPU (or perhaps the same one) that
|
||||||
|
proceeds upwards from that point, and the ``rcu_node`` ``->lock``
|
||||||
|
guarantees that the first CPU's quiescent state happens before the
|
||||||
|
remainder of the second CPU's traversal. Applying this line of thought
|
||||||
|
repeatedly shows that all CPUs' quiescent states happen before the last
|
||||||
|
CPU traverses through the root ``rcu_node`` structure, the “last CPU”
|
||||||
|
being the one that clears the last bit in the root ``rcu_node``
|
||||||
|
structure's ``->qsmask`` field.
|
||||||
|
|
||||||
|
Dynamic Tick Interface
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
Due to energy-efficiency considerations, RCU is forbidden from
|
||||||
|
disturbing idle CPUs. CPUs are therefore required to notify RCU when
|
||||||
|
entering or leaving idle state, which they do via fully ordered
|
||||||
|
value-returning atomic operations on a per-CPU variable. The ordering
|
||||||
|
effects are as shown below:
|
||||||
|
|
||||||
|
.. kernel-figure:: TreeRCU-dyntick.svg
|
||||||
|
|
||||||
|
The RCU grace-period kernel thread samples the per-CPU idleness variable
|
||||||
|
while holding the corresponding CPU's leaf ``rcu_node`` structure's
|
||||||
|
``->lock``. This means that any RCU read-side critical sections that
|
||||||
|
precede the idle period (the oval near the top of the diagram above)
|
||||||
|
will happen before the end of the current grace period. Similarly, the
|
||||||
|
beginning of the current grace period will happen before any RCU
|
||||||
|
read-side critical sections that follow the idle period (the oval near
|
||||||
|
the bottom of the diagram above).
|
||||||
|
|
||||||
|
Plumbing this into the full grace-period execution is described
|
||||||
|
`below <#Forcing%20Quiescent%20States>`__.
|
||||||
|
|
||||||
|
CPU-Hotplug Interface
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
RCU is also forbidden from disturbing offline CPUs, which might well be
|
||||||
|
powered off and removed from the system completely. CPUs are therefore
|
||||||
|
required to notify RCU of their comings and goings as part of the
|
||||||
|
corresponding CPU hotplug operations. The ordering effects are shown
|
||||||
|
below:
|
||||||
|
|
||||||
|
.. kernel-figure:: TreeRCU-hotplug.svg
|
||||||
|
|
||||||
|
Because CPU hotplug operations are much less frequent than idle
|
||||||
|
transitions, they are heavier weight, and thus acquire the CPU's leaf
|
||||||
|
``rcu_node`` structure's ``->lock`` and update this structure's
|
||||||
|
``->qsmaskinitnext``. The RCU grace-period kernel thread samples this
|
||||||
|
mask to detect CPUs having gone offline since the beginning of this
|
||||||
|
grace period.
|
||||||
|
|
||||||
|
Plumbing this into the full grace-period execution is described
|
||||||
|
`below <#Forcing%20Quiescent%20States>`__.
|
||||||
|
|
||||||
|
Forcing Quiescent States
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
As noted above, idle and offline CPUs cannot report their own quiescent
|
||||||
|
states, and therefore the grace-period kernel thread must do the
|
||||||
|
reporting on their behalf. This process is called “forcing quiescent
|
||||||
|
states”, it is repeated every few jiffies, and its ordering effects are
|
||||||
|
shown below:
|
||||||
|
|
||||||
|
.. kernel-figure:: TreeRCU-gp-fqs.svg
|
||||||
|
|
||||||
|
Each pass of quiescent state forcing is guaranteed to traverse the leaf
|
||||||
|
``rcu_node`` structures, and if there are no new quiescent states due to
|
||||||
|
recently idled and/or offlined CPUs, then only the leaves are traversed.
|
||||||
|
However, if there is a newly offlined CPU as illustrated on the left or
|
||||||
|
a newly idled CPU as illustrated on the right, the corresponding
|
||||||
|
quiescent state will be driven up towards the root. As with
|
||||||
|
self-reported quiescent states, the upwards driving stops once it
|
||||||
|
reaches an ``rcu_node`` structure that has quiescent states outstanding
|
||||||
|
from other CPUs.
|
||||||
|
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
| **Quick Quiz**: |
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
| The leftmost drive to root stopped before it reached the root |
|
||||||
|
| ``rcu_node`` structure, which means that there are still CPUs |
|
||||||
|
| subordinate to that structure on which the current grace period is |
|
||||||
|
| waiting. Given that, how is it possible that the rightmost drive to |
|
||||||
|
| root ended the grace period? |
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
| **Answer**: |
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
| Good analysis! It is in fact impossible in the absence of bugs in |
|
||||||
|
| RCU. But this diagram is complex enough as it is, so simplicity |
|
||||||
|
| overrode accuracy. You can think of it as poetic license, or you can |
|
||||||
|
| think of it as misdirection that is resolved in the |
|
||||||
|
| `stitched-together diagram <#Putting%20It%20All%20Together>`__. |
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
|
||||||
|
Grace-Period Cleanup
|
||||||
|
^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
Grace-period cleanup first scans the ``rcu_node`` tree breadth-first
|
||||||
|
advancing all the ``->gp_seq`` fields, then it advances the
|
||||||
|
``rcu_state`` structure's ``->gp_seq`` field. The ordering effects are
|
||||||
|
shown below:
|
||||||
|
|
||||||
|
.. kernel-figure:: TreeRCU-gp-cleanup.svg
|
||||||
|
|
||||||
|
As indicated by the oval at the bottom of the diagram, once grace-period
|
||||||
|
cleanup is complete, the next grace period can begin.
|
||||||
|
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
| **Quick Quiz**: |
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
| But when precisely does the grace period end? |
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
| **Answer**: |
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
| There is no useful single point at which the grace period can be said |
|
||||||
|
| to end. The earliest reasonable candidate is as soon as the last CPU |
|
||||||
|
| has reported its quiescent state, but it may be some milliseconds |
|
||||||
|
| before RCU becomes aware of this. The latest reasonable candidate is |
|
||||||
|
| once the ``rcu_state`` structure's ``->gp_seq`` field has been |
|
||||||
|
| updated, but it is quite possible that some CPUs have already |
|
||||||
|
| completed phase two of their updates by that time. In short, if you |
|
||||||
|
| are going to work with RCU, you need to learn to embrace uncertainty. |
|
||||||
|
+-----------------------------------------------------------------------+
|
||||||
|
|
||||||
|
Callback Invocation
|
||||||
|
^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
Once a given CPU's leaf ``rcu_node`` structure's ``->gp_seq`` field has
|
||||||
|
been updated, that CPU can begin invoking its RCU callbacks that were
|
||||||
|
waiting for this grace period to end. These callbacks are identified by
|
||||||
|
``rcu_advance_cbs()``, which is usually invoked by
|
||||||
|
``__note_gp_changes()``. As shown in the diagram below, this invocation
|
||||||
|
can be triggered by the scheduling-clock interrupt
|
||||||
|
(``rcu_sched_clock_irq()`` on the left) or by idle entry
|
||||||
|
(``rcu_cleanup_after_idle()`` on the right, but only for kernels build
|
||||||
|
with ``CONFIG_RCU_FAST_NO_HZ=y``). Either way, ``RCU_SOFTIRQ`` is
|
||||||
|
raised, which results in ``rcu_do_batch()`` invoking the callbacks,
|
||||||
|
which in turn allows those callbacks to carry out (either directly or
|
||||||
|
indirectly via wakeup) the needed phase-two processing for each update.
|
||||||
|
|
||||||
|
.. kernel-figure:: TreeRCU-callback-invocation.svg
|
||||||
|
|
||||||
|
Please note that callback invocation can also be prompted by any number
|
||||||
|
of corner-case code paths, for example, when a CPU notes that it has
|
||||||
|
excessive numbers of callbacks queued. In all cases, the CPU acquires
|
||||||
|
its leaf ``rcu_node`` structure's ``->lock`` before invoking callbacks,
|
||||||
|
which preserves the required ordering against the newly completed grace
|
||||||
|
period.
|
||||||
|
|
||||||
|
However, if the callback function communicates to other CPUs, for
|
||||||
|
example, doing a wakeup, then it is that function's responsibility to
|
||||||
|
maintain ordering. For example, if the callback function wakes up a task
|
||||||
|
that runs on some other CPU, proper ordering must in place in both the
|
||||||
|
callback function and the task being awakened. To see why this is
|
||||||
|
important, consider the top half of the `grace-period
|
||||||
|
cleanup <#Grace-Period%20Cleanup>`__ diagram. The callback might be
|
||||||
|
running on a CPU corresponding to the leftmost leaf ``rcu_node``
|
||||||
|
structure, and awaken a task that is to run on a CPU corresponding to
|
||||||
|
the rightmost leaf ``rcu_node`` structure, and the grace-period kernel
|
||||||
|
thread might not yet have reached the rightmost leaf. In this case, the
|
||||||
|
grace period's memory ordering might not yet have reached that CPU, so
|
||||||
|
again the callback function and the awakened task must supply proper
|
||||||
|
ordering.
|
||||||
|
|
||||||
|
Putting It All Together
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
A stitched-together diagram is here:
|
||||||
|
|
||||||
|
.. kernel-figure:: TreeRCU-gp.svg
|
||||||
|
|
||||||
|
Legal Statement
|
||||||
|
~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
This work represents the view of the author and does not necessarily
|
||||||
|
represent the view of IBM.
|
||||||
|
|
||||||
|
Linux is a registered trademark of Linus Torvalds.
|
||||||
|
|
||||||
|
Other company, product, and service names may be trademarks or service
|
||||||
|
marks of others.
|
@@ -3880,7 +3880,7 @@
|
|||||||
font-style="normal"
|
font-style="normal"
|
||||||
y="-4418.6582"
|
y="-4418.6582"
|
||||||
x="3745.7725"
|
x="3745.7725"
|
||||||
xml:space="preserve">rcu_node_context_switch()</text>
|
xml:space="preserve">rcu_note_context_switch()</text>
|
||||||
</g>
|
</g>
|
||||||
<g
|
<g
|
||||||
transform="translate(1881.1886,54048.57)"
|
transform="translate(1881.1886,54048.57)"
|
||||||
|
Before Width: | Height: | Size: 209 KiB After Width: | Height: | Size: 209 KiB |
@@ -753,7 +753,7 @@
|
|||||||
font-style="normal"
|
font-style="normal"
|
||||||
y="-4418.6582"
|
y="-4418.6582"
|
||||||
x="3745.7725"
|
x="3745.7725"
|
||||||
xml:space="preserve">rcu_node_context_switch()</text>
|
xml:space="preserve">rcu_note_context_switch()</text>
|
||||||
</g>
|
</g>
|
||||||
<g
|
<g
|
||||||
transform="translate(3131.2648,-585.6713)"
|
transform="translate(3131.2648,-585.6713)"
|
||||||
|
Before Width: | Height: | Size: 43 KiB After Width: | Height: | Size: 43 KiB |
File diff suppressed because it is too large
Load Diff
2704
Documentation/RCU/Design/Requirements/Requirements.rst
Normal file
2704
Documentation/RCU/Design/Requirements/Requirements.rst
Normal file
File diff suppressed because it is too large
Load Diff
@@ -5,12 +5,17 @@ RCU concepts
|
|||||||
============
|
============
|
||||||
|
|
||||||
.. toctree::
|
.. toctree::
|
||||||
:maxdepth: 1
|
:maxdepth: 3
|
||||||
|
|
||||||
rcu
|
rcu
|
||||||
listRCU
|
listRCU
|
||||||
UP
|
UP
|
||||||
|
|
||||||
|
Design/Memory-Ordering/Tree-RCU-Memory-Ordering
|
||||||
|
Design/Expedited-Grace-Periods/Expedited-Grace-Periods
|
||||||
|
Design/Requirements/Requirements
|
||||||
|
Design/Data-Structures/Data-Structures
|
||||||
|
|
||||||
.. only:: subproject and html
|
.. only:: subproject and html
|
||||||
|
|
||||||
Indices
|
Indices
|
||||||
|
@@ -96,7 +96,17 @@ other flavors of rcu_dereference(). On the other hand, it is illegal
|
|||||||
to use rcu_dereference_protected() if either the RCU-protected pointer
|
to use rcu_dereference_protected() if either the RCU-protected pointer
|
||||||
or the RCU-protected data that it points to can change concurrently.
|
or the RCU-protected data that it points to can change concurrently.
|
||||||
|
|
||||||
There are currently only "universal" versions of the rcu_assign_pointer()
|
Like rcu_dereference(), when lockdep is enabled, RCU list and hlist
|
||||||
and RCU list-/tree-traversal primitives, which do not (yet) check for
|
traversal primitives check for being called from within an RCU read-side
|
||||||
being in an RCU read-side critical section. In the future, separate
|
critical section. However, a lockdep expression can be passed to them
|
||||||
versions of these primitives might be created.
|
as a additional optional argument. With this lockdep expression, these
|
||||||
|
traversal primitives will complain only if the lockdep expression is
|
||||||
|
false and they are called from outside any RCU read-side critical section.
|
||||||
|
|
||||||
|
For example, the workqueue for_each_pwq() macro is intended to be used
|
||||||
|
either within an RCU read-side critical section or with wq->mutex held.
|
||||||
|
It is thus implemented as follows:
|
||||||
|
|
||||||
|
#define for_each_pwq(pwq, wq)
|
||||||
|
list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node,
|
||||||
|
lock_is_held(&(wq->mutex).dep_map))
|
||||||
|
@@ -290,7 +290,7 @@ rcu_dereference()
|
|||||||
at any time, including immediately after the rcu_dereference().
|
at any time, including immediately after the rcu_dereference().
|
||||||
And, again like rcu_assign_pointer(), rcu_dereference() is
|
And, again like rcu_assign_pointer(), rcu_dereference() is
|
||||||
typically used indirectly, via the _rcu list-manipulation
|
typically used indirectly, via the _rcu list-manipulation
|
||||||
primitives, such as list_for_each_entry_rcu().
|
primitives, such as list_for_each_entry_rcu() [2].
|
||||||
|
|
||||||
[1] The variant rcu_dereference_protected() can be used outside
|
[1] The variant rcu_dereference_protected() can be used outside
|
||||||
of an RCU read-side critical section as long as the usage is
|
of an RCU read-side critical section as long as the usage is
|
||||||
@@ -302,9 +302,17 @@ rcu_dereference()
|
|||||||
must prohibit. The rcu_dereference_protected() variant takes
|
must prohibit. The rcu_dereference_protected() variant takes
|
||||||
a lockdep expression to indicate which locks must be acquired
|
a lockdep expression to indicate which locks must be acquired
|
||||||
by the caller. If the indicated protection is not provided,
|
by the caller. If the indicated protection is not provided,
|
||||||
a lockdep splat is emitted. See RCU/Design/Requirements/Requirements.html
|
a lockdep splat is emitted. See Documentation/RCU/Design/Requirements/Requirements.rst
|
||||||
and the API's code comments for more details and example usage.
|
and the API's code comments for more details and example usage.
|
||||||
|
|
||||||
|
[2] If the list_for_each_entry_rcu() instance might be used by
|
||||||
|
update-side code as well as by RCU readers, then an additional
|
||||||
|
lockdep expression can be added to its list of arguments.
|
||||||
|
For example, given an additional "lock_is_held(&mylock)" argument,
|
||||||
|
the RCU lockdep code would complain only if this instance was
|
||||||
|
invoked outside of an RCU read-side critical section and without
|
||||||
|
the protection of mylock.
|
||||||
|
|
||||||
The following diagram shows how each API communicates among the
|
The following diagram shows how each API communicates among the
|
||||||
reader, updater, and reclaimer.
|
reader, updater, and reclaimer.
|
||||||
|
|
||||||
@@ -630,7 +638,7 @@ been able to write-acquire the lock otherwise. The smp_mb__after_spinlock()
|
|||||||
promotes synchronize_rcu() to a full memory barrier in compliance with
|
promotes synchronize_rcu() to a full memory barrier in compliance with
|
||||||
the "Memory-Barrier Guarantees" listed in:
|
the "Memory-Barrier Guarantees" listed in:
|
||||||
|
|
||||||
Documentation/RCU/Design/Requirements/Requirements.html.
|
Documentation/RCU/Design/Requirements/Requirements.rst
|
||||||
|
|
||||||
It is possible to nest rcu_read_lock(), since reader-writer locks may
|
It is possible to nest rcu_read_lock(), since reader-writer locks may
|
||||||
be recursively acquired. Note also that rcu_read_lock() is immune
|
be recursively acquired. Note also that rcu_read_lock() is immune
|
||||||
|
@@ -56,7 +56,7 @@ setid capabilities from the application completely and refactor the process
|
|||||||
spawning semantics in the application (e.g. by using a privileged helper program
|
spawning semantics in the application (e.g. by using a privileged helper program
|
||||||
to do process spawning and UID/GID transitions). Unfortunately, there are a
|
to do process spawning and UID/GID transitions). Unfortunately, there are a
|
||||||
number of semantics around process spawning that would be affected by this, such
|
number of semantics around process spawning that would be affected by this, such
|
||||||
as fork() calls where the program doesn???t immediately call exec() after the
|
as fork() calls where the program doesn't immediately call exec() after the
|
||||||
fork(), parent processes specifying custom environment variables or command line
|
fork(), parent processes specifying custom environment variables or command line
|
||||||
args for spawned child processes, or inheritance of file handles across a
|
args for spawned child processes, or inheritance of file handles across a
|
||||||
fork()/exec(). Because of this, as solution that uses a privileged helper in
|
fork()/exec(). Because of this, as solution that uses a privileged helper in
|
||||||
@@ -72,7 +72,7 @@ own user namespace, and only approved UIDs/GIDs could be mapped back to the
|
|||||||
initial system user namespace, affectively preventing privilege escalation.
|
initial system user namespace, affectively preventing privilege escalation.
|
||||||
Unfortunately, it is not generally feasible to use user namespaces in isolation,
|
Unfortunately, it is not generally feasible to use user namespaces in isolation,
|
||||||
without pairing them with other namespace types, which is not always an option.
|
without pairing them with other namespace types, which is not always an option.
|
||||||
Linux checks for capabilities based off of the user namespace that ???owns??? some
|
Linux checks for capabilities based off of the user namespace that "owns" some
|
||||||
entity. For example, Linux has the notion that network namespaces are owned by
|
entity. For example, Linux has the notion that network namespaces are owned by
|
||||||
the user namespace in which they were created. A consequence of this is that
|
the user namespace in which they were created. A consequence of this is that
|
||||||
capability checks for access to a given network namespace are done by checking
|
capability checks for access to a given network namespace are done by checking
|
||||||
|
@@ -1120,8 +1120,9 @@ PAGE_SIZE multiple when read back.
|
|||||||
|
|
||||||
Best-effort memory protection. If the memory usage of a
|
Best-effort memory protection. If the memory usage of a
|
||||||
cgroup is within its effective low boundary, the cgroup's
|
cgroup is within its effective low boundary, the cgroup's
|
||||||
memory won't be reclaimed unless memory can be reclaimed
|
memory won't be reclaimed unless there is no reclaimable
|
||||||
from unprotected cgroups. Above the effective low boundary (or
|
memory available in unprotected cgroups.
|
||||||
|
Above the effective low boundary (or
|
||||||
effective min boundary if it is higher), pages are reclaimed
|
effective min boundary if it is higher), pages are reclaimed
|
||||||
proportionally to the overage, reducing reclaim pressure for
|
proportionally to the overage, reducing reclaim pressure for
|
||||||
smaller overages.
|
smaller overages.
|
||||||
@@ -1288,7 +1289,12 @@ PAGE_SIZE multiple when read back.
|
|||||||
inactive_anon, active_anon, inactive_file, active_file, unevictable
|
inactive_anon, active_anon, inactive_file, active_file, unevictable
|
||||||
Amount of memory, swap-backed and filesystem-backed,
|
Amount of memory, swap-backed and filesystem-backed,
|
||||||
on the internal memory management lists used by the
|
on the internal memory management lists used by the
|
||||||
page reclaim algorithm
|
page reclaim algorithm.
|
||||||
|
|
||||||
|
As these represent internal list state (eg. shmem pages are on anon
|
||||||
|
memory management lists), inactive_foo + active_foo may not be equal to
|
||||||
|
the value for the foo counter, since the foo counter is type-based, not
|
||||||
|
list-based.
|
||||||
|
|
||||||
slab_reclaimable
|
slab_reclaimable
|
||||||
Part of "slab" that might be reclaimed, such as
|
Part of "slab" that might be reclaimed, such as
|
||||||
@@ -1334,7 +1340,7 @@ PAGE_SIZE multiple when read back.
|
|||||||
|
|
||||||
pgdeactivate
|
pgdeactivate
|
||||||
|
|
||||||
Amount of pages moved to the inactive LRU lis
|
Amount of pages moved to the inactive LRU list
|
||||||
|
|
||||||
pglazyfree
|
pglazyfree
|
||||||
|
|
||||||
@@ -1920,7 +1926,7 @@ Cpuset Interface Files
|
|||||||
|
|
||||||
It accepts only the following input values when written to.
|
It accepts only the following input values when written to.
|
||||||
|
|
||||||
"root" - a paritition root
|
"root" - a partition root
|
||||||
"member" - a non-root member of a partition
|
"member" - a non-root member of a partition
|
||||||
|
|
||||||
When set to be a partition root, the current cgroup is the
|
When set to be a partition root, the current cgroup is the
|
||||||
|
@@ -1,11 +1,11 @@
|
|||||||
=============================================================
|
=========================================
|
||||||
Usage of the new open sourced rbu (Remote BIOS Update) driver
|
Dell Remote BIOS Update driver (dell_rbu)
|
||||||
=============================================================
|
=========================================
|
||||||
|
|
||||||
Purpose
|
Purpose
|
||||||
=======
|
=======
|
||||||
|
|
||||||
Document demonstrating the use of the Dell Remote BIOS Update driver.
|
Document demonstrating the use of the Dell Remote BIOS Update driver
|
||||||
for updating BIOS images on Dell servers and desktops.
|
for updating BIOS images on Dell servers and desktops.
|
||||||
|
|
||||||
Scope
|
Scope
|
||||||
@@ -37,7 +37,7 @@ maintains a link list of packets for reading them back.
|
|||||||
|
|
||||||
If the dell_rbu driver is unloaded all the allocated memory is freed.
|
If the dell_rbu driver is unloaded all the allocated memory is freed.
|
||||||
|
|
||||||
The rbu driver needs to have an application (as mentioned above)which will
|
The rbu driver needs to have an application (as mentioned above) which will
|
||||||
inform the BIOS to enable the update in the next system reboot.
|
inform the BIOS to enable the update in the next system reboot.
|
||||||
|
|
||||||
The user should not unload the rbu driver after downloading the BIOS image
|
The user should not unload the rbu driver after downloading the BIOS image
|
||||||
@@ -71,7 +71,7 @@ be downloaded. It is done as below::
|
|||||||
echo XXXX > /sys/devices/platform/dell_rbu/packet_size
|
echo XXXX > /sys/devices/platform/dell_rbu/packet_size
|
||||||
|
|
||||||
In the packet update mechanism, the user needs to create a new file having
|
In the packet update mechanism, the user needs to create a new file having
|
||||||
packets of data arranged back to back. It can be done as follows
|
packets of data arranged back to back. It can be done as follows:
|
||||||
The user creates packets header, gets the chunk of the BIOS image and
|
The user creates packets header, gets the chunk of the BIOS image and
|
||||||
places it next to the packetheader; now, the packetheader + BIOS image chunk
|
places it next to the packetheader; now, the packetheader + BIOS image chunk
|
||||||
added together should match the specified packet_size. This makes one
|
added together should match the specified packet_size. This makes one
|
||||||
@@ -114,7 +114,7 @@ The entries can be recreated by doing the following::
|
|||||||
|
|
||||||
echo init > /sys/devices/platform/dell_rbu/image_type
|
echo init > /sys/devices/platform/dell_rbu/image_type
|
||||||
|
|
||||||
.. note:: echoing init in image_type does not change it original value.
|
.. note:: echoing init in image_type does not change its original value.
|
||||||
|
|
||||||
Also the driver provides /sys/devices/platform/dell_rbu/data readonly file to
|
Also the driver provides /sys/devices/platform/dell_rbu/data readonly file to
|
||||||
read back the image downloaded.
|
read back the image downloaded.
|
@@ -31,218 +31,233 @@ configured "bad blocks" will be treated as bad, or bypassed.
|
|||||||
This allows the pre-writing of test data and metadata prior to
|
This allows the pre-writing of test data and metadata prior to
|
||||||
simulating a "failure" event where bad sectors start to appear.
|
simulating a "failure" event where bad sectors start to appear.
|
||||||
|
|
||||||
Table parameters:
|
Table parameters
|
||||||
-----------------
|
----------------
|
||||||
<device_path> <offset> <blksz>
|
<device_path> <offset> <blksz>
|
||||||
|
|
||||||
Mandatory parameters:
|
Mandatory parameters:
|
||||||
<device_path>: path to the block device.
|
<device_path>:
|
||||||
<offset>: offset to data area from start of device_path
|
Path to the block device.
|
||||||
<blksz>: block size in bytes
|
|
||||||
|
<offset>:
|
||||||
|
Offset to data area from start of device_path
|
||||||
|
|
||||||
|
<blksz>:
|
||||||
|
Block size in bytes
|
||||||
|
|
||||||
(minimum 512, maximum 1073741824, must be a power of 2)
|
(minimum 512, maximum 1073741824, must be a power of 2)
|
||||||
|
|
||||||
Usage instructions:
|
Usage instructions
|
||||||
-------------------
|
------------------
|
||||||
|
|
||||||
First, find the size (in 512-byte sectors) of the device to be used:
|
First, find the size (in 512-byte sectors) of the device to be used::
|
||||||
|
|
||||||
$ sudo blockdev --getsz /dev/vdb1
|
$ sudo blockdev --getsz /dev/vdb1
|
||||||
33552384
|
33552384
|
||||||
|
|
||||||
Create the dm-dust device:
|
Create the dm-dust device:
|
||||||
(For a device with a block size of 512 bytes)
|
(For a device with a block size of 512 bytes)
|
||||||
$ sudo dmsetup create dust1 --table '0 33552384 dust /dev/vdb1 0 512'
|
|
||||||
|
::
|
||||||
|
|
||||||
|
$ sudo dmsetup create dust1 --table '0 33552384 dust /dev/vdb1 0 512'
|
||||||
|
|
||||||
(For a device with a block size of 4096 bytes)
|
(For a device with a block size of 4096 bytes)
|
||||||
$ sudo dmsetup create dust1 --table '0 33552384 dust /dev/vdb1 0 4096'
|
|
||||||
|
::
|
||||||
|
|
||||||
|
$ sudo dmsetup create dust1 --table '0 33552384 dust /dev/vdb1 0 4096'
|
||||||
|
|
||||||
Check the status of the read behavior ("bypass" indicates that all I/O
|
Check the status of the read behavior ("bypass" indicates that all I/O
|
||||||
will be passed through to the underlying device):
|
will be passed through to the underlying device)::
|
||||||
$ sudo dmsetup status dust1
|
|
||||||
0 33552384 dust 252:17 bypass
|
|
||||||
|
|
||||||
$ sudo dd if=/dev/mapper/dust1 of=/dev/null bs=512 count=128 iflag=direct
|
$ sudo dmsetup status dust1
|
||||||
128+0 records in
|
0 33552384 dust 252:17 bypass
|
||||||
128+0 records out
|
|
||||||
|
|
||||||
$ sudo dd if=/dev/zero of=/dev/mapper/dust1 bs=512 count=128 oflag=direct
|
$ sudo dd if=/dev/mapper/dust1 of=/dev/null bs=512 count=128 iflag=direct
|
||||||
128+0 records in
|
128+0 records in
|
||||||
128+0 records out
|
128+0 records out
|
||||||
|
|
||||||
Adding and removing bad blocks:
|
$ sudo dd if=/dev/zero of=/dev/mapper/dust1 bs=512 count=128 oflag=direct
|
||||||
-------------------------------
|
128+0 records in
|
||||||
|
128+0 records out
|
||||||
|
|
||||||
|
Adding and removing bad blocks
|
||||||
|
------------------------------
|
||||||
|
|
||||||
At any time (i.e.: whether the device has the "bad block" emulation
|
At any time (i.e.: whether the device has the "bad block" emulation
|
||||||
enabled or disabled), bad blocks may be added or removed from the
|
enabled or disabled), bad blocks may be added or removed from the
|
||||||
device via the "addbadblock" and "removebadblock" messages:
|
device via the "addbadblock" and "removebadblock" messages::
|
||||||
|
|
||||||
$ sudo dmsetup message dust1 0 addbadblock 60
|
$ sudo dmsetup message dust1 0 addbadblock 60
|
||||||
kernel: device-mapper: dust: badblock added at block 60
|
kernel: device-mapper: dust: badblock added at block 60
|
||||||
|
|
||||||
$ sudo dmsetup message dust1 0 addbadblock 67
|
$ sudo dmsetup message dust1 0 addbadblock 67
|
||||||
kernel: device-mapper: dust: badblock added at block 67
|
kernel: device-mapper: dust: badblock added at block 67
|
||||||
|
|
||||||
$ sudo dmsetup message dust1 0 addbadblock 72
|
$ sudo dmsetup message dust1 0 addbadblock 72
|
||||||
kernel: device-mapper: dust: badblock added at block 72
|
kernel: device-mapper: dust: badblock added at block 72
|
||||||
|
|
||||||
These bad blocks will be stored in the "bad block list".
|
These bad blocks will be stored in the "bad block list".
|
||||||
While the device is in "bypass" mode, reads and writes will succeed:
|
While the device is in "bypass" mode, reads and writes will succeed::
|
||||||
|
|
||||||
$ sudo dmsetup status dust1
|
$ sudo dmsetup status dust1
|
||||||
0 33552384 dust 252:17 bypass
|
0 33552384 dust 252:17 bypass
|
||||||
|
|
||||||
Enabling block read failures:
|
Enabling block read failures
|
||||||
-----------------------------
|
----------------------------
|
||||||
|
|
||||||
To enable the "fail read on bad block" behavior, send the "enable" message:
|
To enable the "fail read on bad block" behavior, send the "enable" message::
|
||||||
|
|
||||||
$ sudo dmsetup message dust1 0 enable
|
$ sudo dmsetup message dust1 0 enable
|
||||||
kernel: device-mapper: dust: enabling read failures on bad sectors
|
kernel: device-mapper: dust: enabling read failures on bad sectors
|
||||||
|
|
||||||
$ sudo dmsetup status dust1
|
$ sudo dmsetup status dust1
|
||||||
0 33552384 dust 252:17 fail_read_on_bad_block
|
0 33552384 dust 252:17 fail_read_on_bad_block
|
||||||
|
|
||||||
With the device in "fail read on bad block" mode, attempting to read a
|
With the device in "fail read on bad block" mode, attempting to read a
|
||||||
block will encounter an "Input/output error":
|
block will encounter an "Input/output error"::
|
||||||
|
|
||||||
$ sudo dd if=/dev/mapper/dust1 of=/dev/null bs=512 count=1 skip=67 iflag=direct
|
$ sudo dd if=/dev/mapper/dust1 of=/dev/null bs=512 count=1 skip=67 iflag=direct
|
||||||
dd: error reading '/dev/mapper/dust1': Input/output error
|
dd: error reading '/dev/mapper/dust1': Input/output error
|
||||||
0+0 records in
|
0+0 records in
|
||||||
0+0 records out
|
0+0 records out
|
||||||
0 bytes copied, 0.00040651 s, 0.0 kB/s
|
0 bytes copied, 0.00040651 s, 0.0 kB/s
|
||||||
|
|
||||||
...and writing to the bad blocks will remove the blocks from the list,
|
...and writing to the bad blocks will remove the blocks from the list,
|
||||||
therefore emulating the "remap" behavior of hard disk drives:
|
therefore emulating the "remap" behavior of hard disk drives::
|
||||||
|
|
||||||
$ sudo dd if=/dev/zero of=/dev/mapper/dust1 bs=512 count=128 oflag=direct
|
$ sudo dd if=/dev/zero of=/dev/mapper/dust1 bs=512 count=128 oflag=direct
|
||||||
128+0 records in
|
128+0 records in
|
||||||
128+0 records out
|
128+0 records out
|
||||||
|
|
||||||
kernel: device-mapper: dust: block 60 removed from badblocklist by write
|
kernel: device-mapper: dust: block 60 removed from badblocklist by write
|
||||||
kernel: device-mapper: dust: block 67 removed from badblocklist by write
|
kernel: device-mapper: dust: block 67 removed from badblocklist by write
|
||||||
kernel: device-mapper: dust: block 72 removed from badblocklist by write
|
kernel: device-mapper: dust: block 72 removed from badblocklist by write
|
||||||
kernel: device-mapper: dust: block 87 removed from badblocklist by write
|
kernel: device-mapper: dust: block 87 removed from badblocklist by write
|
||||||
|
|
||||||
Bad block add/remove error handling:
|
Bad block add/remove error handling
|
||||||
------------------------------------
|
-----------------------------------
|
||||||
|
|
||||||
Attempting to add a bad block that already exists in the list will
|
Attempting to add a bad block that already exists in the list will
|
||||||
result in an "Invalid argument" error, as well as a helpful message:
|
result in an "Invalid argument" error, as well as a helpful message::
|
||||||
|
|
||||||
$ sudo dmsetup message dust1 0 addbadblock 88
|
$ sudo dmsetup message dust1 0 addbadblock 88
|
||||||
device-mapper: message ioctl on dust1 failed: Invalid argument
|
device-mapper: message ioctl on dust1 failed: Invalid argument
|
||||||
kernel: device-mapper: dust: block 88 already in badblocklist
|
kernel: device-mapper: dust: block 88 already in badblocklist
|
||||||
|
|
||||||
Attempting to remove a bad block that doesn't exist in the list will
|
Attempting to remove a bad block that doesn't exist in the list will
|
||||||
result in an "Invalid argument" error, as well as a helpful message:
|
result in an "Invalid argument" error, as well as a helpful message::
|
||||||
|
|
||||||
$ sudo dmsetup message dust1 0 removebadblock 87
|
$ sudo dmsetup message dust1 0 removebadblock 87
|
||||||
device-mapper: message ioctl on dust1 failed: Invalid argument
|
device-mapper: message ioctl on dust1 failed: Invalid argument
|
||||||
kernel: device-mapper: dust: block 87 not found in badblocklist
|
kernel: device-mapper: dust: block 87 not found in badblocklist
|
||||||
|
|
||||||
Counting the number of bad blocks in the bad block list:
|
Counting the number of bad blocks in the bad block list
|
||||||
--------------------------------------------------------
|
-------------------------------------------------------
|
||||||
|
|
||||||
To count the number of bad blocks configured in the device, run the
|
To count the number of bad blocks configured in the device, run the
|
||||||
following message command:
|
following message command::
|
||||||
|
|
||||||
$ sudo dmsetup message dust1 0 countbadblocks
|
$ sudo dmsetup message dust1 0 countbadblocks
|
||||||
|
|
||||||
A message will print with the number of bad blocks currently
|
A message will print with the number of bad blocks currently
|
||||||
configured on the device:
|
configured on the device::
|
||||||
|
|
||||||
kernel: device-mapper: dust: countbadblocks: 895 badblock(s) found
|
kernel: device-mapper: dust: countbadblocks: 895 badblock(s) found
|
||||||
|
|
||||||
Querying for specific bad blocks:
|
Querying for specific bad blocks
|
||||||
---------------------------------
|
--------------------------------
|
||||||
|
|
||||||
To find out if a specific block is in the bad block list, run the
|
To find out if a specific block is in the bad block list, run the
|
||||||
following message command:
|
following message command::
|
||||||
|
|
||||||
$ sudo dmsetup message dust1 0 queryblock 72
|
$ sudo dmsetup message dust1 0 queryblock 72
|
||||||
|
|
||||||
The following message will print if the block is in the list:
|
The following message will print if the block is in the list::
|
||||||
device-mapper: dust: queryblock: block 72 found in badblocklist
|
|
||||||
|
|
||||||
The following message will print if the block is in the list:
|
device-mapper: dust: queryblock: block 72 found in badblocklist
|
||||||
device-mapper: dust: queryblock: block 72 not found in badblocklist
|
|
||||||
|
The following message will print if the block is not in the list::
|
||||||
|
|
||||||
|
device-mapper: dust: queryblock: block 72 not found in badblocklist
|
||||||
|
|
||||||
The "queryblock" message command will work in both the "enabled"
|
The "queryblock" message command will work in both the "enabled"
|
||||||
and "disabled" modes, allowing the verification of whether a block
|
and "disabled" modes, allowing the verification of whether a block
|
||||||
will be treated as "bad" without having to issue I/O to the device,
|
will be treated as "bad" without having to issue I/O to the device,
|
||||||
or having to "enable" the bad block emulation.
|
or having to "enable" the bad block emulation.
|
||||||
|
|
||||||
Clearing the bad block list:
|
Clearing the bad block list
|
||||||
----------------------------
|
---------------------------
|
||||||
|
|
||||||
To clear the bad block list (without needing to individually run
|
To clear the bad block list (without needing to individually run
|
||||||
a "removebadblock" message command for every block), run the
|
a "removebadblock" message command for every block), run the
|
||||||
following message command:
|
following message command::
|
||||||
|
|
||||||
$ sudo dmsetup message dust1 0 clearbadblocks
|
$ sudo dmsetup message dust1 0 clearbadblocks
|
||||||
|
|
||||||
After clearing the bad block list, the following message will appear:
|
After clearing the bad block list, the following message will appear::
|
||||||
|
|
||||||
kernel: device-mapper: dust: clearbadblocks: badblocks cleared
|
kernel: device-mapper: dust: clearbadblocks: badblocks cleared
|
||||||
|
|
||||||
If there were no bad blocks to clear, the following message will
|
If there were no bad blocks to clear, the following message will
|
||||||
appear:
|
appear::
|
||||||
|
|
||||||
kernel: device-mapper: dust: clearbadblocks: no badblocks found
|
kernel: device-mapper: dust: clearbadblocks: no badblocks found
|
||||||
|
|
||||||
Message commands list:
|
Message commands list
|
||||||
----------------------
|
---------------------
|
||||||
|
|
||||||
Below is a list of the messages that can be sent to a dust device:
|
Below is a list of the messages that can be sent to a dust device:
|
||||||
|
|
||||||
Operations on blocks (requires a <blknum> argument):
|
Operations on blocks (requires a <blknum> argument)::
|
||||||
|
|
||||||
addbadblock <blknum>
|
addbadblock <blknum>
|
||||||
queryblock <blknum>
|
queryblock <blknum>
|
||||||
removebadblock <blknum>
|
removebadblock <blknum>
|
||||||
|
|
||||||
...where <blknum> is a block number within range of the device
|
...where <blknum> is a block number within range of the device
|
||||||
(corresponding to the block size of the device.)
|
(corresponding to the block size of the device.)
|
||||||
|
|
||||||
Single argument message commands:
|
Single argument message commands::
|
||||||
|
|
||||||
countbadblocks
|
countbadblocks
|
||||||
clearbadblocks
|
clearbadblocks
|
||||||
disable
|
disable
|
||||||
enable
|
enable
|
||||||
quiet
|
quiet
|
||||||
|
|
||||||
Device removal:
|
Device removal
|
||||||
---------------
|
--------------
|
||||||
|
|
||||||
When finished, remove the device via the "dmsetup remove" command:
|
When finished, remove the device via the "dmsetup remove" command::
|
||||||
|
|
||||||
$ sudo dmsetup remove dust1
|
$ sudo dmsetup remove dust1
|
||||||
|
|
||||||
Quiet mode:
|
Quiet mode
|
||||||
-----------
|
----------
|
||||||
|
|
||||||
On test runs with many bad blocks, it may be desirable to avoid
|
On test runs with many bad blocks, it may be desirable to avoid
|
||||||
excessive logging (from bad blocks added, removed, or "remapped").
|
excessive logging (from bad blocks added, removed, or "remapped").
|
||||||
This can be done by enabling "quiet mode" via the following message:
|
This can be done by enabling "quiet mode" via the following message::
|
||||||
|
|
||||||
$ sudo dmsetup message dust1 0 quiet
|
$ sudo dmsetup message dust1 0 quiet
|
||||||
|
|
||||||
This will suppress log messages from add / remove / removed by write
|
This will suppress log messages from add / remove / removed by write
|
||||||
operations. Log messages from "countbadblocks" or "queryblock"
|
operations. Log messages from "countbadblocks" or "queryblock"
|
||||||
message commands will still print in quiet mode.
|
message commands will still print in quiet mode.
|
||||||
|
|
||||||
The status of quiet mode can be seen by running "dmsetup status":
|
The status of quiet mode can be seen by running "dmsetup status"::
|
||||||
|
|
||||||
$ sudo dmsetup status dust1
|
$ sudo dmsetup status dust1
|
||||||
0 33552384 dust 252:17 fail_read_on_bad_block quiet
|
0 33552384 dust 252:17 fail_read_on_bad_block quiet
|
||||||
|
|
||||||
To disable quiet mode, send the "quiet" message again:
|
To disable quiet mode, send the "quiet" message again::
|
||||||
|
|
||||||
$ sudo dmsetup message dust1 0 quiet
|
$ sudo dmsetup message dust1 0 quiet
|
||||||
|
|
||||||
$ sudo dmsetup status dust1
|
$ sudo dmsetup status dust1
|
||||||
0 33552384 dust 252:17 fail_read_on_bad_block verbose
|
0 33552384 dust 252:17 fail_read_on_bad_block verbose
|
||||||
|
|
||||||
(The presence of "verbose" indicates normal logging.)
|
(The presence of "verbose" indicates normal logging.)
|
||||||
|
|
@@ -177,6 +177,11 @@ bitmap_flush_interval:number
|
|||||||
The bitmap flush interval in milliseconds. The metadata buffers
|
The bitmap flush interval in milliseconds. The metadata buffers
|
||||||
are synchronized when this interval expires.
|
are synchronized when this interval expires.
|
||||||
|
|
||||||
|
fix_padding
|
||||||
|
Use a smaller padding of the tag area that is more
|
||||||
|
space-efficient. If this option is not present, large padding is
|
||||||
|
used - that is for compatibility with older kernels.
|
||||||
|
|
||||||
|
|
||||||
The journal mode (D/J), buffer_sectors, journal_watermark, commit_time can
|
The journal mode (D/J), buffer_sectors, journal_watermark, commit_time can
|
||||||
be changed when reloading the target (load an inactive table and swap the
|
be changed when reloading the target (load an inactive table and swap the
|
||||||
|
@@ -417,3 +417,5 @@ Version History
|
|||||||
deadlock/potential data corruption. Update superblock when
|
deadlock/potential data corruption. Update superblock when
|
||||||
specific devices are requested via rebuild. Fix RAID leg
|
specific devices are requested via rebuild. Fix RAID leg
|
||||||
rebuild errors.
|
rebuild errors.
|
||||||
|
1.15.0 Fix size extensions not being synchronized in case of new MD bitmap
|
||||||
|
pages allocated; also fix those not occuring after previous reductions
|
||||||
|
@@ -9,6 +9,7 @@ Device Mapper
|
|||||||
cache
|
cache
|
||||||
delay
|
delay
|
||||||
dm-crypt
|
dm-crypt
|
||||||
|
dm-dust
|
||||||
dm-flakey
|
dm-flakey
|
||||||
dm-init
|
dm-init
|
||||||
dm-integrity
|
dm-integrity
|
||||||
|
@@ -265,8 +265,11 @@ time with the option "mds=". The valid arguments for this option are:
|
|||||||
|
|
||||||
============ =============================================================
|
============ =============================================================
|
||||||
|
|
||||||
Not specifying this option is equivalent to "mds=full".
|
Not specifying this option is equivalent to "mds=full". For processors
|
||||||
|
that are affected by both TAA (TSX Asynchronous Abort) and MDS,
|
||||||
|
specifying just "mds=off" without an accompanying "tsx_async_abort=off"
|
||||||
|
will have no effect as the same mitigation is used for both
|
||||||
|
vulnerabilities.
|
||||||
|
|
||||||
Mitigation selection guide
|
Mitigation selection guide
|
||||||
--------------------------
|
--------------------------
|
||||||
|
@@ -174,7 +174,10 @@ the option "tsx_async_abort=". The valid arguments for this option are:
|
|||||||
CPU is not vulnerable to cross-thread TAA attacks.
|
CPU is not vulnerable to cross-thread TAA attacks.
|
||||||
============ =============================================================
|
============ =============================================================
|
||||||
|
|
||||||
Not specifying this option is equivalent to "tsx_async_abort=full".
|
Not specifying this option is equivalent to "tsx_async_abort=full". For
|
||||||
|
processors that are affected by both TAA and MDS, specifying just
|
||||||
|
"tsx_async_abort=off" without an accompanying "mds=off" will have no
|
||||||
|
effect as the same mitigation is used for both vulnerabilities.
|
||||||
|
|
||||||
The kernel command line also allows to control the TSX feature using the
|
The kernel command line also allows to control the TSX feature using the
|
||||||
parameter "tsx=" on CPUs which support TSX control. MSR_IA32_TSX_CTRL is used
|
parameter "tsx=" on CPUs which support TSX control. MSR_IA32_TSX_CTRL is used
|
||||||
|
@@ -57,60 +57,61 @@ configure specific aspects of kernel behavior to your liking.
|
|||||||
.. toctree::
|
.. toctree::
|
||||||
:maxdepth: 1
|
:maxdepth: 1
|
||||||
|
|
||||||
initrd
|
|
||||||
cgroup-v2
|
|
||||||
cgroup-v1/index
|
|
||||||
serial-console
|
|
||||||
braille-console
|
|
||||||
parport
|
|
||||||
md
|
|
||||||
module-signing
|
|
||||||
rapidio
|
|
||||||
sysrq
|
|
||||||
unicode
|
|
||||||
vga-softcursor
|
|
||||||
binfmt-misc
|
|
||||||
mono
|
|
||||||
java
|
|
||||||
ras
|
|
||||||
bcache
|
|
||||||
blockdev/index
|
|
||||||
ext4
|
|
||||||
binderfs
|
|
||||||
cifs/index
|
|
||||||
xfs
|
|
||||||
jfs
|
|
||||||
ufs
|
|
||||||
pm/index
|
|
||||||
thunderbolt
|
|
||||||
LSM/index
|
|
||||||
mm/index
|
|
||||||
namespaces/index
|
|
||||||
perf-security
|
|
||||||
acpi/index
|
acpi/index
|
||||||
aoe/index
|
aoe/index
|
||||||
|
auxdisplay/index
|
||||||
|
bcache
|
||||||
|
binderfs
|
||||||
|
binfmt-misc
|
||||||
|
blockdev/index
|
||||||
|
braille-console
|
||||||
btmrvl
|
btmrvl
|
||||||
|
cgroup-v1/index
|
||||||
|
cgroup-v2
|
||||||
|
cifs/index
|
||||||
clearing-warn-once
|
clearing-warn-once
|
||||||
cpu-load
|
cpu-load
|
||||||
cputopology
|
cputopology
|
||||||
|
dell_rbu
|
||||||
device-mapper/index
|
device-mapper/index
|
||||||
efi-stub
|
efi-stub
|
||||||
|
ext4
|
||||||
gpio/index
|
gpio/index
|
||||||
highuid
|
highuid
|
||||||
hw_random
|
hw_random
|
||||||
|
initrd
|
||||||
iostats
|
iostats
|
||||||
|
java
|
||||||
|
jfs
|
||||||
kernel-per-CPU-kthreads
|
kernel-per-CPU-kthreads
|
||||||
laptops/index
|
laptops/index
|
||||||
auxdisplay/index
|
|
||||||
lcd-panel-cgram
|
lcd-panel-cgram
|
||||||
ldm
|
ldm
|
||||||
lockup-watchdogs
|
lockup-watchdogs
|
||||||
|
LSM/index
|
||||||
|
md
|
||||||
|
mm/index
|
||||||
|
module-signing
|
||||||
|
mono
|
||||||
|
namespaces/index
|
||||||
numastat
|
numastat
|
||||||
|
parport
|
||||||
|
perf-security
|
||||||
|
pm/index
|
||||||
pnp
|
pnp
|
||||||
|
rapidio
|
||||||
|
ras
|
||||||
rtc
|
rtc
|
||||||
|
serial-console
|
||||||
svga
|
svga
|
||||||
wimax/index
|
sysrq
|
||||||
|
thunderbolt
|
||||||
|
ufs
|
||||||
|
unicode
|
||||||
|
vga-softcursor
|
||||||
video-output
|
video-output
|
||||||
|
wimax/index
|
||||||
|
xfs
|
||||||
|
|
||||||
.. only:: subproject and html
|
.. only:: subproject and html
|
||||||
|
|
||||||
|
@@ -46,81 +46,91 @@ each snapshot of your disk statistics.
|
|||||||
In 2.4, the statistics fields are those after the device name. In
|
In 2.4, the statistics fields are those after the device name. In
|
||||||
the above example, the first field of statistics would be 446216.
|
the above example, the first field of statistics would be 446216.
|
||||||
By contrast, in 2.6+ if you look at ``/sys/block/hda/stat``, you'll
|
By contrast, in 2.6+ if you look at ``/sys/block/hda/stat``, you'll
|
||||||
find just the eleven fields, beginning with 446216. If you look at
|
find just the 15 fields, beginning with 446216. If you look at
|
||||||
``/proc/diskstats``, the eleven fields will be preceded by the major and
|
``/proc/diskstats``, the 15 fields will be preceded by the major and
|
||||||
minor device numbers, and device name. Each of these formats provides
|
minor device numbers, and device name. Each of these formats provides
|
||||||
eleven fields of statistics, each meaning exactly the same things.
|
15 fields of statistics, each meaning exactly the same things.
|
||||||
All fields except field 9 are cumulative since boot. Field 9 should
|
All fields except field 9 are cumulative since boot. Field 9 should
|
||||||
go to zero as I/Os complete; all others only increase (unless they
|
go to zero as I/Os complete; all others only increase (unless they
|
||||||
overflow and wrap). Yes, these are (32-bit or 64-bit) unsigned long
|
overflow and wrap). Wrapping might eventually occur on a very busy
|
||||||
(native word size) numbers, and on a very busy or long-lived system they
|
or long-lived system; so applications should be prepared to deal with
|
||||||
may wrap. Applications should be prepared to deal with that; unless
|
it. Regarding wrapping, the types of the fields are either unsigned
|
||||||
your observations are measured in large numbers of minutes or hours,
|
int (32 bit) or unsigned long (32-bit or 64-bit, depending on your
|
||||||
they should not wrap twice before you notice them.
|
machine) as noted per-field below. Unless your observations are very
|
||||||
|
spread in time, these fields should not wrap twice before you notice it.
|
||||||
|
|
||||||
Each set of stats only applies to the indicated device; if you want
|
Each set of stats only applies to the indicated device; if you want
|
||||||
system-wide stats you'll have to find all the devices and sum them all up.
|
system-wide stats you'll have to find all the devices and sum them all up.
|
||||||
|
|
||||||
Field 1 -- # of reads completed
|
Field 1 -- # of reads completed (unsigned long)
|
||||||
This is the total number of reads completed successfully.
|
This is the total number of reads completed successfully.
|
||||||
|
|
||||||
Field 2 -- # of reads merged, field 6 -- # of writes merged
|
Field 2 -- # of reads merged, field 6 -- # of writes merged (unsigned long)
|
||||||
Reads and writes which are adjacent to each other may be merged for
|
Reads and writes which are adjacent to each other may be merged for
|
||||||
efficiency. Thus two 4K reads may become one 8K read before it is
|
efficiency. Thus two 4K reads may become one 8K read before it is
|
||||||
ultimately handed to the disk, and so it will be counted (and queued)
|
ultimately handed to the disk, and so it will be counted (and queued)
|
||||||
as only one I/O. This field lets you know how often this was done.
|
as only one I/O. This field lets you know how often this was done.
|
||||||
|
|
||||||
Field 3 -- # of sectors read
|
Field 3 -- # of sectors read (unsigned long)
|
||||||
This is the total number of sectors read successfully.
|
This is the total number of sectors read successfully.
|
||||||
|
|
||||||
Field 4 -- # of milliseconds spent reading
|
Field 4 -- # of milliseconds spent reading (unsigned int)
|
||||||
This is the total number of milliseconds spent by all reads (as
|
This is the total number of milliseconds spent by all reads (as
|
||||||
measured from __make_request() to end_that_request_last()).
|
measured from __make_request() to end_that_request_last()).
|
||||||
|
|
||||||
Field 5 -- # of writes completed
|
Field 5 -- # of writes completed (unsigned long)
|
||||||
This is the total number of writes completed successfully.
|
This is the total number of writes completed successfully.
|
||||||
|
|
||||||
Field 6 -- # of writes merged
|
Field 6 -- # of writes merged (unsigned long)
|
||||||
See the description of field 2.
|
See the description of field 2.
|
||||||
|
|
||||||
Field 7 -- # of sectors written
|
Field 7 -- # of sectors written (unsigned long)
|
||||||
This is the total number of sectors written successfully.
|
This is the total number of sectors written successfully.
|
||||||
|
|
||||||
Field 8 -- # of milliseconds spent writing
|
Field 8 -- # of milliseconds spent writing (unsigned int)
|
||||||
This is the total number of milliseconds spent by all writes (as
|
This is the total number of milliseconds spent by all writes (as
|
||||||
measured from __make_request() to end_that_request_last()).
|
measured from __make_request() to end_that_request_last()).
|
||||||
|
|
||||||
Field 9 -- # of I/Os currently in progress
|
Field 9 -- # of I/Os currently in progress (unsigned int)
|
||||||
The only field that should go to zero. Incremented as requests are
|
The only field that should go to zero. Incremented as requests are
|
||||||
given to appropriate struct request_queue and decremented as they finish.
|
given to appropriate struct request_queue and decremented as they finish.
|
||||||
|
|
||||||
Field 10 -- # of milliseconds spent doing I/Os
|
Field 10 -- # of milliseconds spent doing I/Os (unsigned int)
|
||||||
This field increases so long as field 9 is nonzero.
|
This field increases so long as field 9 is nonzero.
|
||||||
|
|
||||||
Since 5.0 this field counts jiffies when at least one request was
|
Since 5.0 this field counts jiffies when at least one request was
|
||||||
started or completed. If request runs more than 2 jiffies then some
|
started or completed. If request runs more than 2 jiffies then some
|
||||||
I/O time will not be accounted unless there are other requests.
|
I/O time will not be accounted unless there are other requests.
|
||||||
|
|
||||||
Field 11 -- weighted # of milliseconds spent doing I/Os
|
Field 11 -- weighted # of milliseconds spent doing I/Os (unsigned int)
|
||||||
This field is incremented at each I/O start, I/O completion, I/O
|
This field is incremented at each I/O start, I/O completion, I/O
|
||||||
merge, or read of these stats by the number of I/Os in progress
|
merge, or read of these stats by the number of I/Os in progress
|
||||||
(field 9) times the number of milliseconds spent doing I/O since the
|
(field 9) times the number of milliseconds spent doing I/O since the
|
||||||
last update of this field. This can provide an easy measure of both
|
last update of this field. This can provide an easy measure of both
|
||||||
I/O completion time and the backlog that may be accumulating.
|
I/O completion time and the backlog that may be accumulating.
|
||||||
|
|
||||||
Field 12 -- # of discards completed
|
Field 12 -- # of discards completed (unsigned long)
|
||||||
This is the total number of discards completed successfully.
|
This is the total number of discards completed successfully.
|
||||||
|
|
||||||
Field 13 -- # of discards merged
|
Field 13 -- # of discards merged (unsigned long)
|
||||||
See the description of field 2
|
See the description of field 2
|
||||||
|
|
||||||
Field 14 -- # of sectors discarded
|
Field 14 -- # of sectors discarded (unsigned long)
|
||||||
This is the total number of sectors discarded successfully.
|
This is the total number of sectors discarded successfully.
|
||||||
|
|
||||||
Field 15 -- # of milliseconds spent discarding
|
Field 15 -- # of milliseconds spent discarding (unsigned int)
|
||||||
This is the total number of milliseconds spent by all discards (as
|
This is the total number of milliseconds spent by all discards (as
|
||||||
measured from __make_request() to end_that_request_last()).
|
measured from __make_request() to end_that_request_last()).
|
||||||
|
|
||||||
|
Field 16 -- # of flush requests completed
|
||||||
|
This is the total number of flush requests completed successfully.
|
||||||
|
|
||||||
|
Block layer combines flush requests and executes at most one at a time.
|
||||||
|
This counts flush requests executed by disk. Not tracked for partitions.
|
||||||
|
|
||||||
|
Field 17 -- # of milliseconds spent flushing
|
||||||
|
This is the total number of milliseconds spent by all flush requests.
|
||||||
|
|
||||||
To avoid introducing performance bottlenecks, no locks are held while
|
To avoid introducing performance bottlenecks, no locks are held while
|
||||||
modifying these counters. This implies that minor inaccuracies may be
|
modifying these counters. This implies that minor inaccuracies may be
|
||||||
introduced when changes collide, so (for instance) adding up all the
|
introduced when changes collide, so (for instance) adding up all the
|
||||||
|
@@ -113,7 +113,7 @@
|
|||||||
the GPE dispatcher.
|
the GPE dispatcher.
|
||||||
This facility can be used to prevent such uncontrolled
|
This facility can be used to prevent such uncontrolled
|
||||||
GPE floodings.
|
GPE floodings.
|
||||||
Format: <int>
|
Format: <byte>
|
||||||
|
|
||||||
acpi_no_auto_serialize [HW,ACPI]
|
acpi_no_auto_serialize [HW,ACPI]
|
||||||
Disable auto-serialization of AML methods
|
Disable auto-serialization of AML methods
|
||||||
@@ -437,8 +437,6 @@
|
|||||||
no delay (0).
|
no delay (0).
|
||||||
Format: integer
|
Format: integer
|
||||||
|
|
||||||
bootmem_debug [KNL] Enable bootmem allocator debug messages.
|
|
||||||
|
|
||||||
bert_disable [ACPI]
|
bert_disable [ACPI]
|
||||||
Disable BERT OS support on buggy BIOSes.
|
Disable BERT OS support on buggy BIOSes.
|
||||||
|
|
||||||
@@ -983,12 +981,10 @@
|
|||||||
|
|
||||||
earlycon= [KNL] Output early console device and options.
|
earlycon= [KNL] Output early console device and options.
|
||||||
|
|
||||||
[ARM64] The early console is determined by the
|
When used with no options, the early console is
|
||||||
stdout-path property in device tree's chosen node,
|
determined by stdout-path property in device tree's
|
||||||
or determined by the ACPI SPCR table.
|
chosen node or the ACPI SPCR table if supported by
|
||||||
|
the platform.
|
||||||
[X86] When used with no options the early console is
|
|
||||||
determined by the ACPI SPCR table.
|
|
||||||
|
|
||||||
cdns,<addr>[,options]
|
cdns,<addr>[,options]
|
||||||
Start an early, polled-mode console on a Cadence
|
Start an early, polled-mode console on a Cadence
|
||||||
@@ -1101,7 +1097,7 @@
|
|||||||
mapped with the correct attributes.
|
mapped with the correct attributes.
|
||||||
|
|
||||||
linflex,<addr>
|
linflex,<addr>
|
||||||
Use early console provided by Freescale LinFlex UART
|
Use early console provided by Freescale LINFlexD UART
|
||||||
serial driver for NXP S32V234 SoCs. A valid base
|
serial driver for NXP S32V234 SoCs. A valid base
|
||||||
address must be provided, and the serial port must
|
address must be provided, and the serial port must
|
||||||
already be setup and configured.
|
already be setup and configured.
|
||||||
@@ -1168,7 +1164,8 @@
|
|||||||
Format: {"off" | "on" | "skip[mbr]"}
|
Format: {"off" | "on" | "skip[mbr]"}
|
||||||
|
|
||||||
efi= [EFI]
|
efi= [EFI]
|
||||||
Format: { "old_map", "nochunk", "noruntime", "debug" }
|
Format: { "old_map", "nochunk", "noruntime", "debug",
|
||||||
|
"nosoftreserve" }
|
||||||
old_map [X86-64]: switch to the old ioremap-based EFI
|
old_map [X86-64]: switch to the old ioremap-based EFI
|
||||||
runtime services mapping. 32-bit still uses this one by
|
runtime services mapping. 32-bit still uses this one by
|
||||||
default.
|
default.
|
||||||
@@ -1177,6 +1174,12 @@
|
|||||||
firmware implementations.
|
firmware implementations.
|
||||||
noruntime : disable EFI runtime services support
|
noruntime : disable EFI runtime services support
|
||||||
debug: enable misc debug output
|
debug: enable misc debug output
|
||||||
|
nosoftreserve: The EFI_MEMORY_SP (Specific Purpose)
|
||||||
|
attribute may cause the kernel to reserve the
|
||||||
|
memory range for a memory mapping driver to
|
||||||
|
claim. Specify efi=nosoftreserve to disable this
|
||||||
|
reservation and treat the memory by its base type
|
||||||
|
(i.e. EFI_CONVENTIONAL_MEMORY / "System RAM").
|
||||||
|
|
||||||
efi_no_storage_paranoia [EFI; X86]
|
efi_no_storage_paranoia [EFI; X86]
|
||||||
Using this parameter you can use more than 50% of
|
Using this parameter you can use more than 50% of
|
||||||
@@ -1189,15 +1192,21 @@
|
|||||||
updating original EFI memory map.
|
updating original EFI memory map.
|
||||||
Region of memory which aa attribute is added to is
|
Region of memory which aa attribute is added to is
|
||||||
from ss to ss+nn.
|
from ss to ss+nn.
|
||||||
|
|
||||||
If efi_fake_mem=2G@4G:0x10000,2G@0x10a0000000:0x10000
|
If efi_fake_mem=2G@4G:0x10000,2G@0x10a0000000:0x10000
|
||||||
is specified, EFI_MEMORY_MORE_RELIABLE(0x10000)
|
is specified, EFI_MEMORY_MORE_RELIABLE(0x10000)
|
||||||
attribute is added to range 0x100000000-0x180000000 and
|
attribute is added to range 0x100000000-0x180000000 and
|
||||||
0x10a0000000-0x1120000000.
|
0x10a0000000-0x1120000000.
|
||||||
|
|
||||||
|
If efi_fake_mem=8G@9G:0x40000 is specified, the
|
||||||
|
EFI_MEMORY_SP(0x40000) attribute is added to
|
||||||
|
range 0x240000000-0x43fffffff.
|
||||||
|
|
||||||
Using this parameter you can do debugging of EFI memmap
|
Using this parameter you can do debugging of EFI memmap
|
||||||
related feature. For example, you can do debugging of
|
related features. For example, you can do debugging of
|
||||||
Address Range Mirroring feature even if your box
|
Address Range Mirroring feature even if your box
|
||||||
doesn't support it.
|
doesn't support it, or mark specific memory as
|
||||||
|
"soft reserved".
|
||||||
|
|
||||||
efivar_ssdt= [EFI; X86] Name of an EFI variable that contains an SSDT
|
efivar_ssdt= [EFI; X86] Name of an EFI variable that contains an SSDT
|
||||||
that is to be dynamically loaded by Linux. If there are
|
that is to be dynamically loaded by Linux. If there are
|
||||||
@@ -2473,6 +2482,12 @@
|
|||||||
SMT on vulnerable CPUs
|
SMT on vulnerable CPUs
|
||||||
off - Unconditionally disable MDS mitigation
|
off - Unconditionally disable MDS mitigation
|
||||||
|
|
||||||
|
On TAA-affected machines, mds=off can be prevented by
|
||||||
|
an active TAA mitigation as both vulnerabilities are
|
||||||
|
mitigated with the same mechanism so in order to disable
|
||||||
|
this mitigation, you need to specify tsx_async_abort=off
|
||||||
|
too.
|
||||||
|
|
||||||
Not specifying this option is equivalent to
|
Not specifying this option is equivalent to
|
||||||
mds=full.
|
mds=full.
|
||||||
|
|
||||||
@@ -3110,9 +3125,9 @@
|
|||||||
[X86,PV_OPS] Disable paravirtualized VMware scheduler
|
[X86,PV_OPS] Disable paravirtualized VMware scheduler
|
||||||
clock and use the default one.
|
clock and use the default one.
|
||||||
|
|
||||||
no-steal-acc [X86,KVM] Disable paravirtualized steal time accounting.
|
no-steal-acc [X86,KVM,ARM64] Disable paravirtualized steal time
|
||||||
steal time is computed, but won't influence scheduler
|
accounting. steal time is computed, but won't
|
||||||
behaviour
|
influence scheduler behaviour
|
||||||
|
|
||||||
nolapic [X86-32,APIC] Do not enable or use the local APIC.
|
nolapic [X86-32,APIC] Do not enable or use the local APIC.
|
||||||
|
|
||||||
@@ -3525,8 +3540,15 @@
|
|||||||
hpiosize=nn[KMG] The fixed amount of bus space which is
|
hpiosize=nn[KMG] The fixed amount of bus space which is
|
||||||
reserved for hotplug bridge's IO window.
|
reserved for hotplug bridge's IO window.
|
||||||
Default size is 256 bytes.
|
Default size is 256 bytes.
|
||||||
|
hpmmiosize=nn[KMG] The fixed amount of bus space which is
|
||||||
|
reserved for hotplug bridge's MMIO window.
|
||||||
|
Default size is 2 megabytes.
|
||||||
|
hpmmioprefsize=nn[KMG] The fixed amount of bus space which is
|
||||||
|
reserved for hotplug bridge's MMIO_PREF window.
|
||||||
|
Default size is 2 megabytes.
|
||||||
hpmemsize=nn[KMG] The fixed amount of bus space which is
|
hpmemsize=nn[KMG] The fixed amount of bus space which is
|
||||||
reserved for hotplug bridge's memory window.
|
reserved for hotplug bridge's MMIO and
|
||||||
|
MMIO_PREF window.
|
||||||
Default size is 2 megabytes.
|
Default size is 2 megabytes.
|
||||||
hpbussize=nn The minimum amount of additional bus numbers
|
hpbussize=nn The minimum amount of additional bus numbers
|
||||||
reserved for buses below a hotplug bridge.
|
reserved for buses below a hotplug bridge.
|
||||||
@@ -3573,6 +3595,8 @@
|
|||||||
even if the platform doesn't give the OS permission to
|
even if the platform doesn't give the OS permission to
|
||||||
use them. This may cause conflicts if the platform
|
use them. This may cause conflicts if the platform
|
||||||
also tries to use these services.
|
also tries to use these services.
|
||||||
|
dpc-native Use native PCIe service for DPC only. May
|
||||||
|
cause conflicts if firmware uses AER or DPC.
|
||||||
compat Disable native PCIe services (PME, AER, DPC, PCIe
|
compat Disable native PCIe services (PME, AER, DPC, PCIe
|
||||||
hotplug).
|
hotplug).
|
||||||
|
|
||||||
@@ -4937,6 +4961,11 @@
|
|||||||
vulnerable to cross-thread TAA attacks.
|
vulnerable to cross-thread TAA attacks.
|
||||||
off - Unconditionally disable TAA mitigation
|
off - Unconditionally disable TAA mitigation
|
||||||
|
|
||||||
|
On MDS-affected machines, tsx_async_abort=off can be
|
||||||
|
prevented by an active MDS mitigation as both vulnerabilities
|
||||||
|
are mitigated with the same mechanism so in order to disable
|
||||||
|
this mitigation, you need to specify mds=off too.
|
||||||
|
|
||||||
Not specifying this option is equivalent to
|
Not specifying this option is equivalent to
|
||||||
tsx_async_abort=full. On CPUs which are MDS affected
|
tsx_async_abort=full. On CPUs which are MDS affected
|
||||||
and deploy MDS mitigation, TAA mitigation is not
|
and deploy MDS mitigation, TAA mitigation is not
|
||||||
@@ -5096,13 +5125,13 @@
|
|||||||
Flags is a set of characters, each corresponding
|
Flags is a set of characters, each corresponding
|
||||||
to a common usb-storage quirk flag as follows:
|
to a common usb-storage quirk flag as follows:
|
||||||
a = SANE_SENSE (collect more than 18 bytes
|
a = SANE_SENSE (collect more than 18 bytes
|
||||||
of sense data);
|
of sense data, not on uas);
|
||||||
b = BAD_SENSE (don't collect more than 18
|
b = BAD_SENSE (don't collect more than 18
|
||||||
bytes of sense data);
|
bytes of sense data, not on uas);
|
||||||
c = FIX_CAPACITY (decrease the reported
|
c = FIX_CAPACITY (decrease the reported
|
||||||
device capacity by one sector);
|
device capacity by one sector);
|
||||||
d = NO_READ_DISC_INFO (don't use
|
d = NO_READ_DISC_INFO (don't use
|
||||||
READ_DISC_INFO command);
|
READ_DISC_INFO command, not on uas);
|
||||||
e = NO_READ_CAPACITY_16 (don't use
|
e = NO_READ_CAPACITY_16 (don't use
|
||||||
READ_CAPACITY_16 command);
|
READ_CAPACITY_16 command);
|
||||||
f = NO_REPORT_OPCODES (don't use report opcodes
|
f = NO_REPORT_OPCODES (don't use report opcodes
|
||||||
@@ -5117,17 +5146,18 @@
|
|||||||
j = NO_REPORT_LUNS (don't use report luns
|
j = NO_REPORT_LUNS (don't use report luns
|
||||||
command, uas only);
|
command, uas only);
|
||||||
l = NOT_LOCKABLE (don't try to lock and
|
l = NOT_LOCKABLE (don't try to lock and
|
||||||
unlock ejectable media);
|
unlock ejectable media, not on uas);
|
||||||
m = MAX_SECTORS_64 (don't transfer more
|
m = MAX_SECTORS_64 (don't transfer more
|
||||||
than 64 sectors = 32 KB at a time);
|
than 64 sectors = 32 KB at a time,
|
||||||
|
not on uas);
|
||||||
n = INITIAL_READ10 (force a retry of the
|
n = INITIAL_READ10 (force a retry of the
|
||||||
initial READ(10) command);
|
initial READ(10) command, not on uas);
|
||||||
o = CAPACITY_OK (accept the capacity
|
o = CAPACITY_OK (accept the capacity
|
||||||
reported by the device);
|
reported by the device, not on uas);
|
||||||
p = WRITE_CACHE (the device cache is ON
|
p = WRITE_CACHE (the device cache is ON
|
||||||
by default);
|
by default, not on uas);
|
||||||
r = IGNORE_RESIDUE (the device reports
|
r = IGNORE_RESIDUE (the device reports
|
||||||
bogus residue values);
|
bogus residue values, not on uas);
|
||||||
s = SINGLE_LUN (the device has only one
|
s = SINGLE_LUN (the device has only one
|
||||||
Logical Unit);
|
Logical Unit);
|
||||||
t = NO_ATA_1X (don't allow ATA(12) and ATA(16)
|
t = NO_ATA_1X (don't allow ATA(12) and ATA(16)
|
||||||
@@ -5136,7 +5166,8 @@
|
|||||||
w = NO_WP_DETECT (don't test whether the
|
w = NO_WP_DETECT (don't test whether the
|
||||||
medium is write-protected).
|
medium is write-protected).
|
||||||
y = ALWAYS_SYNC (issue a SYNCHRONIZE_CACHE
|
y = ALWAYS_SYNC (issue a SYNCHRONIZE_CACHE
|
||||||
even if the device claims no cache)
|
even if the device claims no cache,
|
||||||
|
not on uas)
|
||||||
Example: quirks=0419:aaf5:rl,0421:0433:rc
|
Example: quirks=0419:aaf5:rl,0421:0433:rc
|
||||||
|
|
||||||
user_debug= [KNL,ARM]
|
user_debug= [KNL,ARM]
|
||||||
|
@@ -17,36 +17,54 @@ The "format" directory describes format of the config (event ID) and config1
|
|||||||
(AXI filtering) fields of the perf_event_attr structure, see /sys/bus/event_source/
|
(AXI filtering) fields of the perf_event_attr structure, see /sys/bus/event_source/
|
||||||
devices/imx8_ddr0/format/. The "events" directory describes the events types
|
devices/imx8_ddr0/format/. The "events" directory describes the events types
|
||||||
hardware supported that can be used with perf tool, see /sys/bus/event_source/
|
hardware supported that can be used with perf tool, see /sys/bus/event_source/
|
||||||
devices/imx8_ddr0/events/.
|
devices/imx8_ddr0/events/. The "caps" directory describes filter features implemented
|
||||||
e.g.::
|
in DDR PMU, see /sys/bus/events_source/devices/imx8_ddr0/caps/.
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
perf stat -a -e imx8_ddr0/cycles/ cmd
|
perf stat -a -e imx8_ddr0/cycles/ cmd
|
||||||
perf stat -a -e imx8_ddr0/read/,imx8_ddr0/write/ cmd
|
perf stat -a -e imx8_ddr0/read/,imx8_ddr0/write/ cmd
|
||||||
|
|
||||||
AXI filtering is only used by CSV modes 0x41 (axid-read) and 0x42 (axid-write)
|
AXI filtering is only used by CSV modes 0x41 (axid-read) and 0x42 (axid-write)
|
||||||
to count reading or writing matches filter setting. Filter setting is various
|
to count reading or writing matches filter setting. Filter setting is various
|
||||||
from different DRAM controller implementations, which is distinguished by quirks
|
from different DRAM controller implementations, which is distinguished by quirks
|
||||||
in the driver.
|
in the driver. You also can dump info from userspace, filter in "caps" directory
|
||||||
|
indicates whether PMU supports AXI ID filter or not; enhanced_filter indicates
|
||||||
|
whether PMU supports enhanced AXI ID filter or not. Value 0 for un-supported, and
|
||||||
|
value 1 for supported.
|
||||||
|
|
||||||
* With DDR_CAP_AXI_ID_FILTER quirk.
|
* With DDR_CAP_AXI_ID_FILTER quirk(filter: 1, enhanced_filter: 0).
|
||||||
Filter is defined with two configuration parts:
|
Filter is defined with two configuration parts:
|
||||||
--AXI_ID defines AxID matching value.
|
--AXI_ID defines AxID matching value.
|
||||||
--AXI_MASKING defines which bits of AxID are meaningful for the matching.
|
--AXI_MASKING defines which bits of AxID are meaningful for the matching.
|
||||||
0:corresponding bit is masked.
|
|
||||||
1: corresponding bit is not masked, i.e. used to do the matching.
|
- 0: corresponding bit is masked.
|
||||||
|
- 1: corresponding bit is not masked, i.e. used to do the matching.
|
||||||
|
|
||||||
AXI_ID and AXI_MASKING are mapped on DPCR1 register in performance counter.
|
AXI_ID and AXI_MASKING are mapped on DPCR1 register in performance counter.
|
||||||
When non-masked bits are matching corresponding AXI_ID bits then counter is
|
When non-masked bits are matching corresponding AXI_ID bits then counter is
|
||||||
incremented. Perf counter is incremented if
|
incremented. Perf counter is incremented if
|
||||||
AxID && AXI_MASKING == AXI_ID && AXI_MASKING
|
AxID && AXI_MASKING == AXI_ID && AXI_MASKING
|
||||||
|
|
||||||
This filter doesn't support filter different AXI ID for axid-read and axid-write
|
This filter doesn't support filter different AXI ID for axid-read and axid-write
|
||||||
event at the same time as this filter is shared between counters.
|
event at the same time as this filter is shared between counters.
|
||||||
e.g.::
|
|
||||||
perf stat -a -e imx8_ddr0/axid-read,axi_mask=0xMMMM,axi_id=0xDDDD/ cmd
|
|
||||||
perf stat -a -e imx8_ddr0/axid-write,axi_mask=0xMMMM,axi_id=0xDDDD/ cmd
|
|
||||||
|
|
||||||
NOTE: axi_mask is inverted in userspace(i.e. set bits are bits to mask), and
|
.. code-block:: bash
|
||||||
it will be reverted in driver automatically. so that the user can just specify
|
|
||||||
axi_id to monitor a specific id, rather than having to specify axi_mask.
|
perf stat -a -e imx8_ddr0/axid-read,axi_mask=0xMMMM,axi_id=0xDDDD/ cmd
|
||||||
e.g.::
|
perf stat -a -e imx8_ddr0/axid-write,axi_mask=0xMMMM,axi_id=0xDDDD/ cmd
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
axi_mask is inverted in userspace(i.e. set bits are bits to mask), and
|
||||||
|
it will be reverted in driver automatically. so that the user can just specify
|
||||||
|
axi_id to monitor a specific id, rather than having to specify axi_mask.
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
perf stat -a -e imx8_ddr0/axid-read,axi_id=0x12/ cmd, which will monitor ARID=0x12
|
perf stat -a -e imx8_ddr0/axid-read,axi_id=0x12/ cmd, which will monitor ARID=0x12
|
||||||
|
|
||||||
|
* With DDR_CAP_AXI_ID_FILTER_ENHANCED quirk(filter: 1, enhanced_filter: 1).
|
||||||
|
This is an extension to the DDR_CAP_AXI_ID_FILTER quirk which permits
|
||||||
|
counting the number of bytes (as opposed to the number of bursts) from DDR
|
||||||
|
read and write transactions concurrently with another set of data counters.
|
||||||
|
@@ -8,6 +8,7 @@ Performance monitor support
|
|||||||
:maxdepth: 1
|
:maxdepth: 1
|
||||||
|
|
||||||
hisi-pmu
|
hisi-pmu
|
||||||
|
imx-ddr
|
||||||
qcom_l2_pmu
|
qcom_l2_pmu
|
||||||
qcom_l3_pmu
|
qcom_l3_pmu
|
||||||
arm-ccn
|
arm-ccn
|
||||||
|
@@ -3,24 +3,26 @@ Cavium ThunderX2 SoC Performance Monitoring Unit (PMU UNCORE)
|
|||||||
=============================================================
|
=============================================================
|
||||||
|
|
||||||
The ThunderX2 SoC PMU consists of independent, system-wide, per-socket
|
The ThunderX2 SoC PMU consists of independent, system-wide, per-socket
|
||||||
PMUs such as the Level 3 Cache (L3C) and DDR4 Memory Controller (DMC).
|
PMUs such as the Level 3 Cache (L3C), DDR4 Memory Controller (DMC) and
|
||||||
|
Cavium Coherent Processor Interconnect (CCPI2).
|
||||||
|
|
||||||
The DMC has 8 interleaved channels and the L3C has 16 interleaved tiles.
|
The DMC has 8 interleaved channels and the L3C has 16 interleaved tiles.
|
||||||
Events are counted for the default channel (i.e. channel 0) and prorated
|
Events are counted for the default channel (i.e. channel 0) and prorated
|
||||||
to the total number of channels/tiles.
|
to the total number of channels/tiles.
|
||||||
|
|
||||||
The DMC and L3C support up to 4 counters. Counters are independently
|
The DMC and L3C support up to 4 counters, while the CCPI2 supports up to 8
|
||||||
programmable and can be started and stopped individually. Each counter
|
counters. Counters are independently programmable to different events and
|
||||||
can be set to a different event. Counters are 32-bit and do not support
|
can be started and stopped individually. None of the counters support an
|
||||||
an overflow interrupt; they are read every 2 seconds.
|
overflow interrupt. DMC and L3C counters are 32-bit and read every 2 seconds.
|
||||||
|
The CCPI2 counters are 64-bit and assumed not to overflow in normal operation.
|
||||||
|
|
||||||
PMU UNCORE (perf) driver:
|
PMU UNCORE (perf) driver:
|
||||||
|
|
||||||
The thunderx2_pmu driver registers per-socket perf PMUs for the DMC and
|
The thunderx2_pmu driver registers per-socket perf PMUs for the DMC and
|
||||||
L3C devices. Each PMU can be used to count up to 4 events
|
L3C devices. Each PMU can be used to count up to 4 (DMC/L3C) or up to 8
|
||||||
simultaneously. The PMUs provide a description of their available events
|
(CCPI2) events simultaneously. The PMUs provide a description of their
|
||||||
and configuration options under sysfs, see
|
available events and configuration options under sysfs, see
|
||||||
/sys/devices/uncore_<l3c_S/dmc_S/>; S is the socket id.
|
/sys/devices/uncore_<l3c_S/dmc_S/ccpi2_S/>; S is the socket id.
|
||||||
|
|
||||||
The driver does not support sampling, therefore "perf record" will not
|
The driver does not support sampling, therefore "perf record" will not
|
||||||
work. Per-task perf sessions are also not supported.
|
work. Per-task perf sessions are also not supported.
|
||||||
|
@@ -330,9 +330,12 @@ There can be multiple csrows and multiple channels.
|
|||||||
|
|
||||||
.. [#f4] Nowadays, the term DIMM (Dual In-line Memory Module) is widely
|
.. [#f4] Nowadays, the term DIMM (Dual In-line Memory Module) is widely
|
||||||
used to refer to a memory module, although there are other memory
|
used to refer to a memory module, although there are other memory
|
||||||
packaging alternatives, like SO-DIMM, SIMM, etc. Along this document,
|
packaging alternatives, like SO-DIMM, SIMM, etc. The UEFI
|
||||||
and inside the EDAC system, the term "dimm" is used for all memory
|
specification (Version 2.7) defines a memory module in the Common
|
||||||
modules, even when they use a different kind of packaging.
|
Platform Error Record (CPER) section to be an SMBIOS Memory Device
|
||||||
|
(Type 17). Along this document, and inside the EDAC subsystem, the term
|
||||||
|
"dimm" is used for all memory modules, even when they use a
|
||||||
|
different kind of packaging.
|
||||||
|
|
||||||
Memory controllers allow for several csrows, with 8 csrows being a
|
Memory controllers allow for several csrows, with 8 csrows being a
|
||||||
typical value. Yet, the actual number of csrows depends on the layout of
|
typical value. Yet, the actual number of csrows depends on the layout of
|
||||||
@@ -349,12 +352,14 @@ controllers. The following example will assume 2 channels:
|
|||||||
| | ``ch0`` | ``ch1`` |
|
| | ``ch0`` | ``ch1`` |
|
||||||
+============+===========+===========+
|
+============+===========+===========+
|
||||||
| ``csrow0`` | DIMM_A0 | DIMM_B0 |
|
| ``csrow0`` | DIMM_A0 | DIMM_B0 |
|
||||||
+------------+ | |
|
| | rank0 | rank0 |
|
||||||
| ``csrow1`` | | |
|
+------------+ - | - |
|
||||||
|
| ``csrow1`` | rank1 | rank1 |
|
||||||
+------------+-----------+-----------+
|
+------------+-----------+-----------+
|
||||||
| ``csrow2`` | DIMM_A1 | DIMM_B1 |
|
| ``csrow2`` | DIMM_A1 | DIMM_B1 |
|
||||||
+------------+ | |
|
| | rank0 | rank0 |
|
||||||
| ``csrow3`` | | |
|
+------------+ - | - |
|
||||||
|
| ``csrow3`` | rank1 | rank1 |
|
||||||
+------------+-----------+-----------+
|
+------------+-----------+-----------+
|
||||||
|
|
||||||
In the above example, there are 4 physical slots on the motherboard
|
In the above example, there are 4 physical slots on the motherboard
|
||||||
@@ -374,11 +379,13 @@ which the memory DIMM is placed. Thus, when 1 DIMM is placed in each
|
|||||||
Channel, the csrows cross both DIMMs.
|
Channel, the csrows cross both DIMMs.
|
||||||
|
|
||||||
Memory DIMMs come single or dual "ranked". A rank is a populated csrow.
|
Memory DIMMs come single or dual "ranked". A rank is a populated csrow.
|
||||||
Thus, 2 single ranked DIMMs, placed in slots DIMM_A0 and DIMM_B0 above
|
In the example above 2 dual ranked DIMMs are similarly placed. Thus,
|
||||||
will have just one csrow (csrow0). csrow1 will be empty. On the other
|
both csrow0 and csrow1 are populated. On the other hand, when 2 single
|
||||||
hand, when 2 dual ranked DIMMs are similarly placed, then both csrow0
|
ranked DIMMs are placed in slots DIMM_A0 and DIMM_B0, then they will
|
||||||
and csrow1 will be populated. The pattern repeats itself for csrow2 and
|
have just one csrow (csrow0) and csrow1 will be empty. The pattern
|
||||||
csrow3.
|
repeats itself for csrow2 and csrow3. Also note that some memory
|
||||||
|
controllers don't have any logic to identify the memory module, see
|
||||||
|
``rankX`` directories below.
|
||||||
|
|
||||||
The representation of the above is reflected in the directory
|
The representation of the above is reflected in the directory
|
||||||
tree in EDAC's sysfs interface. Starting in directory
|
tree in EDAC's sysfs interface. Starting in directory
|
||||||
|
@@ -834,8 +834,8 @@ printk_ratelimit:
|
|||||||
=================
|
=================
|
||||||
|
|
||||||
Some warning messages are rate limited. printk_ratelimit specifies
|
Some warning messages are rate limited. printk_ratelimit specifies
|
||||||
the minimum length of time between these messages (in jiffies), by
|
the minimum length of time between these messages (in seconds).
|
||||||
default we allow one every 5 seconds.
|
The default value is 5 seconds.
|
||||||
|
|
||||||
A value of 0 will disable rate limiting.
|
A value of 0 will disable rate limiting.
|
||||||
|
|
||||||
@@ -848,6 +848,8 @@ seconds, we do allow a burst of messages to pass through.
|
|||||||
printk_ratelimit_burst specifies the number of messages we can
|
printk_ratelimit_burst specifies the number of messages we can
|
||||||
send before ratelimiting kicks in.
|
send before ratelimiting kicks in.
|
||||||
|
|
||||||
|
The default value is 10 messages.
|
||||||
|
|
||||||
|
|
||||||
printk_devkmsg:
|
printk_devkmsg:
|
||||||
===============
|
===============
|
||||||
@@ -1104,7 +1106,7 @@ During initialization the kernel sets this value such that even if the
|
|||||||
maximum number of threads is created, the thread structures occupy only
|
maximum number of threads is created, the thread structures occupy only
|
||||||
a part (1/8th) of the available RAM pages.
|
a part (1/8th) of the available RAM pages.
|
||||||
|
|
||||||
The minimum value that can be written to threads-max is 20.
|
The minimum value that can be written to threads-max is 1.
|
||||||
|
|
||||||
The maximum value that can be written to threads-max is given by the
|
The maximum value that can be written to threads-max is given by the
|
||||||
constant FUTEX_TID_MASK (0x3fffffff).
|
constant FUTEX_TID_MASK (0x3fffffff).
|
||||||
@@ -1112,10 +1114,6 @@ constant FUTEX_TID_MASK (0x3fffffff).
|
|||||||
If a value outside of this range is written to threads-max an error
|
If a value outside of this range is written to threads-max an error
|
||||||
EINVAL occurs.
|
EINVAL occurs.
|
||||||
|
|
||||||
The value written is checked against the available RAM pages. If the
|
|
||||||
thread structures would occupy too much (more than 1/8th) of the
|
|
||||||
available RAM pages threads-max is reduced accordingly.
|
|
||||||
|
|
||||||
|
|
||||||
unknown_nmi_panic:
|
unknown_nmi_panic:
|
||||||
==================
|
==================
|
||||||
|
@@ -103,7 +103,7 @@ the Microchip website: http://www.microchip.com.
|
|||||||
|
|
||||||
* Datasheet
|
* Datasheet
|
||||||
|
|
||||||
http://ww1.microchip.com/downloads/en/DeviceDoc/Atmel-11121-32-bit-Cortex-A5-Microcontroller-SAMA5D3_Datasheet.pdf
|
http://ww1.microchip.com/downloads/en/DeviceDoc/Atmel-11121-32-bit-Cortex-A5-Microcontroller-SAMA5D3_Datasheet_B.pdf
|
||||||
|
|
||||||
* ARM Cortex-A5 + NEON based SoCs
|
* ARM Cortex-A5 + NEON based SoCs
|
||||||
- sama5d4 family
|
- sama5d4 family
|
||||||
@@ -167,7 +167,7 @@ the Microchip website: http://www.microchip.com.
|
|||||||
|
|
||||||
* Datasheet
|
* Datasheet
|
||||||
|
|
||||||
http://ww1.microchip.com/downloads/en/DeviceDoc/60001527A.pdf
|
http://ww1.microchip.com/downloads/en/DeviceDoc/SAM-E70-S70-V70-V71-Family-Data-Sheet-DS60001527D.pdf
|
||||||
|
|
||||||
|
|
||||||
Linux kernel information
|
Linux kernel information
|
||||||
|
@@ -213,6 +213,9 @@ Before jumping into the kernel, the following conditions must be met:
|
|||||||
|
|
||||||
- ICC_SRE_EL3.Enable (bit 3) must be initialiased to 0b1.
|
- ICC_SRE_EL3.Enable (bit 3) must be initialiased to 0b1.
|
||||||
- ICC_SRE_EL3.SRE (bit 0) must be initialised to 0b1.
|
- ICC_SRE_EL3.SRE (bit 0) must be initialised to 0b1.
|
||||||
|
- ICC_CTLR_EL3.PMHE (bit 6) must be set to the same value across
|
||||||
|
all CPUs the kernel is executing on, and must stay constant
|
||||||
|
for the lifetime of the kernel.
|
||||||
|
|
||||||
- If the kernel is entered at EL1:
|
- If the kernel is entered at EL1:
|
||||||
|
|
||||||
|
@@ -168,8 +168,15 @@ infrastructure:
|
|||||||
+------------------------------+---------+---------+
|
+------------------------------+---------+---------+
|
||||||
|
|
||||||
|
|
||||||
3) MIDR_EL1 - Main ID Register
|
3) ID_AA64PFR1_EL1 - Processor Feature Register 1
|
||||||
|
+------------------------------+---------+---------+
|
||||||
|
| Name | bits | visible |
|
||||||
|
+------------------------------+---------+---------+
|
||||||
|
| SSBS | [7-4] | y |
|
||||||
|
+------------------------------+---------+---------+
|
||||||
|
|
||||||
|
|
||||||
|
4) MIDR_EL1 - Main ID Register
|
||||||
+------------------------------+---------+---------+
|
+------------------------------+---------+---------+
|
||||||
| Name | bits | visible |
|
| Name | bits | visible |
|
||||||
+------------------------------+---------+---------+
|
+------------------------------+---------+---------+
|
||||||
@@ -188,11 +195,15 @@ infrastructure:
|
|||||||
as available on the CPU where it is fetched and is not a system
|
as available on the CPU where it is fetched and is not a system
|
||||||
wide safe value.
|
wide safe value.
|
||||||
|
|
||||||
4) ID_AA64ISAR1_EL1 - Instruction set attribute register 1
|
5) ID_AA64ISAR1_EL1 - Instruction set attribute register 1
|
||||||
|
|
||||||
+------------------------------+---------+---------+
|
+------------------------------+---------+---------+
|
||||||
| Name | bits | visible |
|
| Name | bits | visible |
|
||||||
+------------------------------+---------+---------+
|
+------------------------------+---------+---------+
|
||||||
|
| SB | [39-36] | y |
|
||||||
|
+------------------------------+---------+---------+
|
||||||
|
| FRINTTS | [35-32] | y |
|
||||||
|
+------------------------------+---------+---------+
|
||||||
| GPI | [31-28] | y |
|
| GPI | [31-28] | y |
|
||||||
+------------------------------+---------+---------+
|
+------------------------------+---------+---------+
|
||||||
| GPA | [27-24] | y |
|
| GPA | [27-24] | y |
|
||||||
@@ -210,7 +221,7 @@ infrastructure:
|
|||||||
| DPB | [3-0] | y |
|
| DPB | [3-0] | y |
|
||||||
+------------------------------+---------+---------+
|
+------------------------------+---------+---------+
|
||||||
|
|
||||||
5) ID_AA64MMFR2_EL1 - Memory model feature register 2
|
6) ID_AA64MMFR2_EL1 - Memory model feature register 2
|
||||||
|
|
||||||
+------------------------------+---------+---------+
|
+------------------------------+---------+---------+
|
||||||
| Name | bits | visible |
|
| Name | bits | visible |
|
||||||
@@ -218,7 +229,7 @@ infrastructure:
|
|||||||
| AT | [35-32] | y |
|
| AT | [35-32] | y |
|
||||||
+------------------------------+---------+---------+
|
+------------------------------+---------+---------+
|
||||||
|
|
||||||
6) ID_AA64ZFR0_EL1 - SVE feature ID register 0
|
7) ID_AA64ZFR0_EL1 - SVE feature ID register 0
|
||||||
|
|
||||||
+------------------------------+---------+---------+
|
+------------------------------+---------+---------+
|
||||||
| Name | bits | visible |
|
| Name | bits | visible |
|
||||||
|
@@ -119,10 +119,6 @@ HWCAP_LRCPC
|
|||||||
HWCAP_DCPOP
|
HWCAP_DCPOP
|
||||||
Functionality implied by ID_AA64ISAR1_EL1.DPB == 0b0001.
|
Functionality implied by ID_AA64ISAR1_EL1.DPB == 0b0001.
|
||||||
|
|
||||||
HWCAP2_DCPODP
|
|
||||||
|
|
||||||
Functionality implied by ID_AA64ISAR1_EL1.DPB == 0b0010.
|
|
||||||
|
|
||||||
HWCAP_SHA3
|
HWCAP_SHA3
|
||||||
Functionality implied by ID_AA64ISAR0_EL1.SHA3 == 0b0001.
|
Functionality implied by ID_AA64ISAR0_EL1.SHA3 == 0b0001.
|
||||||
|
|
||||||
@@ -141,6 +137,41 @@ HWCAP_SHA512
|
|||||||
HWCAP_SVE
|
HWCAP_SVE
|
||||||
Functionality implied by ID_AA64PFR0_EL1.SVE == 0b0001.
|
Functionality implied by ID_AA64PFR0_EL1.SVE == 0b0001.
|
||||||
|
|
||||||
|
HWCAP_ASIMDFHM
|
||||||
|
Functionality implied by ID_AA64ISAR0_EL1.FHM == 0b0001.
|
||||||
|
|
||||||
|
HWCAP_DIT
|
||||||
|
Functionality implied by ID_AA64PFR0_EL1.DIT == 0b0001.
|
||||||
|
|
||||||
|
HWCAP_USCAT
|
||||||
|
Functionality implied by ID_AA64MMFR2_EL1.AT == 0b0001.
|
||||||
|
|
||||||
|
HWCAP_ILRCPC
|
||||||
|
Functionality implied by ID_AA64ISAR1_EL1.LRCPC == 0b0010.
|
||||||
|
|
||||||
|
HWCAP_FLAGM
|
||||||
|
Functionality implied by ID_AA64ISAR0_EL1.TS == 0b0001.
|
||||||
|
|
||||||
|
HWCAP_SSBS
|
||||||
|
Functionality implied by ID_AA64PFR1_EL1.SSBS == 0b0010.
|
||||||
|
|
||||||
|
HWCAP_SB
|
||||||
|
Functionality implied by ID_AA64ISAR1_EL1.SB == 0b0001.
|
||||||
|
|
||||||
|
HWCAP_PACA
|
||||||
|
Functionality implied by ID_AA64ISAR1_EL1.APA == 0b0001 or
|
||||||
|
ID_AA64ISAR1_EL1.API == 0b0001, as described by
|
||||||
|
Documentation/arm64/pointer-authentication.rst.
|
||||||
|
|
||||||
|
HWCAP_PACG
|
||||||
|
Functionality implied by ID_AA64ISAR1_EL1.GPA == 0b0001 or
|
||||||
|
ID_AA64ISAR1_EL1.GPI == 0b0001, as described by
|
||||||
|
Documentation/arm64/pointer-authentication.rst.
|
||||||
|
|
||||||
|
HWCAP2_DCPODP
|
||||||
|
|
||||||
|
Functionality implied by ID_AA64ISAR1_EL1.DPB == 0b0010.
|
||||||
|
|
||||||
HWCAP2_SVE2
|
HWCAP2_SVE2
|
||||||
|
|
||||||
Functionality implied by ID_AA64ZFR0_EL1.SVEVer == 0b0001.
|
Functionality implied by ID_AA64ZFR0_EL1.SVEVer == 0b0001.
|
||||||
@@ -165,38 +196,10 @@ HWCAP2_SVESM4
|
|||||||
|
|
||||||
Functionality implied by ID_AA64ZFR0_EL1.SM4 == 0b0001.
|
Functionality implied by ID_AA64ZFR0_EL1.SM4 == 0b0001.
|
||||||
|
|
||||||
HWCAP_ASIMDFHM
|
|
||||||
Functionality implied by ID_AA64ISAR0_EL1.FHM == 0b0001.
|
|
||||||
|
|
||||||
HWCAP_DIT
|
|
||||||
Functionality implied by ID_AA64PFR0_EL1.DIT == 0b0001.
|
|
||||||
|
|
||||||
HWCAP_USCAT
|
|
||||||
Functionality implied by ID_AA64MMFR2_EL1.AT == 0b0001.
|
|
||||||
|
|
||||||
HWCAP_ILRCPC
|
|
||||||
Functionality implied by ID_AA64ISAR1_EL1.LRCPC == 0b0010.
|
|
||||||
|
|
||||||
HWCAP_FLAGM
|
|
||||||
Functionality implied by ID_AA64ISAR0_EL1.TS == 0b0001.
|
|
||||||
|
|
||||||
HWCAP2_FLAGM2
|
HWCAP2_FLAGM2
|
||||||
|
|
||||||
Functionality implied by ID_AA64ISAR0_EL1.TS == 0b0010.
|
Functionality implied by ID_AA64ISAR0_EL1.TS == 0b0010.
|
||||||
|
|
||||||
HWCAP_SSBS
|
|
||||||
Functionality implied by ID_AA64PFR1_EL1.SSBS == 0b0010.
|
|
||||||
|
|
||||||
HWCAP_PACA
|
|
||||||
Functionality implied by ID_AA64ISAR1_EL1.APA == 0b0001 or
|
|
||||||
ID_AA64ISAR1_EL1.API == 0b0001, as described by
|
|
||||||
Documentation/arm64/pointer-authentication.rst.
|
|
||||||
|
|
||||||
HWCAP_PACG
|
|
||||||
Functionality implied by ID_AA64ISAR1_EL1.GPA == 0b0001 or
|
|
||||||
ID_AA64ISAR1_EL1.GPI == 0b0001, as described by
|
|
||||||
Documentation/arm64/pointer-authentication.rst.
|
|
||||||
|
|
||||||
HWCAP2_FRINT
|
HWCAP2_FRINT
|
||||||
|
|
||||||
Functionality implied by ID_AA64ISAR1_EL1.FRINTTS == 0b0001.
|
Functionality implied by ID_AA64ISAR1_EL1.FRINTTS == 0b0001.
|
||||||
|
@@ -70,8 +70,12 @@ stable kernels.
|
|||||||
+----------------+-----------------+-----------------+-----------------------------+
|
+----------------+-----------------+-----------------+-----------------------------+
|
||||||
| ARM | Cortex-A57 | #834220 | ARM64_ERRATUM_834220 |
|
| ARM | Cortex-A57 | #834220 | ARM64_ERRATUM_834220 |
|
||||||
+----------------+-----------------+-----------------+-----------------------------+
|
+----------------+-----------------+-----------------+-----------------------------+
|
||||||
|
| ARM | Cortex-A57 | #1319537 | ARM64_ERRATUM_1319367 |
|
||||||
|
+----------------+-----------------+-----------------+-----------------------------+
|
||||||
| ARM | Cortex-A72 | #853709 | N/A |
|
| ARM | Cortex-A72 | #853709 | N/A |
|
||||||
+----------------+-----------------+-----------------+-----------------------------+
|
+----------------+-----------------+-----------------+-----------------------------+
|
||||||
|
| ARM | Cortex-A72 | #1319367 | ARM64_ERRATUM_1319367 |
|
||||||
|
+----------------+-----------------+-----------------+-----------------------------+
|
||||||
| ARM | Cortex-A73 | #858921 | ARM64_ERRATUM_858921 |
|
| ARM | Cortex-A73 | #858921 | ARM64_ERRATUM_858921 |
|
||||||
+----------------+-----------------+-----------------+-----------------------------+
|
+----------------+-----------------+-----------------+-----------------------------+
|
||||||
| ARM | Cortex-A55 | #1024718 | ARM64_ERRATUM_1024718 |
|
| ARM | Cortex-A55 | #1024718 | ARM64_ERRATUM_1024718 |
|
||||||
@@ -88,6 +92,8 @@ stable kernels.
|
|||||||
+----------------+-----------------+-----------------+-----------------------------+
|
+----------------+-----------------+-----------------+-----------------------------+
|
||||||
| ARM | Neoverse-N1 | #1349291 | N/A |
|
| ARM | Neoverse-N1 | #1349291 | N/A |
|
||||||
+----------------+-----------------+-----------------+-----------------------------+
|
+----------------+-----------------+-----------------+-----------------------------+
|
||||||
|
| ARM | Neoverse-N1 | #1542419 | ARM64_ERRATUM_1542419 |
|
||||||
|
+----------------+-----------------+-----------------+-----------------------------+
|
||||||
| ARM | MMU-500 | #841119,826419 | N/A |
|
| ARM | MMU-500 | #841119,826419 | N/A |
|
||||||
+----------------+-----------------+-----------------+-----------------------------+
|
+----------------+-----------------+-----------------+-----------------------------+
|
||||||
+----------------+-----------------+-----------------+-----------------------------+
|
+----------------+-----------------+-----------------+-----------------------------+
|
||||||
|
216
Documentation/asm-annotations.rst
Normal file
216
Documentation/asm-annotations.rst
Normal file
@@ -0,0 +1,216 @@
|
|||||||
|
Assembler Annotations
|
||||||
|
=====================
|
||||||
|
|
||||||
|
Copyright (c) 2017-2019 Jiri Slaby
|
||||||
|
|
||||||
|
This document describes the new macros for annotation of data and code in
|
||||||
|
assembly. In particular, it contains information about ``SYM_FUNC_START``,
|
||||||
|
``SYM_FUNC_END``, ``SYM_CODE_START``, and similar.
|
||||||
|
|
||||||
|
Rationale
|
||||||
|
---------
|
||||||
|
Some code like entries, trampolines, or boot code needs to be written in
|
||||||
|
assembly. The same as in C, such code is grouped into functions and
|
||||||
|
accompanied with data. Standard assemblers do not force users into precisely
|
||||||
|
marking these pieces as code, data, or even specifying their length.
|
||||||
|
Nevertheless, assemblers provide developers with such annotations to aid
|
||||||
|
debuggers throughout assembly. On top of that, developers also want to mark
|
||||||
|
some functions as *global* in order to be visible outside of their translation
|
||||||
|
units.
|
||||||
|
|
||||||
|
Over time, the Linux kernel has adopted macros from various projects (like
|
||||||
|
``binutils``) to facilitate such annotations. So for historic reasons,
|
||||||
|
developers have been using ``ENTRY``, ``END``, ``ENDPROC``, and other
|
||||||
|
annotations in assembly. Due to the lack of their documentation, the macros
|
||||||
|
are used in rather wrong contexts at some locations. Clearly, ``ENTRY`` was
|
||||||
|
intended to denote the beginning of global symbols (be it data or code).
|
||||||
|
``END`` used to mark the end of data or end of special functions with
|
||||||
|
*non-standard* calling convention. In contrast, ``ENDPROC`` should annotate
|
||||||
|
only ends of *standard* functions.
|
||||||
|
|
||||||
|
When these macros are used correctly, they help assemblers generate a nice
|
||||||
|
object with both sizes and types set correctly. For example, the result of
|
||||||
|
``arch/x86/lib/putuser.S``::
|
||||||
|
|
||||||
|
Num: Value Size Type Bind Vis Ndx Name
|
||||||
|
25: 0000000000000000 33 FUNC GLOBAL DEFAULT 1 __put_user_1
|
||||||
|
29: 0000000000000030 37 FUNC GLOBAL DEFAULT 1 __put_user_2
|
||||||
|
32: 0000000000000060 36 FUNC GLOBAL DEFAULT 1 __put_user_4
|
||||||
|
35: 0000000000000090 37 FUNC GLOBAL DEFAULT 1 __put_user_8
|
||||||
|
|
||||||
|
This is not only important for debugging purposes. When there are properly
|
||||||
|
annotated objects like this, tools can be run on them to generate more useful
|
||||||
|
information. In particular, on properly annotated objects, ``objtool`` can be
|
||||||
|
run to check and fix the object if needed. Currently, ``objtool`` can report
|
||||||
|
missing frame pointer setup/destruction in functions. It can also
|
||||||
|
automatically generate annotations for :doc:`ORC unwinder <x86/orc-unwinder>`
|
||||||
|
for most code. Both of these are especially important to support reliable
|
||||||
|
stack traces which are in turn necessary for :doc:`Kernel live patching
|
||||||
|
<livepatch/livepatch>`.
|
||||||
|
|
||||||
|
Caveat and Discussion
|
||||||
|
---------------------
|
||||||
|
As one might realize, there were only three macros previously. That is indeed
|
||||||
|
insufficient to cover all the combinations of cases:
|
||||||
|
|
||||||
|
* standard/non-standard function
|
||||||
|
* code/data
|
||||||
|
* global/local symbol
|
||||||
|
|
||||||
|
There was a discussion_ and instead of extending the current ``ENTRY/END*``
|
||||||
|
macros, it was decided that brand new macros should be introduced instead::
|
||||||
|
|
||||||
|
So how about using macro names that actually show the purpose, instead
|
||||||
|
of importing all the crappy, historic, essentially randomly chosen
|
||||||
|
debug symbol macro names from the binutils and older kernels?
|
||||||
|
|
||||||
|
.. _discussion: https://lkml.kernel.org/r/20170217104757.28588-1-jslaby@suse.cz
|
||||||
|
|
||||||
|
Macros Description
|
||||||
|
------------------
|
||||||
|
|
||||||
|
The new macros are prefixed with the ``SYM_`` prefix and can be divided into
|
||||||
|
three main groups:
|
||||||
|
|
||||||
|
1. ``SYM_FUNC_*`` -- to annotate C-like functions. This means functions with
|
||||||
|
standard C calling conventions, i.e. the stack contains a return address at
|
||||||
|
the predefined place and a return from the function can happen in a
|
||||||
|
standard way. When frame pointers are enabled, save/restore of frame
|
||||||
|
pointer shall happen at the start/end of a function, respectively, too.
|
||||||
|
|
||||||
|
Checking tools like ``objtool`` should ensure such marked functions conform
|
||||||
|
to these rules. The tools can also easily annotate these functions with
|
||||||
|
debugging information (like *ORC data*) automatically.
|
||||||
|
|
||||||
|
2. ``SYM_CODE_*`` -- special functions called with special stack. Be it
|
||||||
|
interrupt handlers with special stack content, trampolines, or startup
|
||||||
|
functions.
|
||||||
|
|
||||||
|
Checking tools mostly ignore checking of these functions. But some debug
|
||||||
|
information still can be generated automatically. For correct debug data,
|
||||||
|
this code needs hints like ``UNWIND_HINT_REGS`` provided by developers.
|
||||||
|
|
||||||
|
3. ``SYM_DATA*`` -- obviously data belonging to ``.data`` sections and not to
|
||||||
|
``.text``. Data do not contain instructions, so they have to be treated
|
||||||
|
specially by the tools: they should not treat the bytes as instructions,
|
||||||
|
nor assign any debug information to them.
|
||||||
|
|
||||||
|
Instruction Macros
|
||||||
|
~~~~~~~~~~~~~~~~~~
|
||||||
|
This section covers ``SYM_FUNC_*`` and ``SYM_CODE_*`` enumerated above.
|
||||||
|
|
||||||
|
* ``SYM_FUNC_START`` and ``SYM_FUNC_START_LOCAL`` are supposed to be **the
|
||||||
|
most frequent markings**. They are used for functions with standard calling
|
||||||
|
conventions -- global and local. Like in C, they both align the functions to
|
||||||
|
architecture specific ``__ALIGN`` bytes. There are also ``_NOALIGN`` variants
|
||||||
|
for special cases where developers do not want this implicit alignment.
|
||||||
|
|
||||||
|
``SYM_FUNC_START_WEAK`` and ``SYM_FUNC_START_WEAK_NOALIGN`` markings are
|
||||||
|
also offered as an assembler counterpart to the *weak* attribute known from
|
||||||
|
C.
|
||||||
|
|
||||||
|
All of these **shall** be coupled with ``SYM_FUNC_END``. First, it marks
|
||||||
|
the sequence of instructions as a function and computes its size to the
|
||||||
|
generated object file. Second, it also eases checking and processing such
|
||||||
|
object files as the tools can trivially find exact function boundaries.
|
||||||
|
|
||||||
|
So in most cases, developers should write something like in the following
|
||||||
|
example, having some asm instructions in between the macros, of course::
|
||||||
|
|
||||||
|
SYM_FUNC_START(memset)
|
||||||
|
... asm insns ...
|
||||||
|
SYM_FUNC_END(memset)
|
||||||
|
|
||||||
|
In fact, this kind of annotation corresponds to the now deprecated ``ENTRY``
|
||||||
|
and ``ENDPROC`` macros.
|
||||||
|
|
||||||
|
* ``SYM_FUNC_START_ALIAS`` and ``SYM_FUNC_START_LOCAL_ALIAS`` serve for those
|
||||||
|
who decided to have two or more names for one function. The typical use is::
|
||||||
|
|
||||||
|
SYM_FUNC_START_ALIAS(__memset)
|
||||||
|
SYM_FUNC_START(memset)
|
||||||
|
... asm insns ...
|
||||||
|
SYM_FUNC_END(memset)
|
||||||
|
SYM_FUNC_END_ALIAS(__memset)
|
||||||
|
|
||||||
|
In this example, one can call ``__memset`` or ``memset`` with the same
|
||||||
|
result, except the debug information for the instructions is generated to
|
||||||
|
the object file only once -- for the non-``ALIAS`` case.
|
||||||
|
|
||||||
|
* ``SYM_CODE_START`` and ``SYM_CODE_START_LOCAL`` should be used only in
|
||||||
|
special cases -- if you know what you are doing. This is used exclusively
|
||||||
|
for interrupt handlers and similar where the calling convention is not the C
|
||||||
|
one. ``_NOALIGN`` variants exist too. The use is the same as for the ``FUNC``
|
||||||
|
category above::
|
||||||
|
|
||||||
|
SYM_CODE_START_LOCAL(bad_put_user)
|
||||||
|
... asm insns ...
|
||||||
|
SYM_CODE_END(bad_put_user)
|
||||||
|
|
||||||
|
Again, every ``SYM_CODE_START*`` **shall** be coupled by ``SYM_CODE_END``.
|
||||||
|
|
||||||
|
To some extent, this category corresponds to deprecated ``ENTRY`` and
|
||||||
|
``END``. Except ``END`` had several other meanings too.
|
||||||
|
|
||||||
|
* ``SYM_INNER_LABEL*`` is used to denote a label inside some
|
||||||
|
``SYM_{CODE,FUNC}_START`` and ``SYM_{CODE,FUNC}_END``. They are very similar
|
||||||
|
to C labels, except they can be made global. An example of use::
|
||||||
|
|
||||||
|
SYM_CODE_START(ftrace_caller)
|
||||||
|
/* save_mcount_regs fills in first two parameters */
|
||||||
|
...
|
||||||
|
|
||||||
|
SYM_INNER_LABEL(ftrace_caller_op_ptr, SYM_L_GLOBAL)
|
||||||
|
/* Load the ftrace_ops into the 3rd parameter */
|
||||||
|
...
|
||||||
|
|
||||||
|
SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL)
|
||||||
|
call ftrace_stub
|
||||||
|
...
|
||||||
|
retq
|
||||||
|
SYM_CODE_END(ftrace_caller)
|
||||||
|
|
||||||
|
Data Macros
|
||||||
|
~~~~~~~~~~~
|
||||||
|
Similar to instructions, there is a couple of macros to describe data in the
|
||||||
|
assembly.
|
||||||
|
|
||||||
|
* ``SYM_DATA_START`` and ``SYM_DATA_START_LOCAL`` mark the start of some data
|
||||||
|
and shall be used in conjunction with either ``SYM_DATA_END``, or
|
||||||
|
``SYM_DATA_END_LABEL``. The latter adds also a label to the end, so that
|
||||||
|
people can use ``lstack`` and (local) ``lstack_end`` in the following
|
||||||
|
example::
|
||||||
|
|
||||||
|
SYM_DATA_START_LOCAL(lstack)
|
||||||
|
.skip 4096
|
||||||
|
SYM_DATA_END_LABEL(lstack, SYM_L_LOCAL, lstack_end)
|
||||||
|
|
||||||
|
* ``SYM_DATA`` and ``SYM_DATA_LOCAL`` are variants for simple, mostly one-line
|
||||||
|
data::
|
||||||
|
|
||||||
|
SYM_DATA(HEAP, .long rm_heap)
|
||||||
|
SYM_DATA(heap_end, .long rm_stack)
|
||||||
|
|
||||||
|
In the end, they expand to ``SYM_DATA_START`` with ``SYM_DATA_END``
|
||||||
|
internally.
|
||||||
|
|
||||||
|
Support Macros
|
||||||
|
~~~~~~~~~~~~~~
|
||||||
|
All the above reduce themselves to some invocation of ``SYM_START``,
|
||||||
|
``SYM_END``, or ``SYM_ENTRY`` at last. Normally, developers should avoid using
|
||||||
|
these.
|
||||||
|
|
||||||
|
Further, in the above examples, one could see ``SYM_L_LOCAL``. There are also
|
||||||
|
``SYM_L_GLOBAL`` and ``SYM_L_WEAK``. All are intended to denote linkage of a
|
||||||
|
symbol marked by them. They are used either in ``_LABEL`` variants of the
|
||||||
|
earlier macros, or in ``SYM_START``.
|
||||||
|
|
||||||
|
|
||||||
|
Overriding Macros
|
||||||
|
~~~~~~~~~~~~~~~~~
|
||||||
|
Architecture can also override any of the macros in their own
|
||||||
|
``asm/linkage.h``, including macros specifying the type of a symbol
|
||||||
|
(``SYM_T_FUNC``, ``SYM_T_OBJECT``, and ``SYM_T_NONE``). As every macro
|
||||||
|
described in this file is surrounded by ``#ifdef`` + ``#endif``, it is enough
|
||||||
|
to define the macros differently in the aforementioned architecture-dependent
|
||||||
|
header.
|
@@ -41,6 +41,8 @@ discard I/Os requests number of discard I/Os processed
|
|||||||
discard merges requests number of discard I/Os merged with in-queue I/O
|
discard merges requests number of discard I/Os merged with in-queue I/O
|
||||||
discard sectors sectors number of sectors discarded
|
discard sectors sectors number of sectors discarded
|
||||||
discard ticks milliseconds total wait time for discard requests
|
discard ticks milliseconds total wait time for discard requests
|
||||||
|
flush I/Os requests number of flush I/Os processed
|
||||||
|
flush ticks milliseconds total wait time for flush requests
|
||||||
=============== ============= =================================================
|
=============== ============= =================================================
|
||||||
|
|
||||||
read I/Os, write I/Os, discard I/0s
|
read I/Os, write I/Os, discard I/0s
|
||||||
@@ -48,6 +50,14 @@ read I/Os, write I/Os, discard I/0s
|
|||||||
|
|
||||||
These values increment when an I/O request completes.
|
These values increment when an I/O request completes.
|
||||||
|
|
||||||
|
flush I/Os
|
||||||
|
==========
|
||||||
|
|
||||||
|
These values increment when an flush I/O request completes.
|
||||||
|
|
||||||
|
Block layer combines flush requests and executes at most one at a time.
|
||||||
|
This counts flush requests executed by disk. Not tracked for partitions.
|
||||||
|
|
||||||
read merges, write merges, discard merges
|
read merges, write merges, discard merges
|
||||||
=========================================
|
=========================================
|
||||||
|
|
||||||
@@ -62,8 +72,8 @@ discarded from this block device. The "sectors" in question are the
|
|||||||
standard UNIX 512-byte sectors, not any device- or filesystem-specific
|
standard UNIX 512-byte sectors, not any device- or filesystem-specific
|
||||||
block size. The counters are incremented when the I/O completes.
|
block size. The counters are incremented when the I/O completes.
|
||||||
|
|
||||||
read ticks, write ticks, discard ticks
|
read ticks, write ticks, discard ticks, flush ticks
|
||||||
======================================
|
===================================================
|
||||||
|
|
||||||
These values count the number of milliseconds that I/O requests have
|
These values count the number of milliseconds that I/O requests have
|
||||||
waited on this block device. If there are multiple I/O requests waiting,
|
waited on this block device. If there are multiple I/O requests waiting,
|
||||||
|
@@ -47,6 +47,15 @@ Program types
|
|||||||
prog_flow_dissector
|
prog_flow_dissector
|
||||||
|
|
||||||
|
|
||||||
|
Testing BPF
|
||||||
|
===========
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 1
|
||||||
|
|
||||||
|
s390
|
||||||
|
|
||||||
|
|
||||||
.. Links:
|
.. Links:
|
||||||
.. _Documentation/networking/filter.txt: ../networking/filter.txt
|
.. _Documentation/networking/filter.txt: ../networking/filter.txt
|
||||||
.. _man-pages: https://www.kernel.org/doc/man-pages/
|
.. _man-pages: https://www.kernel.org/doc/man-pages/
|
||||||
|
@@ -142,3 +142,6 @@ BPF flow dissector doesn't support exporting all the metadata that in-kernel
|
|||||||
C-based implementation can export. Notable example is single VLAN (802.1Q)
|
C-based implementation can export. Notable example is single VLAN (802.1Q)
|
||||||
and double VLAN (802.1AD) tags. Please refer to the ``struct bpf_flow_keys``
|
and double VLAN (802.1AD) tags. Please refer to the ``struct bpf_flow_keys``
|
||||||
for a set of information that's currently can be exported from the BPF context.
|
for a set of information that's currently can be exported from the BPF context.
|
||||||
|
|
||||||
|
When BPF flow dissector is attached to the root network namespace (machine-wide
|
||||||
|
policy), users can't override it in their child network namespaces.
|
||||||
|
205
Documentation/bpf/s390.rst
Normal file
205
Documentation/bpf/s390.rst
Normal file
@@ -0,0 +1,205 @@
|
|||||||
|
===================
|
||||||
|
Testing BPF on s390
|
||||||
|
===================
|
||||||
|
|
||||||
|
1. Introduction
|
||||||
|
***************
|
||||||
|
|
||||||
|
IBM Z are mainframe computers, which are descendants of IBM System/360 from
|
||||||
|
year 1964. They are supported by the Linux kernel under the name "s390". This
|
||||||
|
document describes how to test BPF in an s390 QEMU guest.
|
||||||
|
|
||||||
|
2. One-time setup
|
||||||
|
*****************
|
||||||
|
|
||||||
|
The following is required to build and run the test suite:
|
||||||
|
|
||||||
|
* s390 GCC
|
||||||
|
* s390 development headers and libraries
|
||||||
|
* Clang with BPF support
|
||||||
|
* QEMU with s390 support
|
||||||
|
* Disk image with s390 rootfs
|
||||||
|
|
||||||
|
Debian supports installing compiler and libraries for s390 out of the box.
|
||||||
|
Users of other distros may use debootstrap in order to set up a Debian chroot::
|
||||||
|
|
||||||
|
sudo debootstrap \
|
||||||
|
--variant=minbase \
|
||||||
|
--include=sudo \
|
||||||
|
testing \
|
||||||
|
./s390-toolchain
|
||||||
|
sudo mount --rbind /dev ./s390-toolchain/dev
|
||||||
|
sudo mount --rbind /proc ./s390-toolchain/proc
|
||||||
|
sudo mount --rbind /sys ./s390-toolchain/sys
|
||||||
|
sudo chroot ./s390-toolchain
|
||||||
|
|
||||||
|
Once on Debian, the build prerequisites can be installed as follows::
|
||||||
|
|
||||||
|
sudo dpkg --add-architecture s390x
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install \
|
||||||
|
bc \
|
||||||
|
bison \
|
||||||
|
cmake \
|
||||||
|
debootstrap \
|
||||||
|
dwarves \
|
||||||
|
flex \
|
||||||
|
g++ \
|
||||||
|
gcc \
|
||||||
|
g++-s390x-linux-gnu \
|
||||||
|
gcc-s390x-linux-gnu \
|
||||||
|
gdb-multiarch \
|
||||||
|
git \
|
||||||
|
make \
|
||||||
|
python3 \
|
||||||
|
qemu-system-misc \
|
||||||
|
qemu-utils \
|
||||||
|
rsync \
|
||||||
|
libcap-dev:s390x \
|
||||||
|
libelf-dev:s390x \
|
||||||
|
libncurses-dev
|
||||||
|
|
||||||
|
Latest Clang targeting BPF can be installed as follows::
|
||||||
|
|
||||||
|
git clone https://github.com/llvm/llvm-project.git
|
||||||
|
ln -s ../../clang llvm-project/llvm/tools/
|
||||||
|
mkdir llvm-project-build
|
||||||
|
cd llvm-project-build
|
||||||
|
cmake \
|
||||||
|
-DLLVM_TARGETS_TO_BUILD=BPF \
|
||||||
|
-DCMAKE_BUILD_TYPE=Release \
|
||||||
|
-DCMAKE_INSTALL_PREFIX=/opt/clang-bpf \
|
||||||
|
../llvm-project/llvm
|
||||||
|
make
|
||||||
|
sudo make install
|
||||||
|
export PATH=/opt/clang-bpf/bin:$PATH
|
||||||
|
|
||||||
|
The disk image can be prepared using a loopback mount and debootstrap::
|
||||||
|
|
||||||
|
qemu-img create -f raw ./s390.img 1G
|
||||||
|
sudo losetup -f ./s390.img
|
||||||
|
sudo mkfs.ext4 /dev/loopX
|
||||||
|
mkdir ./s390.rootfs
|
||||||
|
sudo mount /dev/loopX ./s390.rootfs
|
||||||
|
sudo debootstrap \
|
||||||
|
--foreign \
|
||||||
|
--arch=s390x \
|
||||||
|
--variant=minbase \
|
||||||
|
--include=" \
|
||||||
|
iproute2, \
|
||||||
|
iputils-ping, \
|
||||||
|
isc-dhcp-client, \
|
||||||
|
kmod, \
|
||||||
|
libcap2, \
|
||||||
|
libelf1, \
|
||||||
|
netcat, \
|
||||||
|
procps" \
|
||||||
|
testing \
|
||||||
|
./s390.rootfs
|
||||||
|
sudo umount ./s390.rootfs
|
||||||
|
sudo losetup -d /dev/loopX
|
||||||
|
|
||||||
|
3. Compilation
|
||||||
|
**************
|
||||||
|
|
||||||
|
In addition to the usual Kconfig options required to run the BPF test suite, it
|
||||||
|
is also helpful to select::
|
||||||
|
|
||||||
|
CONFIG_NET_9P=y
|
||||||
|
CONFIG_9P_FS=y
|
||||||
|
CONFIG_NET_9P_VIRTIO=y
|
||||||
|
CONFIG_VIRTIO_PCI=y
|
||||||
|
|
||||||
|
as that would enable a very easy way to share files with the s390 virtual
|
||||||
|
machine.
|
||||||
|
|
||||||
|
Compiling kernel, modules and testsuite, as well as preparing gdb scripts to
|
||||||
|
simplify debugging, can be done using the following commands::
|
||||||
|
|
||||||
|
make ARCH=s390 CROSS_COMPILE=s390x-linux-gnu- menuconfig
|
||||||
|
make ARCH=s390 CROSS_COMPILE=s390x-linux-gnu- bzImage modules scripts_gdb
|
||||||
|
make ARCH=s390 CROSS_COMPILE=s390x-linux-gnu- \
|
||||||
|
-C tools/testing/selftests \
|
||||||
|
TARGETS=bpf \
|
||||||
|
INSTALL_PATH=$PWD/tools/testing/selftests/kselftest_install \
|
||||||
|
install
|
||||||
|
|
||||||
|
4. Running the test suite
|
||||||
|
*************************
|
||||||
|
|
||||||
|
The virtual machine can be started as follows::
|
||||||
|
|
||||||
|
qemu-system-s390x \
|
||||||
|
-cpu max,zpci=on \
|
||||||
|
-smp 2 \
|
||||||
|
-m 4G \
|
||||||
|
-kernel linux/arch/s390/boot/compressed/vmlinux \
|
||||||
|
-drive file=./s390.img,if=virtio,format=raw \
|
||||||
|
-nographic \
|
||||||
|
-append 'root=/dev/vda rw console=ttyS1' \
|
||||||
|
-virtfs local,path=./linux,security_model=none,mount_tag=linux \
|
||||||
|
-object rng-random,filename=/dev/urandom,id=rng0 \
|
||||||
|
-device virtio-rng-ccw,rng=rng0 \
|
||||||
|
-netdev user,id=net0 \
|
||||||
|
-device virtio-net-ccw,netdev=net0
|
||||||
|
|
||||||
|
When using this on a real IBM Z, ``-enable-kvm`` may be added for better
|
||||||
|
performance. When starting the virtual machine for the first time, disk image
|
||||||
|
setup must be finalized using the following command::
|
||||||
|
|
||||||
|
/debootstrap/debootstrap --second-stage
|
||||||
|
|
||||||
|
Directory with the code built on the host as well as ``/proc`` and ``/sys``
|
||||||
|
need to be mounted as follows::
|
||||||
|
|
||||||
|
mkdir -p /linux
|
||||||
|
mount -t 9p linux /linux
|
||||||
|
mount -t proc proc /proc
|
||||||
|
mount -t sysfs sys /sys
|
||||||
|
|
||||||
|
After that, the test suite can be run using the following commands::
|
||||||
|
|
||||||
|
cd /linux/tools/testing/selftests/kselftest_install
|
||||||
|
./run_kselftest.sh
|
||||||
|
|
||||||
|
As usual, tests can be also run individually::
|
||||||
|
|
||||||
|
cd /linux/tools/testing/selftests/bpf
|
||||||
|
./test_verifier
|
||||||
|
|
||||||
|
5. Debugging
|
||||||
|
************
|
||||||
|
|
||||||
|
It is possible to debug the s390 kernel using QEMU GDB stub, which is activated
|
||||||
|
by passing ``-s`` to QEMU.
|
||||||
|
|
||||||
|
It is preferable to turn KASLR off, so that gdb would know where to find the
|
||||||
|
kernel image in memory, by building the kernel with::
|
||||||
|
|
||||||
|
RANDOMIZE_BASE=n
|
||||||
|
|
||||||
|
GDB can then be attached using the following command::
|
||||||
|
|
||||||
|
gdb-multiarch -ex 'target remote localhost:1234' ./vmlinux
|
||||||
|
|
||||||
|
6. Network
|
||||||
|
**********
|
||||||
|
|
||||||
|
In case one needs to use the network in the virtual machine in order to e.g.
|
||||||
|
install additional packages, it can be configured using::
|
||||||
|
|
||||||
|
dhclient eth0
|
||||||
|
|
||||||
|
7. Links
|
||||||
|
********
|
||||||
|
|
||||||
|
This document is a compilation of techniques, whose more comprehensive
|
||||||
|
descriptions can be found by following these links:
|
||||||
|
|
||||||
|
- `Debootstrap <https://wiki.debian.org/EmDebian/CrossDebootstrap>`_
|
||||||
|
- `Multiarch <https://wiki.debian.org/Multiarch/HOWTO>`_
|
||||||
|
- `Building LLVM <https://llvm.org/docs/CMake.html>`_
|
||||||
|
- `Cross-compiling the kernel <https://wiki.gentoo.org/wiki/Embedded_Handbook/General/Cross-compiling_the_kernel>`_
|
||||||
|
- `QEMU s390x Guest Support <https://wiki.qemu.org/Documentation/Platforms/S390X>`_
|
||||||
|
- `Plan 9 folder sharing over Virtio <https://wiki.qemu.org/Documentation/9psetup>`_
|
||||||
|
- `Using GDB with QEMU <https://wiki.osdev.org/Kernel_Debugging#Use_GDB_with_QEMU>`_
|
@@ -37,7 +37,8 @@ needs_sphinx = '1.3'
|
|||||||
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
|
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
|
||||||
# ones.
|
# ones.
|
||||||
extensions = ['kerneldoc', 'rstFlatTable', 'kernel_include', 'cdomain',
|
extensions = ['kerneldoc', 'rstFlatTable', 'kernel_include', 'cdomain',
|
||||||
'kfigure', 'sphinx.ext.ifconfig', 'automarkup']
|
'kfigure', 'sphinx.ext.ifconfig', 'automarkup',
|
||||||
|
'maintainers_include']
|
||||||
|
|
||||||
# The name of the math extension changed on Sphinx 1.4
|
# The name of the math extension changed on Sphinx 1.4
|
||||||
if (major == 1 and minor > 3) or (major > 1):
|
if (major == 1 and minor > 3) or (major > 1):
|
||||||
|
@@ -23,7 +23,7 @@ begins with the creation of a pool using one of:
|
|||||||
.. kernel-doc:: lib/genalloc.c
|
.. kernel-doc:: lib/genalloc.c
|
||||||
:functions: devm_gen_pool_create
|
:functions: devm_gen_pool_create
|
||||||
|
|
||||||
A call to :c:func:`gen_pool_create` will create a pool. The granularity of
|
A call to gen_pool_create() will create a pool. The granularity of
|
||||||
allocations is set with min_alloc_order; it is a log-base-2 number like
|
allocations is set with min_alloc_order; it is a log-base-2 number like
|
||||||
those used by the page allocator, but it refers to bytes rather than pages.
|
those used by the page allocator, but it refers to bytes rather than pages.
|
||||||
So, if min_alloc_order is passed as 3, then all allocations will be a
|
So, if min_alloc_order is passed as 3, then all allocations will be a
|
||||||
@@ -32,7 +32,7 @@ required to track the memory in the pool. The nid parameter specifies
|
|||||||
which NUMA node should be used for the allocation of the housekeeping
|
which NUMA node should be used for the allocation of the housekeeping
|
||||||
structures; it can be -1 if the caller doesn't care.
|
structures; it can be -1 if the caller doesn't care.
|
||||||
|
|
||||||
The "managed" interface :c:func:`devm_gen_pool_create` ties the pool to a
|
The "managed" interface devm_gen_pool_create() ties the pool to a
|
||||||
specific device. Among other things, it will automatically clean up the
|
specific device. Among other things, it will automatically clean up the
|
||||||
pool when the given device is destroyed.
|
pool when the given device is destroyed.
|
||||||
|
|
||||||
@@ -53,32 +53,32 @@ to the pool. That can be done with one of:
|
|||||||
:functions: gen_pool_add
|
:functions: gen_pool_add
|
||||||
|
|
||||||
.. kernel-doc:: lib/genalloc.c
|
.. kernel-doc:: lib/genalloc.c
|
||||||
:functions: gen_pool_add_virt
|
:functions: gen_pool_add_owner
|
||||||
|
|
||||||
A call to :c:func:`gen_pool_add` will place the size bytes of memory
|
A call to gen_pool_add() will place the size bytes of memory
|
||||||
starting at addr (in the kernel's virtual address space) into the given
|
starting at addr (in the kernel's virtual address space) into the given
|
||||||
pool, once again using nid as the node ID for ancillary memory allocations.
|
pool, once again using nid as the node ID for ancillary memory allocations.
|
||||||
The :c:func:`gen_pool_add_virt` variant associates an explicit physical
|
The gen_pool_add_virt() variant associates an explicit physical
|
||||||
address with the memory; this is only necessary if the pool will be used
|
address with the memory; this is only necessary if the pool will be used
|
||||||
for DMA allocations.
|
for DMA allocations.
|
||||||
|
|
||||||
The functions for allocating memory from the pool (and putting it back)
|
The functions for allocating memory from the pool (and putting it back)
|
||||||
are:
|
are:
|
||||||
|
|
||||||
.. kernel-doc:: lib/genalloc.c
|
.. kernel-doc:: include/linux/genalloc.h
|
||||||
:functions: gen_pool_alloc
|
:functions: gen_pool_alloc
|
||||||
|
|
||||||
.. kernel-doc:: lib/genalloc.c
|
.. kernel-doc:: lib/genalloc.c
|
||||||
:functions: gen_pool_dma_alloc
|
:functions: gen_pool_dma_alloc
|
||||||
|
|
||||||
.. kernel-doc:: lib/genalloc.c
|
.. kernel-doc:: lib/genalloc.c
|
||||||
:functions: gen_pool_free
|
:functions: gen_pool_free_owner
|
||||||
|
|
||||||
As one would expect, :c:func:`gen_pool_alloc` will allocate size< bytes
|
As one would expect, gen_pool_alloc() will allocate size< bytes
|
||||||
from the given pool. The :c:func:`gen_pool_dma_alloc` variant allocates
|
from the given pool. The gen_pool_dma_alloc() variant allocates
|
||||||
memory for use with DMA operations, returning the associated physical
|
memory for use with DMA operations, returning the associated physical
|
||||||
address in the space pointed to by dma. This will only work if the memory
|
address in the space pointed to by dma. This will only work if the memory
|
||||||
was added with :c:func:`gen_pool_add_virt`. Note that this function
|
was added with gen_pool_add_virt(). Note that this function
|
||||||
departs from the usual genpool pattern of using unsigned long values to
|
departs from the usual genpool pattern of using unsigned long values to
|
||||||
represent kernel addresses; it returns a void * instead.
|
represent kernel addresses; it returns a void * instead.
|
||||||
|
|
||||||
@@ -89,14 +89,14 @@ return. If that sort of control is needed, the following functions will be
|
|||||||
of interest:
|
of interest:
|
||||||
|
|
||||||
.. kernel-doc:: lib/genalloc.c
|
.. kernel-doc:: lib/genalloc.c
|
||||||
:functions: gen_pool_alloc_algo
|
:functions: gen_pool_alloc_algo_owner
|
||||||
|
|
||||||
.. kernel-doc:: lib/genalloc.c
|
.. kernel-doc:: lib/genalloc.c
|
||||||
:functions: gen_pool_set_algo
|
:functions: gen_pool_set_algo
|
||||||
|
|
||||||
Allocations with :c:func:`gen_pool_alloc_algo` specify an algorithm to be
|
Allocations with gen_pool_alloc_algo() specify an algorithm to be
|
||||||
used to choose the memory to be allocated; the default algorithm can be set
|
used to choose the memory to be allocated; the default algorithm can be set
|
||||||
with :c:func:`gen_pool_set_algo`. The data value is passed to the
|
with gen_pool_set_algo(). The data value is passed to the
|
||||||
algorithm; most ignore it, but it is occasionally needed. One can,
|
algorithm; most ignore it, but it is occasionally needed. One can,
|
||||||
naturally, write a special-purpose algorithm, but there is a fair set
|
naturally, write a special-purpose algorithm, but there is a fair set
|
||||||
already available:
|
already available:
|
||||||
@@ -129,7 +129,7 @@ writing of special-purpose memory allocators in the future.
|
|||||||
:functions: gen_pool_for_each_chunk
|
:functions: gen_pool_for_each_chunk
|
||||||
|
|
||||||
.. kernel-doc:: lib/genalloc.c
|
.. kernel-doc:: lib/genalloc.c
|
||||||
:functions: addr_in_gen_pool
|
:functions: gen_pool_has_addr
|
||||||
|
|
||||||
.. kernel-doc:: lib/genalloc.c
|
.. kernel-doc:: lib/genalloc.c
|
||||||
:functions: gen_pool_avail
|
:functions: gen_pool_avail
|
||||||
|
@@ -26,7 +26,7 @@ Rationale
|
|||||||
=========
|
=========
|
||||||
|
|
||||||
The original implementation of interrupt handling in Linux uses the
|
The original implementation of interrupt handling in Linux uses the
|
||||||
:c:func:`__do_IRQ` super-handler, which is able to deal with every type of
|
__do_IRQ() super-handler, which is able to deal with every type of
|
||||||
interrupt logic.
|
interrupt logic.
|
||||||
|
|
||||||
Originally, Russell King identified different types of handlers to build
|
Originally, Russell King identified different types of handlers to build
|
||||||
@@ -43,7 +43,7 @@ During the implementation we identified another type:
|
|||||||
|
|
||||||
- Fast EOI type
|
- Fast EOI type
|
||||||
|
|
||||||
In the SMP world of the :c:func:`__do_IRQ` super-handler another type was
|
In the SMP world of the __do_IRQ() super-handler another type was
|
||||||
identified:
|
identified:
|
||||||
|
|
||||||
- Per CPU type
|
- Per CPU type
|
||||||
@@ -83,7 +83,7 @@ IRQ-flow implementation for 'level type' interrupts and add a
|
|||||||
(sub)architecture specific 'edge type' implementation.
|
(sub)architecture specific 'edge type' implementation.
|
||||||
|
|
||||||
To make the transition to the new model easier and prevent the breakage
|
To make the transition to the new model easier and prevent the breakage
|
||||||
of existing implementations, the :c:func:`__do_IRQ` super-handler is still
|
of existing implementations, the __do_IRQ() super-handler is still
|
||||||
available. This leads to a kind of duality for the time being. Over time
|
available. This leads to a kind of duality for the time being. Over time
|
||||||
the new model should be used in more and more architectures, as it
|
the new model should be used in more and more architectures, as it
|
||||||
enables smaller and cleaner IRQ subsystems. It's deprecated for three
|
enables smaller and cleaner IRQ subsystems. It's deprecated for three
|
||||||
@@ -116,7 +116,7 @@ status information and pointers to the interrupt flow method and the
|
|||||||
interrupt chip structure which are assigned to this interrupt.
|
interrupt chip structure which are assigned to this interrupt.
|
||||||
|
|
||||||
Whenever an interrupt triggers, the low-level architecture code calls
|
Whenever an interrupt triggers, the low-level architecture code calls
|
||||||
into the generic interrupt code by calling :c:func:`desc->handle_irq`. This
|
into the generic interrupt code by calling desc->handle_irq(). This
|
||||||
high-level IRQ handling function only uses desc->irq_data.chip
|
high-level IRQ handling function only uses desc->irq_data.chip
|
||||||
primitives referenced by the assigned chip descriptor structure.
|
primitives referenced by the assigned chip descriptor structure.
|
||||||
|
|
||||||
@@ -125,27 +125,29 @@ High-level Driver API
|
|||||||
|
|
||||||
The high-level Driver API consists of following functions:
|
The high-level Driver API consists of following functions:
|
||||||
|
|
||||||
- :c:func:`request_irq`
|
- request_irq()
|
||||||
|
|
||||||
- :c:func:`free_irq`
|
- request_threaded_irq()
|
||||||
|
|
||||||
- :c:func:`disable_irq`
|
- free_irq()
|
||||||
|
|
||||||
- :c:func:`enable_irq`
|
- disable_irq()
|
||||||
|
|
||||||
- :c:func:`disable_irq_nosync` (SMP only)
|
- enable_irq()
|
||||||
|
|
||||||
- :c:func:`synchronize_irq` (SMP only)
|
- disable_irq_nosync() (SMP only)
|
||||||
|
|
||||||
- :c:func:`irq_set_irq_type`
|
- synchronize_irq() (SMP only)
|
||||||
|
|
||||||
- :c:func:`irq_set_irq_wake`
|
- irq_set_irq_type()
|
||||||
|
|
||||||
- :c:func:`irq_set_handler_data`
|
- irq_set_irq_wake()
|
||||||
|
|
||||||
- :c:func:`irq_set_chip`
|
- irq_set_handler_data()
|
||||||
|
|
||||||
- :c:func:`irq_set_chip_data`
|
- irq_set_chip()
|
||||||
|
|
||||||
|
- irq_set_chip_data()
|
||||||
|
|
||||||
See the autogenerated function documentation for details.
|
See the autogenerated function documentation for details.
|
||||||
|
|
||||||
@@ -154,19 +156,19 @@ High-level IRQ flow handlers
|
|||||||
|
|
||||||
The generic layer provides a set of pre-defined irq-flow methods:
|
The generic layer provides a set of pre-defined irq-flow methods:
|
||||||
|
|
||||||
- :c:func:`handle_level_irq`
|
- handle_level_irq()
|
||||||
|
|
||||||
- :c:func:`handle_edge_irq`
|
- handle_edge_irq()
|
||||||
|
|
||||||
- :c:func:`handle_fasteoi_irq`
|
- handle_fasteoi_irq()
|
||||||
|
|
||||||
- :c:func:`handle_simple_irq`
|
- handle_simple_irq()
|
||||||
|
|
||||||
- :c:func:`handle_percpu_irq`
|
- handle_percpu_irq()
|
||||||
|
|
||||||
- :c:func:`handle_edge_eoi_irq`
|
- handle_edge_eoi_irq()
|
||||||
|
|
||||||
- :c:func:`handle_bad_irq`
|
- handle_bad_irq()
|
||||||
|
|
||||||
The interrupt flow handlers (either pre-defined or architecture
|
The interrupt flow handlers (either pre-defined or architecture
|
||||||
specific) are assigned to specific interrupts by the architecture either
|
specific) are assigned to specific interrupts by the architecture either
|
||||||
@@ -325,14 +327,14 @@ Delayed interrupt disable
|
|||||||
|
|
||||||
This per interrupt selectable feature, which was introduced by Russell
|
This per interrupt selectable feature, which was introduced by Russell
|
||||||
King in the ARM interrupt implementation, does not mask an interrupt at
|
King in the ARM interrupt implementation, does not mask an interrupt at
|
||||||
the hardware level when :c:func:`disable_irq` is called. The interrupt is kept
|
the hardware level when disable_irq() is called. The interrupt is kept
|
||||||
enabled and is masked in the flow handler when an interrupt event
|
enabled and is masked in the flow handler when an interrupt event
|
||||||
happens. This prevents losing edge interrupts on hardware which does not
|
happens. This prevents losing edge interrupts on hardware which does not
|
||||||
store an edge interrupt event while the interrupt is disabled at the
|
store an edge interrupt event while the interrupt is disabled at the
|
||||||
hardware level. When an interrupt arrives while the IRQ_DISABLED flag
|
hardware level. When an interrupt arrives while the IRQ_DISABLED flag
|
||||||
is set, then the interrupt is masked at the hardware level and the
|
is set, then the interrupt is masked at the hardware level and the
|
||||||
IRQ_PENDING bit is set. When the interrupt is re-enabled by
|
IRQ_PENDING bit is set. When the interrupt is re-enabled by
|
||||||
:c:func:`enable_irq` the pending bit is checked and if it is set, the interrupt
|
enable_irq() the pending bit is checked and if it is set, the interrupt
|
||||||
is resent either via hardware or by a software resend mechanism. (It's
|
is resent either via hardware or by a software resend mechanism. (It's
|
||||||
necessary to enable CONFIG_HARDIRQS_SW_RESEND when you want to use
|
necessary to enable CONFIG_HARDIRQS_SW_RESEND when you want to use
|
||||||
the delayed interrupt disable feature and your hardware is not capable
|
the delayed interrupt disable feature and your hardware is not capable
|
||||||
@@ -369,7 +371,7 @@ handler(s) to use these basic units of low-level functionality.
|
|||||||
__do_IRQ entry point
|
__do_IRQ entry point
|
||||||
====================
|
====================
|
||||||
|
|
||||||
The original implementation :c:func:`__do_IRQ` was an alternative entry point
|
The original implementation __do_IRQ() was an alternative entry point
|
||||||
for all types of interrupts. It no longer exists.
|
for all types of interrupts. It no longer exists.
|
||||||
|
|
||||||
This handler turned out to be not suitable for all interrupt hardware
|
This handler turned out to be not suitable for all interrupt hardware
|
||||||
|
@@ -57,7 +57,13 @@ The Linux kernel provides more basic utility functions.
|
|||||||
Bit Operations
|
Bit Operations
|
||||||
--------------
|
--------------
|
||||||
|
|
||||||
.. kernel-doc:: include/asm-generic/bitops-instrumented.h
|
.. kernel-doc:: include/asm-generic/bitops/instrumented-atomic.h
|
||||||
|
:internal:
|
||||||
|
|
||||||
|
.. kernel-doc:: include/asm-generic/bitops/instrumented-non-atomic.h
|
||||||
|
:internal:
|
||||||
|
|
||||||
|
.. kernel-doc:: include/asm-generic/bitops/instrumented-lock.h
|
||||||
:internal:
|
:internal:
|
||||||
|
|
||||||
Bitmap Operations
|
Bitmap Operations
|
||||||
|
@@ -88,10 +88,11 @@ Selecting memory allocator
|
|||||||
==========================
|
==========================
|
||||||
|
|
||||||
The most straightforward way to allocate memory is to use a function
|
The most straightforward way to allocate memory is to use a function
|
||||||
from the :c:func:`kmalloc` family. And, to be on the safe size it's
|
from the kmalloc() family. And, to be on the safe side it's best to use
|
||||||
best to use routines that set memory to zero, like
|
routines that set memory to zero, like kzalloc(). If you need to
|
||||||
:c:func:`kzalloc`. If you need to allocate memory for an array, there
|
allocate memory for an array, there are kmalloc_array() and kcalloc()
|
||||||
are :c:func:`kmalloc_array` and :c:func:`kcalloc` helpers.
|
helpers. The helpers struct_size(), array_size() and array3_size() can
|
||||||
|
be used to safely calculate object sizes without overflowing.
|
||||||
|
|
||||||
The maximal size of a chunk that can be allocated with `kmalloc` is
|
The maximal size of a chunk that can be allocated with `kmalloc` is
|
||||||
limited. The actual limit depends on the hardware and the kernel
|
limited. The actual limit depends on the hardware and the kernel
|
||||||
@@ -102,29 +103,26 @@ The address of a chunk allocated with `kmalloc` is aligned to at least
|
|||||||
ARCH_KMALLOC_MINALIGN bytes. For sizes which are a power of two, the
|
ARCH_KMALLOC_MINALIGN bytes. For sizes which are a power of two, the
|
||||||
alignment is also guaranteed to be at least the respective size.
|
alignment is also guaranteed to be at least the respective size.
|
||||||
|
|
||||||
For large allocations you can use :c:func:`vmalloc` and
|
For large allocations you can use vmalloc() and vzalloc(), or directly
|
||||||
:c:func:`vzalloc`, or directly request pages from the page
|
request pages from the page allocator. The memory allocated by `vmalloc`
|
||||||
allocator. The memory allocated by `vmalloc` and related functions is
|
and related functions is not physically contiguous.
|
||||||
not physically contiguous.
|
|
||||||
|
|
||||||
If you are not sure whether the allocation size is too large for
|
If you are not sure whether the allocation size is too large for
|
||||||
`kmalloc`, it is possible to use :c:func:`kvmalloc` and its
|
`kmalloc`, it is possible to use kvmalloc() and its derivatives. It will
|
||||||
derivatives. It will try to allocate memory with `kmalloc` and if the
|
try to allocate memory with `kmalloc` and if the allocation fails it
|
||||||
allocation fails it will be retried with `vmalloc`. There are
|
will be retried with `vmalloc`. There are restrictions on which GFP
|
||||||
restrictions on which GFP flags can be used with `kvmalloc`; please
|
flags can be used with `kvmalloc`; please see kvmalloc_node() reference
|
||||||
see :c:func:`kvmalloc_node` reference documentation. Note that
|
documentation. Note that `kvmalloc` may return memory that is not
|
||||||
`kvmalloc` may return memory that is not physically contiguous.
|
physically contiguous.
|
||||||
|
|
||||||
If you need to allocate many identical objects you can use the slab
|
If you need to allocate many identical objects you can use the slab
|
||||||
cache allocator. The cache should be set up with
|
cache allocator. The cache should be set up with kmem_cache_create() or
|
||||||
:c:func:`kmem_cache_create` or :c:func:`kmem_cache_create_usercopy`
|
kmem_cache_create_usercopy() before it can be used. The second function
|
||||||
before it can be used. The second function should be used if a part of
|
should be used if a part of the cache might be copied to the userspace.
|
||||||
the cache might be copied to the userspace. After the cache is
|
After the cache is created kmem_cache_alloc() and its convenience
|
||||||
created :c:func:`kmem_cache_alloc` and its convenience wrappers can
|
wrappers can allocate memory from that cache.
|
||||||
allocate memory from that cache.
|
|
||||||
|
|
||||||
When the allocated memory is no longer needed it must be freed. You
|
When the allocated memory is no longer needed it must be freed. You can
|
||||||
can use :c:func:`kvfree` for the memory allocated with `kmalloc`,
|
use kvfree() for the memory allocated with `kmalloc`, `vmalloc` and
|
||||||
`vmalloc` and `kvmalloc`. The slab caches should be freed with
|
`kvmalloc`. The slab caches should be freed with kmem_cache_free(). And
|
||||||
:c:func:`kmem_cache_free`. And don't forget to destroy the cache with
|
don't forget to destroy the cache with kmem_cache_destroy().
|
||||||
:c:func:`kmem_cache_destroy`.
|
|
||||||
|
@@ -11,7 +11,7 @@ User Space Memory Access
|
|||||||
.. kernel-doc:: arch/x86/lib/usercopy_32.c
|
.. kernel-doc:: arch/x86/lib/usercopy_32.c
|
||||||
:export:
|
:export:
|
||||||
|
|
||||||
.. kernel-doc:: mm/util.c
|
.. kernel-doc:: mm/gup.c
|
||||||
:functions: get_user_pages_fast
|
:functions: get_user_pages_fast
|
||||||
|
|
||||||
.. _mm-api-gfp-flags:
|
.. _mm-api-gfp-flags:
|
||||||
|
@@ -79,6 +79,18 @@ has the added benefit of providing a unique identifier. On 64-bit machines
|
|||||||
the first 32 bits are zeroed. The kernel will print ``(ptrval)`` until it
|
the first 32 bits are zeroed. The kernel will print ``(ptrval)`` until it
|
||||||
gathers enough entropy. If you *really* want the address see %px below.
|
gathers enough entropy. If you *really* want the address see %px below.
|
||||||
|
|
||||||
|
Error Pointers
|
||||||
|
--------------
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
%pe -ENOSPC
|
||||||
|
|
||||||
|
For printing error pointers (i.e. a pointer for which IS_ERR() is true)
|
||||||
|
as a symbolic error name. Error values for which no symbolic name is
|
||||||
|
known are printed in decimal, while a non-ERR_PTR passed as the
|
||||||
|
argument to %pe gets treated as ordinary %p.
|
||||||
|
|
||||||
Symbols/Function Pointers
|
Symbols/Function Pointers
|
||||||
-------------------------
|
-------------------------
|
||||||
|
|
||||||
@@ -86,8 +98,6 @@ Symbols/Function Pointers
|
|||||||
|
|
||||||
%pS versatile_init+0x0/0x110
|
%pS versatile_init+0x0/0x110
|
||||||
%ps versatile_init
|
%ps versatile_init
|
||||||
%pF versatile_init+0x0/0x110
|
|
||||||
%pf versatile_init
|
|
||||||
%pSR versatile_init+0x9/0x110
|
%pSR versatile_init+0x9/0x110
|
||||||
(with __builtin_extract_return_addr() translation)
|
(with __builtin_extract_return_addr() translation)
|
||||||
%pB prev_fn_of_versatile_init+0x88/0x88
|
%pB prev_fn_of_versatile_init+0x88/0x88
|
||||||
@@ -97,14 +107,6 @@ The ``S`` and ``s`` specifiers are used for printing a pointer in symbolic
|
|||||||
format. They result in the symbol name with (S) or without (s)
|
format. They result in the symbol name with (S) or without (s)
|
||||||
offsets. If KALLSYMS are disabled then the symbol address is printed instead.
|
offsets. If KALLSYMS are disabled then the symbol address is printed instead.
|
||||||
|
|
||||||
Note, that the ``F`` and ``f`` specifiers are identical to ``S`` (``s``)
|
|
||||||
and thus deprecated. We have ``F`` and ``f`` because on ia64, ppc64 and
|
|
||||||
parisc64 function pointers are indirect and, in fact, are function
|
|
||||||
descriptors, which require additional dereferencing before we can lookup
|
|
||||||
the symbol. As of now, ``S`` and ``s`` perform dereferencing on those
|
|
||||||
platforms (when needed), so ``F`` and ``f`` exist for compatibility
|
|
||||||
reasons only.
|
|
||||||
|
|
||||||
The ``B`` specifier results in the symbol name with offsets and should be
|
The ``B`` specifier results in the symbol name with offsets and should be
|
||||||
used when printing stack backtraces. The specifier takes into
|
used when printing stack backtraces. The specifier takes into
|
||||||
consideration the effect of compiler optimisations which may occur
|
consideration the effect of compiler optimisations which may occur
|
||||||
@@ -135,6 +137,20 @@ equivalent to %lx (or %lu). %px is preferred because it is more uniquely
|
|||||||
grep'able. If in the future we need to modify the way the kernel handles
|
grep'able. If in the future we need to modify the way the kernel handles
|
||||||
printing pointers we will be better equipped to find the call sites.
|
printing pointers we will be better equipped to find the call sites.
|
||||||
|
|
||||||
|
Pointer Differences
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
%td 2560
|
||||||
|
%tx a00
|
||||||
|
|
||||||
|
For printing the pointer differences, use the %t modifier for ptrdiff_t.
|
||||||
|
|
||||||
|
Example::
|
||||||
|
|
||||||
|
printk("test: difference between pointers: %td\n", ptr2 - ptr1);
|
||||||
|
|
||||||
Struct Resources
|
Struct Resources
|
||||||
----------------
|
----------------
|
||||||
|
|
||||||
@@ -428,6 +444,30 @@ Examples::
|
|||||||
|
|
||||||
Passed by reference.
|
Passed by reference.
|
||||||
|
|
||||||
|
Fwnode handles
|
||||||
|
--------------
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
%pfw[fP]
|
||||||
|
|
||||||
|
For printing information on fwnode handles. The default is to print the full
|
||||||
|
node name, including the path. The modifiers are functionally equivalent to
|
||||||
|
%pOF above.
|
||||||
|
|
||||||
|
- f - full name of the node, including the path
|
||||||
|
- P - the name of the node including an address (if there is one)
|
||||||
|
|
||||||
|
Examples (ACPI)::
|
||||||
|
|
||||||
|
%pfwf \_SB.PCI0.CIO2.port@1.endpoint@0 - Full node name
|
||||||
|
%pfwP endpoint@0 - Node name
|
||||||
|
|
||||||
|
Examples (OF)::
|
||||||
|
|
||||||
|
%pfwf /ocp@68000000/i2c@48072000/camera@10/port/endpoint - Full name
|
||||||
|
%pfwP endpoint - Node name
|
||||||
|
|
||||||
Time and date (struct rtc_time)
|
Time and date (struct rtc_time)
|
||||||
-------------------------------
|
-------------------------------
|
||||||
|
|
||||||
|
@@ -35,7 +35,7 @@ atomics & refcounters only provide atomicity and
|
|||||||
program order (po) relation (on the same CPU). It guarantees that
|
program order (po) relation (on the same CPU). It guarantees that
|
||||||
each ``atomic_*()`` and ``refcount_*()`` operation is atomic and instructions
|
each ``atomic_*()`` and ``refcount_*()`` operation is atomic and instructions
|
||||||
are executed in program order on a single CPU.
|
are executed in program order on a single CPU.
|
||||||
This is implemented using :c:func:`READ_ONCE`/:c:func:`WRITE_ONCE` and
|
This is implemented using READ_ONCE()/WRITE_ONCE() and
|
||||||
compare-and-swap primitives.
|
compare-and-swap primitives.
|
||||||
|
|
||||||
A strong (full) memory ordering guarantees that all prior loads and
|
A strong (full) memory ordering guarantees that all prior loads and
|
||||||
@@ -44,7 +44,7 @@ before any po-later instruction is executed on the same CPU.
|
|||||||
It also guarantees that all po-earlier stores on the same CPU
|
It also guarantees that all po-earlier stores on the same CPU
|
||||||
and all propagated stores from other CPUs must propagate to all
|
and all propagated stores from other CPUs must propagate to all
|
||||||
other CPUs before any po-later instruction is executed on the original
|
other CPUs before any po-later instruction is executed on the original
|
||||||
CPU (A-cumulative property). This is implemented using :c:func:`smp_mb`.
|
CPU (A-cumulative property). This is implemented using smp_mb().
|
||||||
|
|
||||||
A RELEASE memory ordering guarantees that all prior loads and
|
A RELEASE memory ordering guarantees that all prior loads and
|
||||||
stores (all po-earlier instructions) on the same CPU are completed
|
stores (all po-earlier instructions) on the same CPU are completed
|
||||||
@@ -52,14 +52,14 @@ before the operation. It also guarantees that all po-earlier
|
|||||||
stores on the same CPU and all propagated stores from other CPUs
|
stores on the same CPU and all propagated stores from other CPUs
|
||||||
must propagate to all other CPUs before the release operation
|
must propagate to all other CPUs before the release operation
|
||||||
(A-cumulative property). This is implemented using
|
(A-cumulative property). This is implemented using
|
||||||
:c:func:`smp_store_release`.
|
smp_store_release().
|
||||||
|
|
||||||
An ACQUIRE memory ordering guarantees that all post loads and
|
An ACQUIRE memory ordering guarantees that all post loads and
|
||||||
stores (all po-later instructions) on the same CPU are
|
stores (all po-later instructions) on the same CPU are
|
||||||
completed after the acquire operation. It also guarantees that all
|
completed after the acquire operation. It also guarantees that all
|
||||||
po-later stores on the same CPU must propagate to all other CPUs
|
po-later stores on the same CPU must propagate to all other CPUs
|
||||||
after the acquire operation executes. This is implemented using
|
after the acquire operation executes. This is implemented using
|
||||||
:c:func:`smp_acquire__after_ctrl_dep`.
|
smp_acquire__after_ctrl_dep().
|
||||||
|
|
||||||
A control dependency (on success) for refcounters guarantees that
|
A control dependency (on success) for refcounters guarantees that
|
||||||
if a reference for an object was successfully obtained (reference
|
if a reference for an object was successfully obtained (reference
|
||||||
@@ -78,8 +78,8 @@ case 1) - non-"Read/Modify/Write" (RMW) ops
|
|||||||
|
|
||||||
Function changes:
|
Function changes:
|
||||||
|
|
||||||
* :c:func:`atomic_set` --> :c:func:`refcount_set`
|
* atomic_set() --> refcount_set()
|
||||||
* :c:func:`atomic_read` --> :c:func:`refcount_read`
|
* atomic_read() --> refcount_read()
|
||||||
|
|
||||||
Memory ordering guarantee changes:
|
Memory ordering guarantee changes:
|
||||||
|
|
||||||
@@ -91,8 +91,8 @@ case 2) - increment-based ops that return no value
|
|||||||
|
|
||||||
Function changes:
|
Function changes:
|
||||||
|
|
||||||
* :c:func:`atomic_inc` --> :c:func:`refcount_inc`
|
* atomic_inc() --> refcount_inc()
|
||||||
* :c:func:`atomic_add` --> :c:func:`refcount_add`
|
* atomic_add() --> refcount_add()
|
||||||
|
|
||||||
Memory ordering guarantee changes:
|
Memory ordering guarantee changes:
|
||||||
|
|
||||||
@@ -103,7 +103,7 @@ case 3) - decrement-based RMW ops that return no value
|
|||||||
|
|
||||||
Function changes:
|
Function changes:
|
||||||
|
|
||||||
* :c:func:`atomic_dec` --> :c:func:`refcount_dec`
|
* atomic_dec() --> refcount_dec()
|
||||||
|
|
||||||
Memory ordering guarantee changes:
|
Memory ordering guarantee changes:
|
||||||
|
|
||||||
@@ -115,8 +115,8 @@ case 4) - increment-based RMW ops that return a value
|
|||||||
|
|
||||||
Function changes:
|
Function changes:
|
||||||
|
|
||||||
* :c:func:`atomic_inc_not_zero` --> :c:func:`refcount_inc_not_zero`
|
* atomic_inc_not_zero() --> refcount_inc_not_zero()
|
||||||
* no atomic counterpart --> :c:func:`refcount_add_not_zero`
|
* no atomic counterpart --> refcount_add_not_zero()
|
||||||
|
|
||||||
Memory ordering guarantees changes:
|
Memory ordering guarantees changes:
|
||||||
|
|
||||||
@@ -131,8 +131,8 @@ case 5) - generic dec/sub decrement-based RMW ops that return a value
|
|||||||
|
|
||||||
Function changes:
|
Function changes:
|
||||||
|
|
||||||
* :c:func:`atomic_dec_and_test` --> :c:func:`refcount_dec_and_test`
|
* atomic_dec_and_test() --> refcount_dec_and_test()
|
||||||
* :c:func:`atomic_sub_and_test` --> :c:func:`refcount_sub_and_test`
|
* atomic_sub_and_test() --> refcount_sub_and_test()
|
||||||
|
|
||||||
Memory ordering guarantees changes:
|
Memory ordering guarantees changes:
|
||||||
|
|
||||||
@@ -144,14 +144,14 @@ case 6) other decrement-based RMW ops that return a value
|
|||||||
|
|
||||||
Function changes:
|
Function changes:
|
||||||
|
|
||||||
* no atomic counterpart --> :c:func:`refcount_dec_if_one`
|
* no atomic counterpart --> refcount_dec_if_one()
|
||||||
* ``atomic_add_unless(&var, -1, 1)`` --> ``refcount_dec_not_one(&var)``
|
* ``atomic_add_unless(&var, -1, 1)`` --> ``refcount_dec_not_one(&var)``
|
||||||
|
|
||||||
Memory ordering guarantees changes:
|
Memory ordering guarantees changes:
|
||||||
|
|
||||||
* fully ordered --> RELEASE ordering + control dependency
|
* fully ordered --> RELEASE ordering + control dependency
|
||||||
|
|
||||||
.. note:: :c:func:`atomic_add_unless` only provides full order on success.
|
.. note:: atomic_add_unless() only provides full order on success.
|
||||||
|
|
||||||
|
|
||||||
case 7) - lock-based RMW
|
case 7) - lock-based RMW
|
||||||
@@ -159,10 +159,10 @@ case 7) - lock-based RMW
|
|||||||
|
|
||||||
Function changes:
|
Function changes:
|
||||||
|
|
||||||
* :c:func:`atomic_dec_and_lock` --> :c:func:`refcount_dec_and_lock`
|
* atomic_dec_and_lock() --> refcount_dec_and_lock()
|
||||||
* :c:func:`atomic_dec_and_mutex_lock` --> :c:func:`refcount_dec_and_mutex_lock`
|
* atomic_dec_and_mutex_lock() --> refcount_dec_and_mutex_lock()
|
||||||
|
|
||||||
Memory ordering guarantees changes:
|
Memory ordering guarantees changes:
|
||||||
|
|
||||||
* fully ordered --> RELEASE ordering + control dependency + hold
|
* fully ordered --> RELEASE ordering + control dependency + hold
|
||||||
:c:func:`spin_lock` on success
|
spin_lock() on success
|
||||||
|
@@ -152,3 +152,6 @@ in-tree modules::
|
|||||||
- notice the warning of modpost telling about a missing import
|
- notice the warning of modpost telling about a missing import
|
||||||
- run `make nsdeps` to add the import to the correct code location
|
- run `make nsdeps` to add the import to the correct code location
|
||||||
|
|
||||||
|
You can also run nsdeps for external module builds. A typical usage is::
|
||||||
|
|
||||||
|
$ make -C <path_to_kernel_src> M=$PWD nsdeps
|
||||||
|
@@ -5,7 +5,7 @@ Block Cipher Algorithm Definitions
|
|||||||
:doc: Block Cipher Algorithm Definitions
|
:doc: Block Cipher Algorithm Definitions
|
||||||
|
|
||||||
.. kernel-doc:: include/linux/crypto.h
|
.. kernel-doc:: include/linux/crypto.h
|
||||||
:functions: crypto_alg ablkcipher_alg blkcipher_alg cipher_alg compress_alg
|
:functions: crypto_alg cipher_alg compress_alg
|
||||||
|
|
||||||
Symmetric Key Cipher API
|
Symmetric Key Cipher API
|
||||||
------------------------
|
------------------------
|
||||||
@@ -33,30 +33,3 @@ Single Block Cipher API
|
|||||||
|
|
||||||
.. kernel-doc:: include/linux/crypto.h
|
.. kernel-doc:: include/linux/crypto.h
|
||||||
:functions: crypto_alloc_cipher crypto_free_cipher crypto_has_cipher crypto_cipher_blocksize crypto_cipher_setkey crypto_cipher_encrypt_one crypto_cipher_decrypt_one
|
:functions: crypto_alloc_cipher crypto_free_cipher crypto_has_cipher crypto_cipher_blocksize crypto_cipher_setkey crypto_cipher_encrypt_one crypto_cipher_decrypt_one
|
||||||
|
|
||||||
Asynchronous Block Cipher API - Deprecated
|
|
||||||
------------------------------------------
|
|
||||||
|
|
||||||
.. kernel-doc:: include/linux/crypto.h
|
|
||||||
:doc: Asynchronous Block Cipher API
|
|
||||||
|
|
||||||
.. kernel-doc:: include/linux/crypto.h
|
|
||||||
:functions: crypto_free_ablkcipher crypto_has_ablkcipher crypto_ablkcipher_ivsize crypto_ablkcipher_blocksize crypto_ablkcipher_setkey crypto_ablkcipher_reqtfm crypto_ablkcipher_encrypt crypto_ablkcipher_decrypt
|
|
||||||
|
|
||||||
Asynchronous Cipher Request Handle - Deprecated
|
|
||||||
-----------------------------------------------
|
|
||||||
|
|
||||||
.. kernel-doc:: include/linux/crypto.h
|
|
||||||
:doc: Asynchronous Cipher Request Handle
|
|
||||||
|
|
||||||
.. kernel-doc:: include/linux/crypto.h
|
|
||||||
:functions: crypto_ablkcipher_reqsize ablkcipher_request_set_tfm ablkcipher_request_alloc ablkcipher_request_free ablkcipher_request_set_callback ablkcipher_request_set_crypt
|
|
||||||
|
|
||||||
Synchronous Block Cipher API - Deprecated
|
|
||||||
-----------------------------------------
|
|
||||||
|
|
||||||
.. kernel-doc:: include/linux/crypto.h
|
|
||||||
:doc: Synchronous Block Cipher API
|
|
||||||
|
|
||||||
.. kernel-doc:: include/linux/crypto.h
|
|
||||||
:functions: crypto_alloc_blkcipher crypto_free_blkcipher crypto_has_blkcipher crypto_blkcipher_name crypto_blkcipher_ivsize crypto_blkcipher_blocksize crypto_blkcipher_setkey crypto_blkcipher_encrypt crypto_blkcipher_encrypt_iv crypto_blkcipher_decrypt crypto_blkcipher_decrypt_iv crypto_blkcipher_set_iv crypto_blkcipher_get_iv
|
|
||||||
|
@@ -201,10 +201,6 @@ the aforementioned cipher types:
|
|||||||
- CRYPTO_ALG_TYPE_AEAD Authenticated Encryption with Associated Data
|
- CRYPTO_ALG_TYPE_AEAD Authenticated Encryption with Associated Data
|
||||||
(MAC)
|
(MAC)
|
||||||
|
|
||||||
- CRYPTO_ALG_TYPE_BLKCIPHER Synchronous multi-block cipher
|
|
||||||
|
|
||||||
- CRYPTO_ALG_TYPE_ABLKCIPHER Asynchronous multi-block cipher
|
|
||||||
|
|
||||||
- CRYPTO_ALG_TYPE_KPP Key-agreement Protocol Primitive (KPP) such as
|
- CRYPTO_ALG_TYPE_KPP Key-agreement Protocol Primitive (KPP) such as
|
||||||
an ECDH or DH implementation
|
an ECDH or DH implementation
|
||||||
|
|
||||||
|
@@ -63,8 +63,6 @@ request by using:
|
|||||||
When your driver receives a crypto_request, you must to transfer it to
|
When your driver receives a crypto_request, you must to transfer it to
|
||||||
the crypto engine via one of:
|
the crypto engine via one of:
|
||||||
|
|
||||||
* crypto_transfer_ablkcipher_request_to_engine()
|
|
||||||
|
|
||||||
* crypto_transfer_aead_request_to_engine()
|
* crypto_transfer_aead_request_to_engine()
|
||||||
|
|
||||||
* crypto_transfer_akcipher_request_to_engine()
|
* crypto_transfer_akcipher_request_to_engine()
|
||||||
@@ -75,8 +73,6 @@ the crypto engine via one of:
|
|||||||
|
|
||||||
At the end of the request process, a call to one of the following functions is needed:
|
At the end of the request process, a call to one of the following functions is needed:
|
||||||
|
|
||||||
* crypto_finalize_ablkcipher_request()
|
|
||||||
|
|
||||||
* crypto_finalize_aead_request()
|
* crypto_finalize_aead_request()
|
||||||
|
|
||||||
* crypto_finalize_akcipher_request()
|
* crypto_finalize_akcipher_request()
|
||||||
|
@@ -128,25 +128,20 @@ process requests that are unaligned. This implies, however, additional
|
|||||||
overhead as the kernel crypto API needs to perform the realignment of
|
overhead as the kernel crypto API needs to perform the realignment of
|
||||||
the data which may imply moving of data.
|
the data which may imply moving of data.
|
||||||
|
|
||||||
Cipher Definition With struct blkcipher_alg and ablkcipher_alg
|
Cipher Definition With struct skcipher_alg
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
Struct blkcipher_alg defines a synchronous block cipher whereas struct
|
Struct skcipher_alg defines a multi-block cipher, or more generally, a
|
||||||
ablkcipher_alg defines an asynchronous block cipher.
|
length-preserving symmetric cipher algorithm.
|
||||||
|
|
||||||
Please refer to the single block cipher description for schematics of
|
Scatterlist handling
|
||||||
the block cipher usage.
|
~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
Specifics Of Asynchronous Multi-Block Cipher
|
Some drivers will want to use the Generic ScatterWalk in case the
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
hardware needs to be fed separate chunks of the scatterlist which
|
||||||
|
contains the plaintext and will contain the ciphertext. Please refer
|
||||||
There are a couple of specifics to the asynchronous interface.
|
to the ScatterWalk interface offered by the Linux kernel scatter /
|
||||||
|
gather list implementation.
|
||||||
First of all, some of the drivers will want to use the Generic
|
|
||||||
ScatterWalk in case the hardware needs to be fed separate chunks of the
|
|
||||||
scatterlist which contains the plaintext and will contain the
|
|
||||||
ciphertext. Please refer to the ScatterWalk interface offered by the
|
|
||||||
Linux kernel scatter / gather list implementation.
|
|
||||||
|
|
||||||
Hashing [HASH]
|
Hashing [HASH]
|
||||||
--------------
|
--------------
|
||||||
|
@@ -24,6 +24,7 @@ whole; patches welcome!
|
|||||||
gdb-kernel-debugging
|
gdb-kernel-debugging
|
||||||
kgdb
|
kgdb
|
||||||
kselftest
|
kselftest
|
||||||
|
kunit/index
|
||||||
|
|
||||||
|
|
||||||
.. only:: subproject and html
|
.. only:: subproject and html
|
||||||
|
@@ -218,3 +218,66 @@ brk handler is used to print bug reports.
|
|||||||
A potential expansion of this mode is a hardware tag-based mode, which would
|
A potential expansion of this mode is a hardware tag-based mode, which would
|
||||||
use hardware memory tagging support instead of compiler instrumentation and
|
use hardware memory tagging support instead of compiler instrumentation and
|
||||||
manual shadow memory manipulation.
|
manual shadow memory manipulation.
|
||||||
|
|
||||||
|
What memory accesses are sanitised by KASAN?
|
||||||
|
--------------------------------------------
|
||||||
|
|
||||||
|
The kernel maps memory in a number of different parts of the address
|
||||||
|
space. This poses something of a problem for KASAN, which requires
|
||||||
|
that all addresses accessed by instrumented code have a valid shadow
|
||||||
|
region.
|
||||||
|
|
||||||
|
The range of kernel virtual addresses is large: there is not enough
|
||||||
|
real memory to support a real shadow region for every address that
|
||||||
|
could be accessed by the kernel.
|
||||||
|
|
||||||
|
By default
|
||||||
|
~~~~~~~~~~
|
||||||
|
|
||||||
|
By default, architectures only map real memory over the shadow region
|
||||||
|
for the linear mapping (and potentially other small areas). For all
|
||||||
|
other areas - such as vmalloc and vmemmap space - a single read-only
|
||||||
|
page is mapped over the shadow area. This read-only shadow page
|
||||||
|
declares all memory accesses as permitted.
|
||||||
|
|
||||||
|
This presents a problem for modules: they do not live in the linear
|
||||||
|
mapping, but in a dedicated module space. By hooking in to the module
|
||||||
|
allocator, KASAN can temporarily map real shadow memory to cover
|
||||||
|
them. This allows detection of invalid accesses to module globals, for
|
||||||
|
example.
|
||||||
|
|
||||||
|
This also creates an incompatibility with ``VMAP_STACK``: if the stack
|
||||||
|
lives in vmalloc space, it will be shadowed by the read-only page, and
|
||||||
|
the kernel will fault when trying to set up the shadow data for stack
|
||||||
|
variables.
|
||||||
|
|
||||||
|
CONFIG_KASAN_VMALLOC
|
||||||
|
~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
With ``CONFIG_KASAN_VMALLOC``, KASAN can cover vmalloc space at the
|
||||||
|
cost of greater memory usage. Currently this is only supported on x86.
|
||||||
|
|
||||||
|
This works by hooking into vmalloc and vmap, and dynamically
|
||||||
|
allocating real shadow memory to back the mappings.
|
||||||
|
|
||||||
|
Most mappings in vmalloc space are small, requiring less than a full
|
||||||
|
page of shadow space. Allocating a full shadow page per mapping would
|
||||||
|
therefore be wasteful. Furthermore, to ensure that different mappings
|
||||||
|
use different shadow pages, mappings would have to be aligned to
|
||||||
|
``KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE``.
|
||||||
|
|
||||||
|
Instead, we share backing space across multiple mappings. We allocate
|
||||||
|
a backing page when a mapping in vmalloc space uses a particular page
|
||||||
|
of the shadow region. This page can be shared by other vmalloc
|
||||||
|
mappings later on.
|
||||||
|
|
||||||
|
We hook in to the vmap infrastructure to lazily clean up unused shadow
|
||||||
|
memory.
|
||||||
|
|
||||||
|
To avoid the difficulties around swapping mappings around, we expect
|
||||||
|
that the part of the shadow region that covers the vmalloc space will
|
||||||
|
not be covered by the early shadow page, but will be left
|
||||||
|
unmapped. This will require changes in arch-specific code.
|
||||||
|
|
||||||
|
This allows ``VMAP_STACK`` support on x86, and can simplify support of
|
||||||
|
architectures that do not have a fixed module region.
|
||||||
|
@@ -34,6 +34,7 @@ Profiling data will only become accessible once debugfs has been mounted::
|
|||||||
|
|
||||||
Coverage collection
|
Coverage collection
|
||||||
-------------------
|
-------------------
|
||||||
|
|
||||||
The following program demonstrates coverage collection from within a test
|
The following program demonstrates coverage collection from within a test
|
||||||
program using kcov:
|
program using kcov:
|
||||||
|
|
||||||
@@ -128,6 +129,7 @@ only need to enable coverage (disable happens automatically on thread end).
|
|||||||
|
|
||||||
Comparison operands collection
|
Comparison operands collection
|
||||||
------------------------------
|
------------------------------
|
||||||
|
|
||||||
Comparison operands collection is similar to coverage collection:
|
Comparison operands collection is similar to coverage collection:
|
||||||
|
|
||||||
.. code-block:: c
|
.. code-block:: c
|
||||||
@@ -202,3 +204,130 @@ Comparison operands collection is similar to coverage collection:
|
|||||||
|
|
||||||
Note that the kcov modes (coverage collection or comparison operands) are
|
Note that the kcov modes (coverage collection or comparison operands) are
|
||||||
mutually exclusive.
|
mutually exclusive.
|
||||||
|
|
||||||
|
Remote coverage collection
|
||||||
|
--------------------------
|
||||||
|
|
||||||
|
With KCOV_ENABLE coverage is collected only for syscalls that are issued
|
||||||
|
from the current process. With KCOV_REMOTE_ENABLE it's possible to collect
|
||||||
|
coverage for arbitrary parts of the kernel code, provided that those parts
|
||||||
|
are annotated with kcov_remote_start()/kcov_remote_stop().
|
||||||
|
|
||||||
|
This allows to collect coverage from two types of kernel background
|
||||||
|
threads: the global ones, that are spawned during kernel boot in a limited
|
||||||
|
number of instances (e.g. one USB hub_event() worker thread is spawned per
|
||||||
|
USB HCD); and the local ones, that are spawned when a user interacts with
|
||||||
|
some kernel interface (e.g. vhost workers).
|
||||||
|
|
||||||
|
To enable collecting coverage from a global background thread, a unique
|
||||||
|
global handle must be assigned and passed to the corresponding
|
||||||
|
kcov_remote_start() call. Then a userspace process can pass a list of such
|
||||||
|
handles to the KCOV_REMOTE_ENABLE ioctl in the handles array field of the
|
||||||
|
kcov_remote_arg struct. This will attach the used kcov device to the code
|
||||||
|
sections, that are referenced by those handles.
|
||||||
|
|
||||||
|
Since there might be many local background threads spawned from different
|
||||||
|
userspace processes, we can't use a single global handle per annotation.
|
||||||
|
Instead, the userspace process passes a non-zero handle through the
|
||||||
|
common_handle field of the kcov_remote_arg struct. This common handle gets
|
||||||
|
saved to the kcov_handle field in the current task_struct and needs to be
|
||||||
|
passed to the newly spawned threads via custom annotations. Those threads
|
||||||
|
should in turn be annotated with kcov_remote_start()/kcov_remote_stop().
|
||||||
|
|
||||||
|
Internally kcov stores handles as u64 integers. The top byte of a handle
|
||||||
|
is used to denote the id of a subsystem that this handle belongs to, and
|
||||||
|
the lower 4 bytes are used to denote the id of a thread instance within
|
||||||
|
that subsystem. A reserved value 0 is used as a subsystem id for common
|
||||||
|
handles as they don't belong to a particular subsystem. The bytes 4-7 are
|
||||||
|
currently reserved and must be zero. In the future the number of bytes
|
||||||
|
used for the subsystem or handle ids might be increased.
|
||||||
|
|
||||||
|
When a particular userspace proccess collects coverage by via a common
|
||||||
|
handle, kcov will collect coverage for each code section that is annotated
|
||||||
|
to use the common handle obtained as kcov_handle from the current
|
||||||
|
task_struct. However non common handles allow to collect coverage
|
||||||
|
selectively from different subsystems.
|
||||||
|
|
||||||
|
.. code-block:: c
|
||||||
|
|
||||||
|
struct kcov_remote_arg {
|
||||||
|
unsigned trace_mode;
|
||||||
|
unsigned area_size;
|
||||||
|
unsigned num_handles;
|
||||||
|
uint64_t common_handle;
|
||||||
|
uint64_t handles[0];
|
||||||
|
};
|
||||||
|
|
||||||
|
#define KCOV_INIT_TRACE _IOR('c', 1, unsigned long)
|
||||||
|
#define KCOV_DISABLE _IO('c', 101)
|
||||||
|
#define KCOV_REMOTE_ENABLE _IOW('c', 102, struct kcov_remote_arg)
|
||||||
|
|
||||||
|
#define COVER_SIZE (64 << 10)
|
||||||
|
|
||||||
|
#define KCOV_TRACE_PC 0
|
||||||
|
|
||||||
|
#define KCOV_SUBSYSTEM_COMMON (0x00ull << 56)
|
||||||
|
#define KCOV_SUBSYSTEM_USB (0x01ull << 56)
|
||||||
|
|
||||||
|
#define KCOV_SUBSYSTEM_MASK (0xffull << 56)
|
||||||
|
#define KCOV_INSTANCE_MASK (0xffffffffull)
|
||||||
|
|
||||||
|
static inline __u64 kcov_remote_handle(__u64 subsys, __u64 inst)
|
||||||
|
{
|
||||||
|
if (subsys & ~KCOV_SUBSYSTEM_MASK || inst & ~KCOV_INSTANCE_MASK)
|
||||||
|
return 0;
|
||||||
|
return subsys | inst;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define KCOV_COMMON_ID 0x42
|
||||||
|
#define KCOV_USB_BUS_NUM 1
|
||||||
|
|
||||||
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
|
int fd;
|
||||||
|
unsigned long *cover, n, i;
|
||||||
|
struct kcov_remote_arg *arg;
|
||||||
|
|
||||||
|
fd = open("/sys/kernel/debug/kcov", O_RDWR);
|
||||||
|
if (fd == -1)
|
||||||
|
perror("open"), exit(1);
|
||||||
|
if (ioctl(fd, KCOV_INIT_TRACE, COVER_SIZE))
|
||||||
|
perror("ioctl"), exit(1);
|
||||||
|
cover = (unsigned long*)mmap(NULL, COVER_SIZE * sizeof(unsigned long),
|
||||||
|
PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
|
||||||
|
if ((void*)cover == MAP_FAILED)
|
||||||
|
perror("mmap"), exit(1);
|
||||||
|
|
||||||
|
/* Enable coverage collection via common handle and from USB bus #1. */
|
||||||
|
arg = calloc(1, sizeof(*arg) + sizeof(uint64_t));
|
||||||
|
if (!arg)
|
||||||
|
perror("calloc"), exit(1);
|
||||||
|
arg->trace_mode = KCOV_TRACE_PC;
|
||||||
|
arg->area_size = COVER_SIZE;
|
||||||
|
arg->num_handles = 1;
|
||||||
|
arg->common_handle = kcov_remote_handle(KCOV_SUBSYSTEM_COMMON,
|
||||||
|
KCOV_COMMON_ID);
|
||||||
|
arg->handles[0] = kcov_remote_handle(KCOV_SUBSYSTEM_USB,
|
||||||
|
KCOV_USB_BUS_NUM);
|
||||||
|
if (ioctl(fd, KCOV_REMOTE_ENABLE, arg))
|
||||||
|
perror("ioctl"), free(arg), exit(1);
|
||||||
|
free(arg);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Here the user needs to trigger execution of a kernel code section
|
||||||
|
* that is either annotated with the common handle, or to trigger some
|
||||||
|
* activity on USB bus #1.
|
||||||
|
*/
|
||||||
|
sleep(2);
|
||||||
|
|
||||||
|
n = __atomic_load_n(&cover[0], __ATOMIC_RELAXED);
|
||||||
|
for (i = 0; i < n; i++)
|
||||||
|
printf("0x%lx\n", cover[i + 1]);
|
||||||
|
if (ioctl(fd, KCOV_DISABLE, 0))
|
||||||
|
perror("ioctl"), exit(1);
|
||||||
|
if (munmap(cover, COVER_SIZE * sizeof(unsigned long)))
|
||||||
|
perror("munmap"), exit(1);
|
||||||
|
if (close(fd))
|
||||||
|
perror("close"), exit(1);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
@@ -69,7 +69,7 @@ the kernel command line.
|
|||||||
|
|
||||||
Memory may be allocated or freed before kmemleak is initialised and
|
Memory may be allocated or freed before kmemleak is initialised and
|
||||||
these actions are stored in an early log buffer. The size of this buffer
|
these actions are stored in an early log buffer. The size of this buffer
|
||||||
is configured via the CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE option.
|
is configured via the CONFIG_DEBUG_KMEMLEAK_MEM_POOL_SIZE option.
|
||||||
|
|
||||||
If CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF are enabled, the kmemleak is
|
If CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF are enabled, the kmemleak is
|
||||||
disabled by default. Passing ``kmemleak=on`` on the kernel command
|
disabled by default. Passing ``kmemleak=on`` on the kernel command
|
||||||
|
16
Documentation/dev-tools/kunit/api/index.rst
Normal file
16
Documentation/dev-tools/kunit/api/index.rst
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
|
||||||
|
=============
|
||||||
|
API Reference
|
||||||
|
=============
|
||||||
|
.. toctree::
|
||||||
|
|
||||||
|
test
|
||||||
|
|
||||||
|
This section documents the KUnit kernel testing API. It is divided into the
|
||||||
|
following sections:
|
||||||
|
|
||||||
|
================================= ==============================================
|
||||||
|
:doc:`test` documents all of the standard testing API
|
||||||
|
excluding mocking or mocking related features.
|
||||||
|
================================= ==============================================
|
11
Documentation/dev-tools/kunit/api/test.rst
Normal file
11
Documentation/dev-tools/kunit/api/test.rst
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
|
||||||
|
========
|
||||||
|
Test API
|
||||||
|
========
|
||||||
|
|
||||||
|
This file documents all of the standard testing API excluding mocking or mocking
|
||||||
|
related features.
|
||||||
|
|
||||||
|
.. kernel-doc:: include/kunit/test.h
|
||||||
|
:internal:
|
62
Documentation/dev-tools/kunit/faq.rst
Normal file
62
Documentation/dev-tools/kunit/faq.rst
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
|
||||||
|
==========================
|
||||||
|
Frequently Asked Questions
|
||||||
|
==========================
|
||||||
|
|
||||||
|
How is this different from Autotest, kselftest, etc?
|
||||||
|
====================================================
|
||||||
|
KUnit is a unit testing framework. Autotest, kselftest (and some others) are
|
||||||
|
not.
|
||||||
|
|
||||||
|
A `unit test <https://martinfowler.com/bliki/UnitTest.html>`_ is supposed to
|
||||||
|
test a single unit of code in isolation, hence the name. A unit test should be
|
||||||
|
the finest granularity of testing and as such should allow all possible code
|
||||||
|
paths to be tested in the code under test; this is only possible if the code
|
||||||
|
under test is very small and does not have any external dependencies outside of
|
||||||
|
the test's control like hardware.
|
||||||
|
|
||||||
|
There are no testing frameworks currently available for the kernel that do not
|
||||||
|
require installing the kernel on a test machine or in a VM and all require
|
||||||
|
tests to be written in userspace and run on the kernel under test; this is true
|
||||||
|
for Autotest, kselftest, and some others, disqualifying any of them from being
|
||||||
|
considered unit testing frameworks.
|
||||||
|
|
||||||
|
Does KUnit support running on architectures other than UML?
|
||||||
|
===========================================================
|
||||||
|
|
||||||
|
Yes, well, mostly.
|
||||||
|
|
||||||
|
For the most part, the KUnit core framework (what you use to write the tests)
|
||||||
|
can compile to any architecture; it compiles like just another part of the
|
||||||
|
kernel and runs when the kernel boots. However, there is some infrastructure,
|
||||||
|
like the KUnit Wrapper (``tools/testing/kunit/kunit.py``) that does not support
|
||||||
|
other architectures.
|
||||||
|
|
||||||
|
In short, this means that, yes, you can run KUnit on other architectures, but
|
||||||
|
it might require more work than using KUnit on UML.
|
||||||
|
|
||||||
|
For more information, see :ref:`kunit-on-non-uml`.
|
||||||
|
|
||||||
|
What is the difference between a unit test and these other kinds of tests?
|
||||||
|
==========================================================================
|
||||||
|
Most existing tests for the Linux kernel would be categorized as an integration
|
||||||
|
test, or an end-to-end test.
|
||||||
|
|
||||||
|
- A unit test is supposed to test a single unit of code in isolation, hence the
|
||||||
|
name. A unit test should be the finest granularity of testing and as such
|
||||||
|
should allow all possible code paths to be tested in the code under test; this
|
||||||
|
is only possible if the code under test is very small and does not have any
|
||||||
|
external dependencies outside of the test's control like hardware.
|
||||||
|
- An integration test tests the interaction between a minimal set of components,
|
||||||
|
usually just two or three. For example, someone might write an integration
|
||||||
|
test to test the interaction between a driver and a piece of hardware, or to
|
||||||
|
test the interaction between the userspace libraries the kernel provides and
|
||||||
|
the kernel itself; however, one of these tests would probably not test the
|
||||||
|
entire kernel along with hardware interactions and interactions with the
|
||||||
|
userspace.
|
||||||
|
- An end-to-end test usually tests the entire system from the perspective of the
|
||||||
|
code under test. For example, someone might write an end-to-end test for the
|
||||||
|
kernel by installing a production configuration of the kernel on production
|
||||||
|
hardware with a production userspace and then trying to exercise some behavior
|
||||||
|
that depends on interactions between the hardware, the kernel, and userspace.
|
79
Documentation/dev-tools/kunit/index.rst
Normal file
79
Documentation/dev-tools/kunit/index.rst
Normal file
@@ -0,0 +1,79 @@
|
|||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
|
||||||
|
=========================================
|
||||||
|
KUnit - Unit Testing for the Linux Kernel
|
||||||
|
=========================================
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 2
|
||||||
|
|
||||||
|
start
|
||||||
|
usage
|
||||||
|
api/index
|
||||||
|
faq
|
||||||
|
|
||||||
|
What is KUnit?
|
||||||
|
==============
|
||||||
|
|
||||||
|
KUnit is a lightweight unit testing and mocking framework for the Linux kernel.
|
||||||
|
These tests are able to be run locally on a developer's workstation without a VM
|
||||||
|
or special hardware.
|
||||||
|
|
||||||
|
KUnit is heavily inspired by JUnit, Python's unittest.mock, and
|
||||||
|
Googletest/Googlemock for C++. KUnit provides facilities for defining unit test
|
||||||
|
cases, grouping related test cases into test suites, providing common
|
||||||
|
infrastructure for running tests, and much more.
|
||||||
|
|
||||||
|
Get started now: :doc:`start`
|
||||||
|
|
||||||
|
Why KUnit?
|
||||||
|
==========
|
||||||
|
|
||||||
|
A unit test is supposed to test a single unit of code in isolation, hence the
|
||||||
|
name. A unit test should be the finest granularity of testing and as such should
|
||||||
|
allow all possible code paths to be tested in the code under test; this is only
|
||||||
|
possible if the code under test is very small and does not have any external
|
||||||
|
dependencies outside of the test's control like hardware.
|
||||||
|
|
||||||
|
Outside of KUnit, there are no testing frameworks currently
|
||||||
|
available for the kernel that do not require installing the kernel on a test
|
||||||
|
machine or in a VM and all require tests to be written in userspace running on
|
||||||
|
the kernel; this is true for Autotest, and kselftest, disqualifying
|
||||||
|
any of them from being considered unit testing frameworks.
|
||||||
|
|
||||||
|
KUnit addresses the problem of being able to run tests without needing a virtual
|
||||||
|
machine or actual hardware with User Mode Linux. User Mode Linux is a Linux
|
||||||
|
architecture, like ARM or x86; however, unlike other architectures it compiles
|
||||||
|
to a standalone program that can be run like any other program directly inside
|
||||||
|
of a host operating system; to be clear, it does not require any virtualization
|
||||||
|
support; it is just a regular program.
|
||||||
|
|
||||||
|
KUnit is fast. Excluding build time, from invocation to completion KUnit can run
|
||||||
|
several dozen tests in only 10 to 20 seconds; this might not sound like a big
|
||||||
|
deal to some people, but having such fast and easy to run tests fundamentally
|
||||||
|
changes the way you go about testing and even writing code in the first place.
|
||||||
|
Linus himself said in his `git talk at Google
|
||||||
|
<https://gist.github.com/lorn/1272686/revisions#diff-53c65572127855f1b003db4064a94573R874>`_:
|
||||||
|
|
||||||
|
"... a lot of people seem to think that performance is about doing the
|
||||||
|
same thing, just doing it faster, and that is not true. That is not what
|
||||||
|
performance is all about. If you can do something really fast, really
|
||||||
|
well, people will start using it differently."
|
||||||
|
|
||||||
|
In this context Linus was talking about branching and merging,
|
||||||
|
but this point also applies to testing. If your tests are slow, unreliable, are
|
||||||
|
difficult to write, and require a special setup or special hardware to run,
|
||||||
|
then you wait a lot longer to write tests, and you wait a lot longer to run
|
||||||
|
tests; this means that tests are likely to break, unlikely to test a lot of
|
||||||
|
things, and are unlikely to be rerun once they pass. If your tests are really
|
||||||
|
fast, you run them all the time, every time you make a change, and every time
|
||||||
|
someone sends you some code. Why trust that someone ran all their tests
|
||||||
|
correctly on every change when you can just run them yourself in less time than
|
||||||
|
it takes to read their test log?
|
||||||
|
|
||||||
|
How do I use it?
|
||||||
|
================
|
||||||
|
|
||||||
|
* :doc:`start` - for new users of KUnit
|
||||||
|
* :doc:`usage` - for a more detailed explanation of KUnit features
|
||||||
|
* :doc:`api/index` - for the list of KUnit APIs used for testing
|
180
Documentation/dev-tools/kunit/start.rst
Normal file
180
Documentation/dev-tools/kunit/start.rst
Normal file
@@ -0,0 +1,180 @@
|
|||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
|
||||||
|
===============
|
||||||
|
Getting Started
|
||||||
|
===============
|
||||||
|
|
||||||
|
Installing dependencies
|
||||||
|
=======================
|
||||||
|
KUnit has the same dependencies as the Linux kernel. As long as you can build
|
||||||
|
the kernel, you can run KUnit.
|
||||||
|
|
||||||
|
KUnit Wrapper
|
||||||
|
=============
|
||||||
|
Included with KUnit is a simple Python wrapper that helps format the output to
|
||||||
|
easily use and read KUnit output. It handles building and running the kernel, as
|
||||||
|
well as formatting the output.
|
||||||
|
|
||||||
|
The wrapper can be run with:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
./tools/testing/kunit/kunit.py run
|
||||||
|
|
||||||
|
Creating a kunitconfig
|
||||||
|
======================
|
||||||
|
The Python script is a thin wrapper around Kbuild as such, it needs to be
|
||||||
|
configured with a ``kunitconfig`` file. This file essentially contains the
|
||||||
|
regular Kernel config, with the specific test targets as well.
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
git clone -b master https://kunit.googlesource.com/kunitconfig $PATH_TO_KUNITCONFIG_REPO
|
||||||
|
cd $PATH_TO_LINUX_REPO
|
||||||
|
ln -s $PATH_TO_KUNIT_CONFIG_REPO/kunitconfig kunitconfig
|
||||||
|
|
||||||
|
You may want to add kunitconfig to your local gitignore.
|
||||||
|
|
||||||
|
Verifying KUnit Works
|
||||||
|
---------------------
|
||||||
|
|
||||||
|
To make sure that everything is set up correctly, simply invoke the Python
|
||||||
|
wrapper from your kernel repo:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
./tools/testing/kunit/kunit.py run
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
You may want to run ``make mrproper`` first.
|
||||||
|
|
||||||
|
If everything worked correctly, you should see the following:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
Generating .config ...
|
||||||
|
Building KUnit Kernel ...
|
||||||
|
Starting KUnit Kernel ...
|
||||||
|
|
||||||
|
followed by a list of tests that are run. All of them should be passing.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
Because it is building a lot of sources for the first time, the ``Building
|
||||||
|
kunit kernel`` step may take a while.
|
||||||
|
|
||||||
|
Writing your first test
|
||||||
|
=======================
|
||||||
|
|
||||||
|
In your kernel repo let's add some code that we can test. Create a file
|
||||||
|
``drivers/misc/example.h`` with the contents:
|
||||||
|
|
||||||
|
.. code-block:: c
|
||||||
|
|
||||||
|
int misc_example_add(int left, int right);
|
||||||
|
|
||||||
|
create a file ``drivers/misc/example.c``:
|
||||||
|
|
||||||
|
.. code-block:: c
|
||||||
|
|
||||||
|
#include <linux/errno.h>
|
||||||
|
|
||||||
|
#include "example.h"
|
||||||
|
|
||||||
|
int misc_example_add(int left, int right)
|
||||||
|
{
|
||||||
|
return left + right;
|
||||||
|
}
|
||||||
|
|
||||||
|
Now add the following lines to ``drivers/misc/Kconfig``:
|
||||||
|
|
||||||
|
.. code-block:: kconfig
|
||||||
|
|
||||||
|
config MISC_EXAMPLE
|
||||||
|
bool "My example"
|
||||||
|
|
||||||
|
and the following lines to ``drivers/misc/Makefile``:
|
||||||
|
|
||||||
|
.. code-block:: make
|
||||||
|
|
||||||
|
obj-$(CONFIG_MISC_EXAMPLE) += example.o
|
||||||
|
|
||||||
|
Now we are ready to write the test. The test will be in
|
||||||
|
``drivers/misc/example-test.c``:
|
||||||
|
|
||||||
|
.. code-block:: c
|
||||||
|
|
||||||
|
#include <kunit/test.h>
|
||||||
|
#include "example.h"
|
||||||
|
|
||||||
|
/* Define the test cases. */
|
||||||
|
|
||||||
|
static void misc_example_add_test_basic(struct kunit *test)
|
||||||
|
{
|
||||||
|
KUNIT_EXPECT_EQ(test, 1, misc_example_add(1, 0));
|
||||||
|
KUNIT_EXPECT_EQ(test, 2, misc_example_add(1, 1));
|
||||||
|
KUNIT_EXPECT_EQ(test, 0, misc_example_add(-1, 1));
|
||||||
|
KUNIT_EXPECT_EQ(test, INT_MAX, misc_example_add(0, INT_MAX));
|
||||||
|
KUNIT_EXPECT_EQ(test, -1, misc_example_add(INT_MAX, INT_MIN));
|
||||||
|
}
|
||||||
|
|
||||||
|
static void misc_example_test_failure(struct kunit *test)
|
||||||
|
{
|
||||||
|
KUNIT_FAIL(test, "This test never passes.");
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct kunit_case misc_example_test_cases[] = {
|
||||||
|
KUNIT_CASE(misc_example_add_test_basic),
|
||||||
|
KUNIT_CASE(misc_example_test_failure),
|
||||||
|
{}
|
||||||
|
};
|
||||||
|
|
||||||
|
static struct kunit_suite misc_example_test_suite = {
|
||||||
|
.name = "misc-example",
|
||||||
|
.test_cases = misc_example_test_cases,
|
||||||
|
};
|
||||||
|
kunit_test_suite(misc_example_test_suite);
|
||||||
|
|
||||||
|
Now add the following to ``drivers/misc/Kconfig``:
|
||||||
|
|
||||||
|
.. code-block:: kconfig
|
||||||
|
|
||||||
|
config MISC_EXAMPLE_TEST
|
||||||
|
bool "Test for my example"
|
||||||
|
depends on MISC_EXAMPLE && KUNIT
|
||||||
|
|
||||||
|
and the following to ``drivers/misc/Makefile``:
|
||||||
|
|
||||||
|
.. code-block:: make
|
||||||
|
|
||||||
|
obj-$(CONFIG_MISC_EXAMPLE_TEST) += example-test.o
|
||||||
|
|
||||||
|
Now add it to your ``kunitconfig``:
|
||||||
|
|
||||||
|
.. code-block:: none
|
||||||
|
|
||||||
|
CONFIG_MISC_EXAMPLE=y
|
||||||
|
CONFIG_MISC_EXAMPLE_TEST=y
|
||||||
|
|
||||||
|
Now you can run the test:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
./tools/testing/kunit/kunit.py
|
||||||
|
|
||||||
|
You should see the following failure:
|
||||||
|
|
||||||
|
.. code-block:: none
|
||||||
|
|
||||||
|
...
|
||||||
|
[16:08:57] [PASSED] misc-example:misc_example_add_test_basic
|
||||||
|
[16:08:57] [FAILED] misc-example:misc_example_test_failure
|
||||||
|
[16:08:57] EXPECTATION FAILED at drivers/misc/example-test.c:17
|
||||||
|
[16:08:57] This test never passes.
|
||||||
|
...
|
||||||
|
|
||||||
|
Congrats! You just wrote your first KUnit test!
|
||||||
|
|
||||||
|
Next Steps
|
||||||
|
==========
|
||||||
|
* Check out the :doc:`usage` page for a more
|
||||||
|
in-depth explanation of KUnit.
|
576
Documentation/dev-tools/kunit/usage.rst
Normal file
576
Documentation/dev-tools/kunit/usage.rst
Normal file
@@ -0,0 +1,576 @@
|
|||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
|
||||||
|
===========
|
||||||
|
Using KUnit
|
||||||
|
===========
|
||||||
|
|
||||||
|
The purpose of this document is to describe what KUnit is, how it works, how it
|
||||||
|
is intended to be used, and all the concepts and terminology that are needed to
|
||||||
|
understand it. This guide assumes a working knowledge of the Linux kernel and
|
||||||
|
some basic knowledge of testing.
|
||||||
|
|
||||||
|
For a high level introduction to KUnit, including setting up KUnit for your
|
||||||
|
project, see :doc:`start`.
|
||||||
|
|
||||||
|
Organization of this document
|
||||||
|
=============================
|
||||||
|
|
||||||
|
This document is organized into two main sections: Testing and Isolating
|
||||||
|
Behavior. The first covers what a unit test is and how to use KUnit to write
|
||||||
|
them. The second covers how to use KUnit to isolate code and make it possible
|
||||||
|
to unit test code that was otherwise un-unit-testable.
|
||||||
|
|
||||||
|
Testing
|
||||||
|
=======
|
||||||
|
|
||||||
|
What is KUnit?
|
||||||
|
--------------
|
||||||
|
|
||||||
|
"K" is short for "kernel" so "KUnit" is the "(Linux) Kernel Unit Testing
|
||||||
|
Framework." KUnit is intended first and foremost for writing unit tests; it is
|
||||||
|
general enough that it can be used to write integration tests; however, this is
|
||||||
|
a secondary goal. KUnit has no ambition of being the only testing framework for
|
||||||
|
the kernel; for example, it does not intend to be an end-to-end testing
|
||||||
|
framework.
|
||||||
|
|
||||||
|
What is Unit Testing?
|
||||||
|
---------------------
|
||||||
|
|
||||||
|
A `unit test <https://martinfowler.com/bliki/UnitTest.html>`_ is a test that
|
||||||
|
tests code at the smallest possible scope, a *unit* of code. In the C
|
||||||
|
programming language that's a function.
|
||||||
|
|
||||||
|
Unit tests should be written for all the publicly exposed functions in a
|
||||||
|
compilation unit; so that is all the functions that are exported in either a
|
||||||
|
*class* (defined below) or all functions which are **not** static.
|
||||||
|
|
||||||
|
Writing Tests
|
||||||
|
-------------
|
||||||
|
|
||||||
|
Test Cases
|
||||||
|
~~~~~~~~~~
|
||||||
|
|
||||||
|
The fundamental unit in KUnit is the test case. A test case is a function with
|
||||||
|
the signature ``void (*)(struct kunit *test)``. It calls a function to be tested
|
||||||
|
and then sets *expectations* for what should happen. For example:
|
||||||
|
|
||||||
|
.. code-block:: c
|
||||||
|
|
||||||
|
void example_test_success(struct kunit *test)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
void example_test_failure(struct kunit *test)
|
||||||
|
{
|
||||||
|
KUNIT_FAIL(test, "This test never passes.");
|
||||||
|
}
|
||||||
|
|
||||||
|
In the above example ``example_test_success`` always passes because it does
|
||||||
|
nothing; no expectations are set, so all expectations pass. On the other hand
|
||||||
|
``example_test_failure`` always fails because it calls ``KUNIT_FAIL``, which is
|
||||||
|
a special expectation that logs a message and causes the test case to fail.
|
||||||
|
|
||||||
|
Expectations
|
||||||
|
~~~~~~~~~~~~
|
||||||
|
An *expectation* is a way to specify that you expect a piece of code to do
|
||||||
|
something in a test. An expectation is called like a function. A test is made
|
||||||
|
by setting expectations about the behavior of a piece of code under test; when
|
||||||
|
one or more of the expectations fail, the test case fails and information about
|
||||||
|
the failure is logged. For example:
|
||||||
|
|
||||||
|
.. code-block:: c
|
||||||
|
|
||||||
|
void add_test_basic(struct kunit *test)
|
||||||
|
{
|
||||||
|
KUNIT_EXPECT_EQ(test, 1, add(1, 0));
|
||||||
|
KUNIT_EXPECT_EQ(test, 2, add(1, 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
In the above example ``add_test_basic`` makes a number of assertions about the
|
||||||
|
behavior of a function called ``add``; the first parameter is always of type
|
||||||
|
``struct kunit *``, which contains information about the current test context;
|
||||||
|
the second parameter, in this case, is what the value is expected to be; the
|
||||||
|
last value is what the value actually is. If ``add`` passes all of these
|
||||||
|
expectations, the test case, ``add_test_basic`` will pass; if any one of these
|
||||||
|
expectations fail, the test case will fail.
|
||||||
|
|
||||||
|
It is important to understand that a test case *fails* when any expectation is
|
||||||
|
violated; however, the test will continue running, potentially trying other
|
||||||
|
expectations until the test case ends or is otherwise terminated. This is as
|
||||||
|
opposed to *assertions* which are discussed later.
|
||||||
|
|
||||||
|
To learn about more expectations supported by KUnit, see :doc:`api/test`.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
A single test case should be pretty short, pretty easy to understand,
|
||||||
|
focused on a single behavior.
|
||||||
|
|
||||||
|
For example, if we wanted to properly test the add function above, we would
|
||||||
|
create additional tests cases which would each test a different property that an
|
||||||
|
add function should have like this:
|
||||||
|
|
||||||
|
.. code-block:: c
|
||||||
|
|
||||||
|
void add_test_basic(struct kunit *test)
|
||||||
|
{
|
||||||
|
KUNIT_EXPECT_EQ(test, 1, add(1, 0));
|
||||||
|
KUNIT_EXPECT_EQ(test, 2, add(1, 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
void add_test_negative(struct kunit *test)
|
||||||
|
{
|
||||||
|
KUNIT_EXPECT_EQ(test, 0, add(-1, 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
void add_test_max(struct kunit *test)
|
||||||
|
{
|
||||||
|
KUNIT_EXPECT_EQ(test, INT_MAX, add(0, INT_MAX));
|
||||||
|
KUNIT_EXPECT_EQ(test, -1, add(INT_MAX, INT_MIN));
|
||||||
|
}
|
||||||
|
|
||||||
|
void add_test_overflow(struct kunit *test)
|
||||||
|
{
|
||||||
|
KUNIT_EXPECT_EQ(test, INT_MIN, add(INT_MAX, 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
Notice how it is immediately obvious what all the properties that we are testing
|
||||||
|
for are.
|
||||||
|
|
||||||
|
Assertions
|
||||||
|
~~~~~~~~~~
|
||||||
|
|
||||||
|
KUnit also has the concept of an *assertion*. An assertion is just like an
|
||||||
|
expectation except the assertion immediately terminates the test case if it is
|
||||||
|
not satisfied.
|
||||||
|
|
||||||
|
For example:
|
||||||
|
|
||||||
|
.. code-block:: c
|
||||||
|
|
||||||
|
static void mock_test_do_expect_default_return(struct kunit *test)
|
||||||
|
{
|
||||||
|
struct mock_test_context *ctx = test->priv;
|
||||||
|
struct mock *mock = ctx->mock;
|
||||||
|
int param0 = 5, param1 = -5;
|
||||||
|
const char *two_param_types[] = {"int", "int"};
|
||||||
|
const void *two_params[] = {¶m0, ¶m1};
|
||||||
|
const void *ret;
|
||||||
|
|
||||||
|
ret = mock->do_expect(mock,
|
||||||
|
"test_printk", test_printk,
|
||||||
|
two_param_types, two_params,
|
||||||
|
ARRAY_SIZE(two_params));
|
||||||
|
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ret);
|
||||||
|
KUNIT_EXPECT_EQ(test, -4, *((int *) ret));
|
||||||
|
}
|
||||||
|
|
||||||
|
In this example, the method under test should return a pointer to a value, so
|
||||||
|
if the pointer returned by the method is null or an errno, we don't want to
|
||||||
|
bother continuing the test since the following expectation could crash the test
|
||||||
|
case. `ASSERT_NOT_ERR_OR_NULL(...)` allows us to bail out of the test case if
|
||||||
|
the appropriate conditions have not been satisfied to complete the test.
|
||||||
|
|
||||||
|
Test Suites
|
||||||
|
~~~~~~~~~~~
|
||||||
|
|
||||||
|
Now obviously one unit test isn't very helpful; the power comes from having
|
||||||
|
many test cases covering all of your behaviors. Consequently it is common to
|
||||||
|
have many *similar* tests; in order to reduce duplication in these closely
|
||||||
|
related tests most unit testing frameworks provide the concept of a *test
|
||||||
|
suite*, in KUnit we call it a *test suite*; all it is is just a collection of
|
||||||
|
test cases for a unit of code with a set up function that gets invoked before
|
||||||
|
every test cases and then a tear down function that gets invoked after every
|
||||||
|
test case completes.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
.. code-block:: c
|
||||||
|
|
||||||
|
static struct kunit_case example_test_cases[] = {
|
||||||
|
KUNIT_CASE(example_test_foo),
|
||||||
|
KUNIT_CASE(example_test_bar),
|
||||||
|
KUNIT_CASE(example_test_baz),
|
||||||
|
{}
|
||||||
|
};
|
||||||
|
|
||||||
|
static struct kunit_suite example_test_suite = {
|
||||||
|
.name = "example",
|
||||||
|
.init = example_test_init,
|
||||||
|
.exit = example_test_exit,
|
||||||
|
.test_cases = example_test_cases,
|
||||||
|
};
|
||||||
|
kunit_test_suite(example_test_suite);
|
||||||
|
|
||||||
|
In the above example the test suite, ``example_test_suite``, would run the test
|
||||||
|
cases ``example_test_foo``, ``example_test_bar``, and ``example_test_baz``,
|
||||||
|
each would have ``example_test_init`` called immediately before it and would
|
||||||
|
have ``example_test_exit`` called immediately after it.
|
||||||
|
``kunit_test_suite(example_test_suite)`` registers the test suite with the
|
||||||
|
KUnit test framework.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
A test case will only be run if it is associated with a test suite.
|
||||||
|
|
||||||
|
For a more information on these types of things see the :doc:`api/test`.
|
||||||
|
|
||||||
|
Isolating Behavior
|
||||||
|
==================
|
||||||
|
|
||||||
|
The most important aspect of unit testing that other forms of testing do not
|
||||||
|
provide is the ability to limit the amount of code under test to a single unit.
|
||||||
|
In practice, this is only possible by being able to control what code gets run
|
||||||
|
when the unit under test calls a function and this is usually accomplished
|
||||||
|
through some sort of indirection where a function is exposed as part of an API
|
||||||
|
such that the definition of that function can be changed without affecting the
|
||||||
|
rest of the code base. In the kernel this primarily comes from two constructs,
|
||||||
|
classes, structs that contain function pointers that are provided by the
|
||||||
|
implementer, and architecture specific functions which have definitions selected
|
||||||
|
at compile time.
|
||||||
|
|
||||||
|
Classes
|
||||||
|
-------
|
||||||
|
|
||||||
|
Classes are not a construct that is built into the C programming language;
|
||||||
|
however, it is an easily derived concept. Accordingly, pretty much every project
|
||||||
|
that does not use a standardized object oriented library (like GNOME's GObject)
|
||||||
|
has their own slightly different way of doing object oriented programming; the
|
||||||
|
Linux kernel is no exception.
|
||||||
|
|
||||||
|
The central concept in kernel object oriented programming is the class. In the
|
||||||
|
kernel, a *class* is a struct that contains function pointers. This creates a
|
||||||
|
contract between *implementers* and *users* since it forces them to use the
|
||||||
|
same function signature without having to call the function directly. In order
|
||||||
|
for it to truly be a class, the function pointers must specify that a pointer
|
||||||
|
to the class, known as a *class handle*, be one of the parameters; this makes
|
||||||
|
it possible for the member functions (also known as *methods*) to have access
|
||||||
|
to member variables (more commonly known as *fields*) allowing the same
|
||||||
|
implementation to have multiple *instances*.
|
||||||
|
|
||||||
|
Typically a class can be *overridden* by *child classes* by embedding the
|
||||||
|
*parent class* in the child class. Then when a method provided by the child
|
||||||
|
class is called, the child implementation knows that the pointer passed to it is
|
||||||
|
of a parent contained within the child; because of this, the child can compute
|
||||||
|
the pointer to itself because the pointer to the parent is always a fixed offset
|
||||||
|
from the pointer to the child; this offset is the offset of the parent contained
|
||||||
|
in the child struct. For example:
|
||||||
|
|
||||||
|
.. code-block:: c
|
||||||
|
|
||||||
|
struct shape {
|
||||||
|
int (*area)(struct shape *this);
|
||||||
|
};
|
||||||
|
|
||||||
|
struct rectangle {
|
||||||
|
struct shape parent;
|
||||||
|
int length;
|
||||||
|
int width;
|
||||||
|
};
|
||||||
|
|
||||||
|
int rectangle_area(struct shape *this)
|
||||||
|
{
|
||||||
|
struct rectangle *self = container_of(this, struct shape, parent);
|
||||||
|
|
||||||
|
return self->length * self->width;
|
||||||
|
};
|
||||||
|
|
||||||
|
void rectangle_new(struct rectangle *self, int length, int width)
|
||||||
|
{
|
||||||
|
self->parent.area = rectangle_area;
|
||||||
|
self->length = length;
|
||||||
|
self->width = width;
|
||||||
|
}
|
||||||
|
|
||||||
|
In this example (as in most kernel code) the operation of computing the pointer
|
||||||
|
to the child from the pointer to the parent is done by ``container_of``.
|
||||||
|
|
||||||
|
Faking Classes
|
||||||
|
~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
In order to unit test a piece of code that calls a method in a class, the
|
||||||
|
behavior of the method must be controllable, otherwise the test ceases to be a
|
||||||
|
unit test and becomes an integration test.
|
||||||
|
|
||||||
|
A fake just provides an implementation of a piece of code that is different than
|
||||||
|
what runs in a production instance, but behaves identically from the standpoint
|
||||||
|
of the callers; this is usually done to replace a dependency that is hard to
|
||||||
|
deal with, or is slow.
|
||||||
|
|
||||||
|
A good example for this might be implementing a fake EEPROM that just stores the
|
||||||
|
"contents" in an internal buffer. For example, let's assume we have a class that
|
||||||
|
represents an EEPROM:
|
||||||
|
|
||||||
|
.. code-block:: c
|
||||||
|
|
||||||
|
struct eeprom {
|
||||||
|
ssize_t (*read)(struct eeprom *this, size_t offset, char *buffer, size_t count);
|
||||||
|
ssize_t (*write)(struct eeprom *this, size_t offset, const char *buffer, size_t count);
|
||||||
|
};
|
||||||
|
|
||||||
|
And we want to test some code that buffers writes to the EEPROM:
|
||||||
|
|
||||||
|
.. code-block:: c
|
||||||
|
|
||||||
|
struct eeprom_buffer {
|
||||||
|
ssize_t (*write)(struct eeprom_buffer *this, const char *buffer, size_t count);
|
||||||
|
int flush(struct eeprom_buffer *this);
|
||||||
|
size_t flush_count; /* Flushes when buffer exceeds flush_count. */
|
||||||
|
};
|
||||||
|
|
||||||
|
struct eeprom_buffer *new_eeprom_buffer(struct eeprom *eeprom);
|
||||||
|
void destroy_eeprom_buffer(struct eeprom *eeprom);
|
||||||
|
|
||||||
|
We can easily test this code by *faking out* the underlying EEPROM:
|
||||||
|
|
||||||
|
.. code-block:: c
|
||||||
|
|
||||||
|
struct fake_eeprom {
|
||||||
|
struct eeprom parent;
|
||||||
|
char contents[FAKE_EEPROM_CONTENTS_SIZE];
|
||||||
|
};
|
||||||
|
|
||||||
|
ssize_t fake_eeprom_read(struct eeprom *parent, size_t offset, char *buffer, size_t count)
|
||||||
|
{
|
||||||
|
struct fake_eeprom *this = container_of(parent, struct fake_eeprom, parent);
|
||||||
|
|
||||||
|
count = min(count, FAKE_EEPROM_CONTENTS_SIZE - offset);
|
||||||
|
memcpy(buffer, this->contents + offset, count);
|
||||||
|
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
|
ssize_t fake_eeprom_write(struct eeprom *this, size_t offset, const char *buffer, size_t count)
|
||||||
|
{
|
||||||
|
struct fake_eeprom *this = container_of(parent, struct fake_eeprom, parent);
|
||||||
|
|
||||||
|
count = min(count, FAKE_EEPROM_CONTENTS_SIZE - offset);
|
||||||
|
memcpy(this->contents + offset, buffer, count);
|
||||||
|
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
|
void fake_eeprom_init(struct fake_eeprom *this)
|
||||||
|
{
|
||||||
|
this->parent.read = fake_eeprom_read;
|
||||||
|
this->parent.write = fake_eeprom_write;
|
||||||
|
memset(this->contents, 0, FAKE_EEPROM_CONTENTS_SIZE);
|
||||||
|
}
|
||||||
|
|
||||||
|
We can now use it to test ``struct eeprom_buffer``:
|
||||||
|
|
||||||
|
.. code-block:: c
|
||||||
|
|
||||||
|
struct eeprom_buffer_test {
|
||||||
|
struct fake_eeprom *fake_eeprom;
|
||||||
|
struct eeprom_buffer *eeprom_buffer;
|
||||||
|
};
|
||||||
|
|
||||||
|
static void eeprom_buffer_test_does_not_write_until_flush(struct kunit *test)
|
||||||
|
{
|
||||||
|
struct eeprom_buffer_test *ctx = test->priv;
|
||||||
|
struct eeprom_buffer *eeprom_buffer = ctx->eeprom_buffer;
|
||||||
|
struct fake_eeprom *fake_eeprom = ctx->fake_eeprom;
|
||||||
|
char buffer[] = {0xff};
|
||||||
|
|
||||||
|
eeprom_buffer->flush_count = SIZE_MAX;
|
||||||
|
|
||||||
|
eeprom_buffer->write(eeprom_buffer, buffer, 1);
|
||||||
|
KUNIT_EXPECT_EQ(test, fake_eeprom->contents[0], 0);
|
||||||
|
|
||||||
|
eeprom_buffer->write(eeprom_buffer, buffer, 1);
|
||||||
|
KUNIT_EXPECT_EQ(test, fake_eeprom->contents[1], 0);
|
||||||
|
|
||||||
|
eeprom_buffer->flush(eeprom_buffer);
|
||||||
|
KUNIT_EXPECT_EQ(test, fake_eeprom->contents[0], 0xff);
|
||||||
|
KUNIT_EXPECT_EQ(test, fake_eeprom->contents[1], 0xff);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void eeprom_buffer_test_flushes_after_flush_count_met(struct kunit *test)
|
||||||
|
{
|
||||||
|
struct eeprom_buffer_test *ctx = test->priv;
|
||||||
|
struct eeprom_buffer *eeprom_buffer = ctx->eeprom_buffer;
|
||||||
|
struct fake_eeprom *fake_eeprom = ctx->fake_eeprom;
|
||||||
|
char buffer[] = {0xff};
|
||||||
|
|
||||||
|
eeprom_buffer->flush_count = 2;
|
||||||
|
|
||||||
|
eeprom_buffer->write(eeprom_buffer, buffer, 1);
|
||||||
|
KUNIT_EXPECT_EQ(test, fake_eeprom->contents[0], 0);
|
||||||
|
|
||||||
|
eeprom_buffer->write(eeprom_buffer, buffer, 1);
|
||||||
|
KUNIT_EXPECT_EQ(test, fake_eeprom->contents[0], 0xff);
|
||||||
|
KUNIT_EXPECT_EQ(test, fake_eeprom->contents[1], 0xff);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void eeprom_buffer_test_flushes_increments_of_flush_count(struct kunit *test)
|
||||||
|
{
|
||||||
|
struct eeprom_buffer_test *ctx = test->priv;
|
||||||
|
struct eeprom_buffer *eeprom_buffer = ctx->eeprom_buffer;
|
||||||
|
struct fake_eeprom *fake_eeprom = ctx->fake_eeprom;
|
||||||
|
char buffer[] = {0xff, 0xff};
|
||||||
|
|
||||||
|
eeprom_buffer->flush_count = 2;
|
||||||
|
|
||||||
|
eeprom_buffer->write(eeprom_buffer, buffer, 1);
|
||||||
|
KUNIT_EXPECT_EQ(test, fake_eeprom->contents[0], 0);
|
||||||
|
|
||||||
|
eeprom_buffer->write(eeprom_buffer, buffer, 2);
|
||||||
|
KUNIT_EXPECT_EQ(test, fake_eeprom->contents[0], 0xff);
|
||||||
|
KUNIT_EXPECT_EQ(test, fake_eeprom->contents[1], 0xff);
|
||||||
|
/* Should have only flushed the first two bytes. */
|
||||||
|
KUNIT_EXPECT_EQ(test, fake_eeprom->contents[2], 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int eeprom_buffer_test_init(struct kunit *test)
|
||||||
|
{
|
||||||
|
struct eeprom_buffer_test *ctx;
|
||||||
|
|
||||||
|
ctx = kunit_kzalloc(test, sizeof(*ctx), GFP_KERNEL);
|
||||||
|
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx);
|
||||||
|
|
||||||
|
ctx->fake_eeprom = kunit_kzalloc(test, sizeof(*ctx->fake_eeprom), GFP_KERNEL);
|
||||||
|
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx->fake_eeprom);
|
||||||
|
fake_eeprom_init(ctx->fake_eeprom);
|
||||||
|
|
||||||
|
ctx->eeprom_buffer = new_eeprom_buffer(&ctx->fake_eeprom->parent);
|
||||||
|
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx->eeprom_buffer);
|
||||||
|
|
||||||
|
test->priv = ctx;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void eeprom_buffer_test_exit(struct kunit *test)
|
||||||
|
{
|
||||||
|
struct eeprom_buffer_test *ctx = test->priv;
|
||||||
|
|
||||||
|
destroy_eeprom_buffer(ctx->eeprom_buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
.. _kunit-on-non-uml:
|
||||||
|
|
||||||
|
KUnit on non-UML architectures
|
||||||
|
==============================
|
||||||
|
|
||||||
|
By default KUnit uses UML as a way to provide dependencies for code under test.
|
||||||
|
Under most circumstances KUnit's usage of UML should be treated as an
|
||||||
|
implementation detail of how KUnit works under the hood. Nevertheless, there
|
||||||
|
are instances where being able to run architecture specific code, or test
|
||||||
|
against real hardware is desirable. For these reasons KUnit supports running on
|
||||||
|
other architectures.
|
||||||
|
|
||||||
|
Running existing KUnit tests on non-UML architectures
|
||||||
|
-----------------------------------------------------
|
||||||
|
|
||||||
|
There are some special considerations when running existing KUnit tests on
|
||||||
|
non-UML architectures:
|
||||||
|
|
||||||
|
* Hardware may not be deterministic, so a test that always passes or fails
|
||||||
|
when run under UML may not always do so on real hardware.
|
||||||
|
* Hardware and VM environments may not be hermetic. KUnit tries its best to
|
||||||
|
provide a hermetic environment to run tests; however, it cannot manage state
|
||||||
|
that it doesn't know about outside of the kernel. Consequently, tests that
|
||||||
|
may be hermetic on UML may not be hermetic on other architectures.
|
||||||
|
* Some features and tooling may not be supported outside of UML.
|
||||||
|
* Hardware and VMs are slower than UML.
|
||||||
|
|
||||||
|
None of these are reasons not to run your KUnit tests on real hardware; they are
|
||||||
|
only things to be aware of when doing so.
|
||||||
|
|
||||||
|
The biggest impediment will likely be that certain KUnit features and
|
||||||
|
infrastructure may not support your target environment. For example, at this
|
||||||
|
time the KUnit Wrapper (``tools/testing/kunit/kunit.py``) does not work outside
|
||||||
|
of UML. Unfortunately, there is no way around this. Using UML (or even just a
|
||||||
|
particular architecture) allows us to make a lot of assumptions that make it
|
||||||
|
possible to do things which might otherwise be impossible.
|
||||||
|
|
||||||
|
Nevertheless, all core KUnit framework features are fully supported on all
|
||||||
|
architectures, and using them is straightforward: all you need to do is to take
|
||||||
|
your kunitconfig, your Kconfig options for the tests you would like to run, and
|
||||||
|
merge them into whatever config your are using for your platform. That's it!
|
||||||
|
|
||||||
|
For example, let's say you have the following kunitconfig:
|
||||||
|
|
||||||
|
.. code-block:: none
|
||||||
|
|
||||||
|
CONFIG_KUNIT=y
|
||||||
|
CONFIG_KUNIT_EXAMPLE_TEST=y
|
||||||
|
|
||||||
|
If you wanted to run this test on an x86 VM, you might add the following config
|
||||||
|
options to your ``.config``:
|
||||||
|
|
||||||
|
.. code-block:: none
|
||||||
|
|
||||||
|
CONFIG_KUNIT=y
|
||||||
|
CONFIG_KUNIT_EXAMPLE_TEST=y
|
||||||
|
CONFIG_SERIAL_8250=y
|
||||||
|
CONFIG_SERIAL_8250_CONSOLE=y
|
||||||
|
|
||||||
|
All these new options do is enable support for a common serial console needed
|
||||||
|
for logging.
|
||||||
|
|
||||||
|
Next, you could build a kernel with these tests as follows:
|
||||||
|
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
make ARCH=x86 olddefconfig
|
||||||
|
make ARCH=x86
|
||||||
|
|
||||||
|
Once you have built a kernel, you could run it on QEMU as follows:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
qemu-system-x86_64 -enable-kvm \
|
||||||
|
-m 1024 \
|
||||||
|
-kernel arch/x86_64/boot/bzImage \
|
||||||
|
-append 'console=ttyS0' \
|
||||||
|
--nographic
|
||||||
|
|
||||||
|
Interspersed in the kernel logs you might see the following:
|
||||||
|
|
||||||
|
.. code-block:: none
|
||||||
|
|
||||||
|
TAP version 14
|
||||||
|
# Subtest: example
|
||||||
|
1..1
|
||||||
|
# example_simple_test: initializing
|
||||||
|
ok 1 - example_simple_test
|
||||||
|
ok 1 - example
|
||||||
|
|
||||||
|
Congratulations, you just ran a KUnit test on the x86 architecture!
|
||||||
|
|
||||||
|
Writing new tests for other architectures
|
||||||
|
-----------------------------------------
|
||||||
|
|
||||||
|
The first thing you must do is ask yourself whether it is necessary to write a
|
||||||
|
KUnit test for a specific architecture, and then whether it is necessary to
|
||||||
|
write that test for a particular piece of hardware. In general, writing a test
|
||||||
|
that depends on having access to a particular piece of hardware or software (not
|
||||||
|
included in the Linux source repo) should be avoided at all costs.
|
||||||
|
|
||||||
|
Even if you only ever plan on running your KUnit test on your hardware
|
||||||
|
configuration, other people may want to run your tests and may not have access
|
||||||
|
to your hardware. If you write your test to run on UML, then anyone can run your
|
||||||
|
tests without knowing anything about your particular setup, and you can still
|
||||||
|
run your tests on your hardware setup just by compiling for your architecture.
|
||||||
|
|
||||||
|
.. important::
|
||||||
|
Always prefer tests that run on UML to tests that only run under a particular
|
||||||
|
architecture, and always prefer tests that run under QEMU or another easy
|
||||||
|
(and monitarily free) to obtain software environment to a specific piece of
|
||||||
|
hardware.
|
||||||
|
|
||||||
|
Nevertheless, there are still valid reasons to write an architecture or hardware
|
||||||
|
specific test: for example, you might want to test some code that really belongs
|
||||||
|
in ``arch/some-arch/*``. Even so, try your best to write the test so that it
|
||||||
|
does not depend on physical hardware: if some of your test cases don't need the
|
||||||
|
hardware, only require the hardware for tests that actually need it.
|
||||||
|
|
||||||
|
Now that you have narrowed down exactly what bits are hardware specific, the
|
||||||
|
actual procedure for writing and running the tests is pretty much the same as
|
||||||
|
writing normal KUnit tests. One special caveat is that you have to reset
|
||||||
|
hardware state in between test cases; if this is not possible, you may only be
|
||||||
|
able to run one test case per invocation.
|
||||||
|
|
||||||
|
.. TODO(brendanhiggins@google.com): Add an actual example of an architecture
|
||||||
|
dependent KUnit test.
|
@@ -12,7 +12,6 @@ $(obj)/%.example.dts: $(src)/%.yaml FORCE
|
|||||||
$(call if_changed,chk_binding)
|
$(call if_changed,chk_binding)
|
||||||
|
|
||||||
DT_TMP_SCHEMA := processed-schema.yaml
|
DT_TMP_SCHEMA := processed-schema.yaml
|
||||||
extra-y += $(DT_TMP_SCHEMA)
|
|
||||||
|
|
||||||
quiet_cmd_mk_schema = SCHEMA $@
|
quiet_cmd_mk_schema = SCHEMA $@
|
||||||
cmd_mk_schema = $(DT_MK_SCHEMA) $(DT_MK_SCHEMA_FLAGS) -o $@ $(real-prereqs)
|
cmd_mk_schema = $(DT_MK_SCHEMA) $(DT_MK_SCHEMA_FLAGS) -o $@ $(real-prereqs)
|
||||||
@@ -26,8 +25,12 @@ DT_DOCS = $(shell \
|
|||||||
|
|
||||||
DT_SCHEMA_FILES ?= $(addprefix $(src)/,$(DT_DOCS))
|
DT_SCHEMA_FILES ?= $(addprefix $(src)/,$(DT_DOCS))
|
||||||
|
|
||||||
|
ifeq ($(CHECK_DTBS),)
|
||||||
extra-y += $(patsubst $(src)/%.yaml,%.example.dts, $(DT_SCHEMA_FILES))
|
extra-y += $(patsubst $(src)/%.yaml,%.example.dts, $(DT_SCHEMA_FILES))
|
||||||
extra-y += $(patsubst $(src)/%.yaml,%.example.dt.yaml, $(DT_SCHEMA_FILES))
|
extra-y += $(patsubst $(src)/%.yaml,%.example.dt.yaml, $(DT_SCHEMA_FILES))
|
||||||
|
endif
|
||||||
|
|
||||||
$(obj)/$(DT_TMP_SCHEMA): $(DT_SCHEMA_FILES) FORCE
|
$(obj)/$(DT_TMP_SCHEMA): $(DT_SCHEMA_FILES) FORCE
|
||||||
$(call if_changed,mk_schema)
|
$(call if_changed,mk_schema)
|
||||||
|
|
||||||
|
extra-y += $(DT_TMP_SCHEMA)
|
||||||
|
@@ -94,7 +94,7 @@ properties:
|
|||||||
- amlogic,p212
|
- amlogic,p212
|
||||||
- hwacom,amazetv
|
- hwacom,amazetv
|
||||||
- khadas,vim
|
- khadas,vim
|
||||||
- libretech,cc
|
- libretech,aml-s905x-cc
|
||||||
- nexbox,a95x
|
- nexbox,a95x
|
||||||
- const: amlogic,s905x
|
- const: amlogic,s905x
|
||||||
- const: amlogic,meson-gxl
|
- const: amlogic,meson-gxl
|
||||||
@@ -147,6 +147,7 @@ properties:
|
|||||||
- enum:
|
- enum:
|
||||||
- hardkernel,odroid-n2
|
- hardkernel,odroid-n2
|
||||||
- khadas,vim3
|
- khadas,vim3
|
||||||
|
- ugoos,am6
|
||||||
- const: amlogic,s922x
|
- const: amlogic,s922x
|
||||||
- const: amlogic,g12b
|
- const: amlogic,g12b
|
||||||
|
|
||||||
@@ -156,4 +157,10 @@ properties:
|
|||||||
- seirobotics,sei610
|
- seirobotics,sei610
|
||||||
- khadas,vim3l
|
- khadas,vim3l
|
||||||
- const: amlogic,sm1
|
- const: amlogic,sm1
|
||||||
|
|
||||||
|
- description: Boards with the Amlogic Meson A1 A113L SoC
|
||||||
|
items:
|
||||||
|
- enum:
|
||||||
|
- amlogic,ad401
|
||||||
|
- const: amlogic,a1
|
||||||
...
|
...
|
||||||
|
@@ -1,32 +0,0 @@
|
|||||||
Amlogic Meson8 and Meson8b SRAM for smp bringup:
|
|
||||||
------------------------------------------------
|
|
||||||
|
|
||||||
Amlogic's SMP-capable SoCs use part of the sram for the bringup of the cores.
|
|
||||||
Once the core gets powered up it executes the code that is residing at a
|
|
||||||
specific location.
|
|
||||||
|
|
||||||
Therefore a reserved section sub-node has to be added to the mmio-sram
|
|
||||||
declaration.
|
|
||||||
|
|
||||||
Required sub-node properties:
|
|
||||||
- compatible : depending on the SoC this should be one of:
|
|
||||||
"amlogic,meson8-smp-sram"
|
|
||||||
"amlogic,meson8b-smp-sram"
|
|
||||||
|
|
||||||
The rest of the properties should follow the generic mmio-sram discription
|
|
||||||
found in ../../misc/sram.txt
|
|
||||||
|
|
||||||
Example:
|
|
||||||
|
|
||||||
sram: sram@d9000000 {
|
|
||||||
compatible = "mmio-sram";
|
|
||||||
reg = <0xd9000000 0x20000>;
|
|
||||||
#address-cells = <1>;
|
|
||||||
#size-cells = <1>;
|
|
||||||
ranges = <0 0xd9000000 0x20000>;
|
|
||||||
|
|
||||||
smp-sram@1ff80 {
|
|
||||||
compatible = "amlogic,meson8b-smp-sram";
|
|
||||||
reg = <0x1ff80 0x8>;
|
|
||||||
};
|
|
||||||
};
|
|
@@ -100,7 +100,7 @@ Required sub-node properties:
|
|||||||
|
|
||||||
[0] http://infocenter.arm.com/help/topic/com.arm.doc.den0056a/index.html
|
[0] http://infocenter.arm.com/help/topic/com.arm.doc.den0056a/index.html
|
||||||
[1] Documentation/devicetree/bindings/clock/clock-bindings.txt
|
[1] Documentation/devicetree/bindings/clock/clock-bindings.txt
|
||||||
[2] Documentation/devicetree/bindings/power/power_domain.txt
|
[2] Documentation/devicetree/bindings/power/power-domain.yaml
|
||||||
[3] Documentation/devicetree/bindings/thermal/thermal.txt
|
[3] Documentation/devicetree/bindings/thermal/thermal.txt
|
||||||
[4] Documentation/devicetree/bindings/sram/sram.txt
|
[4] Documentation/devicetree/bindings/sram/sram.txt
|
||||||
[5] Documentation/devicetree/bindings/reset/reset.txt
|
[5] Documentation/devicetree/bindings/reset/reset.txt
|
||||||
|
@@ -110,7 +110,7 @@ Required properties:
|
|||||||
[1] Documentation/devicetree/bindings/clock/clock-bindings.txt
|
[1] Documentation/devicetree/bindings/clock/clock-bindings.txt
|
||||||
[2] Documentation/devicetree/bindings/thermal/thermal.txt
|
[2] Documentation/devicetree/bindings/thermal/thermal.txt
|
||||||
[3] Documentation/devicetree/bindings/sram/sram.txt
|
[3] Documentation/devicetree/bindings/sram/sram.txt
|
||||||
[4] Documentation/devicetree/bindings/power/power_domain.txt
|
[4] Documentation/devicetree/bindings/power/power-domain.yaml
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user