Merge tag 'v5.2-rc1' into asoc-5.3
Linux 5.2-rc1
This commit is contained in:
@@ -387,14 +387,14 @@ ForEachMacros:
|
||||
- 'rhl_for_each_entry_rcu'
|
||||
- 'rhl_for_each_rcu'
|
||||
- 'rht_for_each'
|
||||
- 'rht_for_each_continue'
|
||||
- 'rht_for_each_from'
|
||||
- 'rht_for_each_entry'
|
||||
- 'rht_for_each_entry_continue'
|
||||
- 'rht_for_each_entry_from'
|
||||
- 'rht_for_each_entry_rcu'
|
||||
- 'rht_for_each_entry_rcu_continue'
|
||||
- 'rht_for_each_entry_rcu_from'
|
||||
- 'rht_for_each_entry_safe'
|
||||
- 'rht_for_each_rcu'
|
||||
- 'rht_for_each_rcu_continue'
|
||||
- 'rht_for_each_rcu_from'
|
||||
- '__rq_for_each_bio'
|
||||
- 'rq_for_each_bvec'
|
||||
- 'rq_for_each_segment'
|
||||
|
@@ -1 +1,2 @@
|
||||
Christoph Hellwig <hch@lst.de>
|
||||
Marc Gonzalez <marc.w.gonzalez@free.fr>
|
||||
|
24
.gitignore
vendored
24
.gitignore
vendored
@@ -58,6 +58,7 @@ modules.builtin
|
||||
/vmlinuz
|
||||
/System.map
|
||||
/Module.markers
|
||||
/modules.builtin.modinfo
|
||||
|
||||
#
|
||||
# RPM spec file (make rpm-pkg)
|
||||
@@ -80,20 +81,22 @@ modules.builtin
|
||||
/tar-install/
|
||||
|
||||
#
|
||||
# git files that we don't want to ignore even if they are dot-files
|
||||
# We don't want to ignore the following even if they are dot-files
|
||||
#
|
||||
!.clang-format
|
||||
!.cocciconfig
|
||||
!.get_maintainer.ignore
|
||||
!.gitattributes
|
||||
!.gitignore
|
||||
!.mailmap
|
||||
!.cocciconfig
|
||||
!.clang-format
|
||||
|
||||
#
|
||||
# Generated include files
|
||||
#
|
||||
include/config
|
||||
include/generated
|
||||
include/ksym
|
||||
arch/*/include/generated
|
||||
/include/config/
|
||||
/include/generated/
|
||||
/include/ksym/
|
||||
/arch/*/include/generated/
|
||||
|
||||
# stgit generated dirs
|
||||
patches-*
|
||||
@@ -129,7 +132,12 @@ signing_key.x509
|
||||
x509.genkey
|
||||
|
||||
# Kconfig presets
|
||||
all.config
|
||||
/all.config
|
||||
/alldef.config
|
||||
/allmod.config
|
||||
/allno.config
|
||||
/allrandom.config
|
||||
/allyes.config
|
||||
|
||||
# Kdevelop4
|
||||
*.kdev4
|
||||
|
16
.mailmap
16
.mailmap
@@ -16,6 +16,11 @@ Alan Cox <alan@lxorguk.ukuu.org.uk>
|
||||
Alan Cox <root@hraefn.swansea.linux.org.uk>
|
||||
Aleksey Gorelov <aleksey_gorelov@phoenix.com>
|
||||
Aleksandar Markovic <aleksandar.markovic@mips.com> <aleksandar.markovic@imgtec.com>
|
||||
Alex Shi <alex.shi@linux.alibaba.com> <alex.shi@intel.com>
|
||||
Alex Shi <alex.shi@linux.alibaba.com> <alex.shi@linaro.org>
|
||||
Alexei Starovoitov <ast@kernel.org> <ast@plumgrid.com>
|
||||
Alexei Starovoitov <ast@kernel.org> <alexei.starovoitov@gmail.com>
|
||||
Alexei Starovoitov <ast@kernel.org> <ast@fb.com>
|
||||
Al Viro <viro@ftp.linux.org.uk>
|
||||
Al Viro <viro@zenIV.linux.org.uk>
|
||||
Andi Shyti <andi@etezian.org> <andi.shyti@samsung.com>
|
||||
@@ -46,6 +51,12 @@ Christoph Hellwig <hch@lst.de>
|
||||
Christophe Ricard <christophe.ricard@gmail.com>
|
||||
Corey Minyard <minyard@acm.org>
|
||||
Damian Hobson-Garcia <dhobsong@igel.co.jp>
|
||||
Daniel Borkmann <daniel@iogearbox.net> <dborkman@redhat.com>
|
||||
Daniel Borkmann <daniel@iogearbox.net> <dborkmann@redhat.com>
|
||||
Daniel Borkmann <daniel@iogearbox.net> <danborkmann@iogearbox.net>
|
||||
Daniel Borkmann <daniel@iogearbox.net> <daniel.borkmann@tik.ee.ethz.ch>
|
||||
Daniel Borkmann <daniel@iogearbox.net> <danborkmann@googlemail.com>
|
||||
Daniel Borkmann <daniel@iogearbox.net> <dxchgb@gmail.com>
|
||||
David Brownell <david-b@pacbell.net>
|
||||
David Woodhouse <dwmw2@shinybook.infradead.org>
|
||||
Dengcheng Zhu <dzhu@wavecomp.com> <dengcheng.zhu@mips.com>
|
||||
@@ -117,6 +128,8 @@ Leonid I Ananiev <leonid.i.ananiev@intel.com>
|
||||
Linas Vepstas <linas@austin.ibm.com>
|
||||
Linus Lüssing <linus.luessing@c0d3.blue> <linus.luessing@web.de>
|
||||
Linus Lüssing <linus.luessing@c0d3.blue> <linus.luessing@ascom.ch>
|
||||
Li Yang <leoyang.li@nxp.com> <leo@zh-kernel.org>
|
||||
Li Yang <leoyang.li@nxp.com> <leoli@freescale.com>
|
||||
Maciej W. Rozycki <macro@mips.com> <macro@imgtec.com>
|
||||
Marcin Nowakowski <marcin.nowakowski@mips.com> <marcin.nowakowski@imgtec.com>
|
||||
Mark Brown <broonie@sirena.org.uk>
|
||||
@@ -189,6 +202,7 @@ Santosh Shilimkar <ssantosh@kernel.org>
|
||||
Santosh Shilimkar <santosh.shilimkar@oracle.org>
|
||||
Sascha Hauer <s.hauer@pengutronix.de>
|
||||
S.Çağlar Onur <caglar@pardus.org.tr>
|
||||
Sean Nyekjaer <sean@geanix.com> <sean.nyekjaer@prevas.dk>
|
||||
Sebastian Reichel <sre@kernel.org> <sre@debian.org>
|
||||
Sebastian Reichel <sre@kernel.org> <sebastian.reichel@collabora.co.uk>
|
||||
Shiraz Hashim <shiraz.linux.kernel@gmail.com> <shiraz.hashim@st.com>
|
||||
@@ -207,6 +221,8 @@ Tejun Heo <htejun@gmail.com>
|
||||
Thomas Graf <tgraf@suug.ch>
|
||||
Thomas Pedersen <twp@codeaurora.org>
|
||||
Tony Luck <tony.luck@intel.com>
|
||||
TripleX Chung <xxx.phy@gmail.com> <zhongyu@18mail.cn>
|
||||
TripleX Chung <xxx.phy@gmail.com> <triplex@zh-kernel.org>
|
||||
Tsuneo Yoshioka <Tsuneo.Yoshioka@f-secure.com>
|
||||
Uwe Kleine-König <ukleinek@informatik.uni-freiburg.de>
|
||||
Uwe Kleine-König <ukl@pengutronix.de>
|
||||
|
@@ -1,3 +1,5 @@
|
||||
This ABI is deprecated and will be removed after 2021. It is
|
||||
replaced with the batadv generic netlink family.
|
||||
|
||||
What: /sys/class/net/<iface>/batman-adv/elp_interval
|
||||
Date: Feb 2014
|
@@ -1,3 +1,5 @@
|
||||
This ABI is deprecated and will be removed after 2021. It is
|
||||
replaced with the batadv generic netlink family.
|
||||
|
||||
What: /sys/class/net/<mesh_iface>/mesh/aggregated_ogms
|
||||
Date: May 2010
|
@@ -6,6 +6,8 @@ Description:
|
||||
This file allows user to read/write the raw NVMEM contents.
|
||||
Permissions for write to this file depends on the nvmem
|
||||
provider configuration.
|
||||
Note: This file is only present if CONFIG_NVMEM_SYSFS
|
||||
is enabled
|
||||
|
||||
ex:
|
||||
hexdump /sys/bus/nvmem/devices/qfprom0/nvmem
|
||||
|
@@ -81,7 +81,9 @@ What: /sys/bus/vmbus/devices/<UUID>/channels/<N>/latency
|
||||
Date: September. 2017
|
||||
KernelVersion: 4.14
|
||||
Contact: Stephen Hemminger <sthemmin@microsoft.com>
|
||||
Description: Channel signaling latency
|
||||
Description: Channel signaling latency. This file is available only for
|
||||
performance critical channels (storage, network, etc.) that use
|
||||
the monitor page mechanism.
|
||||
Users: Debugging tools
|
||||
|
||||
What: /sys/bus/vmbus/devices/<UUID>/channels/<N>/out_mask
|
||||
@@ -95,7 +97,9 @@ What: /sys/bus/vmbus/devices/<UUID>/channels/<N>/pending
|
||||
Date: September. 2017
|
||||
KernelVersion: 4.14
|
||||
Contact: Stephen Hemminger <sthemmin@microsoft.com>
|
||||
Description: Channel interrupt pending state
|
||||
Description: Channel interrupt pending state. This file is available only for
|
||||
performance critical channels (storage, network, etc.) that use
|
||||
the monitor page mechanism.
|
||||
Users: Debugging tools
|
||||
|
||||
What: /sys/bus/vmbus/devices/<UUID>/channels/<N>/read_avail
|
||||
@@ -137,7 +141,9 @@ What: /sys/bus/vmbus/devices/<UUID>/channels/<N>/monitor_id
|
||||
Date: January. 2018
|
||||
KernelVersion: 4.16
|
||||
Contact: Stephen Hemminger <sthemmin@microsoft.com>
|
||||
Description: Monitor bit associated with channel
|
||||
Description: Monitor bit associated with channel. This file is available only
|
||||
for performance critical channels (storage, network, etc.) that
|
||||
use the monitor page mechanism.
|
||||
Users: Debugging tools and userspace drivers
|
||||
|
||||
What: /sys/bus/vmbus/devices/<UUID>/channels/<N>/ring
|
||||
|
@@ -90,4 +90,89 @@ Date: December 2009
|
||||
Contact: Lee Schermerhorn <lee.schermerhorn@hp.com>
|
||||
Description:
|
||||
The node's huge page size control/query attributes.
|
||||
See Documentation/admin-guide/mm/hugetlbpage.rst
|
||||
See Documentation/admin-guide/mm/hugetlbpage.rst
|
||||
|
||||
What: /sys/devices/system/node/nodeX/accessY/
|
||||
Date: December 2018
|
||||
Contact: Keith Busch <keith.busch@intel.com>
|
||||
Description:
|
||||
The node's relationship to other nodes for access class "Y".
|
||||
|
||||
What: /sys/devices/system/node/nodeX/accessY/initiators/
|
||||
Date: December 2018
|
||||
Contact: Keith Busch <keith.busch@intel.com>
|
||||
Description:
|
||||
The directory containing symlinks to memory initiator
|
||||
nodes that have class "Y" access to this target node's
|
||||
memory. CPUs and other memory initiators in nodes not in
|
||||
the list accessing this node's memory may have different
|
||||
performance.
|
||||
|
||||
What: /sys/devices/system/node/nodeX/accessY/targets/
|
||||
Date: December 2018
|
||||
Contact: Keith Busch <keith.busch@intel.com>
|
||||
Description:
|
||||
The directory containing symlinks to memory targets that
|
||||
this initiator node has class "Y" access.
|
||||
|
||||
What: /sys/devices/system/node/nodeX/accessY/initiators/read_bandwidth
|
||||
Date: December 2018
|
||||
Contact: Keith Busch <keith.busch@intel.com>
|
||||
Description:
|
||||
This node's read bandwidth in MB/s when accessed from
|
||||
nodes found in this access class's linked initiators.
|
||||
|
||||
What: /sys/devices/system/node/nodeX/accessY/initiators/read_latency
|
||||
Date: December 2018
|
||||
Contact: Keith Busch <keith.busch@intel.com>
|
||||
Description:
|
||||
This node's read latency in nanoseconds when accessed
|
||||
from nodes found in this access class's linked initiators.
|
||||
|
||||
What: /sys/devices/system/node/nodeX/accessY/initiators/write_bandwidth
|
||||
Date: December 2018
|
||||
Contact: Keith Busch <keith.busch@intel.com>
|
||||
Description:
|
||||
This node's write bandwidth in MB/s when accessed from
|
||||
found in this access class's linked initiators.
|
||||
|
||||
What: /sys/devices/system/node/nodeX/accessY/initiators/write_latency
|
||||
Date: December 2018
|
||||
Contact: Keith Busch <keith.busch@intel.com>
|
||||
Description:
|
||||
This node's write latency in nanoseconds when access
|
||||
from nodes found in this class's linked initiators.
|
||||
|
||||
What: /sys/devices/system/node/nodeX/memory_side_cache/indexY/
|
||||
Date: December 2018
|
||||
Contact: Keith Busch <keith.busch@intel.com>
|
||||
Description:
|
||||
The directory containing attributes for the memory-side cache
|
||||
level 'Y'.
|
||||
|
||||
What: /sys/devices/system/node/nodeX/memory_side_cache/indexY/indexing
|
||||
Date: December 2018
|
||||
Contact: Keith Busch <keith.busch@intel.com>
|
||||
Description:
|
||||
The caches associativity indexing: 0 for direct mapped,
|
||||
non-zero if indexed.
|
||||
|
||||
What: /sys/devices/system/node/nodeX/memory_side_cache/indexY/line_size
|
||||
Date: December 2018
|
||||
Contact: Keith Busch <keith.busch@intel.com>
|
||||
Description:
|
||||
The number of bytes accessed from the next cache level on a
|
||||
cache miss.
|
||||
|
||||
What: /sys/devices/system/node/nodeX/memory_side_cache/indexY/size
|
||||
Date: December 2018
|
||||
Contact: Keith Busch <keith.busch@intel.com>
|
||||
Description:
|
||||
The size of this memory side cache in bytes.
|
||||
|
||||
What: /sys/devices/system/node/nodeX/memory_side_cache/indexY/write_policy
|
||||
Date: December 2018
|
||||
Contact: Keith Busch <keith.busch@intel.com>
|
||||
Description:
|
||||
The cache write policy: 0 for write-back, 1 for write-through,
|
||||
other or unknown.
|
||||
|
@@ -1,23 +1,46 @@
|
||||
What: /sys/kernel/debug/wilco_ec/h1_gpio
|
||||
Date: April 2019
|
||||
KernelVersion: 5.2
|
||||
Description:
|
||||
As part of Chrome OS's FAFT (Fully Automated Firmware Testing)
|
||||
tests, we need to ensure that the H1 chip is properly setting
|
||||
some GPIO lines. The h1_gpio attribute exposes the state
|
||||
of the lines:
|
||||
- ENTRY_TO_FACT_MODE in BIT(0)
|
||||
- SPI_CHROME_SEL in BIT(1)
|
||||
|
||||
Output will formatted with "0x%02x\n".
|
||||
|
||||
What: /sys/kernel/debug/wilco_ec/raw
|
||||
Date: January 2019
|
||||
KernelVersion: 5.1
|
||||
Description:
|
||||
Write and read raw mailbox commands to the EC.
|
||||
|
||||
For writing:
|
||||
Bytes 0-1 indicate the message type:
|
||||
00 F0 = Execute Legacy Command
|
||||
00 F2 = Read/Write NVRAM Property
|
||||
Byte 2 provides the command code
|
||||
Bytes 3+ consist of the data passed in the request
|
||||
You can write a hexadecimal sentence to raw, and that series of
|
||||
bytes will be sent to the EC. Then, you can read the bytes of
|
||||
response by reading from raw.
|
||||
|
||||
At least three bytes are required, for the msg type and command,
|
||||
with additional bytes optional for additional data.
|
||||
For writing, bytes 0-1 indicate the message type, one of enum
|
||||
wilco_ec_msg_type. Byte 2+ consist of the data passed in the
|
||||
request, starting at MBOX[0]
|
||||
|
||||
At least three bytes are required for writing, two for the type
|
||||
and at least a single byte of data. Only the first
|
||||
EC_MAILBOX_DATA_SIZE bytes of MBOX will be used.
|
||||
|
||||
Example:
|
||||
// Request EC info type 3 (EC firmware build date)
|
||||
$ echo 00 f0 38 00 03 00 > raw
|
||||
// Corresponds with sending type 0x00f0 with
|
||||
// MBOX = [38, 00, 03, 00]
|
||||
$ echo 00 f0 38 00 03 00 > /sys/kernel/debug/wilco_ec/raw
|
||||
// View the result. The decoded ASCII result "12/21/18" is
|
||||
// included after the raw hex.
|
||||
$ cat raw
|
||||
00 31 32 2f 32 31 2f 31 38 00 38 00 01 00 2f 00 .12/21/18.8...
|
||||
// Corresponds with MBOX = [00, 00, 31, 32, 2f, 32, 31, 38, ...]
|
||||
$ cat /sys/kernel/debug/wilco_ec/raw
|
||||
00 00 31 32 2f 32 31 2f 31 38 00 38 00 01 00 2f 00 ..12/21/18.8...
|
||||
|
||||
Note that the first 32 bytes of the received MBOX[] will be
|
||||
printed, even if some of the data is junk. It is up to you to
|
||||
know how many of the first bytes of data are the actual
|
||||
response.
|
||||
|
230
Documentation/ABI/testing/sysfs-bus-counter
Normal file
230
Documentation/ABI/testing/sysfs-bus-counter
Normal file
@@ -0,0 +1,230 @@
|
||||
What: /sys/bus/counter/devices/counterX/countY/count
|
||||
KernelVersion: 5.2
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Count data of Count Y represented as a string.
|
||||
|
||||
What: /sys/bus/counter/devices/counterX/countY/ceiling
|
||||
KernelVersion: 5.2
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Count value ceiling for Count Y. This is the upper limit for the
|
||||
respective counter.
|
||||
|
||||
What: /sys/bus/counter/devices/counterX/countY/floor
|
||||
KernelVersion: 5.2
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Count value floor for Count Y. This is the lower limit for the
|
||||
respective counter.
|
||||
|
||||
What: /sys/bus/counter/devices/counterX/countY/count_mode
|
||||
KernelVersion: 5.2
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Count mode for channel Y. The ceiling and floor values for
|
||||
Count Y are used by the count mode where required. The following
|
||||
count modes are available:
|
||||
|
||||
normal:
|
||||
Counting is continuous in either direction.
|
||||
|
||||
range limit:
|
||||
An upper or lower limit is set, mimicking limit switches
|
||||
in the mechanical counterpart. The upper limit is set to
|
||||
the Count Y ceiling value, while the lower limit is set
|
||||
to the Count Y floor value. The counter freezes at
|
||||
count = ceiling when counting up, and at count = floor
|
||||
when counting down. At either of these limits, the
|
||||
counting is resumed only when the count direction is
|
||||
reversed.
|
||||
|
||||
non-recycle:
|
||||
The counter is disabled whenever a counter overflow or
|
||||
underflow takes place. The counter is re-enabled when a
|
||||
new count value is loaded to the counter via a preset
|
||||
operation or direct write.
|
||||
|
||||
modulo-n:
|
||||
A count value boundary is set between the Count Y floor
|
||||
value and the Count Y ceiling value. The counter is
|
||||
reset to the Count Y floor value at count = ceiling when
|
||||
counting up, while the counter is set to the Count Y
|
||||
ceiling value at count = floor when counting down; the
|
||||
counter does not freeze at the boundary points, but
|
||||
counts continuously throughout.
|
||||
|
||||
What: /sys/bus/counter/devices/counterX/countY/count_mode_available
|
||||
What: /sys/bus/counter/devices/counterX/countY/error_noise_available
|
||||
What: /sys/bus/counter/devices/counterX/countY/function_available
|
||||
What: /sys/bus/counter/devices/counterX/countY/signalZ_action_available
|
||||
KernelVersion: 5.2
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Discrete set of available values for the respective Count Y
|
||||
configuration are listed in this file. Values are delimited by
|
||||
newline characters.
|
||||
|
||||
What: /sys/bus/counter/devices/counterX/countY/direction
|
||||
KernelVersion: 5.2
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Read-only attribute that indicates the count direction of Count
|
||||
Y. Two count directions are available: forward and backward.
|
||||
|
||||
Some counter devices are able to determine the direction of
|
||||
their counting. For example, quadrature encoding counters can
|
||||
determine the direction of movement by evaluating the leading
|
||||
phase of the respective A and B quadrature encoding signals.
|
||||
This attribute exposes such count directions.
|
||||
|
||||
What: /sys/bus/counter/devices/counterX/countY/enable
|
||||
KernelVersion: 5.2
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Whether channel Y counter is enabled. Valid attribute values are
|
||||
boolean.
|
||||
|
||||
This attribute is intended to serve as a pause/unpause mechanism
|
||||
for Count Y. Suppose a counter device is used to count the total
|
||||
movement of a conveyor belt: this attribute allows an operator
|
||||
to temporarily pause the counter, service the conveyor belt,
|
||||
and then finally unpause the counter to continue where it had
|
||||
left off.
|
||||
|
||||
What: /sys/bus/counter/devices/counterX/countY/error_noise
|
||||
KernelVersion: 5.2
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Read-only attribute that indicates whether excessive noise is
|
||||
present at the channel Y counter inputs.
|
||||
|
||||
What: /sys/bus/counter/devices/counterX/countY/function
|
||||
KernelVersion: 5.2
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Count function mode of Count Y; count function evaluation is
|
||||
triggered by conditions specified by the Count Y signalZ_action
|
||||
attributes. The following count functions are available:
|
||||
|
||||
increase:
|
||||
Accumulated count is incremented.
|
||||
|
||||
decrease:
|
||||
Accumulated count is decremented.
|
||||
|
||||
pulse-direction:
|
||||
Rising edges on signal A updates the respective count.
|
||||
The input level of signal B determines direction.
|
||||
|
||||
quadrature x1 a:
|
||||
If direction is forward, rising edges on quadrature pair
|
||||
signal A updates the respective count; if the direction
|
||||
is backward, falling edges on quadrature pair signal A
|
||||
updates the respective count. Quadrature encoding
|
||||
determines the direction.
|
||||
|
||||
quadrature x1 b:
|
||||
If direction is forward, rising edges on quadrature pair
|
||||
signal B updates the respective count; if the direction
|
||||
is backward, falling edges on quadrature pair signal B
|
||||
updates the respective count. Quadrature encoding
|
||||
determines the direction.
|
||||
|
||||
quadrature x2 a:
|
||||
Any state transition on quadrature pair signal A updates
|
||||
the respective count. Quadrature encoding determines the
|
||||
direction.
|
||||
|
||||
quadrature x2 b:
|
||||
Any state transition on quadrature pair signal B updates
|
||||
the respective count. Quadrature encoding determines the
|
||||
direction.
|
||||
|
||||
quadrature x4:
|
||||
Any state transition on either quadrature pair signals
|
||||
updates the respective count. Quadrature encoding
|
||||
determines the direction.
|
||||
|
||||
What: /sys/bus/counter/devices/counterX/countY/name
|
||||
KernelVersion: 5.2
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Read-only attribute that indicates the device-specific name of
|
||||
Count Y. If possible, this should match the name of the
|
||||
respective channel as it appears in the device datasheet.
|
||||
|
||||
What: /sys/bus/counter/devices/counterX/countY/preset
|
||||
KernelVersion: 5.2
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
If the counter device supports preset registers -- registers
|
||||
used to load counter channels to a set count upon device-defined
|
||||
preset operation trigger events -- the preset count for channel
|
||||
Y is provided by this attribute.
|
||||
|
||||
What: /sys/bus/counter/devices/counterX/countY/preset_enable
|
||||
KernelVersion: 5.2
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Whether channel Y counter preset operation is enabled. Valid
|
||||
attribute values are boolean.
|
||||
|
||||
What: /sys/bus/counter/devices/counterX/countY/signalZ_action
|
||||
KernelVersion: 5.2
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Action mode of Count Y for Signal Z. This attribute indicates
|
||||
the condition of Signal Z that triggers the count function
|
||||
evaluation for Count Y. The following action modes are
|
||||
available:
|
||||
|
||||
none:
|
||||
Signal does not trigger the count function. In
|
||||
Pulse-Direction count function mode, this Signal is
|
||||
evaluated as Direction.
|
||||
|
||||
rising edge:
|
||||
Low state transitions to high state.
|
||||
|
||||
falling edge:
|
||||
High state transitions to low state.
|
||||
|
||||
both edges:
|
||||
Any state transition.
|
||||
|
||||
What: /sys/bus/counter/devices/counterX/name
|
||||
KernelVersion: 5.2
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Read-only attribute that indicates the device-specific name of
|
||||
the Counter. This should match the name of the device as it
|
||||
appears in its respective datasheet.
|
||||
|
||||
What: /sys/bus/counter/devices/counterX/num_counts
|
||||
KernelVersion: 5.2
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Read-only attribute that indicates the total number of Counts
|
||||
belonging to the Counter.
|
||||
|
||||
What: /sys/bus/counter/devices/counterX/num_signals
|
||||
KernelVersion: 5.2
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Read-only attribute that indicates the total number of Signals
|
||||
belonging to the Counter.
|
||||
|
||||
What: /sys/bus/counter/devices/counterX/signalY/signal
|
||||
KernelVersion: 5.2
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Signal data of Signal Y represented as a string.
|
||||
|
||||
What: /sys/bus/counter/devices/counterX/signalY/name
|
||||
KernelVersion: 5.2
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Read-only attribute that indicates the device-specific name of
|
||||
Signal Y. If possible, this should match the name of the
|
||||
respective signal as it appears in the device datasheet.
|
36
Documentation/ABI/testing/sysfs-bus-counter-104-quad-8
Normal file
36
Documentation/ABI/testing/sysfs-bus-counter-104-quad-8
Normal file
@@ -0,0 +1,36 @@
|
||||
What: /sys/bus/counter/devices/counterX/signalY/index_polarity
|
||||
KernelVersion: 5.2
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Active level of index input Signal Y; irrelevant in
|
||||
non-synchronous load mode.
|
||||
|
||||
What: /sys/bus/counter/devices/counterX/signalY/index_polarity_available
|
||||
What: /sys/bus/counter/devices/counterX/signalY/synchronous_mode_available
|
||||
KernelVersion: 5.2
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Discrete set of available values for the respective Signal Y
|
||||
configuration are listed in this file.
|
||||
|
||||
What: /sys/bus/counter/devices/counterX/signalY/synchronous_mode
|
||||
KernelVersion: 5.2
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Configure the counter associated with Signal Y for
|
||||
non-synchronous or synchronous load mode. Synchronous load mode
|
||||
cannot be selected in non-quadrature (Pulse-Direction) clock
|
||||
mode.
|
||||
|
||||
non-synchronous:
|
||||
A logic low level is the active level at this index
|
||||
input. The index function (as enabled via preset_enable)
|
||||
is performed directly on the active level of the index
|
||||
input.
|
||||
|
||||
synchronous:
|
||||
Intended for interfacing with encoder Index output in
|
||||
quadrature clock mode. The active level is configured
|
||||
via index_polarity. The index function (as enabled via
|
||||
preset_enable) is performed synchronously with the
|
||||
quadrature clock on the active level of the index input.
|
16
Documentation/ABI/testing/sysfs-bus-counter-ftm-quaddec
Normal file
16
Documentation/ABI/testing/sysfs-bus-counter-ftm-quaddec
Normal file
@@ -0,0 +1,16 @@
|
||||
What: /sys/bus/counter/devices/counterX/countY/prescaler_available
|
||||
KernelVersion: 5.2
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Discrete set of available values for the respective Count Y
|
||||
configuration are listed in this file. Values are delimited by
|
||||
newline characters.
|
||||
|
||||
What: /sys/bus/counter/devices/counterX/countY/prescaler
|
||||
KernelVersion: 5.2
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Configure the prescaler value associated with Count Y.
|
||||
On the FlexTimer, the counter clock source passes through a
|
||||
prescaler (i.e. a counter). This acts like a clock
|
||||
divider.
|
20
Documentation/ABI/testing/sysfs-bus-i2c-devices-pca954x
Normal file
20
Documentation/ABI/testing/sysfs-bus-i2c-devices-pca954x
Normal file
@@ -0,0 +1,20 @@
|
||||
What: /sys/bus/i2c/.../idle_state
|
||||
Date: January 2019
|
||||
KernelVersion: 5.2
|
||||
Contact: Robert Shearman <robert.shearman@att.com>
|
||||
Description:
|
||||
Value that exists only for mux devices that can be
|
||||
written to control the behaviour of the multiplexer on
|
||||
idle. Possible values:
|
||||
-2 - disconnect on idle, i.e. deselect the last used
|
||||
channel, which is useful when there is a device
|
||||
with an address that conflicts with another
|
||||
device on another mux on the same parent bus.
|
||||
-1 - leave the mux as-is, which is the most optimal
|
||||
setting in terms of I2C operations and is the
|
||||
default mode.
|
||||
0..<nchans> - set the mux to a predetermined channel,
|
||||
which is useful if there is one channel that is
|
||||
used almost always, and you want to reduce the
|
||||
latency for normal operations after rare
|
||||
transactions on other channels
|
@@ -1656,6 +1656,8 @@ What: /sys/bus/iio/devices/iio:deviceX/in_countY_raw
|
||||
KernelVersion: 4.10
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
This interface is deprecated; please use the Counter subsystem.
|
||||
|
||||
Raw counter device counts from channel Y. For quadrature
|
||||
counters, multiplication by an available [Y]_scale results in
|
||||
the counts of a single quadrature signal phase from channel Y.
|
||||
@@ -1664,6 +1666,8 @@ What: /sys/bus/iio/devices/iio:deviceX/in_indexY_raw
|
||||
KernelVersion: 4.10
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
This interface is deprecated; please use the Counter subsystem.
|
||||
|
||||
Raw counter device index value from channel Y. This attribute
|
||||
provides an absolute positional reference (e.g. a pulse once per
|
||||
revolution) which may be used to home positional systems as
|
||||
@@ -1673,6 +1677,8 @@ What: /sys/bus/iio/devices/iio:deviceX/in_count_count_direction_available
|
||||
KernelVersion: 4.12
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
This interface is deprecated; please use the Counter subsystem.
|
||||
|
||||
A list of possible counting directions which are:
|
||||
- "up" : counter device is increasing.
|
||||
- "down": counter device is decreasing.
|
||||
@@ -1681,6 +1687,8 @@ What: /sys/bus/iio/devices/iio:deviceX/in_countY_count_direction
|
||||
KernelVersion: 4.12
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
This interface is deprecated; please use the Counter subsystem.
|
||||
|
||||
Raw counter device counters direction for channel Y.
|
||||
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_phaseY_raw
|
||||
|
@@ -6,6 +6,8 @@ What: /sys/bus/iio/devices/iio:deviceX/in_index_synchronous_mode_available
|
||||
KernelVersion: 4.10
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
This interface is deprecated; please use the Counter subsystem.
|
||||
|
||||
Discrete set of available values for the respective counter
|
||||
configuration are listed in this file.
|
||||
|
||||
@@ -13,6 +15,8 @@ What: /sys/bus/iio/devices/iio:deviceX/in_countY_count_mode
|
||||
KernelVersion: 4.10
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
This interface is deprecated; please use the Counter subsystem.
|
||||
|
||||
Count mode for channel Y. Four count modes are available:
|
||||
normal, range limit, non-recycle, and modulo-n. The preset value
|
||||
for channel Y is used by the count mode where required.
|
||||
@@ -47,6 +51,8 @@ What: /sys/bus/iio/devices/iio:deviceX/in_countY_noise_error
|
||||
KernelVersion: 4.10
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
This interface is deprecated; please use the Counter subsystem.
|
||||
|
||||
Read-only attribute that indicates whether excessive noise is
|
||||
present at the channel Y count inputs in quadrature clock mode;
|
||||
irrelevant in non-quadrature clock mode.
|
||||
@@ -55,6 +61,8 @@ What: /sys/bus/iio/devices/iio:deviceX/in_countY_preset
|
||||
KernelVersion: 4.10
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
This interface is deprecated; please use the Counter subsystem.
|
||||
|
||||
If the counter device supports preset registers, the preset
|
||||
count for channel Y is provided by this attribute.
|
||||
|
||||
@@ -62,6 +70,8 @@ What: /sys/bus/iio/devices/iio:deviceX/in_countY_quadrature_mode
|
||||
KernelVersion: 4.10
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
This interface is deprecated; please use the Counter subsystem.
|
||||
|
||||
Configure channel Y counter for non-quadrature or quadrature
|
||||
clock mode. Selecting non-quadrature clock mode will disable
|
||||
synchronous load mode. In quadrature clock mode, the channel Y
|
||||
@@ -83,6 +93,8 @@ What: /sys/bus/iio/devices/iio:deviceX/in_countY_set_to_preset_on_index
|
||||
KernelVersion: 4.10
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
This interface is deprecated; please use the Counter subsystem.
|
||||
|
||||
Whether to set channel Y counter with channel Y preset value
|
||||
when channel Y index input is active, or continuously count.
|
||||
Valid attribute values are boolean.
|
||||
@@ -91,6 +103,8 @@ What: /sys/bus/iio/devices/iio:deviceX/in_indexY_index_polarity
|
||||
KernelVersion: 4.10
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
This interface is deprecated; please use the Counter subsystem.
|
||||
|
||||
Active level of channel Y index input; irrelevant in
|
||||
non-synchronous load mode.
|
||||
|
||||
@@ -98,6 +112,8 @@ What: /sys/bus/iio/devices/iio:deviceX/in_indexY_synchronous_mode
|
||||
KernelVersion: 4.10
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
This interface is deprecated; please use the Counter subsystem.
|
||||
|
||||
Configure channel Y counter for non-synchronous or synchronous
|
||||
load mode. Synchronous load mode cannot be selected in
|
||||
non-quadrature clock mode.
|
||||
|
@@ -1,26 +1,31 @@
|
||||
What: /sys/bus/iio/devices/iio:deviceX/outY_freq_start
|
||||
What: /sys/bus/iio/devices/iio:deviceX/out_altvoltageY_frequency_start
|
||||
Date: March 2019
|
||||
KernelVersion: 3.1.0
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Frequency sweep start frequency in Hz.
|
||||
|
||||
What: /sys/bus/iio/devices/iio:deviceX/outY_freq_increment
|
||||
What: /sys/bus/iio/devices/iio:deviceX/out_altvoltageY_frequency_increment
|
||||
Date: March 2019
|
||||
KernelVersion: 3.1.0
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Frequency increment in Hz (step size) between consecutive
|
||||
frequency points along the sweep.
|
||||
|
||||
What: /sys/bus/iio/devices/iio:deviceX/outY_freq_points
|
||||
What: /sys/bus/iio/devices/iio:deviceX/out_altvoltageY_frequency_points
|
||||
Date: March 2019
|
||||
KernelVersion: 3.1.0
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Number of frequency points (steps) in the frequency sweep.
|
||||
This value, in conjunction with the outY_freq_start and the
|
||||
outY_freq_increment, determines the frequency sweep range
|
||||
for the sweep operation.
|
||||
This value, in conjunction with the
|
||||
out_altvoltageY_frequency_start and the
|
||||
out_altvoltageY_frequency_increment, determines the frequency
|
||||
sweep range for the sweep operation.
|
||||
|
||||
What: /sys/bus/iio/devices/iio:deviceX/outY_settling_cycles
|
||||
What: /sys/bus/iio/devices/iio:deviceX/out_altvoltageY_settling_cycles
|
||||
Date: March 2019
|
||||
KernelVersion: 3.1.0
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
@@ -1,6 +1,6 @@
|
||||
What: /sys/bus/iio/devices/iio:deviceX/start_cleaning
|
||||
Date: December 2018
|
||||
KernelVersion: 4.22
|
||||
KernelVersion: 5.0
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Writing 1 starts sensor self cleaning. Internal fan accelerates
|
||||
|
24
Documentation/ABI/testing/sysfs-bus-iio-temperature-max31856
Normal file
24
Documentation/ABI/testing/sysfs-bus-iio-temperature-max31856
Normal file
@@ -0,0 +1,24 @@
|
||||
What: /sys/bus/iio/devices/iio:deviceX/fault_oc
|
||||
KernelVersion: 5.1
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Open-circuit fault. The detection of open-circuit faults,
|
||||
such as those caused by broken thermocouple wires.
|
||||
Reading returns either '1' or '0'.
|
||||
'1' = An open circuit such as broken thermocouple wires
|
||||
has been detected.
|
||||
'0' = No open circuit or broken thermocouple wires are detected
|
||||
|
||||
What: /sys/bus/iio/devices/iio:deviceX/fault_ovuv
|
||||
KernelVersion: 5.1
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Overvoltage or Undervoltage Input Fault. The internal circuitry
|
||||
is protected from excessive voltages applied to the thermocouple
|
||||
cables by integrated MOSFETs at the T+ and T- inputs, and the
|
||||
BIAS output. These MOSFETs turn off when the input voltage is
|
||||
negative or greater than VDD.
|
||||
Reading returns either '1' or '0'.
|
||||
'1' = The input voltage is negative or greater than VDD.
|
||||
'0' = The input voltage is positive and less than VDD (normal
|
||||
state).
|
@@ -30,4 +30,12 @@ Description: (RW) Configure MSC buffer size for "single" or "multi" modes.
|
||||
there are no active users and tracing is not enabled) and then
|
||||
allocates a new one.
|
||||
|
||||
What: /sys/bus/intel_th/devices/<intel_th_id>-msc<msc-id>/win_switch
|
||||
Date: May 2019
|
||||
KernelVersion: 5.2
|
||||
Contact: Alexander Shishkin <alexander.shishkin@linux.intel.com>
|
||||
Description: (RW) Trigger window switch for the MSC's buffer, in
|
||||
multi-window mode. In "multi" mode, accepts writes of "1", thereby
|
||||
triggering a window switch for the buffer. Returns an error in any
|
||||
other operating mode or attempts to write something other than "1".
|
||||
|
||||
|
@@ -65,3 +65,18 @@ Description: Display the ME firmware version.
|
||||
<platform>:<major>.<minor>.<milestone>.<build_no>.
|
||||
There can be up to three such blocks for different
|
||||
FW components.
|
||||
|
||||
What: /sys/class/mei/meiN/dev_state
|
||||
Date: Mar 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: Tomas Winkler <tomas.winkler@intel.com>
|
||||
Description: Display the ME device state.
|
||||
|
||||
The device state can have following values:
|
||||
INITIALIZING
|
||||
INIT_CLIENTS
|
||||
ENABLED
|
||||
RESETTING
|
||||
DISABLED
|
||||
POWER_DOWN
|
||||
POWER_UP
|
||||
|
@@ -114,15 +114,60 @@ Description:
|
||||
Access: Read
|
||||
Valid values: Represented in microamps
|
||||
|
||||
What: /sys/class/power_supply/<supply_name>/charge_control_limit
|
||||
Date: Oct 2012
|
||||
Contact: linux-pm@vger.kernel.org
|
||||
Description:
|
||||
Maximum allowable charging current. Used for charge rate
|
||||
throttling for thermal cooling or improving battery health.
|
||||
|
||||
Access: Read, Write
|
||||
Valid values: Represented in microamps
|
||||
|
||||
What: /sys/class/power_supply/<supply_name>/charge_control_limit_max
|
||||
Date: Oct 2012
|
||||
Contact: linux-pm@vger.kernel.org
|
||||
Description:
|
||||
Maximum legal value for the charge_control_limit property.
|
||||
|
||||
Access: Read
|
||||
Valid values: Represented in microamps
|
||||
|
||||
What: /sys/class/power_supply/<supply_name>/charge_control_start_threshold
|
||||
Date: April 2019
|
||||
Contact: linux-pm@vger.kernel.org
|
||||
Description:
|
||||
Represents a battery percentage level, below which charging will
|
||||
begin.
|
||||
|
||||
Access: Read, Write
|
||||
Valid values: 0 - 100 (percent)
|
||||
|
||||
What: /sys/class/power_supply/<supply_name>/charge_control_end_threshold
|
||||
Date: April 2019
|
||||
Contact: linux-pm@vger.kernel.org
|
||||
Description:
|
||||
Represents a battery percentage level, above which charging will
|
||||
stop.
|
||||
|
||||
Access: Read, Write
|
||||
Valid values: 0 - 100 (percent)
|
||||
|
||||
What: /sys/class/power_supply/<supply_name>/charge_type
|
||||
Date: July 2009
|
||||
Contact: linux-pm@vger.kernel.org
|
||||
Description:
|
||||
Represents the type of charging currently being applied to the
|
||||
battery.
|
||||
battery. "Trickle", "Fast", and "Standard" all mean different
|
||||
charging speeds. "Adaptive" means that the charger uses some
|
||||
algorithm to adjust the charge rate dynamically, without
|
||||
any user configuration required. "Custom" means that the charger
|
||||
uses the charge_control_* properties as configuration for some
|
||||
different algorithm.
|
||||
|
||||
Access: Read
|
||||
Valid values: "Unknown", "N/A", "Trickle", "Fast"
|
||||
Access: Read, Write
|
||||
Valid values: "Unknown", "N/A", "Trickle", "Fast", "Standard",
|
||||
"Adaptive", "Custom"
|
||||
|
||||
What: /sys/class/power_supply/<supply_name>/charge_term_current
|
||||
Date: July 2014
|
||||
|
@@ -212,7 +212,7 @@ Description:
|
||||
Messages may be broken into parts if
|
||||
they are long.
|
||||
|
||||
receieved_messages: (RO) Number of message responses
|
||||
received_messages: (RO) Number of message responses
|
||||
received.
|
||||
|
||||
received_message_parts: (RO) Number of message fragments
|
||||
|
@@ -484,6 +484,7 @@ What: /sys/devices/system/cpu/vulnerabilities
|
||||
/sys/devices/system/cpu/vulnerabilities/spectre_v2
|
||||
/sys/devices/system/cpu/vulnerabilities/spec_store_bypass
|
||||
/sys/devices/system/cpu/vulnerabilities/l1tf
|
||||
/sys/devices/system/cpu/vulnerabilities/mds
|
||||
Date: January 2018
|
||||
Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
|
||||
Description: Information about CPU vulnerabilities
|
||||
@@ -496,8 +497,7 @@ Description: Information about CPU vulnerabilities
|
||||
"Vulnerable" CPU is affected and no mitigation in effect
|
||||
"Mitigation: $M" CPU is affected and mitigation $M is in effect
|
||||
|
||||
Details about the l1tf file can be found in
|
||||
Documentation/admin-guide/l1tf.rst
|
||||
See also: Documentation/admin-guide/hw-vuln/index.rst
|
||||
|
||||
What: /sys/devices/system/cpu/smt
|
||||
/sys/devices/system/cpu/smt/active
|
||||
@@ -511,10 +511,30 @@ Description: Control Symetric Multi Threading (SMT)
|
||||
control: Read/write interface to control SMT. Possible
|
||||
values:
|
||||
|
||||
"on" SMT is enabled
|
||||
"off" SMT is disabled
|
||||
"forceoff" SMT is force disabled. Cannot be changed.
|
||||
"notsupported" SMT is not supported by the CPU
|
||||
"on" SMT is enabled
|
||||
"off" SMT is disabled
|
||||
"forceoff" SMT is force disabled. Cannot be changed.
|
||||
"notsupported" SMT is not supported by the CPU
|
||||
"notimplemented" SMT runtime toggling is not
|
||||
implemented for the architecture
|
||||
|
||||
If control status is "forceoff" or "notsupported" writes
|
||||
are rejected.
|
||||
|
||||
What: /sys/devices/system/cpu/cpu#/power/energy_perf_bias
|
||||
Date: March 2019
|
||||
Contact: linux-pm@vger.kernel.org
|
||||
Description: Intel Energy and Performance Bias Hint (EPB)
|
||||
|
||||
EPB for the given CPU in a sliding scale 0 - 15, where a value
|
||||
of 0 corresponds to a hint preference for highest performance
|
||||
and a value of 15 corresponds to the maximum energy savings.
|
||||
|
||||
In order to change the EPB value for the CPU, write either
|
||||
a number in the 0 - 15 sliding scale above, or one of the
|
||||
strings: "performance", "balance-performance", "normal",
|
||||
"balance-power", "power" (that represent values reflected by
|
||||
their meaning), to this attribute.
|
||||
|
||||
This attribute is present for all online CPUs supporting the
|
||||
Intel EPB feature.
|
||||
|
6
Documentation/ABI/testing/sysfs-driver-ucsi-ccg
Normal file
6
Documentation/ABI/testing/sysfs-driver-ucsi-ccg
Normal file
@@ -0,0 +1,6 @@
|
||||
What: /sys/bus/i2c/drivers/ucsi_ccg/.../do_flash
|
||||
Date: May 2019
|
||||
Contact: Ajay Gupta <ajayg@nvidia.com>
|
||||
Description:
|
||||
Tell the driver for Cypress CCGx Type-C controller to attempt
|
||||
firmware upgrade by writing [Yy1] to the file.
|
@@ -45,7 +45,7 @@ Description:
|
||||
use this feature without a clearance from a patch
|
||||
distributor. Removal (rmmod) of patch modules is permanently
|
||||
disabled when the feature is used. See
|
||||
Documentation/livepatch/livepatch.txt for more information.
|
||||
Documentation/livepatch/livepatch.rst for more information.
|
||||
|
||||
What: /sys/kernel/livepatch/<patch>/<object>
|
||||
Date: Nov 2014
|
||||
|
27
Documentation/ABI/testing/usb-uevent
Normal file
27
Documentation/ABI/testing/usb-uevent
Normal file
@@ -0,0 +1,27 @@
|
||||
What: Raise a uevent when a USB Host Controller has died
|
||||
Date: 2019-04-17
|
||||
KernelVersion: 5.2
|
||||
Contact: linux-usb@vger.kernel.org
|
||||
Description: When the USB Host Controller has entered a state where it is no
|
||||
longer functional a uevent will be raised. The uevent will
|
||||
contain ACTION=offline and ERROR=DEAD.
|
||||
|
||||
Here is an example taken using udevadm monitor -p:
|
||||
|
||||
KERNEL[130.428945] offline /devices/pci0000:00/0000:00:10.0/usb2 (usb)
|
||||
ACTION=offline
|
||||
BUSNUM=002
|
||||
DEVNAME=/dev/bus/usb/002/001
|
||||
DEVNUM=001
|
||||
DEVPATH=/devices/pci0000:00/0000:00:10.0/usb2
|
||||
DEVTYPE=usb_device
|
||||
DRIVER=usb
|
||||
ERROR=DEAD
|
||||
MAJOR=189
|
||||
MINOR=128
|
||||
PRODUCT=1d6b/2/414
|
||||
SEQNUM=2168
|
||||
SUBSYSTEM=usb
|
||||
TYPE=9/0/1
|
||||
|
||||
Users: chromium-os-dev@chromium.org
|
@@ -147,7 +147,7 @@ networking subsystems make sure that the buffers they use are valid
|
||||
for you to DMA from/to.
|
||||
|
||||
DMA addressing capabilities
|
||||
==========================
|
||||
===========================
|
||||
|
||||
By default, the kernel assumes that your device can address 32-bits of DMA
|
||||
addressing. For a 64-bit capable device, this needs to be increased, and for
|
||||
@@ -365,13 +365,12 @@ __get_free_pages() (but takes size instead of a page order). If your
|
||||
driver needs regions sized smaller than a page, you may prefer using
|
||||
the dma_pool interface, described below.
|
||||
|
||||
The consistent DMA mapping interfaces, for non-NULL dev, will by
|
||||
default return a DMA address which is 32-bit addressable. Even if the
|
||||
device indicates (via DMA mask) that it may address the upper 32-bits,
|
||||
consistent allocation will only return > 32-bit addresses for DMA if
|
||||
the consistent DMA mask has been explicitly changed via
|
||||
dma_set_coherent_mask(). This is true of the dma_pool interface as
|
||||
well.
|
||||
The consistent DMA mapping interfaces, will by default return a DMA address
|
||||
which is 32-bit addressable. Even if the device indicates (via the DMA mask)
|
||||
that it may address the upper 32-bits, consistent allocation will only
|
||||
return > 32-bit addresses for DMA if the consistent DMA mask has been
|
||||
explicitly changed via dma_set_coherent_mask(). This is true of the
|
||||
dma_pool interface as well.
|
||||
|
||||
dma_alloc_coherent() returns two values: the virtual address which you
|
||||
can use to access it from the CPU and dma_handle which you pass to the
|
||||
|
@@ -28,8 +28,13 @@ ifeq ($(HAVE_SPHINX),0)
|
||||
|
||||
else # HAVE_SPHINX
|
||||
|
||||
# User-friendly check for pdflatex
|
||||
# User-friendly check for pdflatex and latexmk
|
||||
HAVE_PDFLATEX := $(shell if which $(PDFLATEX) >/dev/null 2>&1; then echo 1; else echo 0; fi)
|
||||
HAVE_LATEXMK := $(shell if which latexmk >/dev/null 2>&1; then echo 1; else echo 0; fi)
|
||||
|
||||
ifeq ($(HAVE_LATEXMK),1)
|
||||
PDFLATEX := latexmk -$(PDFLATEX)
|
||||
endif #HAVE_LATEXMK
|
||||
|
||||
# Internal variables.
|
||||
PAPEROPT_a4 = -D latex_paper_size=a4
|
||||
@@ -82,7 +87,7 @@ pdfdocs:
|
||||
else # HAVE_PDFLATEX
|
||||
|
||||
pdfdocs: latexdocs
|
||||
$(foreach var,$(SPHINXDIRS), $(MAKE) PDFLATEX=$(PDFLATEX) LATEXOPTS="$(LATEXOPTS)" -C $(BUILDDIR)/$(var)/latex || exit;)
|
||||
$(foreach var,$(SPHINXDIRS), $(MAKE) PDFLATEX="$(PDFLATEX)" LATEXOPTS="$(LATEXOPTS)" -C $(BUILDDIR)/$(var)/latex || exit;)
|
||||
|
||||
endif # HAVE_PDFLATEX
|
||||
|
||||
|
@@ -155,8 +155,7 @@ keeping lock contention under control at all tree levels regardless
|
||||
of the level of loading on the system.
|
||||
|
||||
</p><p>RCU updaters wait for normal grace periods by registering
|
||||
RCU callbacks, either directly via <tt>call_rcu()</tt> and
|
||||
friends (namely <tt>call_rcu_bh()</tt> and <tt>call_rcu_sched()</tt>),
|
||||
RCU callbacks, either directly via <tt>call_rcu()</tt>
|
||||
or indirectly via <tt>synchronize_rcu()</tt> and friends.
|
||||
RCU callbacks are represented by <tt>rcu_head</tt> structures,
|
||||
which are queued on <tt>rcu_data</tt> structures while they are
|
||||
|
@@ -56,6 +56,7 @@ sections.
|
||||
RCU-preempt Expedited Grace Periods</a></h2>
|
||||
|
||||
<p>
|
||||
<tt>CONFIG_PREEMPT=y</tt> kernels implement RCU-preempt.
|
||||
The overall flow of the handling of a given CPU by an RCU-preempt
|
||||
expedited grace period is shown in the following diagram:
|
||||
|
||||
@@ -139,6 +140,7 @@ or offline, among other things.
|
||||
RCU-sched Expedited Grace Periods</a></h2>
|
||||
|
||||
<p>
|
||||
<tt>CONFIG_PREEMPT=n</tt> kernels implement RCU-sched.
|
||||
The overall flow of the handling of a given CPU by an RCU-sched
|
||||
expedited grace period is shown in the following diagram:
|
||||
|
||||
@@ -146,7 +148,7 @@ expedited grace period is shown in the following diagram:
|
||||
|
||||
<p>
|
||||
As with RCU-preempt, RCU-sched's
|
||||
<tt>synchronize_sched_expedited()</tt> ignores offline and
|
||||
<tt>synchronize_rcu_expedited()</tt> ignores offline and
|
||||
idle CPUs, again because they are in remotely detectable
|
||||
quiescent states.
|
||||
However, because the
|
||||
|
@@ -34,12 +34,11 @@ Similarly, any code that happens before the beginning of a given RCU grace
|
||||
period is guaranteed to see the effects of all accesses following the end
|
||||
of that grace period that are within RCU read-side critical sections.
|
||||
|
||||
<p>This guarantee is particularly pervasive for <tt>synchronize_sched()</tt>,
|
||||
for which RCU-sched read-side critical sections include any region
|
||||
<p>Note well that RCU-sched read-side critical sections include any region
|
||||
of code for which preemption is disabled.
|
||||
Given that each individual machine instruction can be thought of as
|
||||
an extremely small region of preemption-disabled code, one can think of
|
||||
<tt>synchronize_sched()</tt> as <tt>smp_mb()</tt> on steroids.
|
||||
<tt>synchronize_rcu()</tt> as <tt>smp_mb()</tt> on steroids.
|
||||
|
||||
<p>RCU updaters use this guarantee by splitting their updates into
|
||||
two phases, one of which is executed before the grace period and
|
||||
|
@@ -81,18 +81,19 @@ currently executing on some other CPU. We therefore cannot free
|
||||
up any data structures used by the old NMI handler until execution
|
||||
of it completes on all other CPUs.
|
||||
|
||||
One way to accomplish this is via synchronize_sched(), perhaps as
|
||||
One way to accomplish this is via synchronize_rcu(), perhaps as
|
||||
follows:
|
||||
|
||||
unset_nmi_callback();
|
||||
synchronize_sched();
|
||||
synchronize_rcu();
|
||||
kfree(my_nmi_data);
|
||||
|
||||
This works because synchronize_sched() blocks until all CPUs complete
|
||||
any preemption-disabled segments of code that they were executing.
|
||||
Since NMI handlers disable preemption, synchronize_sched() is guaranteed
|
||||
This works because (as of v4.20) synchronize_rcu() blocks until all
|
||||
CPUs complete any preemption-disabled segments of code that they were
|
||||
executing.
|
||||
Since NMI handlers disable preemption, synchronize_rcu() is guaranteed
|
||||
not to return until all ongoing NMI handlers exit. It is therefore safe
|
||||
to free up the handler's data as soon as synchronize_sched() returns.
|
||||
to free up the handler's data as soon as synchronize_rcu() returns.
|
||||
|
||||
Important note: for this to work, the architecture in question must
|
||||
invoke nmi_enter() and nmi_exit() on NMI entry and exit, respectively.
|
||||
|
@@ -86,10 +86,8 @@ even on a UP system. So do not do it! Even on a UP system, the RCU
|
||||
infrastructure -must- respect grace periods, and -must- invoke callbacks
|
||||
from a known environment in which no locks are held.
|
||||
|
||||
It -is- safe for synchronize_sched() and synchronize_rcu_bh() to return
|
||||
immediately on an UP system. It is also safe for synchronize_rcu()
|
||||
to return immediately on UP systems, except when running preemptable
|
||||
RCU.
|
||||
Note that it -is- safe for synchronize_rcu() to return immediately on
|
||||
UP systems, including !PREEMPT SMP builds running on UP systems.
|
||||
|
||||
Quick Quiz #3: Why can't synchronize_rcu() return immediately on
|
||||
UP systems running preemptable RCU?
|
||||
|
@@ -182,16 +182,13 @@ over a rather long period of time, but improvements are always welcome!
|
||||
when publicizing a pointer to a structure that can
|
||||
be traversed by an RCU read-side critical section.
|
||||
|
||||
5. If call_rcu(), or a related primitive such as call_rcu_bh(),
|
||||
call_rcu_sched(), or call_srcu() is used, the callback function
|
||||
will be called from softirq context. In particular, it cannot
|
||||
block.
|
||||
5. If call_rcu() or call_srcu() is used, the callback function will
|
||||
be called from softirq context. In particular, it cannot block.
|
||||
|
||||
6. Since synchronize_rcu() can block, it cannot be called from
|
||||
any sort of irq context. The same rule applies for
|
||||
synchronize_rcu_bh(), synchronize_sched(), synchronize_srcu(),
|
||||
synchronize_rcu_expedited(), synchronize_rcu_bh_expedited(),
|
||||
synchronize_sched_expedite(), and synchronize_srcu_expedited().
|
||||
6. Since synchronize_rcu() can block, it cannot be called
|
||||
from any sort of irq context. The same rule applies
|
||||
for synchronize_srcu(), synchronize_rcu_expedited(), and
|
||||
synchronize_srcu_expedited().
|
||||
|
||||
The expedited forms of these primitives have the same semantics
|
||||
as the non-expedited forms, but expediting is both expensive and
|
||||
@@ -212,20 +209,20 @@ over a rather long period of time, but improvements are always welcome!
|
||||
of the system, especially to real-time workloads running on
|
||||
the rest of the system.
|
||||
|
||||
7. If the updater uses call_rcu() or synchronize_rcu(), then the
|
||||
corresponding readers must use rcu_read_lock() and
|
||||
rcu_read_unlock(). If the updater uses call_rcu_bh() or
|
||||
synchronize_rcu_bh(), then the corresponding readers must
|
||||
use rcu_read_lock_bh() and rcu_read_unlock_bh(). If the
|
||||
updater uses call_rcu_sched() or synchronize_sched(), then
|
||||
the corresponding readers must disable preemption, possibly
|
||||
by calling rcu_read_lock_sched() and rcu_read_unlock_sched().
|
||||
If the updater uses synchronize_srcu() or call_srcu(), then
|
||||
the corresponding readers must use srcu_read_lock() and
|
||||
7. As of v4.20, a given kernel implements only one RCU flavor,
|
||||
which is RCU-sched for PREEMPT=n and RCU-preempt for PREEMPT=y.
|
||||
If the updater uses call_rcu() or synchronize_rcu(),
|
||||
then the corresponding readers my use rcu_read_lock() and
|
||||
rcu_read_unlock(), rcu_read_lock_bh() and rcu_read_unlock_bh(),
|
||||
or any pair of primitives that disables and re-enables preemption,
|
||||
for example, rcu_read_lock_sched() and rcu_read_unlock_sched().
|
||||
If the updater uses synchronize_srcu() or call_srcu(),
|
||||
then the corresponding readers must use srcu_read_lock() and
|
||||
srcu_read_unlock(), and with the same srcu_struct. The rules for
|
||||
the expedited primitives are the same as for their non-expedited
|
||||
counterparts. Mixing things up will result in confusion and
|
||||
broken kernels.
|
||||
broken kernels, and has even resulted in an exploitable security
|
||||
issue.
|
||||
|
||||
One exception to this rule: rcu_read_lock() and rcu_read_unlock()
|
||||
may be substituted for rcu_read_lock_bh() and rcu_read_unlock_bh()
|
||||
@@ -288,8 +285,7 @@ over a rather long period of time, but improvements are always welcome!
|
||||
d. Periodically invoke synchronize_rcu(), permitting a limited
|
||||
number of updates per grace period.
|
||||
|
||||
The same cautions apply to call_rcu_bh(), call_rcu_sched(),
|
||||
call_srcu(), and kfree_rcu().
|
||||
The same cautions apply to call_srcu() and kfree_rcu().
|
||||
|
||||
Note that although these primitives do take action to avoid memory
|
||||
exhaustion when any given CPU has too many callbacks, a determined
|
||||
@@ -322,7 +318,7 @@ over a rather long period of time, but improvements are always welcome!
|
||||
|
||||
11. Any lock acquired by an RCU callback must be acquired elsewhere
|
||||
with softirq disabled, e.g., via spin_lock_irqsave(),
|
||||
spin_lock_bh(), etc. Failing to disable irq on a given
|
||||
spin_lock_bh(), etc. Failing to disable softirq on a given
|
||||
acquisition of that lock will result in deadlock as soon as
|
||||
the RCU softirq handler happens to run your RCU callback while
|
||||
interrupting that acquisition's critical section.
|
||||
@@ -335,13 +331,16 @@ over a rather long period of time, but improvements are always welcome!
|
||||
must use whatever locking or other synchronization is required
|
||||
to safely access and/or modify that data structure.
|
||||
|
||||
RCU callbacks are -usually- executed on the same CPU that executed
|
||||
the corresponding call_rcu(), call_rcu_bh(), or call_rcu_sched(),
|
||||
but are by -no- means guaranteed to be. For example, if a given
|
||||
CPU goes offline while having an RCU callback pending, then that
|
||||
RCU callback will execute on some surviving CPU. (If this was
|
||||
not the case, a self-spawning RCU callback would prevent the
|
||||
victim CPU from ever going offline.)
|
||||
Do not assume that RCU callbacks will be executed on the same
|
||||
CPU that executed the corresponding call_rcu() or call_srcu().
|
||||
For example, if a given CPU goes offline while having an RCU
|
||||
callback pending, then that RCU callback will execute on some
|
||||
surviving CPU. (If this was not the case, a self-spawning RCU
|
||||
callback would prevent the victim CPU from ever going offline.)
|
||||
Furthermore, CPUs designated by rcu_nocbs= might well -always-
|
||||
have their RCU callbacks executed on some other CPUs, in fact,
|
||||
for some real-time workloads, this is the whole point of using
|
||||
the rcu_nocbs= kernel boot parameter.
|
||||
|
||||
13. Unlike other forms of RCU, it -is- permissible to block in an
|
||||
SRCU read-side critical section (demarked by srcu_read_lock()
|
||||
@@ -381,11 +380,11 @@ over a rather long period of time, but improvements are always welcome!
|
||||
|
||||
SRCU's expedited primitive (synchronize_srcu_expedited())
|
||||
never sends IPIs to other CPUs, so it is easier on
|
||||
real-time workloads than is synchronize_rcu_expedited(),
|
||||
synchronize_rcu_bh_expedited() or synchronize_sched_expedited().
|
||||
real-time workloads than is synchronize_rcu_expedited().
|
||||
|
||||
Note that rcu_dereference() and rcu_assign_pointer() relate to
|
||||
SRCU just as they do to other forms of RCU.
|
||||
Note that rcu_assign_pointer() relates to SRCU just as it does to
|
||||
other forms of RCU, but instead of rcu_dereference() you should
|
||||
use srcu_dereference() in order to avoid lockdep splats.
|
||||
|
||||
14. The whole point of call_rcu(), synchronize_rcu(), and friends
|
||||
is to wait until all pre-existing readers have finished before
|
||||
@@ -405,6 +404,9 @@ over a rather long period of time, but improvements are always welcome!
|
||||
read-side critical sections. It is the responsibility of the
|
||||
RCU update-side primitives to deal with this.
|
||||
|
||||
For SRCU readers, you can use smp_mb__after_srcu_read_unlock()
|
||||
immediately after an srcu_read_unlock() to get a full barrier.
|
||||
|
||||
16. Use CONFIG_PROVE_LOCKING, CONFIG_DEBUG_OBJECTS_RCU_HEAD, and the
|
||||
__rcu sparse checks to validate your RCU code. These can help
|
||||
find problems as follows:
|
||||
@@ -428,22 +430,19 @@ over a rather long period of time, but improvements are always welcome!
|
||||
These debugging aids can help you find problems that are
|
||||
otherwise extremely difficult to spot.
|
||||
|
||||
17. If you register a callback using call_rcu(), call_rcu_bh(),
|
||||
call_rcu_sched(), or call_srcu(), and pass in a function defined
|
||||
within a loadable module, then it in necessary to wait for
|
||||
all pending callbacks to be invoked after the last invocation
|
||||
and before unloading that module. Note that it is absolutely
|
||||
-not- sufficient to wait for a grace period! The current (say)
|
||||
synchronize_rcu() implementation waits only for all previous
|
||||
callbacks registered on the CPU that synchronize_rcu() is running
|
||||
on, but it is -not- guaranteed to wait for callbacks registered
|
||||
on other CPUs.
|
||||
17. If you register a callback using call_rcu() or call_srcu(), and
|
||||
pass in a function defined within a loadable module, then it in
|
||||
necessary to wait for all pending callbacks to be invoked after
|
||||
the last invocation and before unloading that module. Note that
|
||||
it is absolutely -not- sufficient to wait for a grace period!
|
||||
The current (say) synchronize_rcu() implementation is -not-
|
||||
guaranteed to wait for callbacks registered on other CPUs.
|
||||
Or even on the current CPU if that CPU recently went offline
|
||||
and came back online.
|
||||
|
||||
You instead need to use one of the barrier functions:
|
||||
|
||||
o call_rcu() -> rcu_barrier()
|
||||
o call_rcu_bh() -> rcu_barrier()
|
||||
o call_rcu_sched() -> rcu_barrier()
|
||||
o call_srcu() -> srcu_barrier()
|
||||
|
||||
However, these barrier functions are absolutely -not- guaranteed
|
||||
|
@@ -52,10 +52,10 @@ o If I am running on a uniprocessor kernel, which can only do one
|
||||
o How can I see where RCU is currently used in the Linux kernel?
|
||||
|
||||
Search for "rcu_read_lock", "rcu_read_unlock", "call_rcu",
|
||||
"rcu_read_lock_bh", "rcu_read_unlock_bh", "call_rcu_bh",
|
||||
"srcu_read_lock", "srcu_read_unlock", "synchronize_rcu",
|
||||
"synchronize_net", "synchronize_srcu", and the other RCU
|
||||
primitives. Or grab one of the cscope databases from:
|
||||
"rcu_read_lock_bh", "rcu_read_unlock_bh", "srcu_read_lock",
|
||||
"srcu_read_unlock", "synchronize_rcu", "synchronize_net",
|
||||
"synchronize_srcu", and the other RCU primitives. Or grab one
|
||||
of the cscope databases from:
|
||||
|
||||
http://www.rdrop.com/users/paulmck/RCU/linuxusage/rculocktab.html
|
||||
|
||||
|
@@ -351,3 +351,106 @@ garbage values.
|
||||
|
||||
In short, rcu_dereference() is -not- optional when you are going to
|
||||
dereference the resulting pointer.
|
||||
|
||||
|
||||
WHICH MEMBER OF THE rcu_dereference() FAMILY SHOULD YOU USE?
|
||||
|
||||
First, please avoid using rcu_dereference_raw() and also please avoid
|
||||
using rcu_dereference_check() and rcu_dereference_protected() with a
|
||||
second argument with a constant value of 1 (or true, for that matter).
|
||||
With that caution out of the way, here is some guidance for which
|
||||
member of the rcu_dereference() to use in various situations:
|
||||
|
||||
1. If the access needs to be within an RCU read-side critical
|
||||
section, use rcu_dereference(). With the new consolidated
|
||||
RCU flavors, an RCU read-side critical section is entered
|
||||
using rcu_read_lock(), anything that disables bottom halves,
|
||||
anything that disables interrupts, or anything that disables
|
||||
preemption.
|
||||
|
||||
2. If the access might be within an RCU read-side critical section
|
||||
on the one hand, or protected by (say) my_lock on the other,
|
||||
use rcu_dereference_check(), for example:
|
||||
|
||||
p1 = rcu_dereference_check(p->rcu_protected_pointer,
|
||||
lockdep_is_held(&my_lock));
|
||||
|
||||
|
||||
3. If the access might be within an RCU read-side critical section
|
||||
on the one hand, or protected by either my_lock or your_lock on
|
||||
the other, again use rcu_dereference_check(), for example:
|
||||
|
||||
p1 = rcu_dereference_check(p->rcu_protected_pointer,
|
||||
lockdep_is_held(&my_lock) ||
|
||||
lockdep_is_held(&your_lock));
|
||||
|
||||
4. If the access is on the update side, so that it is always protected
|
||||
by my_lock, use rcu_dereference_protected():
|
||||
|
||||
p1 = rcu_dereference_protected(p->rcu_protected_pointer,
|
||||
lockdep_is_held(&my_lock));
|
||||
|
||||
This can be extended to handle multiple locks as in #3 above,
|
||||
and both can be extended to check other conditions as well.
|
||||
|
||||
5. If the protection is supplied by the caller, and is thus unknown
|
||||
to this code, that is the rare case when rcu_dereference_raw()
|
||||
is appropriate. In addition, rcu_dereference_raw() might be
|
||||
appropriate when the lockdep expression would be excessively
|
||||
complex, except that a better approach in that case might be to
|
||||
take a long hard look at your synchronization design. Still,
|
||||
there are data-locking cases where any one of a very large number
|
||||
of locks or reference counters suffices to protect the pointer,
|
||||
so rcu_dereference_raw() does have its place.
|
||||
|
||||
However, its place is probably quite a bit smaller than one
|
||||
might expect given the number of uses in the current kernel.
|
||||
Ditto for its synonym, rcu_dereference_check( ... , 1), and
|
||||
its close relative, rcu_dereference_protected(... , 1).
|
||||
|
||||
|
||||
SPARSE CHECKING OF RCU-PROTECTED POINTERS
|
||||
|
||||
The sparse static-analysis tool checks for direct access to RCU-protected
|
||||
pointers, which can result in "interesting" bugs due to compiler
|
||||
optimizations involving invented loads and perhaps also load tearing.
|
||||
For example, suppose someone mistakenly does something like this:
|
||||
|
||||
p = q->rcu_protected_pointer;
|
||||
do_something_with(p->a);
|
||||
do_something_else_with(p->b);
|
||||
|
||||
If register pressure is high, the compiler might optimize "p" out
|
||||
of existence, transforming the code to something like this:
|
||||
|
||||
do_something_with(q->rcu_protected_pointer->a);
|
||||
do_something_else_with(q->rcu_protected_pointer->b);
|
||||
|
||||
This could fatally disappoint your code if q->rcu_protected_pointer
|
||||
changed in the meantime. Nor is this a theoretical problem: Exactly
|
||||
this sort of bug cost Paul E. McKenney (and several of his innocent
|
||||
colleagues) a three-day weekend back in the early 1990s.
|
||||
|
||||
Load tearing could of course result in dereferencing a mashup of a pair
|
||||
of pointers, which also might fatally disappoint your code.
|
||||
|
||||
These problems could have been avoided simply by making the code instead
|
||||
read as follows:
|
||||
|
||||
p = rcu_dereference(q->rcu_protected_pointer);
|
||||
do_something_with(p->a);
|
||||
do_something_else_with(p->b);
|
||||
|
||||
Unfortunately, these sorts of bugs can be extremely hard to spot during
|
||||
review. This is where the sparse tool comes into play, along with the
|
||||
"__rcu" marker. If you mark a pointer declaration, whether in a structure
|
||||
or as a formal parameter, with "__rcu", which tells sparse to complain if
|
||||
this pointer is accessed directly. It will also cause sparse to complain
|
||||
if a pointer not marked with "__rcu" is accessed using rcu_dereference()
|
||||
and friends. For example, ->rcu_protected_pointer might be declared as
|
||||
follows:
|
||||
|
||||
struct foo __rcu *rcu_protected_pointer;
|
||||
|
||||
Use of "__rcu" is opt-in. If you choose not to use it, then you should
|
||||
ignore the sparse warnings.
|
||||
|
@@ -83,16 +83,15 @@ Pseudo-code using rcu_barrier() is as follows:
|
||||
2. Execute rcu_barrier().
|
||||
3. Allow the module to be unloaded.
|
||||
|
||||
There are also rcu_barrier_bh(), rcu_barrier_sched(), and srcu_barrier()
|
||||
functions for the other flavors of RCU, and you of course must match
|
||||
the flavor of rcu_barrier() with that of call_rcu(). If your module
|
||||
uses multiple flavors of call_rcu(), then it must also use multiple
|
||||
There is also an srcu_barrier() function for SRCU, and you of course
|
||||
must match the flavor of rcu_barrier() with that of call_rcu(). If your
|
||||
module uses multiple flavors of call_rcu(), then it must also use multiple
|
||||
flavors of rcu_barrier() when unloading that module. For example, if
|
||||
it uses call_rcu_bh(), call_srcu() on srcu_struct_1, and call_srcu() on
|
||||
it uses call_rcu(), call_srcu() on srcu_struct_1, and call_srcu() on
|
||||
srcu_struct_2(), then the following three lines of code will be required
|
||||
when unloading:
|
||||
|
||||
1 rcu_barrier_bh();
|
||||
1 rcu_barrier();
|
||||
2 srcu_barrier(&srcu_struct_1);
|
||||
3 srcu_barrier(&srcu_struct_2);
|
||||
|
||||
@@ -185,12 +184,12 @@ module invokes call_rcu() from timers, you will need to first cancel all
|
||||
the timers, and only then invoke rcu_barrier() to wait for any remaining
|
||||
RCU callbacks to complete.
|
||||
|
||||
Of course, if you module uses call_rcu_bh(), you will need to invoke
|
||||
rcu_barrier_bh() before unloading. Similarly, if your module uses
|
||||
call_rcu_sched(), you will need to invoke rcu_barrier_sched() before
|
||||
unloading. If your module uses call_rcu(), call_rcu_bh(), -and-
|
||||
call_rcu_sched(), then you will need to invoke each of rcu_barrier(),
|
||||
rcu_barrier_bh(), and rcu_barrier_sched().
|
||||
Of course, if you module uses call_rcu(), you will need to invoke
|
||||
rcu_barrier() before unloading. Similarly, if your module uses
|
||||
call_srcu(), you will need to invoke srcu_barrier() before unloading,
|
||||
and on the same srcu_struct structure. If your module uses call_rcu()
|
||||
-and- call_srcu(), then you will need to invoke rcu_barrier() -and-
|
||||
srcu_barrier().
|
||||
|
||||
|
||||
Implementing rcu_barrier()
|
||||
@@ -223,8 +222,8 @@ shown below. Note that the final "1" in on_each_cpu()'s argument list
|
||||
ensures that all the calls to rcu_barrier_func() will have completed
|
||||
before on_each_cpu() returns. Line 9 then waits for the completion.
|
||||
|
||||
This code was rewritten in 2008 to support rcu_barrier_bh() and
|
||||
rcu_barrier_sched() in addition to the original rcu_barrier().
|
||||
This code was rewritten in 2008 and several times thereafter, but this
|
||||
still gives the general idea.
|
||||
|
||||
The rcu_barrier_func() runs on each CPU, where it invokes call_rcu()
|
||||
to post an RCU callback, as follows:
|
||||
|
@@ -310,7 +310,7 @@ reader, updater, and reclaimer.
|
||||
|
||||
|
||||
rcu_assign_pointer()
|
||||
+--------+
|
||||
+--------+
|
||||
+---------------------->| reader |---------+
|
||||
| +--------+ |
|
||||
| | |
|
||||
@@ -318,12 +318,12 @@ reader, updater, and reclaimer.
|
||||
| | | rcu_read_lock()
|
||||
| | | rcu_read_unlock()
|
||||
| rcu_dereference() | |
|
||||
+---------+ | |
|
||||
| updater |<---------------------+ |
|
||||
+---------+ V
|
||||
+---------+ | |
|
||||
| updater |<----------------+ |
|
||||
+---------+ V
|
||||
| +-----------+
|
||||
+----------------------------------->| reclaimer |
|
||||
+-----------+
|
||||
+-----------+
|
||||
Defer:
|
||||
synchronize_rcu() & call_rcu()
|
||||
|
||||
|
@@ -63,6 +63,110 @@ as well as medium and long term trends. The total absolute stall time
|
||||
spikes which wouldn't necessarily make a dent in the time averages,
|
||||
or to average trends over custom time frames.
|
||||
|
||||
Monitoring for pressure thresholds
|
||||
==================================
|
||||
|
||||
Users can register triggers and use poll() to be woken up when resource
|
||||
pressure exceeds certain thresholds.
|
||||
|
||||
A trigger describes the maximum cumulative stall time over a specific
|
||||
time window, e.g. 100ms of total stall time within any 500ms window to
|
||||
generate a wakeup event.
|
||||
|
||||
To register a trigger user has to open psi interface file under
|
||||
/proc/pressure/ representing the resource to be monitored and write the
|
||||
desired threshold and time window. The open file descriptor should be
|
||||
used to wait for trigger events using select(), poll() or epoll().
|
||||
The following format is used:
|
||||
|
||||
<some|full> <stall amount in us> <time window in us>
|
||||
|
||||
For example writing "some 150000 1000000" into /proc/pressure/memory
|
||||
would add 150ms threshold for partial memory stall measured within
|
||||
1sec time window. Writing "full 50000 1000000" into /proc/pressure/io
|
||||
would add 50ms threshold for full io stall measured within 1sec time window.
|
||||
|
||||
Triggers can be set on more than one psi metric and more than one trigger
|
||||
for the same psi metric can be specified. However for each trigger a separate
|
||||
file descriptor is required to be able to poll it separately from others,
|
||||
therefore for each trigger a separate open() syscall should be made even
|
||||
when opening the same psi interface file.
|
||||
|
||||
Monitors activate only when system enters stall state for the monitored
|
||||
psi metric and deactivates upon exit from the stall state. While system is
|
||||
in the stall state psi signal growth is monitored at a rate of 10 times per
|
||||
tracking window.
|
||||
|
||||
The kernel accepts window sizes ranging from 500ms to 10s, therefore min
|
||||
monitoring update interval is 50ms and max is 1s. Min limit is set to
|
||||
prevent overly frequent polling. Max limit is chosen as a high enough number
|
||||
after which monitors are most likely not needed and psi averages can be used
|
||||
instead.
|
||||
|
||||
When activated, psi monitor stays active for at least the duration of one
|
||||
tracking window to avoid repeated activations/deactivations when system is
|
||||
bouncing in and out of the stall state.
|
||||
|
||||
Notifications to the userspace are rate-limited to one per tracking window.
|
||||
|
||||
The trigger will de-register when the file descriptor used to define the
|
||||
trigger is closed.
|
||||
|
||||
Userspace monitor usage example
|
||||
===============================
|
||||
|
||||
#include <errno.h>
|
||||
#include <fcntl.h>
|
||||
#include <stdio.h>
|
||||
#include <poll.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
|
||||
/*
|
||||
* Monitor memory partial stall with 1s tracking window size
|
||||
* and 150ms threshold.
|
||||
*/
|
||||
int main() {
|
||||
const char trig[] = "some 150000 1000000";
|
||||
struct pollfd fds;
|
||||
int n;
|
||||
|
||||
fds.fd = open("/proc/pressure/memory", O_RDWR | O_NONBLOCK);
|
||||
if (fds.fd < 0) {
|
||||
printf("/proc/pressure/memory open error: %s\n",
|
||||
strerror(errno));
|
||||
return 1;
|
||||
}
|
||||
fds.events = POLLPRI;
|
||||
|
||||
if (write(fds.fd, trig, strlen(trig) + 1) < 0) {
|
||||
printf("/proc/pressure/memory write error: %s\n",
|
||||
strerror(errno));
|
||||
return 1;
|
||||
}
|
||||
|
||||
printf("waiting for events...\n");
|
||||
while (1) {
|
||||
n = poll(&fds, 1, -1);
|
||||
if (n < 0) {
|
||||
printf("poll error: %s\n", strerror(errno));
|
||||
return 1;
|
||||
}
|
||||
if (fds.revents & POLLERR) {
|
||||
printf("got POLLERR, event source is gone\n");
|
||||
return 0;
|
||||
}
|
||||
if (fds.revents & POLLPRI) {
|
||||
printf("event triggered!\n");
|
||||
} else {
|
||||
printf("unknown event received: 0x%x\n", fds.revents);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
Cgroup2 interface
|
||||
=================
|
||||
|
||||
@@ -71,3 +175,6 @@ mounted, pressure stall information is also tracked for tasks grouped
|
||||
into cgroups. Each subdirectory in the cgroupfs mountpoint contains
|
||||
cpu.pressure, memory.pressure, and io.pressure files; the format is
|
||||
the same as the /proc/pressure/ files.
|
||||
|
||||
Per-cgroup psi monitors can be specified and used the same way as
|
||||
system-wide ones.
|
||||
|
@@ -1,66 +0,0 @@
|
||||
The AML Debugger
|
||||
|
||||
Copyright (C) 2016, Intel Corporation
|
||||
Author: Lv Zheng <lv.zheng@intel.com>
|
||||
|
||||
|
||||
This document describes the usage of the AML debugger embedded in the Linux
|
||||
kernel.
|
||||
|
||||
1. Build the debugger
|
||||
|
||||
The following kernel configuration items are required to enable the AML
|
||||
debugger interface from the Linux kernel:
|
||||
|
||||
CONFIG_ACPI_DEBUGGER=y
|
||||
CONFIG_ACPI_DEBUGGER_USER=m
|
||||
|
||||
The userspace utilities can be built from the kernel source tree using
|
||||
the following commands:
|
||||
|
||||
$ cd tools
|
||||
$ make acpi
|
||||
|
||||
The resultant userspace tool binary is then located at:
|
||||
|
||||
tools/power/acpi/acpidbg
|
||||
|
||||
It can be installed to system directories by running "make install" (as a
|
||||
sufficiently privileged user).
|
||||
|
||||
2. Start the userspace debugger interface
|
||||
|
||||
After booting the kernel with the debugger built-in, the debugger can be
|
||||
started by using the following commands:
|
||||
|
||||
# mount -t debugfs none /sys/kernel/debug
|
||||
# modprobe acpi_dbg
|
||||
# tools/power/acpi/acpidbg
|
||||
|
||||
That spawns the interactive AML debugger environment where you can execute
|
||||
debugger commands.
|
||||
|
||||
The commands are documented in the "ACPICA Overview and Programmer Reference"
|
||||
that can be downloaded from
|
||||
|
||||
https://acpica.org/documentation
|
||||
|
||||
The detailed debugger commands reference is located in Chapter 12 "ACPICA
|
||||
Debugger Reference". The "help" command can be used for a quick reference.
|
||||
|
||||
3. Stop the userspace debugger interface
|
||||
|
||||
The interactive debugger interface can be closed by pressing Ctrl+C or using
|
||||
the "quit" or "exit" commands. When finished, unload the module with:
|
||||
|
||||
# rmmod acpi_dbg
|
||||
|
||||
The module unloading may fail if there is an acpidbg instance running.
|
||||
|
||||
4. Run the debugger in a script
|
||||
|
||||
It may be useful to run the AML debugger in a test script. "acpidbg" supports
|
||||
this in a special "batch" mode. For example, the following command outputs
|
||||
the entire ACPI namespace:
|
||||
|
||||
# acpidbg -b "namespace"
|
@@ -1,147 +0,0 @@
|
||||
APEI output format
|
||||
~~~~~~~~~~~~~~~~~~
|
||||
|
||||
APEI uses printk as hardware error reporting interface, the output
|
||||
format is as follow.
|
||||
|
||||
<error record> :=
|
||||
APEI generic hardware error status
|
||||
severity: <integer>, <severity string>
|
||||
section: <integer>, severity: <integer>, <severity string>
|
||||
flags: <integer>
|
||||
<section flags strings>
|
||||
fru_id: <uuid string>
|
||||
fru_text: <string>
|
||||
section_type: <section type string>
|
||||
<section data>
|
||||
|
||||
<severity string>* := recoverable | fatal | corrected | info
|
||||
|
||||
<section flags strings># :=
|
||||
[primary][, containment warning][, reset][, threshold exceeded]\
|
||||
[, resource not accessible][, latent error]
|
||||
|
||||
<section type string> := generic processor error | memory error | \
|
||||
PCIe error | unknown, <uuid string>
|
||||
|
||||
<section data> :=
|
||||
<generic processor section data> | <memory section data> | \
|
||||
<pcie section data> | <null>
|
||||
|
||||
<generic processor section data> :=
|
||||
[processor_type: <integer>, <proc type string>]
|
||||
[processor_isa: <integer>, <proc isa string>]
|
||||
[error_type: <integer>
|
||||
<proc error type strings>]
|
||||
[operation: <integer>, <proc operation string>]
|
||||
[flags: <integer>
|
||||
<proc flags strings>]
|
||||
[level: <integer>]
|
||||
[version_info: <integer>]
|
||||
[processor_id: <integer>]
|
||||
[target_address: <integer>]
|
||||
[requestor_id: <integer>]
|
||||
[responder_id: <integer>]
|
||||
[IP: <integer>]
|
||||
|
||||
<proc type string>* := IA32/X64 | IA64
|
||||
|
||||
<proc isa string>* := IA32 | IA64 | X64
|
||||
|
||||
<processor error type strings># :=
|
||||
[cache error][, TLB error][, bus error][, micro-architectural error]
|
||||
|
||||
<proc operation string>* := unknown or generic | data read | data write | \
|
||||
instruction execution
|
||||
|
||||
<proc flags strings># :=
|
||||
[restartable][, precise IP][, overflow][, corrected]
|
||||
|
||||
<memory section data> :=
|
||||
[error_status: <integer>]
|
||||
[physical_address: <integer>]
|
||||
[physical_address_mask: <integer>]
|
||||
[node: <integer>]
|
||||
[card: <integer>]
|
||||
[module: <integer>]
|
||||
[bank: <integer>]
|
||||
[device: <integer>]
|
||||
[row: <integer>]
|
||||
[column: <integer>]
|
||||
[bit_position: <integer>]
|
||||
[requestor_id: <integer>]
|
||||
[responder_id: <integer>]
|
||||
[target_id: <integer>]
|
||||
[error_type: <integer>, <mem error type string>]
|
||||
|
||||
<mem error type string>* :=
|
||||
unknown | no error | single-bit ECC | multi-bit ECC | \
|
||||
single-symbol chipkill ECC | multi-symbol chipkill ECC | master abort | \
|
||||
target abort | parity error | watchdog timeout | invalid address | \
|
||||
mirror Broken | memory sparing | scrub corrected error | \
|
||||
scrub uncorrected error
|
||||
|
||||
<pcie section data> :=
|
||||
[port_type: <integer>, <pcie port type string>]
|
||||
[version: <integer>.<integer>]
|
||||
[command: <integer>, status: <integer>]
|
||||
[device_id: <integer>:<integer>:<integer>.<integer>
|
||||
slot: <integer>
|
||||
secondary_bus: <integer>
|
||||
vendor_id: <integer>, device_id: <integer>
|
||||
class_code: <integer>]
|
||||
[serial number: <integer>, <integer>]
|
||||
[bridge: secondary_status: <integer>, control: <integer>]
|
||||
[aer_status: <integer>, aer_mask: <integer>
|
||||
<aer status string>
|
||||
[aer_uncor_severity: <integer>]
|
||||
aer_layer=<aer layer string>, aer_agent=<aer agent string>
|
||||
aer_tlp_header: <integer> <integer> <integer> <integer>]
|
||||
|
||||
<pcie port type string>* := PCIe end point | legacy PCI end point | \
|
||||
unknown | unknown | root port | upstream switch port | \
|
||||
downstream switch port | PCIe to PCI/PCI-X bridge | \
|
||||
PCI/PCI-X to PCIe bridge | root complex integrated endpoint device | \
|
||||
root complex event collector
|
||||
|
||||
if section severity is fatal or recoverable
|
||||
<aer status string># :=
|
||||
unknown | unknown | unknown | unknown | Data Link Protocol | \
|
||||
unknown | unknown | unknown | unknown | unknown | unknown | unknown | \
|
||||
Poisoned TLP | Flow Control Protocol | Completion Timeout | \
|
||||
Completer Abort | Unexpected Completion | Receiver Overflow | \
|
||||
Malformed TLP | ECRC | Unsupported Request
|
||||
else
|
||||
<aer status string># :=
|
||||
Receiver Error | unknown | unknown | unknown | unknown | unknown | \
|
||||
Bad TLP | Bad DLLP | RELAY_NUM Rollover | unknown | unknown | unknown | \
|
||||
Replay Timer Timeout | Advisory Non-Fatal
|
||||
fi
|
||||
|
||||
<aer layer string> :=
|
||||
Physical Layer | Data Link Layer | Transaction Layer
|
||||
|
||||
<aer agent string> :=
|
||||
Receiver ID | Requester ID | Completer ID | Transmitter ID
|
||||
|
||||
Where, [] designate corresponding content is optional
|
||||
|
||||
All <field string> description with * has the following format:
|
||||
|
||||
field: <integer>, <field string>
|
||||
|
||||
Where value of <integer> should be the position of "string" in <field
|
||||
string> description. Otherwise, <field string> will be "unknown".
|
||||
|
||||
All <field strings> description with # has the following format:
|
||||
|
||||
field: <integer>
|
||||
<field strings>
|
||||
|
||||
Where each string in <fields strings> corresponding to one set bit of
|
||||
<integer>. The bit position is the position of "string" in <field
|
||||
strings> description.
|
||||
|
||||
For more detailed explanation of every field, please refer to UEFI
|
||||
specification version 2.3 or later, section Appendix N: Common
|
||||
Platform Error Record.
|
99
Documentation/acpi/dsd/leds.txt
Normal file
99
Documentation/acpi/dsd/leds.txt
Normal file
@@ -0,0 +1,99 @@
|
||||
Describing and referring to LEDs in ACPI
|
||||
|
||||
Individual LEDs are described by hierarchical data extension [6] nodes under the
|
||||
device node, the LED driver chip. The "reg" property in the LED specific nodes
|
||||
tells the numerical ID of each individual LED output to which the LEDs are
|
||||
connected. [3] The hierarchical data nodes are named "led@X", where X is the
|
||||
number of the LED output.
|
||||
|
||||
Referring to LEDs in Device tree is documented in [4], in "flash-leds" property
|
||||
documentation. In short, LEDs are directly referred to by using phandles.
|
||||
|
||||
While Device tree allows referring to any node in the tree[1], in ACPI
|
||||
references are limited to device nodes only [2]. For this reason using the same
|
||||
mechanism on ACPI is not possible. A mechanism to refer to non-device ACPI nodes
|
||||
is documented in [7].
|
||||
|
||||
ACPI allows (as does DT) using integer arguments after the reference. A
|
||||
combination of the LED driver device reference and an integer argument,
|
||||
referring to the "reg" property of the relevant LED, is used to identify
|
||||
individual LEDs. The value of the "reg" property is a contract between the
|
||||
firmware and software, it uniquely identifies the LED driver outputs.
|
||||
|
||||
Under the LED driver device, The first hierarchical data extension package list
|
||||
entry shall contain the string "led@" followed by the number of the LED,
|
||||
followed by the referred object name. That object shall be named "LED" followed
|
||||
by the number of the LED.
|
||||
|
||||
An ASL example of a camera sensor device and a LED driver device for two LEDs.
|
||||
Objects not relevant for LEDs or the references to them have been omitted.
|
||||
|
||||
Device (LED)
|
||||
{
|
||||
Name (_DSD, Package () {
|
||||
ToUUID("dbb8e3e6-5886-4ba6-8795-1319f52a966b"),
|
||||
Package () {
|
||||
Package () { "led@0", LED0 },
|
||||
Package () { "led@1", LED1 },
|
||||
}
|
||||
})
|
||||
Name (LED0, Package () {
|
||||
ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
|
||||
Package () {
|
||||
Package () { "reg", 0 },
|
||||
Package () { "flash-max-microamp", 1000000 },
|
||||
Package () { "flash-timeout-us", 200000 },
|
||||
Package () { "led-max-microamp", 100000 },
|
||||
Package () { "label", "white:flash" },
|
||||
}
|
||||
})
|
||||
Name (LED1, Package () {
|
||||
ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
|
||||
Package () {
|
||||
Package () { "reg", 1 },
|
||||
Package () { "led-max-microamp", 10000 },
|
||||
Package () { "label", "red:indicator" },
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
Device (SEN)
|
||||
{
|
||||
Name (_DSD, Package () {
|
||||
ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
|
||||
Package () {
|
||||
Package () {
|
||||
"flash-leds",
|
||||
Package () { ^LED, "led@0", ^LED, "led@1" },
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
where
|
||||
|
||||
LED LED driver device
|
||||
LED0 First LED
|
||||
LED1 Second LED
|
||||
SEN Camera sensor device (or another device the LED is
|
||||
related to)
|
||||
|
||||
[1] Device tree. <URL:http://www.devicetree.org>, referenced 2019-02-21.
|
||||
|
||||
[2] Advanced Configuration and Power Interface Specification.
|
||||
<URL:https://uefi.org/sites/default/files/resources/ACPI_6_3_final_Jan30.pdf>,
|
||||
referenced 2019-02-21.
|
||||
|
||||
[3] Documentation/devicetree/bindings/leds/common.txt
|
||||
|
||||
[4] Documentation/devicetree/bindings/media/video-interfaces.txt
|
||||
|
||||
[5] Device Properties UUID For _DSD.
|
||||
<URL:http://www.uefi.org/sites/default/files/resources/_DSD-device-properties-UUID.pdf>,
|
||||
referenced 2019-02-21.
|
||||
|
||||
[6] Hierarchical Data Extension UUID For _DSD.
|
||||
<URL:http://www.uefi.org/sites/default/files/resources/_DSD-hierarchical-data-extension-UUID-v1.1.pdf>,
|
||||
referenced 2019-02-21.
|
||||
|
||||
[7] Documentation/acpi/dsd/data-node-reference.txt
|
@@ -1,58 +0,0 @@
|
||||
ACPI I2C Muxes
|
||||
--------------
|
||||
|
||||
Describing an I2C device hierarchy that includes I2C muxes requires an ACPI
|
||||
Device () scope per mux channel.
|
||||
|
||||
Consider this topology:
|
||||
|
||||
+------+ +------+
|
||||
| SMB1 |-->| MUX0 |--CH00--> i2c client A (0x50)
|
||||
| | | 0x70 |--CH01--> i2c client B (0x50)
|
||||
+------+ +------+
|
||||
|
||||
which corresponds to the following ASL:
|
||||
|
||||
Device (SMB1)
|
||||
{
|
||||
Name (_HID, ...)
|
||||
Device (MUX0)
|
||||
{
|
||||
Name (_HID, ...)
|
||||
Name (_CRS, ResourceTemplate () {
|
||||
I2cSerialBus (0x70, ControllerInitiated, I2C_SPEED,
|
||||
AddressingMode7Bit, "^SMB1", 0x00,
|
||||
ResourceConsumer,,)
|
||||
}
|
||||
|
||||
Device (CH00)
|
||||
{
|
||||
Name (_ADR, 0)
|
||||
|
||||
Device (CLIA)
|
||||
{
|
||||
Name (_HID, ...)
|
||||
Name (_CRS, ResourceTemplate () {
|
||||
I2cSerialBus (0x50, ControllerInitiated, I2C_SPEED,
|
||||
AddressingMode7Bit, "^CH00", 0x00,
|
||||
ResourceConsumer,,)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Device (CH01)
|
||||
{
|
||||
Name (_ADR, 1)
|
||||
|
||||
Device (CLIB)
|
||||
{
|
||||
Name (_HID, ...)
|
||||
Name (_CRS, ResourceTemplate () {
|
||||
I2cSerialBus (0x50, ControllerInitiated, I2C_SPEED,
|
||||
AddressingMode7Bit, "^CH01", 0x00,
|
||||
ResourceConsumer,,)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@@ -1,111 +0,0 @@
|
||||
Upgrading ACPI tables via initrd
|
||||
================================
|
||||
|
||||
1) Introduction (What is this about)
|
||||
2) What is this for
|
||||
3) How does it work
|
||||
4) References (Where to retrieve userspace tools)
|
||||
|
||||
1) What is this about
|
||||
---------------------
|
||||
|
||||
If the ACPI_TABLE_UPGRADE compile option is true, it is possible to
|
||||
upgrade the ACPI execution environment that is defined by the ACPI tables
|
||||
via upgrading the ACPI tables provided by the BIOS with an instrumented,
|
||||
modified, more recent version one, or installing brand new ACPI tables.
|
||||
|
||||
When building initrd with kernel in a single image, option
|
||||
ACPI_TABLE_OVERRIDE_VIA_BUILTIN_INITRD should also be true for this
|
||||
feature to work.
|
||||
|
||||
For a full list of ACPI tables that can be upgraded/installed, take a look
|
||||
at the char *table_sigs[MAX_ACPI_SIGNATURE]; definition in
|
||||
drivers/acpi/tables.c.
|
||||
All ACPI tables iasl (Intel's ACPI compiler and disassembler) knows should
|
||||
be overridable, except:
|
||||
- ACPI_SIG_RSDP (has a signature of 6 bytes)
|
||||
- ACPI_SIG_FACS (does not have an ordinary ACPI table header)
|
||||
Both could get implemented as well.
|
||||
|
||||
|
||||
2) What is this for
|
||||
-------------------
|
||||
|
||||
Complain to your platform/BIOS vendor if you find a bug which is so severe
|
||||
that a workaround is not accepted in the Linux kernel. And this facility
|
||||
allows you to upgrade the buggy tables before your platform/BIOS vendor
|
||||
releases an upgraded BIOS binary.
|
||||
|
||||
This facility can be used by platform/BIOS vendors to provide a Linux
|
||||
compatible environment without modifying the underlying platform firmware.
|
||||
|
||||
This facility also provides a powerful feature to easily debug and test
|
||||
ACPI BIOS table compatibility with the Linux kernel by modifying old
|
||||
platform provided ACPI tables or inserting new ACPI tables.
|
||||
|
||||
It can and should be enabled in any kernel because there is no functional
|
||||
change with not instrumented initrds.
|
||||
|
||||
|
||||
3) How does it work
|
||||
-------------------
|
||||
|
||||
# Extract the machine's ACPI tables:
|
||||
cd /tmp
|
||||
acpidump >acpidump
|
||||
acpixtract -a acpidump
|
||||
# Disassemble, modify and recompile them:
|
||||
iasl -d *.dat
|
||||
# For example add this statement into a _PRT (PCI Routing Table) function
|
||||
# of the DSDT:
|
||||
Store("HELLO WORLD", debug)
|
||||
# And increase the OEM Revision. For example, before modification:
|
||||
DefinitionBlock ("DSDT.aml", "DSDT", 2, "INTEL ", "TEMPLATE", 0x00000000)
|
||||
# After modification:
|
||||
DefinitionBlock ("DSDT.aml", "DSDT", 2, "INTEL ", "TEMPLATE", 0x00000001)
|
||||
iasl -sa dsdt.dsl
|
||||
# Add the raw ACPI tables to an uncompressed cpio archive.
|
||||
# They must be put into a /kernel/firmware/acpi directory inside the cpio
|
||||
# archive. Note that if the table put here matches a platform table
|
||||
# (similar Table Signature, and similar OEMID, and similar OEM Table ID)
|
||||
# with a more recent OEM Revision, the platform table will be upgraded by
|
||||
# this table. If the table put here doesn't match a platform table
|
||||
# (dissimilar Table Signature, or dissimilar OEMID, or dissimilar OEM Table
|
||||
# ID), this table will be appended.
|
||||
mkdir -p kernel/firmware/acpi
|
||||
cp dsdt.aml kernel/firmware/acpi
|
||||
# A maximum of "NR_ACPI_INITRD_TABLES (64)" tables are currently allowed
|
||||
# (see osl.c):
|
||||
iasl -sa facp.dsl
|
||||
iasl -sa ssdt1.dsl
|
||||
cp facp.aml kernel/firmware/acpi
|
||||
cp ssdt1.aml kernel/firmware/acpi
|
||||
# The uncompressed cpio archive must be the first. Other, typically
|
||||
# compressed cpio archives, must be concatenated on top of the uncompressed
|
||||
# one. Following command creates the uncompressed cpio archive and
|
||||
# concatenates the original initrd on top:
|
||||
find kernel | cpio -H newc --create > /boot/instrumented_initrd
|
||||
cat /boot/initrd >>/boot/instrumented_initrd
|
||||
# reboot with increased acpi debug level, e.g. boot params:
|
||||
acpi.debug_level=0x2 acpi.debug_layer=0xFFFFFFFF
|
||||
# and check your syslog:
|
||||
[ 1.268089] ACPI: PCI Interrupt Routing Table [\_SB_.PCI0._PRT]
|
||||
[ 1.272091] [ACPI Debug] String [0x0B] "HELLO WORLD"
|
||||
|
||||
iasl is able to disassemble and recompile quite a lot different,
|
||||
also static ACPI tables.
|
||||
|
||||
|
||||
4) Where to retrieve userspace tools
|
||||
------------------------------------
|
||||
|
||||
iasl and acpixtract are part of Intel's ACPICA project:
|
||||
http://acpica.org/
|
||||
and should be packaged by distributions (for example in the acpica package
|
||||
on SUSE).
|
||||
|
||||
acpidump can be found in Len Browns pmtools:
|
||||
ftp://kernel.org/pub/linux/kernel/people/lenb/acpi/utils/pmtools/acpidump
|
||||
This tool is also part of the acpica package on SUSE.
|
||||
Alternatively, used ACPI tables can be retrieved via sysfs in latest kernels:
|
||||
/sys/firmware/acpi/tables
|
@@ -1,73 +0,0 @@
|
||||
Linux ACPI Custom Control Method How To
|
||||
=======================================
|
||||
|
||||
Written by Zhang Rui <rui.zhang@intel.com>
|
||||
|
||||
|
||||
Linux supports customizing ACPI control methods at runtime.
|
||||
|
||||
Users can use this to
|
||||
1. override an existing method which may not work correctly,
|
||||
or just for debugging purposes.
|
||||
2. insert a completely new method in order to create a missing
|
||||
method such as _OFF, _ON, _STA, _INI, etc.
|
||||
For these cases, it is far simpler to dynamically install a single
|
||||
control method rather than override the entire DSDT, because kernel
|
||||
rebuild/reboot is not needed and test result can be got in minutes.
|
||||
|
||||
Note: Only ACPI METHOD can be overridden, any other object types like
|
||||
"Device", "OperationRegion", are not recognized. Methods
|
||||
declared inside scope operators are also not supported.
|
||||
Note: The same ACPI control method can be overridden for many times,
|
||||
and it's always the latest one that used by Linux/kernel.
|
||||
Note: To get the ACPI debug object output (Store (AAAA, Debug)),
|
||||
please run "echo 1 > /sys/module/acpi/parameters/aml_debug_output".
|
||||
|
||||
1. override an existing method
|
||||
a) get the ACPI table via ACPI sysfs I/F. e.g. to get the DSDT,
|
||||
just run "cat /sys/firmware/acpi/tables/DSDT > /tmp/dsdt.dat"
|
||||
b) disassemble the table by running "iasl -d dsdt.dat".
|
||||
c) rewrite the ASL code of the method and save it in a new file,
|
||||
d) package the new file (psr.asl) to an ACPI table format.
|
||||
Here is an example of a customized \_SB._AC._PSR method,
|
||||
|
||||
DefinitionBlock ("", "SSDT", 1, "", "", 0x20080715)
|
||||
{
|
||||
Method (\_SB_.AC._PSR, 0, NotSerialized)
|
||||
{
|
||||
Store ("In AC _PSR", Debug)
|
||||
Return (ACON)
|
||||
}
|
||||
}
|
||||
Note that the full pathname of the method in ACPI namespace
|
||||
should be used.
|
||||
e) assemble the file to generate the AML code of the method.
|
||||
e.g. "iasl -vw 6084 psr.asl" (psr.aml is generated as a result)
|
||||
If parameter "-vw 6084" is not supported by your iASL compiler,
|
||||
please try a newer version.
|
||||
f) mount debugfs by "mount -t debugfs none /sys/kernel/debug"
|
||||
g) override the old method via the debugfs by running
|
||||
"cat /tmp/psr.aml > /sys/kernel/debug/acpi/custom_method"
|
||||
|
||||
2. insert a new method
|
||||
This is easier than overriding an existing method.
|
||||
We just need to create the ASL code of the method we want to
|
||||
insert and then follow the step c) ~ g) in section 1.
|
||||
|
||||
3. undo your changes
|
||||
The "undo" operation is not supported for a new inserted method
|
||||
right now, i.e. we can not remove a method currently.
|
||||
For an overridden method, in order to undo your changes, please
|
||||
save a copy of the method original ASL code in step c) section 1,
|
||||
and redo step c) ~ g) to override the method with the original one.
|
||||
|
||||
|
||||
Note: We can use a kernel with multiple custom ACPI method running,
|
||||
But each individual write to debugfs can implement a SINGLE
|
||||
method override. i.e. if we want to insert/override multiple
|
||||
ACPI methods, we need to redo step c) ~ g) for multiple times.
|
||||
|
||||
Note: Be aware that root can mis-use this driver to modify arbitrary
|
||||
memory and gain additional rights, if root's privileges got
|
||||
restricted (for example if root is not allowed to load additional
|
||||
modules after boot).
|
@@ -1,192 +0,0 @@
|
||||
ACPICA Trace Facility
|
||||
|
||||
Copyright (C) 2015, Intel Corporation
|
||||
Author: Lv Zheng <lv.zheng@intel.com>
|
||||
|
||||
|
||||
Abstract:
|
||||
|
||||
This document describes the functions and the interfaces of the method
|
||||
tracing facility.
|
||||
|
||||
1. Functionalities and usage examples:
|
||||
|
||||
ACPICA provides method tracing capability. And two functions are
|
||||
currently implemented using this capability.
|
||||
|
||||
A. Log reducer
|
||||
ACPICA subsystem provides debugging outputs when CONFIG_ACPI_DEBUG is
|
||||
enabled. The debugging messages which are deployed via
|
||||
ACPI_DEBUG_PRINT() macro can be reduced at 2 levels - per-component
|
||||
level (known as debug layer, configured via
|
||||
/sys/module/acpi/parameters/debug_layer) and per-type level (known as
|
||||
debug level, configured via /sys/module/acpi/parameters/debug_level).
|
||||
|
||||
But when the particular layer/level is applied to the control method
|
||||
evaluations, the quantity of the debugging outputs may still be too
|
||||
large to be put into the kernel log buffer. The idea thus is worked out
|
||||
to only enable the particular debug layer/level (normally more detailed)
|
||||
logs when the control method evaluation is started, and disable the
|
||||
detailed logging when the control method evaluation is stopped.
|
||||
|
||||
The following command examples illustrate the usage of the "log reducer"
|
||||
functionality:
|
||||
a. Filter out the debug layer/level matched logs when control methods
|
||||
are being evaluated:
|
||||
# cd /sys/module/acpi/parameters
|
||||
# echo "0xXXXXXXXX" > trace_debug_layer
|
||||
# echo "0xYYYYYYYY" > trace_debug_level
|
||||
# echo "enable" > trace_state
|
||||
b. Filter out the debug layer/level matched logs when the specified
|
||||
control method is being evaluated:
|
||||
# cd /sys/module/acpi/parameters
|
||||
# echo "0xXXXXXXXX" > trace_debug_layer
|
||||
# echo "0xYYYYYYYY" > trace_debug_level
|
||||
# echo "\PPPP.AAAA.TTTT.HHHH" > trace_method_name
|
||||
# echo "method" > /sys/module/acpi/parameters/trace_state
|
||||
c. Filter out the debug layer/level matched logs when the specified
|
||||
control method is being evaluated for the first time:
|
||||
# cd /sys/module/acpi/parameters
|
||||
# echo "0xXXXXXXXX" > trace_debug_layer
|
||||
# echo "0xYYYYYYYY" > trace_debug_level
|
||||
# echo "\PPPP.AAAA.TTTT.HHHH" > trace_method_name
|
||||
# echo "method-once" > /sys/module/acpi/parameters/trace_state
|
||||
Where:
|
||||
0xXXXXXXXX/0xYYYYYYYY: Refer to Documentation/acpi/debug.txt for
|
||||
possible debug layer/level masking values.
|
||||
\PPPP.AAAA.TTTT.HHHH: Full path of a control method that can be found
|
||||
in the ACPI namespace. It needn't be an entry
|
||||
of a control method evaluation.
|
||||
|
||||
B. AML tracer
|
||||
|
||||
There are special log entries added by the method tracing facility at
|
||||
the "trace points" the AML interpreter starts/stops to execute a control
|
||||
method, or an AML opcode. Note that the format of the log entries are
|
||||
subject to change:
|
||||
[ 0.186427] exdebug-0398 ex_trace_point : Method Begin [0xf58394d8:\_SB.PCI0.LPCB.ECOK] execution.
|
||||
[ 0.186630] exdebug-0398 ex_trace_point : Opcode Begin [0xf5905c88:If] execution.
|
||||
[ 0.186820] exdebug-0398 ex_trace_point : Opcode Begin [0xf5905cc0:LEqual] execution.
|
||||
[ 0.187010] exdebug-0398 ex_trace_point : Opcode Begin [0xf5905a20:-NamePath-] execution.
|
||||
[ 0.187214] exdebug-0398 ex_trace_point : Opcode End [0xf5905a20:-NamePath-] execution.
|
||||
[ 0.187407] exdebug-0398 ex_trace_point : Opcode Begin [0xf5905f60:One] execution.
|
||||
[ 0.187594] exdebug-0398 ex_trace_point : Opcode End [0xf5905f60:One] execution.
|
||||
[ 0.187789] exdebug-0398 ex_trace_point : Opcode End [0xf5905cc0:LEqual] execution.
|
||||
[ 0.187980] exdebug-0398 ex_trace_point : Opcode Begin [0xf5905cc0:Return] execution.
|
||||
[ 0.188146] exdebug-0398 ex_trace_point : Opcode Begin [0xf5905f60:One] execution.
|
||||
[ 0.188334] exdebug-0398 ex_trace_point : Opcode End [0xf5905f60:One] execution.
|
||||
[ 0.188524] exdebug-0398 ex_trace_point : Opcode End [0xf5905cc0:Return] execution.
|
||||
[ 0.188712] exdebug-0398 ex_trace_point : Opcode End [0xf5905c88:If] execution.
|
||||
[ 0.188903] exdebug-0398 ex_trace_point : Method End [0xf58394d8:\_SB.PCI0.LPCB.ECOK] execution.
|
||||
|
||||
Developers can utilize these special log entries to track the AML
|
||||
interpretion, thus can aid issue debugging and performance tuning. Note
|
||||
that, as the "AML tracer" logs are implemented via ACPI_DEBUG_PRINT()
|
||||
macro, CONFIG_ACPI_DEBUG is also required to be enabled for enabling
|
||||
"AML tracer" logs.
|
||||
|
||||
The following command examples illustrate the usage of the "AML tracer"
|
||||
functionality:
|
||||
a. Filter out the method start/stop "AML tracer" logs when control
|
||||
methods are being evaluated:
|
||||
# cd /sys/module/acpi/parameters
|
||||
# echo "0x80" > trace_debug_layer
|
||||
# echo "0x10" > trace_debug_level
|
||||
# echo "enable" > trace_state
|
||||
b. Filter out the method start/stop "AML tracer" when the specified
|
||||
control method is being evaluated:
|
||||
# cd /sys/module/acpi/parameters
|
||||
# echo "0x80" > trace_debug_layer
|
||||
# echo "0x10" > trace_debug_level
|
||||
# echo "\PPPP.AAAA.TTTT.HHHH" > trace_method_name
|
||||
# echo "method" > trace_state
|
||||
c. Filter out the method start/stop "AML tracer" logs when the specified
|
||||
control method is being evaluated for the first time:
|
||||
# cd /sys/module/acpi/parameters
|
||||
# echo "0x80" > trace_debug_layer
|
||||
# echo "0x10" > trace_debug_level
|
||||
# echo "\PPPP.AAAA.TTTT.HHHH" > trace_method_name
|
||||
# echo "method-once" > trace_state
|
||||
d. Filter out the method/opcode start/stop "AML tracer" when the
|
||||
specified control method is being evaluated:
|
||||
# cd /sys/module/acpi/parameters
|
||||
# echo "0x80" > trace_debug_layer
|
||||
# echo "0x10" > trace_debug_level
|
||||
# echo "\PPPP.AAAA.TTTT.HHHH" > trace_method_name
|
||||
# echo "opcode" > trace_state
|
||||
e. Filter out the method/opcode start/stop "AML tracer" when the
|
||||
specified control method is being evaluated for the first time:
|
||||
# cd /sys/module/acpi/parameters
|
||||
# echo "0x80" > trace_debug_layer
|
||||
# echo "0x10" > trace_debug_level
|
||||
# echo "\PPPP.AAAA.TTTT.HHHH" > trace_method_name
|
||||
# echo "opcode-opcode" > trace_state
|
||||
|
||||
Note that all above method tracing facility related module parameters can
|
||||
be used as the boot parameters, for example:
|
||||
acpi.trace_debug_layer=0x80 acpi.trace_debug_level=0x10 \
|
||||
acpi.trace_method_name=\_SB.LID0._LID acpi.trace_state=opcode-once
|
||||
|
||||
2. Interface descriptions:
|
||||
|
||||
All method tracing functions can be configured via ACPI module
|
||||
parameters that are accessible at /sys/module/acpi/parameters/:
|
||||
|
||||
trace_method_name
|
||||
The full path of the AML method that the user wants to trace.
|
||||
Note that the full path shouldn't contain the trailing "_"s in its
|
||||
name segments but may contain "\" to form an absolute path.
|
||||
|
||||
trace_debug_layer
|
||||
The temporary debug_layer used when the tracing feature is enabled.
|
||||
Using ACPI_EXECUTER (0x80) by default, which is the debug_layer
|
||||
used to match all "AML tracer" logs.
|
||||
|
||||
trace_debug_level
|
||||
The temporary debug_level used when the tracing feature is enabled.
|
||||
Using ACPI_LV_TRACE_POINT (0x10) by default, which is the
|
||||
debug_level used to match all "AML tracer" logs.
|
||||
|
||||
trace_state
|
||||
The status of the tracing feature.
|
||||
Users can enable/disable this debug tracing feature by executing
|
||||
the following command:
|
||||
# echo string > /sys/module/acpi/parameters/trace_state
|
||||
Where "string" should be one of the following:
|
||||
"disable"
|
||||
Disable the method tracing feature.
|
||||
"enable"
|
||||
Enable the method tracing feature.
|
||||
ACPICA debugging messages matching
|
||||
"trace_debug_layer/trace_debug_level" during any method
|
||||
execution will be logged.
|
||||
"method"
|
||||
Enable the method tracing feature.
|
||||
ACPICA debugging messages matching
|
||||
"trace_debug_layer/trace_debug_level" during method execution
|
||||
of "trace_method_name" will be logged.
|
||||
"method-once"
|
||||
Enable the method tracing feature.
|
||||
ACPICA debugging messages matching
|
||||
"trace_debug_layer/trace_debug_level" during method execution
|
||||
of "trace_method_name" will be logged only once.
|
||||
"opcode"
|
||||
Enable the method tracing feature.
|
||||
ACPICA debugging messages matching
|
||||
"trace_debug_layer/trace_debug_level" during method/opcode
|
||||
execution of "trace_method_name" will be logged.
|
||||
"opcode-once"
|
||||
Enable the method tracing feature.
|
||||
ACPICA debugging messages matching
|
||||
"trace_debug_layer/trace_debug_level" during method/opcode
|
||||
execution of "trace_method_name" will be logged only once.
|
||||
Note that, the difference between the "enable" and other feature
|
||||
enabling options are:
|
||||
1. When "enable" is specified, since
|
||||
"trace_debug_layer/trace_debug_level" shall apply to all control
|
||||
method evaluations, after configuring "trace_state" to "enable",
|
||||
"trace_method_name" will be reset to NULL.
|
||||
2. When "method/opcode" is specified, if
|
||||
"trace_method_name" is NULL when "trace_state" is configured to
|
||||
these options, the "trace_debug_layer/trace_debug_level" will
|
||||
apply to all control method evaluations.
|
@@ -1,172 +0,0 @@
|
||||
|
||||
In order to support ACPI open-ended hardware configurations (e.g. development
|
||||
boards) we need a way to augment the ACPI configuration provided by the firmware
|
||||
image. A common example is connecting sensors on I2C / SPI buses on development
|
||||
boards.
|
||||
|
||||
Although this can be accomplished by creating a kernel platform driver or
|
||||
recompiling the firmware image with updated ACPI tables, neither is practical:
|
||||
the former proliferates board specific kernel code while the latter requires
|
||||
access to firmware tools which are often not publicly available.
|
||||
|
||||
Because ACPI supports external references in AML code a more practical
|
||||
way to augment firmware ACPI configuration is by dynamically loading
|
||||
user defined SSDT tables that contain the board specific information.
|
||||
|
||||
For example, to enumerate a Bosch BMA222E accelerometer on the I2C bus of the
|
||||
Minnowboard MAX development board exposed via the LSE connector [1], the
|
||||
following ASL code can be used:
|
||||
|
||||
DefinitionBlock ("minnowmax.aml", "SSDT", 1, "Vendor", "Accel", 0x00000003)
|
||||
{
|
||||
External (\_SB.I2C6, DeviceObj)
|
||||
|
||||
Scope (\_SB.I2C6)
|
||||
{
|
||||
Device (STAC)
|
||||
{
|
||||
Name (_ADR, Zero)
|
||||
Name (_HID, "BMA222E")
|
||||
|
||||
Method (_CRS, 0, Serialized)
|
||||
{
|
||||
Name (RBUF, ResourceTemplate ()
|
||||
{
|
||||
I2cSerialBus (0x0018, ControllerInitiated, 0x00061A80,
|
||||
AddressingMode7Bit, "\\_SB.I2C6", 0x00,
|
||||
ResourceConsumer, ,)
|
||||
GpioInt (Edge, ActiveHigh, Exclusive, PullDown, 0x0000,
|
||||
"\\_SB.GPO2", 0x00, ResourceConsumer, , )
|
||||
{ // Pin list
|
||||
0
|
||||
}
|
||||
})
|
||||
Return (RBUF)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
which can then be compiled to AML binary format:
|
||||
|
||||
$ iasl minnowmax.asl
|
||||
|
||||
Intel ACPI Component Architecture
|
||||
ASL Optimizing Compiler version 20140214-64 [Mar 29 2014]
|
||||
Copyright (c) 2000 - 2014 Intel Corporation
|
||||
|
||||
ASL Input: minnomax.asl - 30 lines, 614 bytes, 7 keywords
|
||||
AML Output: minnowmax.aml - 165 bytes, 6 named objects, 1 executable opcodes
|
||||
|
||||
[1] http://wiki.minnowboard.org/MinnowBoard_MAX#Low_Speed_Expansion_Connector_.28Top.29
|
||||
|
||||
The resulting AML code can then be loaded by the kernel using one of the methods
|
||||
below.
|
||||
|
||||
== Loading ACPI SSDTs from initrd ==
|
||||
|
||||
This option allows loading of user defined SSDTs from initrd and it is useful
|
||||
when the system does not support EFI or when there is not enough EFI storage.
|
||||
|
||||
It works in a similar way with initrd based ACPI tables override/upgrade: SSDT
|
||||
aml code must be placed in the first, uncompressed, initrd under the
|
||||
"kernel/firmware/acpi" path. Multiple files can be used and this will translate
|
||||
in loading multiple tables. Only SSDT and OEM tables are allowed. See
|
||||
initrd_table_override.txt for more details.
|
||||
|
||||
Here is an example:
|
||||
|
||||
# Add the raw ACPI tables to an uncompressed cpio archive.
|
||||
# They must be put into a /kernel/firmware/acpi directory inside the
|
||||
# cpio archive.
|
||||
# The uncompressed cpio archive must be the first.
|
||||
# Other, typically compressed cpio archives, must be
|
||||
# concatenated on top of the uncompressed one.
|
||||
mkdir -p kernel/firmware/acpi
|
||||
cp ssdt.aml kernel/firmware/acpi
|
||||
|
||||
# Create the uncompressed cpio archive and concatenate the original initrd
|
||||
# on top:
|
||||
find kernel | cpio -H newc --create > /boot/instrumented_initrd
|
||||
cat /boot/initrd >>/boot/instrumented_initrd
|
||||
|
||||
== Loading ACPI SSDTs from EFI variables ==
|
||||
|
||||
This is the preferred method, when EFI is supported on the platform, because it
|
||||
allows a persistent, OS independent way of storing the user defined SSDTs. There
|
||||
is also work underway to implement EFI support for loading user defined SSDTs
|
||||
and using this method will make it easier to convert to the EFI loading
|
||||
mechanism when that will arrive.
|
||||
|
||||
In order to load SSDTs from an EFI variable the efivar_ssdt kernel command line
|
||||
parameter can be used. The argument for the option is the variable name to
|
||||
use. If there are multiple variables with the same name but with different
|
||||
vendor GUIDs, all of them will be loaded.
|
||||
|
||||
In order to store the AML code in an EFI variable the efivarfs filesystem can be
|
||||
used. It is enabled and mounted by default in /sys/firmware/efi/efivars in all
|
||||
recent distribution.
|
||||
|
||||
Creating a new file in /sys/firmware/efi/efivars will automatically create a new
|
||||
EFI variable. Updating a file in /sys/firmware/efi/efivars will update the EFI
|
||||
variable. Please note that the file name needs to be specially formatted as
|
||||
"Name-GUID" and that the first 4 bytes in the file (little-endian format)
|
||||
represent the attributes of the EFI variable (see EFI_VARIABLE_MASK in
|
||||
include/linux/efi.h). Writing to the file must also be done with one write
|
||||
operation.
|
||||
|
||||
For example, you can use the following bash script to create/update an EFI
|
||||
variable with the content from a given file:
|
||||
|
||||
#!/bin/sh -e
|
||||
|
||||
while ! [ -z "$1" ]; do
|
||||
case "$1" in
|
||||
"-f") filename="$2"; shift;;
|
||||
"-g") guid="$2"; shift;;
|
||||
*) name="$1";;
|
||||
esac
|
||||
shift
|
||||
done
|
||||
|
||||
usage()
|
||||
{
|
||||
echo "Syntax: ${0##*/} -f filename [ -g guid ] name"
|
||||
exit 1
|
||||
}
|
||||
|
||||
[ -n "$name" -a -f "$filename" ] || usage
|
||||
|
||||
EFIVARFS="/sys/firmware/efi/efivars"
|
||||
|
||||
[ -d "$EFIVARFS" ] || exit 2
|
||||
|
||||
if stat -tf $EFIVARFS | grep -q -v de5e81e4; then
|
||||
mount -t efivarfs none $EFIVARFS
|
||||
fi
|
||||
|
||||
# try to pick up an existing GUID
|
||||
[ -n "$guid" ] || guid=$(find "$EFIVARFS" -name "$name-*" | head -n1 | cut -f2- -d-)
|
||||
|
||||
# use a randomly generated GUID
|
||||
[ -n "$guid" ] || guid="$(cat /proc/sys/kernel/random/uuid)"
|
||||
|
||||
# efivarfs expects all of the data in one write
|
||||
tmp=$(mktemp)
|
||||
/bin/echo -ne "\007\000\000\000" | cat - $filename > $tmp
|
||||
dd if=$tmp of="$EFIVARFS/$name-$guid" bs=$(stat -c %s $tmp)
|
||||
rm $tmp
|
||||
|
||||
== Loading ACPI SSDTs from configfs ==
|
||||
|
||||
This option allows loading of user defined SSDTs from userspace via the configfs
|
||||
interface. The CONFIG_ACPI_CONFIGFS option must be select and configfs must be
|
||||
mounted. In the following examples, we assume that configfs has been mounted in
|
||||
/config.
|
||||
|
||||
New tables can be loading by creating new directories in /config/acpi/table/ and
|
||||
writing the SSDT aml code in the aml attribute:
|
||||
|
||||
cd /config/acpi/table
|
||||
mkdir my_ssdt
|
||||
cat ~/ssdt.aml > my_ssdt/aml
|
@@ -1,5 +1,11 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
Collaborative Processor Performance Control (CPPC)
|
||||
==================================================
|
||||
Collaborative Processor Performance Control (CPPC)
|
||||
==================================================
|
||||
|
||||
CPPC
|
||||
====
|
||||
|
||||
CPPC defined in the ACPI spec describes a mechanism for the OS to manage the
|
||||
performance of a logical processor on a contigious and abstract performance
|
||||
@@ -10,31 +16,28 @@ For more details on CPPC please refer to the ACPI specification at:
|
||||
|
||||
http://uefi.org/specifications
|
||||
|
||||
Some of the CPPC registers are exposed via sysfs under:
|
||||
Some of the CPPC registers are exposed via sysfs under::
|
||||
|
||||
/sys/devices/system/cpu/cpuX/acpi_cppc/
|
||||
/sys/devices/system/cpu/cpuX/acpi_cppc/
|
||||
|
||||
for each cpu X
|
||||
for each cpu X::
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
$ ls -lR /sys/devices/system/cpu/cpu0/acpi_cppc/
|
||||
/sys/devices/system/cpu/cpu0/acpi_cppc/:
|
||||
total 0
|
||||
-r--r--r-- 1 root root 65536 Mar 5 19:38 feedback_ctrs
|
||||
-r--r--r-- 1 root root 65536 Mar 5 19:38 highest_perf
|
||||
-r--r--r-- 1 root root 65536 Mar 5 19:38 lowest_freq
|
||||
-r--r--r-- 1 root root 65536 Mar 5 19:38 lowest_nonlinear_perf
|
||||
-r--r--r-- 1 root root 65536 Mar 5 19:38 lowest_perf
|
||||
-r--r--r-- 1 root root 65536 Mar 5 19:38 nominal_freq
|
||||
-r--r--r-- 1 root root 65536 Mar 5 19:38 nominal_perf
|
||||
-r--r--r-- 1 root root 65536 Mar 5 19:38 reference_perf
|
||||
-r--r--r-- 1 root root 65536 Mar 5 19:38 wraparound_time
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
$ ls -lR /sys/devices/system/cpu/cpu0/acpi_cppc/
|
||||
/sys/devices/system/cpu/cpu0/acpi_cppc/:
|
||||
total 0
|
||||
-r--r--r-- 1 root root 65536 Mar 5 19:38 feedback_ctrs
|
||||
-r--r--r-- 1 root root 65536 Mar 5 19:38 highest_perf
|
||||
-r--r--r-- 1 root root 65536 Mar 5 19:38 lowest_freq
|
||||
-r--r--r-- 1 root root 65536 Mar 5 19:38 lowest_nonlinear_perf
|
||||
-r--r--r-- 1 root root 65536 Mar 5 19:38 lowest_perf
|
||||
-r--r--r-- 1 root root 65536 Mar 5 19:38 nominal_freq
|
||||
-r--r--r-- 1 root root 65536 Mar 5 19:38 nominal_perf
|
||||
-r--r--r-- 1 root root 65536 Mar 5 19:38 reference_perf
|
||||
-r--r--r-- 1 root root 65536 Mar 5 19:38 wraparound_time
|
||||
|
||||
* highest_perf : Highest performance of this processor (abstract scale).
|
||||
* nominal_perf : Highest sustained performance of this processor (abstract scale).
|
||||
* nominal_perf : Highest sustained performance of this processor
|
||||
(abstract scale).
|
||||
* lowest_nonlinear_perf : Lowest performance of this processor with nonlinear
|
||||
power savings (abstract scale).
|
||||
* lowest_perf : Lowest performance of this processor (abstract scale).
|
||||
@@ -48,22 +51,26 @@ total 0
|
||||
* feedback_ctrs : Includes both Reference and delivered performance counter.
|
||||
Reference counter ticks up proportional to processor's reference performance.
|
||||
Delivered counter ticks up proportional to processor's delivered performance.
|
||||
* wraparound_time: Minimum time for the feedback counters to wraparound (seconds).
|
||||
* wraparound_time: Minimum time for the feedback counters to wraparound
|
||||
(seconds).
|
||||
* reference_perf : Performance level at which reference performance counter
|
||||
accumulates (abstract scale).
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
Computing Average Delivered Performance
|
||||
Computing Average Delivered Performance
|
||||
=======================================
|
||||
|
||||
Below describes the steps to compute the average performance delivered by taking
|
||||
two different snapshots of feedback counters at time T1 and T2.
|
||||
Below describes the steps to compute the average performance delivered by
|
||||
taking two different snapshots of feedback counters at time T1 and T2.
|
||||
|
||||
T1: Read feedback_ctrs as fbc_t1
|
||||
Wait or run some workload
|
||||
T2: Read feedback_ctrs as fbc_t2
|
||||
T1: Read feedback_ctrs as fbc_t1
|
||||
Wait or run some workload
|
||||
|
||||
delivered_counter_delta = fbc_t2[del] - fbc_t1[del]
|
||||
reference_counter_delta = fbc_t2[ref] - fbc_t1[ref]
|
||||
T2: Read feedback_ctrs as fbc_t2
|
||||
|
||||
delivered_perf = (refernce_perf x delivered_counter_delta) / reference_counter_delta
|
||||
::
|
||||
|
||||
delivered_counter_delta = fbc_t2[del] - fbc_t1[del]
|
||||
reference_counter_delta = fbc_t2[ref] - fbc_t1[ref]
|
||||
|
||||
delivered_perf = (refernce_perf x delivered_counter_delta) / reference_counter_delta
|
@@ -1,6 +1,12 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
===============
|
||||
Overriding DSDT
|
||||
===============
|
||||
|
||||
Linux supports a method of overriding the BIOS DSDT:
|
||||
|
||||
CONFIG_ACPI_CUSTOM_DSDT builds the image into the kernel.
|
||||
CONFIG_ACPI_CUSTOM_DSDT - builds the image into the kernel.
|
||||
|
||||
When to use this method is described in detail on the
|
||||
Linux/ACPI home page:
|
14
Documentation/admin-guide/acpi/index.rst
Normal file
14
Documentation/admin-guide/acpi/index.rst
Normal file
@@ -0,0 +1,14 @@
|
||||
============
|
||||
ACPI Support
|
||||
============
|
||||
|
||||
Here we document in detail how to interact with various mechanisms in
|
||||
the Linux ACPI support.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
||||
initrd_table_override
|
||||
dsdt-override
|
||||
ssdt-overlays
|
||||
cppc_sysfs
|
115
Documentation/admin-guide/acpi/initrd_table_override.rst
Normal file
115
Documentation/admin-guide/acpi/initrd_table_override.rst
Normal file
@@ -0,0 +1,115 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
================================
|
||||
Upgrading ACPI tables via initrd
|
||||
================================
|
||||
|
||||
What is this about
|
||||
==================
|
||||
|
||||
If the ACPI_TABLE_UPGRADE compile option is true, it is possible to
|
||||
upgrade the ACPI execution environment that is defined by the ACPI tables
|
||||
via upgrading the ACPI tables provided by the BIOS with an instrumented,
|
||||
modified, more recent version one, or installing brand new ACPI tables.
|
||||
|
||||
When building initrd with kernel in a single image, option
|
||||
ACPI_TABLE_OVERRIDE_VIA_BUILTIN_INITRD should also be true for this
|
||||
feature to work.
|
||||
|
||||
For a full list of ACPI tables that can be upgraded/installed, take a look
|
||||
at the char `*table_sigs[MAX_ACPI_SIGNATURE];` definition in
|
||||
drivers/acpi/tables.c.
|
||||
|
||||
All ACPI tables iasl (Intel's ACPI compiler and disassembler) knows should
|
||||
be overridable, except:
|
||||
|
||||
- ACPI_SIG_RSDP (has a signature of 6 bytes)
|
||||
- ACPI_SIG_FACS (does not have an ordinary ACPI table header)
|
||||
|
||||
Both could get implemented as well.
|
||||
|
||||
|
||||
What is this for
|
||||
================
|
||||
|
||||
Complain to your platform/BIOS vendor if you find a bug which is so severe
|
||||
that a workaround is not accepted in the Linux kernel. And this facility
|
||||
allows you to upgrade the buggy tables before your platform/BIOS vendor
|
||||
releases an upgraded BIOS binary.
|
||||
|
||||
This facility can be used by platform/BIOS vendors to provide a Linux
|
||||
compatible environment without modifying the underlying platform firmware.
|
||||
|
||||
This facility also provides a powerful feature to easily debug and test
|
||||
ACPI BIOS table compatibility with the Linux kernel by modifying old
|
||||
platform provided ACPI tables or inserting new ACPI tables.
|
||||
|
||||
It can and should be enabled in any kernel because there is no functional
|
||||
change with not instrumented initrds.
|
||||
|
||||
|
||||
How does it work
|
||||
================
|
||||
::
|
||||
|
||||
# Extract the machine's ACPI tables:
|
||||
cd /tmp
|
||||
acpidump >acpidump
|
||||
acpixtract -a acpidump
|
||||
# Disassemble, modify and recompile them:
|
||||
iasl -d *.dat
|
||||
# For example add this statement into a _PRT (PCI Routing Table) function
|
||||
# of the DSDT:
|
||||
Store("HELLO WORLD", debug)
|
||||
# And increase the OEM Revision. For example, before modification:
|
||||
DefinitionBlock ("DSDT.aml", "DSDT", 2, "INTEL ", "TEMPLATE", 0x00000000)
|
||||
# After modification:
|
||||
DefinitionBlock ("DSDT.aml", "DSDT", 2, "INTEL ", "TEMPLATE", 0x00000001)
|
||||
iasl -sa dsdt.dsl
|
||||
# Add the raw ACPI tables to an uncompressed cpio archive.
|
||||
# They must be put into a /kernel/firmware/acpi directory inside the cpio
|
||||
# archive. Note that if the table put here matches a platform table
|
||||
# (similar Table Signature, and similar OEMID, and similar OEM Table ID)
|
||||
# with a more recent OEM Revision, the platform table will be upgraded by
|
||||
# this table. If the table put here doesn't match a platform table
|
||||
# (dissimilar Table Signature, or dissimilar OEMID, or dissimilar OEM Table
|
||||
# ID), this table will be appended.
|
||||
mkdir -p kernel/firmware/acpi
|
||||
cp dsdt.aml kernel/firmware/acpi
|
||||
# A maximum of "NR_ACPI_INITRD_TABLES (64)" tables are currently allowed
|
||||
# (see osl.c):
|
||||
iasl -sa facp.dsl
|
||||
iasl -sa ssdt1.dsl
|
||||
cp facp.aml kernel/firmware/acpi
|
||||
cp ssdt1.aml kernel/firmware/acpi
|
||||
# The uncompressed cpio archive must be the first. Other, typically
|
||||
# compressed cpio archives, must be concatenated on top of the uncompressed
|
||||
# one. Following command creates the uncompressed cpio archive and
|
||||
# concatenates the original initrd on top:
|
||||
find kernel | cpio -H newc --create > /boot/instrumented_initrd
|
||||
cat /boot/initrd >>/boot/instrumented_initrd
|
||||
# reboot with increased acpi debug level, e.g. boot params:
|
||||
acpi.debug_level=0x2 acpi.debug_layer=0xFFFFFFFF
|
||||
# and check your syslog:
|
||||
[ 1.268089] ACPI: PCI Interrupt Routing Table [\_SB_.PCI0._PRT]
|
||||
[ 1.272091] [ACPI Debug] String [0x0B] "HELLO WORLD"
|
||||
|
||||
iasl is able to disassemble and recompile quite a lot different,
|
||||
also static ACPI tables.
|
||||
|
||||
|
||||
Where to retrieve userspace tools
|
||||
=================================
|
||||
|
||||
iasl and acpixtract are part of Intel's ACPICA project:
|
||||
http://acpica.org/
|
||||
|
||||
and should be packaged by distributions (for example in the acpica package
|
||||
on SUSE).
|
||||
|
||||
acpidump can be found in Len Browns pmtools:
|
||||
ftp://kernel.org/pub/linux/kernel/people/lenb/acpi/utils/pmtools/acpidump
|
||||
|
||||
This tool is also part of the acpica package on SUSE.
|
||||
Alternatively, used ACPI tables can be retrieved via sysfs in latest kernels:
|
||||
/sys/firmware/acpi/tables
|
180
Documentation/admin-guide/acpi/ssdt-overlays.rst
Normal file
180
Documentation/admin-guide/acpi/ssdt-overlays.rst
Normal file
@@ -0,0 +1,180 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
=============
|
||||
SSDT Overlays
|
||||
=============
|
||||
|
||||
In order to support ACPI open-ended hardware configurations (e.g. development
|
||||
boards) we need a way to augment the ACPI configuration provided by the firmware
|
||||
image. A common example is connecting sensors on I2C / SPI buses on development
|
||||
boards.
|
||||
|
||||
Although this can be accomplished by creating a kernel platform driver or
|
||||
recompiling the firmware image with updated ACPI tables, neither is practical:
|
||||
the former proliferates board specific kernel code while the latter requires
|
||||
access to firmware tools which are often not publicly available.
|
||||
|
||||
Because ACPI supports external references in AML code a more practical
|
||||
way to augment firmware ACPI configuration is by dynamically loading
|
||||
user defined SSDT tables that contain the board specific information.
|
||||
|
||||
For example, to enumerate a Bosch BMA222E accelerometer on the I2C bus of the
|
||||
Minnowboard MAX development board exposed via the LSE connector [1], the
|
||||
following ASL code can be used::
|
||||
|
||||
DefinitionBlock ("minnowmax.aml", "SSDT", 1, "Vendor", "Accel", 0x00000003)
|
||||
{
|
||||
External (\_SB.I2C6, DeviceObj)
|
||||
|
||||
Scope (\_SB.I2C6)
|
||||
{
|
||||
Device (STAC)
|
||||
{
|
||||
Name (_ADR, Zero)
|
||||
Name (_HID, "BMA222E")
|
||||
|
||||
Method (_CRS, 0, Serialized)
|
||||
{
|
||||
Name (RBUF, ResourceTemplate ()
|
||||
{
|
||||
I2cSerialBus (0x0018, ControllerInitiated, 0x00061A80,
|
||||
AddressingMode7Bit, "\\_SB.I2C6", 0x00,
|
||||
ResourceConsumer, ,)
|
||||
GpioInt (Edge, ActiveHigh, Exclusive, PullDown, 0x0000,
|
||||
"\\_SB.GPO2", 0x00, ResourceConsumer, , )
|
||||
{ // Pin list
|
||||
0
|
||||
}
|
||||
})
|
||||
Return (RBUF)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
which can then be compiled to AML binary format::
|
||||
|
||||
$ iasl minnowmax.asl
|
||||
|
||||
Intel ACPI Component Architecture
|
||||
ASL Optimizing Compiler version 20140214-64 [Mar 29 2014]
|
||||
Copyright (c) 2000 - 2014 Intel Corporation
|
||||
|
||||
ASL Input: minnomax.asl - 30 lines, 614 bytes, 7 keywords
|
||||
AML Output: minnowmax.aml - 165 bytes, 6 named objects, 1 executable opcodes
|
||||
|
||||
[1] http://wiki.minnowboard.org/MinnowBoard_MAX#Low_Speed_Expansion_Connector_.28Top.29
|
||||
|
||||
The resulting AML code can then be loaded by the kernel using one of the methods
|
||||
below.
|
||||
|
||||
Loading ACPI SSDTs from initrd
|
||||
==============================
|
||||
|
||||
This option allows loading of user defined SSDTs from initrd and it is useful
|
||||
when the system does not support EFI or when there is not enough EFI storage.
|
||||
|
||||
It works in a similar way with initrd based ACPI tables override/upgrade: SSDT
|
||||
aml code must be placed in the first, uncompressed, initrd under the
|
||||
"kernel/firmware/acpi" path. Multiple files can be used and this will translate
|
||||
in loading multiple tables. Only SSDT and OEM tables are allowed. See
|
||||
initrd_table_override.txt for more details.
|
||||
|
||||
Here is an example::
|
||||
|
||||
# Add the raw ACPI tables to an uncompressed cpio archive.
|
||||
# They must be put into a /kernel/firmware/acpi directory inside the
|
||||
# cpio archive.
|
||||
# The uncompressed cpio archive must be the first.
|
||||
# Other, typically compressed cpio archives, must be
|
||||
# concatenated on top of the uncompressed one.
|
||||
mkdir -p kernel/firmware/acpi
|
||||
cp ssdt.aml kernel/firmware/acpi
|
||||
|
||||
# Create the uncompressed cpio archive and concatenate the original initrd
|
||||
# on top:
|
||||
find kernel | cpio -H newc --create > /boot/instrumented_initrd
|
||||
cat /boot/initrd >>/boot/instrumented_initrd
|
||||
|
||||
Loading ACPI SSDTs from EFI variables
|
||||
=====================================
|
||||
|
||||
This is the preferred method, when EFI is supported on the platform, because it
|
||||
allows a persistent, OS independent way of storing the user defined SSDTs. There
|
||||
is also work underway to implement EFI support for loading user defined SSDTs
|
||||
and using this method will make it easier to convert to the EFI loading
|
||||
mechanism when that will arrive.
|
||||
|
||||
In order to load SSDTs from an EFI variable the efivar_ssdt kernel command line
|
||||
parameter can be used. The argument for the option is the variable name to
|
||||
use. If there are multiple variables with the same name but with different
|
||||
vendor GUIDs, all of them will be loaded.
|
||||
|
||||
In order to store the AML code in an EFI variable the efivarfs filesystem can be
|
||||
used. It is enabled and mounted by default in /sys/firmware/efi/efivars in all
|
||||
recent distribution.
|
||||
|
||||
Creating a new file in /sys/firmware/efi/efivars will automatically create a new
|
||||
EFI variable. Updating a file in /sys/firmware/efi/efivars will update the EFI
|
||||
variable. Please note that the file name needs to be specially formatted as
|
||||
"Name-GUID" and that the first 4 bytes in the file (little-endian format)
|
||||
represent the attributes of the EFI variable (see EFI_VARIABLE_MASK in
|
||||
include/linux/efi.h). Writing to the file must also be done with one write
|
||||
operation.
|
||||
|
||||
For example, you can use the following bash script to create/update an EFI
|
||||
variable with the content from a given file::
|
||||
|
||||
#!/bin/sh -e
|
||||
|
||||
while ! [ -z "$1" ]; do
|
||||
case "$1" in
|
||||
"-f") filename="$2"; shift;;
|
||||
"-g") guid="$2"; shift;;
|
||||
*) name="$1";;
|
||||
esac
|
||||
shift
|
||||
done
|
||||
|
||||
usage()
|
||||
{
|
||||
echo "Syntax: ${0##*/} -f filename [ -g guid ] name"
|
||||
exit 1
|
||||
}
|
||||
|
||||
[ -n "$name" -a -f "$filename" ] || usage
|
||||
|
||||
EFIVARFS="/sys/firmware/efi/efivars"
|
||||
|
||||
[ -d "$EFIVARFS" ] || exit 2
|
||||
|
||||
if stat -tf $EFIVARFS | grep -q -v de5e81e4; then
|
||||
mount -t efivarfs none $EFIVARFS
|
||||
fi
|
||||
|
||||
# try to pick up an existing GUID
|
||||
[ -n "$guid" ] || guid=$(find "$EFIVARFS" -name "$name-*" | head -n1 | cut -f2- -d-)
|
||||
|
||||
# use a randomly generated GUID
|
||||
[ -n "$guid" ] || guid="$(cat /proc/sys/kernel/random/uuid)"
|
||||
|
||||
# efivarfs expects all of the data in one write
|
||||
tmp=$(mktemp)
|
||||
/bin/echo -ne "\007\000\000\000" | cat - $filename > $tmp
|
||||
dd if=$tmp of="$EFIVARFS/$name-$guid" bs=$(stat -c %s $tmp)
|
||||
rm $tmp
|
||||
|
||||
Loading ACPI SSDTs from configfs
|
||||
================================
|
||||
|
||||
This option allows loading of user defined SSDTs from userspace via the configfs
|
||||
interface. The CONFIG_ACPI_CONFIGFS option must be select and configfs must be
|
||||
mounted. In the following examples, we assume that configfs has been mounted in
|
||||
/config.
|
||||
|
||||
New tables can be loading by creating new directories in /config/acpi/table/ and
|
||||
writing the SSDT aml code in the aml attribute::
|
||||
|
||||
cd /config/acpi/table
|
||||
mkdir my_ssdt
|
||||
cat ~/ssdt.aml > my_ssdt/aml
|
@@ -864,6 +864,8 @@ All cgroup core files are prefixed with "cgroup."
|
||||
populated
|
||||
1 if the cgroup or its descendants contains any live
|
||||
processes; otherwise, 0.
|
||||
frozen
|
||||
1 if the cgroup is frozen; otherwise, 0.
|
||||
|
||||
cgroup.max.descendants
|
||||
A read-write single value files. The default is "max".
|
||||
@@ -897,6 +899,31 @@ All cgroup core files are prefixed with "cgroup."
|
||||
A dying cgroup can consume system resources not exceeding
|
||||
limits, which were active at the moment of cgroup deletion.
|
||||
|
||||
cgroup.freeze
|
||||
A read-write single value file which exists on non-root cgroups.
|
||||
Allowed values are "0" and "1". The default is "0".
|
||||
|
||||
Writing "1" to the file causes freezing of the cgroup and all
|
||||
descendant cgroups. This means that all belonging processes will
|
||||
be stopped and will not run until the cgroup will be explicitly
|
||||
unfrozen. Freezing of the cgroup may take some time; when this action
|
||||
is completed, the "frozen" value in the cgroup.events control file
|
||||
will be updated to "1" and the corresponding notification will be
|
||||
issued.
|
||||
|
||||
A cgroup can be frozen either by its own settings, or by settings
|
||||
of any ancestor cgroups. If any of ancestor cgroups is frozen, the
|
||||
cgroup will remain frozen.
|
||||
|
||||
Processes in the frozen cgroup can be killed by a fatal signal.
|
||||
They also can enter and leave a frozen cgroup: either by an explicit
|
||||
move by a user, or if freezing of the cgroup races with fork().
|
||||
If a process is moved to a frozen cgroup, it stops. If a process is
|
||||
moved out of a frozen cgroup, it becomes running.
|
||||
|
||||
Frozen status of a cgroup doesn't affect any cgroup tree operations:
|
||||
it's possible to delete a frozen (and empty) cgroup, as well as
|
||||
create new sub-cgroups.
|
||||
|
||||
Controllers
|
||||
===========
|
||||
|
@@ -91,10 +91,48 @@ Currently Available
|
||||
* large block (up to pagesize) support
|
||||
* efficient new ordered mode in JBD2 and ext4 (avoid using buffer head to force
|
||||
the ordering)
|
||||
* Case-insensitive file name lookups
|
||||
|
||||
[1] Filesystems with a block size of 1k may see a limit imposed by the
|
||||
directory hash tree having a maximum depth of two.
|
||||
|
||||
case-insensitive file name lookups
|
||||
======================================================
|
||||
|
||||
The case-insensitive file name lookup feature is supported on a
|
||||
per-directory basis, allowing the user to mix case-insensitive and
|
||||
case-sensitive directories in the same filesystem. It is enabled by
|
||||
flipping the +F inode attribute of an empty directory. The
|
||||
case-insensitive string match operation is only defined when we know how
|
||||
text in encoded in a byte sequence. For that reason, in order to enable
|
||||
case-insensitive directories, the filesystem must have the
|
||||
casefold feature, which stores the filesystem-wide encoding
|
||||
model used. By default, the charset adopted is the latest version of
|
||||
Unicode (12.1.0, by the time of this writing), encoded in the UTF-8
|
||||
form. The comparison algorithm is implemented by normalizing the
|
||||
strings to the Canonical decomposition form, as defined by Unicode,
|
||||
followed by a byte per byte comparison.
|
||||
|
||||
The case-awareness is name-preserving on the disk, meaning that the file
|
||||
name provided by userspace is a byte-per-byte match to what is actually
|
||||
written in the disk. The Unicode normalization format used by the
|
||||
kernel is thus an internal representation, and not exposed to the
|
||||
userspace nor to the disk, with the important exception of disk hashes,
|
||||
used on large case-insensitive directories with DX feature. On DX
|
||||
directories, the hash must be calculated using the casefolded version of
|
||||
the filename, meaning that the normalization format used actually has an
|
||||
impact on where the directory entry is stored.
|
||||
|
||||
When we change from viewing filenames as opaque byte sequences to seeing
|
||||
them as encoded strings we need to address what happens when a program
|
||||
tries to create a file with an invalid name. The Unicode subsystem
|
||||
within the kernel leaves the decision of what to do in this case to the
|
||||
filesystem, which select its preferred behavior by enabling/disabling
|
||||
the strict mode. When Ext4 encounters one of those strings and the
|
||||
filesystem did not require strict mode, it falls back to considering the
|
||||
entire string as an opaque byte sequence, which still allows the user to
|
||||
operate on that file, but the case-insensitive lookups won't work.
|
||||
|
||||
Options
|
||||
=======
|
||||
|
||||
|
13
Documentation/admin-guide/hw-vuln/index.rst
Normal file
13
Documentation/admin-guide/hw-vuln/index.rst
Normal file
@@ -0,0 +1,13 @@
|
||||
========================
|
||||
Hardware vulnerabilities
|
||||
========================
|
||||
|
||||
This section describes CPU vulnerabilities and provides an overview of the
|
||||
possible mitigations along with guidance for selecting mitigations if they
|
||||
are configurable at compile, boot or run time.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
||||
l1tf
|
||||
mds
|
@@ -445,6 +445,7 @@ The default is 'cond'. If 'l1tf=full,force' is given on the kernel command
|
||||
line, then 'always' is enforced and the kvm-intel.vmentry_l1d_flush
|
||||
module parameter is ignored and writes to the sysfs file are rejected.
|
||||
|
||||
.. _mitigation_selection:
|
||||
|
||||
Mitigation selection guide
|
||||
--------------------------
|
308
Documentation/admin-guide/hw-vuln/mds.rst
Normal file
308
Documentation/admin-guide/hw-vuln/mds.rst
Normal file
@@ -0,0 +1,308 @@
|
||||
MDS - Microarchitectural Data Sampling
|
||||
======================================
|
||||
|
||||
Microarchitectural Data Sampling is a hardware vulnerability which allows
|
||||
unprivileged speculative access to data which is available in various CPU
|
||||
internal buffers.
|
||||
|
||||
Affected processors
|
||||
-------------------
|
||||
|
||||
This vulnerability affects a wide range of Intel processors. The
|
||||
vulnerability is not present on:
|
||||
|
||||
- Processors from AMD, Centaur and other non Intel vendors
|
||||
|
||||
- Older processor models, where the CPU family is < 6
|
||||
|
||||
- Some Atoms (Bonnell, Saltwell, Goldmont, GoldmontPlus)
|
||||
|
||||
- Intel processors which have the ARCH_CAP_MDS_NO bit set in the
|
||||
IA32_ARCH_CAPABILITIES MSR.
|
||||
|
||||
Whether a processor is affected or not can be read out from the MDS
|
||||
vulnerability file in sysfs. See :ref:`mds_sys_info`.
|
||||
|
||||
Not all processors are affected by all variants of MDS, but the mitigation
|
||||
is identical for all of them so the kernel treats them as a single
|
||||
vulnerability.
|
||||
|
||||
Related CVEs
|
||||
------------
|
||||
|
||||
The following CVE entries are related to the MDS vulnerability:
|
||||
|
||||
============== ===== ===================================================
|
||||
CVE-2018-12126 MSBDS Microarchitectural Store Buffer Data Sampling
|
||||
CVE-2018-12130 MFBDS Microarchitectural Fill Buffer Data Sampling
|
||||
CVE-2018-12127 MLPDS Microarchitectural Load Port Data Sampling
|
||||
CVE-2019-11091 MDSUM Microarchitectural Data Sampling Uncacheable Memory
|
||||
============== ===== ===================================================
|
||||
|
||||
Problem
|
||||
-------
|
||||
|
||||
When performing store, load, L1 refill operations, processors write data
|
||||
into temporary microarchitectural structures (buffers). The data in the
|
||||
buffer can be forwarded to load operations as an optimization.
|
||||
|
||||
Under certain conditions, usually a fault/assist caused by a load
|
||||
operation, data unrelated to the load memory address can be speculatively
|
||||
forwarded from the buffers. Because the load operation causes a fault or
|
||||
assist and its result will be discarded, the forwarded data will not cause
|
||||
incorrect program execution or state changes. But a malicious operation
|
||||
may be able to forward this speculative data to a disclosure gadget which
|
||||
allows in turn to infer the value via a cache side channel attack.
|
||||
|
||||
Because the buffers are potentially shared between Hyper-Threads cross
|
||||
Hyper-Thread attacks are possible.
|
||||
|
||||
Deeper technical information is available in the MDS specific x86
|
||||
architecture section: :ref:`Documentation/x86/mds.rst <mds>`.
|
||||
|
||||
|
||||
Attack scenarios
|
||||
----------------
|
||||
|
||||
Attacks against the MDS vulnerabilities can be mounted from malicious non
|
||||
priviledged user space applications running on hosts or guest. Malicious
|
||||
guest OSes can obviously mount attacks as well.
|
||||
|
||||
Contrary to other speculation based vulnerabilities the MDS vulnerability
|
||||
does not allow the attacker to control the memory target address. As a
|
||||
consequence the attacks are purely sampling based, but as demonstrated with
|
||||
the TLBleed attack samples can be postprocessed successfully.
|
||||
|
||||
Web-Browsers
|
||||
^^^^^^^^^^^^
|
||||
|
||||
It's unclear whether attacks through Web-Browsers are possible at
|
||||
all. The exploitation through Java-Script is considered very unlikely,
|
||||
but other widely used web technologies like Webassembly could possibly be
|
||||
abused.
|
||||
|
||||
|
||||
.. _mds_sys_info:
|
||||
|
||||
MDS system information
|
||||
-----------------------
|
||||
|
||||
The Linux kernel provides a sysfs interface to enumerate the current MDS
|
||||
status of the system: whether the system is vulnerable, and which
|
||||
mitigations are active. The relevant sysfs file is:
|
||||
|
||||
/sys/devices/system/cpu/vulnerabilities/mds
|
||||
|
||||
The possible values in this file are:
|
||||
|
||||
.. list-table::
|
||||
|
||||
* - 'Not affected'
|
||||
- The processor is not vulnerable
|
||||
* - 'Vulnerable'
|
||||
- The processor is vulnerable, but no mitigation enabled
|
||||
* - 'Vulnerable: Clear CPU buffers attempted, no microcode'
|
||||
- The processor is vulnerable but microcode is not updated.
|
||||
|
||||
The mitigation is enabled on a best effort basis. See :ref:`vmwerv`
|
||||
* - 'Mitigation: Clear CPU buffers'
|
||||
- The processor is vulnerable and the CPU buffer clearing mitigation is
|
||||
enabled.
|
||||
|
||||
If the processor is vulnerable then the following information is appended
|
||||
to the above information:
|
||||
|
||||
======================== ============================================
|
||||
'SMT vulnerable' SMT is enabled
|
||||
'SMT mitigated' SMT is enabled and mitigated
|
||||
'SMT disabled' SMT is disabled
|
||||
'SMT Host state unknown' Kernel runs in a VM, Host SMT state unknown
|
||||
======================== ============================================
|
||||
|
||||
.. _vmwerv:
|
||||
|
||||
Best effort mitigation mode
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
If the processor is vulnerable, but the availability of the microcode based
|
||||
mitigation mechanism is not advertised via CPUID the kernel selects a best
|
||||
effort mitigation mode. This mode invokes the mitigation instructions
|
||||
without a guarantee that they clear the CPU buffers.
|
||||
|
||||
This is done to address virtualization scenarios where the host has the
|
||||
microcode update applied, but the hypervisor is not yet updated to expose
|
||||
the CPUID to the guest. If the host has updated microcode the protection
|
||||
takes effect otherwise a few cpu cycles are wasted pointlessly.
|
||||
|
||||
The state in the mds sysfs file reflects this situation accordingly.
|
||||
|
||||
|
||||
Mitigation mechanism
|
||||
-------------------------
|
||||
|
||||
The kernel detects the affected CPUs and the presence of the microcode
|
||||
which is required.
|
||||
|
||||
If a CPU is affected and the microcode is available, then the kernel
|
||||
enables the mitigation by default. The mitigation can be controlled at boot
|
||||
time via a kernel command line option. See
|
||||
:ref:`mds_mitigation_control_command_line`.
|
||||
|
||||
.. _cpu_buffer_clear:
|
||||
|
||||
CPU buffer clearing
|
||||
^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The mitigation for MDS clears the affected CPU buffers on return to user
|
||||
space and when entering a guest.
|
||||
|
||||
If SMT is enabled it also clears the buffers on idle entry when the CPU
|
||||
is only affected by MSBDS and not any other MDS variant, because the
|
||||
other variants cannot be protected against cross Hyper-Thread attacks.
|
||||
|
||||
For CPUs which are only affected by MSBDS the user space, guest and idle
|
||||
transition mitigations are sufficient and SMT is not affected.
|
||||
|
||||
.. _virt_mechanism:
|
||||
|
||||
Virtualization mitigation
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The protection for host to guest transition depends on the L1TF
|
||||
vulnerability of the CPU:
|
||||
|
||||
- CPU is affected by L1TF:
|
||||
|
||||
If the L1D flush mitigation is enabled and up to date microcode is
|
||||
available, the L1D flush mitigation is automatically protecting the
|
||||
guest transition.
|
||||
|
||||
If the L1D flush mitigation is disabled then the MDS mitigation is
|
||||
invoked explicit when the host MDS mitigation is enabled.
|
||||
|
||||
For details on L1TF and virtualization see:
|
||||
:ref:`Documentation/admin-guide/hw-vuln//l1tf.rst <mitigation_control_kvm>`.
|
||||
|
||||
- CPU is not affected by L1TF:
|
||||
|
||||
CPU buffers are flushed before entering the guest when the host MDS
|
||||
mitigation is enabled.
|
||||
|
||||
The resulting MDS protection matrix for the host to guest transition:
|
||||
|
||||
============ ===== ============= ============ =================
|
||||
L1TF MDS VMX-L1FLUSH Host MDS MDS-State
|
||||
|
||||
Don't care No Don't care N/A Not affected
|
||||
|
||||
Yes Yes Disabled Off Vulnerable
|
||||
|
||||
Yes Yes Disabled Full Mitigated
|
||||
|
||||
Yes Yes Enabled Don't care Mitigated
|
||||
|
||||
No Yes N/A Off Vulnerable
|
||||
|
||||
No Yes N/A Full Mitigated
|
||||
============ ===== ============= ============ =================
|
||||
|
||||
This only covers the host to guest transition, i.e. prevents leakage from
|
||||
host to guest, but does not protect the guest internally. Guests need to
|
||||
have their own protections.
|
||||
|
||||
.. _xeon_phi:
|
||||
|
||||
XEON PHI specific considerations
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The XEON PHI processor family is affected by MSBDS which can be exploited
|
||||
cross Hyper-Threads when entering idle states. Some XEON PHI variants allow
|
||||
to use MWAIT in user space (Ring 3) which opens an potential attack vector
|
||||
for malicious user space. The exposure can be disabled on the kernel
|
||||
command line with the 'ring3mwait=disable' command line option.
|
||||
|
||||
XEON PHI is not affected by the other MDS variants and MSBDS is mitigated
|
||||
before the CPU enters a idle state. As XEON PHI is not affected by L1TF
|
||||
either disabling SMT is not required for full protection.
|
||||
|
||||
.. _mds_smt_control:
|
||||
|
||||
SMT control
|
||||
^^^^^^^^^^^
|
||||
|
||||
All MDS variants except MSBDS can be attacked cross Hyper-Threads. That
|
||||
means on CPUs which are affected by MFBDS or MLPDS it is necessary to
|
||||
disable SMT for full protection. These are most of the affected CPUs; the
|
||||
exception is XEON PHI, see :ref:`xeon_phi`.
|
||||
|
||||
Disabling SMT can have a significant performance impact, but the impact
|
||||
depends on the type of workloads.
|
||||
|
||||
See the relevant chapter in the L1TF mitigation documentation for details:
|
||||
:ref:`Documentation/admin-guide/hw-vuln/l1tf.rst <smt_control>`.
|
||||
|
||||
|
||||
.. _mds_mitigation_control_command_line:
|
||||
|
||||
Mitigation control on the kernel command line
|
||||
---------------------------------------------
|
||||
|
||||
The kernel command line allows to control the MDS mitigations at boot
|
||||
time with the option "mds=". The valid arguments for this option are:
|
||||
|
||||
============ =============================================================
|
||||
full If the CPU is vulnerable, enable all available mitigations
|
||||
for the MDS vulnerability, CPU buffer clearing on exit to
|
||||
userspace and when entering a VM. Idle transitions are
|
||||
protected as well if SMT is enabled.
|
||||
|
||||
It does not automatically disable SMT.
|
||||
|
||||
full,nosmt The same as mds=full, with SMT disabled on vulnerable
|
||||
CPUs. This is the complete mitigation.
|
||||
|
||||
off Disables MDS mitigations completely.
|
||||
|
||||
============ =============================================================
|
||||
|
||||
Not specifying this option is equivalent to "mds=full".
|
||||
|
||||
|
||||
Mitigation selection guide
|
||||
--------------------------
|
||||
|
||||
1. Trusted userspace
|
||||
^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
If all userspace applications are from a trusted source and do not
|
||||
execute untrusted code which is supplied externally, then the mitigation
|
||||
can be disabled.
|
||||
|
||||
|
||||
2. Virtualization with trusted guests
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The same considerations as above versus trusted user space apply.
|
||||
|
||||
3. Virtualization with untrusted guests
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The protection depends on the state of the L1TF mitigations.
|
||||
See :ref:`virt_mechanism`.
|
||||
|
||||
If the MDS mitigation is enabled and SMT is disabled, guest to host and
|
||||
guest to guest attacks are prevented.
|
||||
|
||||
.. _mds_default_mitigations:
|
||||
|
||||
Default mitigations
|
||||
-------------------
|
||||
|
||||
The kernel default mitigations for vulnerable processors are:
|
||||
|
||||
- Enable CPU buffer clearing
|
||||
|
||||
The kernel does not by default enforce the disabling of SMT, which leaves
|
||||
SMT systems vulnerable when running untrusted code. The same rationale as
|
||||
for L1TF applies.
|
||||
See :ref:`Documentation/admin-guide/hw-vuln//l1tf.rst <default_mitigations>`.
|
@@ -17,14 +17,12 @@ etc.
|
||||
kernel-parameters
|
||||
devices
|
||||
|
||||
This section describes CPU vulnerabilities and provides an overview of the
|
||||
possible mitigations along with guidance for selecting mitigations if they
|
||||
are configurable at compile, boot or run time.
|
||||
This section describes CPU vulnerabilities and their mitigations.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
||||
l1tf
|
||||
hw-vuln/index
|
||||
|
||||
Here is a set of documents aimed at users who are trying to track down
|
||||
problems and bugs in particular.
|
||||
@@ -77,6 +75,7 @@ configure specific aspects of kernel behavior to your liking.
|
||||
LSM/index
|
||||
mm/index
|
||||
perf-security
|
||||
acpi/index
|
||||
|
||||
.. only:: subproject and html
|
||||
|
||||
|
@@ -88,6 +88,7 @@ parameter is applicable::
|
||||
APIC APIC support is enabled.
|
||||
APM Advanced Power Management support is enabled.
|
||||
ARM ARM architecture is enabled.
|
||||
ARM64 ARM64 architecture is enabled.
|
||||
AX25 Appropriate AX.25 support is enabled.
|
||||
CLK Common clock infrastructure is enabled.
|
||||
CMA Contiguous Memory Area support is enabled.
|
||||
|
@@ -704,8 +704,11 @@
|
||||
upon panic. This parameter reserves the physical
|
||||
memory region [offset, offset + size] for that kernel
|
||||
image. If '@offset' is omitted, then a suitable offset
|
||||
is selected automatically. Check
|
||||
Documentation/kdump/kdump.txt for further details.
|
||||
is selected automatically.
|
||||
[KNL, x86_64] select a region under 4G first, and
|
||||
fall back to reserve region above 4G when '@offset'
|
||||
hasn't been specified.
|
||||
See Documentation/kdump/kdump.txt for further details.
|
||||
|
||||
crashkernel=range1:size1[,range2:size2,...][@offset]
|
||||
[KNL] Same as above, but depends on the memory
|
||||
@@ -1585,7 +1588,7 @@
|
||||
Format: { "off" | "enforce" | "fix" | "log" }
|
||||
default: "enforce"
|
||||
|
||||
ima_appraise_tcb [IMA]
|
||||
ima_appraise_tcb [IMA] Deprecated. Use ima_policy= instead.
|
||||
The builtin appraise policy appraises all files
|
||||
owned by uid=0.
|
||||
|
||||
@@ -1612,8 +1615,7 @@
|
||||
uid=0.
|
||||
|
||||
The "appraise_tcb" policy appraises the integrity of
|
||||
all files owned by root. (This is the equivalent
|
||||
of ima_appraise_tcb.)
|
||||
all files owned by root.
|
||||
|
||||
The "secure_boot" policy appraises the integrity
|
||||
of files (eg. kexec kernel image, kernel modules,
|
||||
@@ -1828,6 +1830,9 @@
|
||||
ip= [IP_PNP]
|
||||
See Documentation/filesystems/nfs/nfsroot.txt.
|
||||
|
||||
ipcmni_extend [KNL] Extend the maximum number of unique System V
|
||||
IPC identifiers from 32,768 to 16,777,216.
|
||||
|
||||
irqaffinity= [SMP] Set the default irq affinity mask
|
||||
The argument is a cpu list, as described above.
|
||||
|
||||
@@ -2141,7 +2146,7 @@
|
||||
|
||||
Default is 'flush'.
|
||||
|
||||
For details see: Documentation/admin-guide/l1tf.rst
|
||||
For details see: Documentation/admin-guide/hw-vuln/l1tf.rst
|
||||
|
||||
l2cr= [PPC]
|
||||
|
||||
@@ -2387,6 +2392,32 @@
|
||||
Format: <first>,<last>
|
||||
Specifies range of consoles to be captured by the MDA.
|
||||
|
||||
mds= [X86,INTEL]
|
||||
Control mitigation for the Micro-architectural Data
|
||||
Sampling (MDS) vulnerability.
|
||||
|
||||
Certain CPUs are vulnerable to an exploit against CPU
|
||||
internal buffers which can forward information to a
|
||||
disclosure gadget under certain conditions.
|
||||
|
||||
In vulnerable processors, the speculatively
|
||||
forwarded data can be used in a cache side channel
|
||||
attack, to access data to which the attacker does
|
||||
not have direct access.
|
||||
|
||||
This parameter controls the MDS mitigation. The
|
||||
options are:
|
||||
|
||||
full - Enable MDS mitigation on vulnerable CPUs
|
||||
full,nosmt - Enable MDS mitigation and disable
|
||||
SMT on vulnerable CPUs
|
||||
off - Unconditionally disable MDS mitigation
|
||||
|
||||
Not specifying this option is equivalent to
|
||||
mds=full.
|
||||
|
||||
For details see: Documentation/admin-guide/hw-vuln/mds.rst
|
||||
|
||||
mem=nn[KMG] [KNL,BOOT] Force usage of a specific amount of memory
|
||||
Amount of memory to be used when the kernel is not able
|
||||
to see the whole system memory or for test.
|
||||
@@ -2544,6 +2575,42 @@
|
||||
in the "bleeding edge" mini2440 support kernel at
|
||||
http://repo.or.cz/w/linux-2.6/mini2440.git
|
||||
|
||||
mitigations=
|
||||
[X86,PPC,S390,ARM64] Control optional mitigations for
|
||||
CPU vulnerabilities. This is a set of curated,
|
||||
arch-independent options, each of which is an
|
||||
aggregation of existing arch-specific options.
|
||||
|
||||
off
|
||||
Disable all optional CPU mitigations. This
|
||||
improves system performance, but it may also
|
||||
expose users to several CPU vulnerabilities.
|
||||
Equivalent to: nopti [X86,PPC]
|
||||
kpti=0 [ARM64]
|
||||
nospectre_v1 [PPC]
|
||||
nobp=0 [S390]
|
||||
nospectre_v2 [X86,PPC,S390,ARM64]
|
||||
spectre_v2_user=off [X86]
|
||||
spec_store_bypass_disable=off [X86,PPC]
|
||||
ssbd=force-off [ARM64]
|
||||
l1tf=off [X86]
|
||||
mds=off [X86]
|
||||
|
||||
auto (default)
|
||||
Mitigate all CPU vulnerabilities, but leave SMT
|
||||
enabled, even if it's vulnerable. This is for
|
||||
users who don't want to be surprised by SMT
|
||||
getting disabled across kernel upgrades, or who
|
||||
have other ways of avoiding SMT-based attacks.
|
||||
Equivalent to: (default behavior)
|
||||
|
||||
auto,nosmt
|
||||
Mitigate all CPU vulnerabilities, disabling SMT
|
||||
if needed. This is for users who always want to
|
||||
be fully mitigated, even if it means losing SMT.
|
||||
Equivalent to: l1tf=flush,nosmt [X86]
|
||||
mds=full,nosmt [X86]
|
||||
|
||||
mminit_loglevel=
|
||||
[KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this
|
||||
parameter allows control of the logging verbosity for
|
||||
@@ -2839,11 +2906,11 @@
|
||||
noexec=on: enable non-executable mappings (default)
|
||||
noexec=off: disable non-executable mappings
|
||||
|
||||
nosmap [X86]
|
||||
nosmap [X86,PPC]
|
||||
Disable SMAP (Supervisor Mode Access Prevention)
|
||||
even if it is supported by processor.
|
||||
|
||||
nosmep [X86]
|
||||
nosmep [X86,PPC]
|
||||
Disable SMEP (Supervisor Mode Execution Prevention)
|
||||
even if it is supported by processor.
|
||||
|
||||
@@ -2873,10 +2940,10 @@
|
||||
check bypass). With this option data leaks are possible
|
||||
in the system.
|
||||
|
||||
nospectre_v2 [X86,PPC_FSL_BOOK3E] Disable all mitigations for the Spectre variant 2
|
||||
(indirect branch prediction) vulnerability. System may
|
||||
allow data leaks with this option, which is equivalent
|
||||
to spectre_v2=off.
|
||||
nospectre_v2 [X86,PPC_FSL_BOOK3E,ARM64] Disable all mitigations for
|
||||
the Spectre variant 2 (indirect branch prediction)
|
||||
vulnerability. System may allow data leaks with this
|
||||
option.
|
||||
|
||||
nospec_store_bypass_disable
|
||||
[HW] Disable all mitigations for the Speculative Store Bypass vulnerability
|
||||
@@ -3110,6 +3177,16 @@
|
||||
This will also cause panics on machine check exceptions.
|
||||
Useful together with panic=30 to trigger a reboot.
|
||||
|
||||
page_alloc.shuffle=
|
||||
[KNL] Boolean flag to control whether the page allocator
|
||||
should randomize its free lists. The randomization may
|
||||
be automatically enabled if the kernel detects it is
|
||||
running on a platform with a direct-mapped memory-side
|
||||
cache, and this parameter can be used to
|
||||
override/disable that behavior. The state of the flag
|
||||
can be read from sysfs at:
|
||||
/sys/module/page_alloc/parameters/shuffle.
|
||||
|
||||
page_owner= [KNL] Boot-time page_owner enabling option.
|
||||
Storage of the information about who allocated
|
||||
each page is disabled in default. With this switch,
|
||||
@@ -3135,6 +3212,7 @@
|
||||
bit 2: print timer info
|
||||
bit 3: print locks info if CONFIG_LOCKDEP is on
|
||||
bit 4: print ftrace buffer
|
||||
bit 5: print all printk messages in buffer
|
||||
|
||||
panic_on_warn panic() instead of WARN(). Useful to cause kdump
|
||||
on a WARN().
|
||||
@@ -3394,6 +3472,8 @@
|
||||
bridges without forcing it upstream. Note:
|
||||
this removes isolation between devices and
|
||||
may put more devices in an IOMMU group.
|
||||
force_floating [S390] Force usage of floating interrupts.
|
||||
nomio [S390] Do not use MIO instructions.
|
||||
|
||||
pcie_aspm= [PCIE] Forcibly enable or disable PCIe Active State Power
|
||||
Management.
|
||||
@@ -3623,7 +3703,9 @@
|
||||
see CONFIG_RAS_CEC help text.
|
||||
|
||||
rcu_nocbs= [KNL]
|
||||
The argument is a cpu list, as described above.
|
||||
The argument is a cpu list, as described above,
|
||||
except that the string "all" can be used to
|
||||
specify every CPU on the system.
|
||||
|
||||
In kernels built with CONFIG_RCU_NOCB_CPU=y, set
|
||||
the specified list of CPUs to be no-callback CPUs.
|
||||
@@ -3986,7 +4068,9 @@
|
||||
[[,]s[mp]#### \
|
||||
[[,]b[ios] | a[cpi] | k[bd] | t[riple] | e[fi] | p[ci]] \
|
||||
[[,]f[orce]
|
||||
Where reboot_mode is one of warm (soft) or cold (hard) or gpio,
|
||||
Where reboot_mode is one of warm (soft) or cold (hard) or gpio
|
||||
(prefix with 'panic_' to set mode for panic
|
||||
reboot only),
|
||||
reboot_type is one of bios, acpi, kbd, triple, efi, or pci,
|
||||
reboot_force is either force or not specified,
|
||||
reboot_cpu is s[mp]#### with #### being the processor
|
||||
@@ -4703,6 +4787,10 @@
|
||||
[x86] unstable: mark the TSC clocksource as unstable, this
|
||||
marks the TSC unconditionally unstable at bootup and
|
||||
avoids any further wobbles once the TSC watchdog notices.
|
||||
[x86] nowatchdog: disable clocksource watchdog. Used
|
||||
in situations with strict latency requirements (where
|
||||
interruptions from clocksource watchdog are not
|
||||
acceptable).
|
||||
|
||||
turbografx.map[2|3]= [HW,JOY]
|
||||
TurboGraFX parallel port interface
|
||||
@@ -5173,6 +5261,13 @@
|
||||
with /sys/devices/system/xen_memory/xen_memory0/scrub_pages.
|
||||
Default value controlled with CONFIG_XEN_SCRUB_PAGES_DEFAULT.
|
||||
|
||||
xen_timer_slop= [X86-64,XEN]
|
||||
Set the timer slop (in nanoseconds) for the virtual Xen
|
||||
timers (default is 100000). This adjusts the minimum
|
||||
delta of virtualized Xen timers, where lower values
|
||||
improve timer resolution at the expense of processing
|
||||
more timer interrupts.
|
||||
|
||||
xirc2ps_cs= [NET,PCMCIA]
|
||||
Format:
|
||||
<irq>,<irq_mask>,<io>,<full_duplex>,<do_sound>,<lockup_hack>[,<irq2>[,<irq3>[,<irq4>]]]
|
||||
|
169
Documentation/admin-guide/mm/numaperf.rst
Normal file
169
Documentation/admin-guide/mm/numaperf.rst
Normal file
@@ -0,0 +1,169 @@
|
||||
.. _numaperf:
|
||||
|
||||
=============
|
||||
NUMA Locality
|
||||
=============
|
||||
|
||||
Some platforms may have multiple types of memory attached to a compute
|
||||
node. These disparate memory ranges may share some characteristics, such
|
||||
as CPU cache coherence, but may have different performance. For example,
|
||||
different media types and buses affect bandwidth and latency.
|
||||
|
||||
A system supports such heterogeneous memory by grouping each memory type
|
||||
under different domains, or "nodes", based on locality and performance
|
||||
characteristics. Some memory may share the same node as a CPU, and others
|
||||
are provided as memory only nodes. While memory only nodes do not provide
|
||||
CPUs, they may still be local to one or more compute nodes relative to
|
||||
other nodes. The following diagram shows one such example of two compute
|
||||
nodes with local memory and a memory only node for each of compute node:
|
||||
|
||||
+------------------+ +------------------+
|
||||
| Compute Node 0 +-----+ Compute Node 1 |
|
||||
| Local Node0 Mem | | Local Node1 Mem |
|
||||
+--------+---------+ +--------+---------+
|
||||
| |
|
||||
+--------+---------+ +--------+---------+
|
||||
| Slower Node2 Mem | | Slower Node3 Mem |
|
||||
+------------------+ +--------+---------+
|
||||
|
||||
A "memory initiator" is a node containing one or more devices such as
|
||||
CPUs or separate memory I/O devices that can initiate memory requests.
|
||||
A "memory target" is a node containing one or more physical address
|
||||
ranges accessible from one or more memory initiators.
|
||||
|
||||
When multiple memory initiators exist, they may not all have the same
|
||||
performance when accessing a given memory target. Each initiator-target
|
||||
pair may be organized into different ranked access classes to represent
|
||||
this relationship. The highest performing initiator to a given target
|
||||
is considered to be one of that target's local initiators, and given
|
||||
the highest access class, 0. Any given target may have one or more
|
||||
local initiators, and any given initiator may have multiple local
|
||||
memory targets.
|
||||
|
||||
To aid applications matching memory targets with their initiators, the
|
||||
kernel provides symlinks to each other. The following example lists the
|
||||
relationship for the access class "0" memory initiators and targets::
|
||||
|
||||
# symlinks -v /sys/devices/system/node/nodeX/access0/targets/
|
||||
relative: /sys/devices/system/node/nodeX/access0/targets/nodeY -> ../../nodeY
|
||||
|
||||
# symlinks -v /sys/devices/system/node/nodeY/access0/initiators/
|
||||
relative: /sys/devices/system/node/nodeY/access0/initiators/nodeX -> ../../nodeX
|
||||
|
||||
A memory initiator may have multiple memory targets in the same access
|
||||
class. The target memory's initiators in a given class indicate the
|
||||
nodes' access characteristics share the same performance relative to other
|
||||
linked initiator nodes. Each target within an initiator's access class,
|
||||
though, do not necessarily perform the same as each other.
|
||||
|
||||
================
|
||||
NUMA Performance
|
||||
================
|
||||
|
||||
Applications may wish to consider which node they want their memory to
|
||||
be allocated from based on the node's performance characteristics. If
|
||||
the system provides these attributes, the kernel exports them under the
|
||||
node sysfs hierarchy by appending the attributes directory under the
|
||||
memory node's access class 0 initiators as follows::
|
||||
|
||||
/sys/devices/system/node/nodeY/access0/initiators/
|
||||
|
||||
These attributes apply only when accessed from nodes that have the
|
||||
are linked under the this access's inititiators.
|
||||
|
||||
The performance characteristics the kernel provides for the local initiators
|
||||
are exported are as follows::
|
||||
|
||||
# tree -P "read*|write*" /sys/devices/system/node/nodeY/access0/initiators/
|
||||
/sys/devices/system/node/nodeY/access0/initiators/
|
||||
|-- read_bandwidth
|
||||
|-- read_latency
|
||||
|-- write_bandwidth
|
||||
`-- write_latency
|
||||
|
||||
The bandwidth attributes are provided in MiB/second.
|
||||
|
||||
The latency attributes are provided in nanoseconds.
|
||||
|
||||
The values reported here correspond to the rated latency and bandwidth
|
||||
for the platform.
|
||||
|
||||
==========
|
||||
NUMA Cache
|
||||
==========
|
||||
|
||||
System memory may be constructed in a hierarchy of elements with various
|
||||
performance characteristics in order to provide large address space of
|
||||
slower performing memory cached by a smaller higher performing memory. The
|
||||
system physical addresses memory initiators are aware of are provided
|
||||
by the last memory level in the hierarchy. The system meanwhile uses
|
||||
higher performing memory to transparently cache access to progressively
|
||||
slower levels.
|
||||
|
||||
The term "far memory" is used to denote the last level memory in the
|
||||
hierarchy. Each increasing cache level provides higher performing
|
||||
initiator access, and the term "near memory" represents the fastest
|
||||
cache provided by the system.
|
||||
|
||||
This numbering is different than CPU caches where the cache level (ex:
|
||||
L1, L2, L3) uses the CPU-side view where each increased level is lower
|
||||
performing. In contrast, the memory cache level is centric to the last
|
||||
level memory, so the higher numbered cache level corresponds to memory
|
||||
nearer to the CPU, and further from far memory.
|
||||
|
||||
The memory-side caches are not directly addressable by software. When
|
||||
software accesses a system address, the system will return it from the
|
||||
near memory cache if it is present. If it is not present, the system
|
||||
accesses the next level of memory until there is either a hit in that
|
||||
cache level, or it reaches far memory.
|
||||
|
||||
An application does not need to know about caching attributes in order
|
||||
to use the system. Software may optionally query the memory cache
|
||||
attributes in order to maximize the performance out of such a setup.
|
||||
If the system provides a way for the kernel to discover this information,
|
||||
for example with ACPI HMAT (Heterogeneous Memory Attribute Table),
|
||||
the kernel will append these attributes to the NUMA node memory target.
|
||||
|
||||
When the kernel first registers a memory cache with a node, the kernel
|
||||
will create the following directory::
|
||||
|
||||
/sys/devices/system/node/nodeX/memory_side_cache/
|
||||
|
||||
If that directory is not present, the system either does not not provide
|
||||
a memory-side cache, or that information is not accessible to the kernel.
|
||||
|
||||
The attributes for each level of cache is provided under its cache
|
||||
level index::
|
||||
|
||||
/sys/devices/system/node/nodeX/memory_side_cache/indexA/
|
||||
/sys/devices/system/node/nodeX/memory_side_cache/indexB/
|
||||
/sys/devices/system/node/nodeX/memory_side_cache/indexC/
|
||||
|
||||
Each cache level's directory provides its attributes. For example, the
|
||||
following shows a single cache level and the attributes available for
|
||||
software to query::
|
||||
|
||||
# tree sys/devices/system/node/node0/memory_side_cache/
|
||||
/sys/devices/system/node/node0/memory_side_cache/
|
||||
|-- index1
|
||||
| |-- indexing
|
||||
| |-- line_size
|
||||
| |-- size
|
||||
| `-- write_policy
|
||||
|
||||
The "indexing" will be 0 if it is a direct-mapped cache, and non-zero
|
||||
for any other indexed based, multi-way associativity.
|
||||
|
||||
The "line_size" is the number of bytes accessed from the next cache
|
||||
level on a miss.
|
||||
|
||||
The "size" is the number of bytes provided by this cache level.
|
||||
|
||||
The "write_policy" will be 0 for write-back, and non-zero for
|
||||
write-through caching.
|
||||
|
||||
========
|
||||
See Also
|
||||
========
|
||||
.. [1] https://www.uefi.org/sites/default/files/resources/ACPI_6_2.pdf
|
||||
Section 5.2.27
|
@@ -1,3 +1,6 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
.. include:: <isonum.txt>
|
||||
|
||||
.. |struct cpufreq_policy| replace:: :c:type:`struct cpufreq_policy <cpufreq_policy>`
|
||||
.. |intel_pstate| replace:: :doc:`intel_pstate <intel_pstate>`
|
||||
|
||||
@@ -5,9 +8,10 @@
|
||||
CPU Performance Scaling
|
||||
=======================
|
||||
|
||||
::
|
||||
:Copyright: |copy| 2017 Intel Corporation
|
||||
|
||||
:Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
||||
|
||||
Copyright (c) 2017 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
||||
|
||||
The Concept of CPU Performance Scaling
|
||||
======================================
|
||||
@@ -396,8 +400,8 @@ RT or deadline scheduling classes, the governor will increase the frequency to
|
||||
the allowed maximum (that is, the ``scaling_max_freq`` policy limit). In turn,
|
||||
if it is invoked by the CFS scheduling class, the governor will use the
|
||||
Per-Entity Load Tracking (PELT) metric for the root control group of the
|
||||
given CPU as the CPU utilization estimate (see the `Per-entity load tracking`_
|
||||
LWN.net article for a description of the PELT mechanism). Then, the new
|
||||
given CPU as the CPU utilization estimate (see the *Per-entity load tracking*
|
||||
LWN.net article [1]_ for a description of the PELT mechanism). Then, the new
|
||||
CPU frequency to apply is computed in accordance with the formula
|
||||
|
||||
f = 1.25 * ``f_0`` * ``util`` / ``max``
|
||||
@@ -698,4 +702,8 @@ hardware feature (e.g. all Intel ones), even if the
|
||||
:c:macro:`CONFIG_X86_ACPI_CPUFREQ_CPB` configuration option is set.
|
||||
|
||||
|
||||
.. _Per-entity load tracking: https://lwn.net/Articles/531853/
|
||||
References
|
||||
==========
|
||||
|
||||
.. [1] Jonathan Corbet, *Per-entity load tracking*,
|
||||
https://lwn.net/Articles/531853/
|
||||
|
@@ -1,3 +1,6 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
.. include:: <isonum.txt>
|
||||
|
||||
.. |struct cpuidle_state| replace:: :c:type:`struct cpuidle_state <cpuidle_state>`
|
||||
.. |cpufreq| replace:: :doc:`CPU Performance Scaling <cpufreq>`
|
||||
|
||||
@@ -5,9 +8,10 @@
|
||||
CPU Idle Time Management
|
||||
========================
|
||||
|
||||
::
|
||||
:Copyright: |copy| 2018 Intel Corporation
|
||||
|
||||
:Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
||||
|
||||
Copyright (c) 2018 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
||||
|
||||
Concepts
|
||||
========
|
||||
|
@@ -1,3 +1,5 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
================
|
||||
Power Management
|
||||
================
|
||||
|
41
Documentation/admin-guide/pm/intel_epb.rst
Normal file
41
Documentation/admin-guide/pm/intel_epb.rst
Normal file
@@ -0,0 +1,41 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
.. include:: <isonum.txt>
|
||||
|
||||
======================================
|
||||
Intel Performance and Energy Bias Hint
|
||||
======================================
|
||||
|
||||
:Copyright: |copy| 2019 Intel Corporation
|
||||
|
||||
:Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
||||
|
||||
|
||||
.. kernel-doc:: arch/x86/kernel/cpu/intel_epb.c
|
||||
:doc: overview
|
||||
|
||||
Intel Performance and Energy Bias Attribute in ``sysfs``
|
||||
========================================================
|
||||
|
||||
The Intel Performance and Energy Bias Hint (EPB) value for a given (logical) CPU
|
||||
can be checked or updated through a ``sysfs`` attribute (file) under
|
||||
:file:`/sys/devices/system/cpu/cpu<N>/power/`, where the CPU number ``<N>``
|
||||
is allocated at the system initialization time:
|
||||
|
||||
``energy_perf_bias``
|
||||
Shows the current EPB value for the CPU in a sliding scale 0 - 15, where
|
||||
a value of 0 corresponds to a hint preference for highest performance
|
||||
and a value of 15 corresponds to the maximum energy savings.
|
||||
|
||||
In order to update the EPB value for the CPU, this attribute can be
|
||||
written to, either with a number in the 0 - 15 sliding scale above, or
|
||||
with one of the strings: "performance", "balance-performance", "normal",
|
||||
"balance-power", "power" that represent values reflected by their
|
||||
meaning.
|
||||
|
||||
This attribute is present for all online CPUs supporting the EPB
|
||||
feature.
|
||||
|
||||
Note that while the EPB interface to the processor is defined at the logical CPU
|
||||
level, the physical register backing it may be shared by multiple CPUs (for
|
||||
example, SMT siblings or cores in one package). For this reason, updating the
|
||||
EPB value for one CPU may cause the EPB values for other CPUs to change.
|
@@ -1,10 +1,13 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
.. include:: <isonum.txt>
|
||||
|
||||
===============================================
|
||||
``intel_pstate`` CPU Performance Scaling Driver
|
||||
===============================================
|
||||
|
||||
::
|
||||
:Copyright: |copy| 2017 Intel Corporation
|
||||
|
||||
Copyright (c) 2017 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
||||
:Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
||||
|
||||
|
||||
General Information
|
||||
@@ -20,11 +23,10 @@ you have not done that yet.]
|
||||
|
||||
For the processors supported by ``intel_pstate``, the P-state concept is broader
|
||||
than just an operating frequency or an operating performance point (see the
|
||||
`LinuxCon Europe 2015 presentation by Kristen Accardi <LCEU2015_>`_ for more
|
||||
LinuxCon Europe 2015 presentation by Kristen Accardi [1]_ for more
|
||||
information about that). For this reason, the representation of P-states used
|
||||
by ``intel_pstate`` internally follows the hardware specification (for details
|
||||
refer to `Intel® 64 and IA-32 Architectures Software Developer’s Manual
|
||||
Volume 3: System Programming Guide <SDM_>`_). However, the ``CPUFreq`` core
|
||||
refer to Intel Software Developer’s Manual [2]_). However, the ``CPUFreq`` core
|
||||
uses frequencies for identifying operating performance points of CPUs and
|
||||
frequencies are involved in the user space interface exposed by it, so
|
||||
``intel_pstate`` maps its internal representation of P-states to frequencies too
|
||||
@@ -561,9 +563,9 @@ or to pin every task potentially sensitive to them to a specific CPU.]
|
||||
|
||||
On the majority of systems supported by ``intel_pstate``, the ACPI tables
|
||||
provided by the platform firmware contain ``_PSS`` objects returning information
|
||||
that can be used for CPU performance scaling (refer to the `ACPI specification`_
|
||||
for details on the ``_PSS`` objects and the format of the information returned
|
||||
by them).
|
||||
that can be used for CPU performance scaling (refer to the ACPI specification
|
||||
[3]_ for details on the ``_PSS`` objects and the format of the information
|
||||
returned by them).
|
||||
|
||||
The information returned by the ACPI ``_PSS`` objects is used by the
|
||||
``acpi-cpufreq`` scaling driver. On systems supported by ``intel_pstate``
|
||||
@@ -728,6 +730,14 @@ P-state is called, the ``ftrace`` filter can be set to to
|
||||
<idle>-0 [000] ..s. 2537.654843: intel_pstate_set_pstate <-intel_pstate_timer_func
|
||||
|
||||
|
||||
.. _LCEU2015: http://events.linuxfoundation.org/sites/events/files/slides/LinuxConEurope_2015.pdf
|
||||
.. _SDM: http://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-software-developer-system-programming-manual-325384.html
|
||||
.. _ACPI specification: http://www.uefi.org/sites/default/files/resources/ACPI_6_1.pdf
|
||||
References
|
||||
==========
|
||||
|
||||
.. [1] Kristen Accardi, *Balancing Power and Performance in the Linux Kernel*,
|
||||
http://events.linuxfoundation.org/sites/events/files/slides/LinuxConEurope_2015.pdf
|
||||
|
||||
.. [2] *Intel® 64 and IA-32 Architectures Software Developer’s Manual Volume 3: System Programming Guide*,
|
||||
http://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-software-developer-system-programming-manual-325384.html
|
||||
|
||||
.. [3] *Advanced Configuration and Power Interface Specification*,
|
||||
https://uefi.org/sites/default/files/resources/ACPI_6_3_final_Jan30.pdf
|
||||
|
@@ -1,10 +1,14 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
.. include:: <isonum.txt>
|
||||
|
||||
===================
|
||||
System Sleep States
|
||||
===================
|
||||
|
||||
::
|
||||
:Copyright: |copy| 2017 Intel Corporation
|
||||
|
||||
:Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
||||
|
||||
Copyright (c) 2017 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
||||
|
||||
Sleep states are global low-power states of the entire system in which user
|
||||
space code cannot be executed and the overall system activity is significantly
|
||||
|
@@ -1,10 +1,14 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
.. include:: <isonum.txt>
|
||||
|
||||
===========================
|
||||
Power Management Strategies
|
||||
===========================
|
||||
|
||||
::
|
||||
:Copyright: |copy| 2017 Intel Corporation
|
||||
|
||||
:Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
||||
|
||||
Copyright (c) 2017 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
||||
|
||||
The Linux kernel supports two major high-level power management strategies.
|
||||
|
||||
|
@@ -1,3 +1,5 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
============================
|
||||
System-Wide Power Management
|
||||
============================
|
||||
|
@@ -1,3 +1,5 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
==============================
|
||||
Working-State Power Management
|
||||
==============================
|
||||
@@ -8,3 +10,4 @@ Working-State Power Management
|
||||
cpuidle
|
||||
cpufreq
|
||||
intel_pstate
|
||||
intel_epb
|
||||
|
@@ -209,6 +209,22 @@ infrastructure:
|
||||
| AT | [35-32] | y |
|
||||
x--------------------------------------------------x
|
||||
|
||||
6) ID_AA64ZFR0_EL1 - SVE feature ID register 0
|
||||
|
||||
x--------------------------------------------------x
|
||||
| Name | bits | visible |
|
||||
|--------------------------------------------------|
|
||||
| SM4 | [43-40] | y |
|
||||
|--------------------------------------------------|
|
||||
| SHA3 | [35-32] | y |
|
||||
|--------------------------------------------------|
|
||||
| BitPerm | [19-16] | y |
|
||||
|--------------------------------------------------|
|
||||
| AES | [7-4] | y |
|
||||
|--------------------------------------------------|
|
||||
| SVEVer | [3-0] | y |
|
||||
x--------------------------------------------------x
|
||||
|
||||
Appendix I: Example
|
||||
---------------------------
|
||||
|
||||
|
@@ -13,9 +13,9 @@ architected discovery mechanism available to userspace code at EL0. The
|
||||
kernel exposes the presence of these features to userspace through a set
|
||||
of flags called hwcaps, exposed in the auxilliary vector.
|
||||
|
||||
Userspace software can test for features by acquiring the AT_HWCAP entry
|
||||
of the auxilliary vector, and testing whether the relevant flags are
|
||||
set, e.g.
|
||||
Userspace software can test for features by acquiring the AT_HWCAP or
|
||||
AT_HWCAP2 entry of the auxiliary vector, and testing whether the relevant
|
||||
flags are set, e.g.
|
||||
|
||||
bool floating_point_is_present(void)
|
||||
{
|
||||
@@ -135,6 +135,10 @@ HWCAP_DCPOP
|
||||
|
||||
Functionality implied by ID_AA64ISAR1_EL1.DPB == 0b0001.
|
||||
|
||||
HWCAP2_DCPODP
|
||||
|
||||
Functionality implied by ID_AA64ISAR1_EL1.DPB == 0b0010.
|
||||
|
||||
HWCAP_SHA3
|
||||
|
||||
Functionality implied by ID_AA64ISAR0_EL1.SHA3 == 0b0001.
|
||||
@@ -159,6 +163,30 @@ HWCAP_SVE
|
||||
|
||||
Functionality implied by ID_AA64PFR0_EL1.SVE == 0b0001.
|
||||
|
||||
HWCAP2_SVE2
|
||||
|
||||
Functionality implied by ID_AA64ZFR0_EL1.SVEVer == 0b0001.
|
||||
|
||||
HWCAP2_SVEAES
|
||||
|
||||
Functionality implied by ID_AA64ZFR0_EL1.AES == 0b0001.
|
||||
|
||||
HWCAP2_SVEPMULL
|
||||
|
||||
Functionality implied by ID_AA64ZFR0_EL1.AES == 0b0010.
|
||||
|
||||
HWCAP2_SVEBITPERM
|
||||
|
||||
Functionality implied by ID_AA64ZFR0_EL1.BitPerm == 0b0001.
|
||||
|
||||
HWCAP2_SVESHA3
|
||||
|
||||
Functionality implied by ID_AA64ZFR0_EL1.SHA3 == 0b0001.
|
||||
|
||||
HWCAP2_SVESM4
|
||||
|
||||
Functionality implied by ID_AA64ZFR0_EL1.SM4 == 0b0001.
|
||||
|
||||
HWCAP_ASIMDFHM
|
||||
|
||||
Functionality implied by ID_AA64ISAR0_EL1.FHM == 0b0001.
|
||||
@@ -194,3 +222,10 @@ HWCAP_PACG
|
||||
Functionality implied by ID_AA64ISAR1_EL1.GPA == 0b0001 or
|
||||
ID_AA64ISAR1_EL1.GPI == 0b0001, as described by
|
||||
Documentation/arm64/pointer-authentication.txt.
|
||||
|
||||
|
||||
4. Unused AT_HWCAP bits
|
||||
-----------------------
|
||||
|
||||
For interoperation with userspace, the kernel guarantees that bits 62
|
||||
and 63 of AT_HWCAP will always be returned as 0.
|
||||
|
85
Documentation/arm64/perf.txt
Normal file
85
Documentation/arm64/perf.txt
Normal file
@@ -0,0 +1,85 @@
|
||||
Perf Event Attributes
|
||||
=====================
|
||||
|
||||
Author: Andrew Murray <andrew.murray@arm.com>
|
||||
Date: 2019-03-06
|
||||
|
||||
exclude_user
|
||||
------------
|
||||
|
||||
This attribute excludes userspace.
|
||||
|
||||
Userspace always runs at EL0 and thus this attribute will exclude EL0.
|
||||
|
||||
|
||||
exclude_kernel
|
||||
--------------
|
||||
|
||||
This attribute excludes the kernel.
|
||||
|
||||
The kernel runs at EL2 with VHE and EL1 without. Guest kernels always run
|
||||
at EL1.
|
||||
|
||||
For the host this attribute will exclude EL1 and additionally EL2 on a VHE
|
||||
system.
|
||||
|
||||
For the guest this attribute will exclude EL1. Please note that EL2 is
|
||||
never counted within a guest.
|
||||
|
||||
|
||||
exclude_hv
|
||||
----------
|
||||
|
||||
This attribute excludes the hypervisor.
|
||||
|
||||
For a VHE host this attribute is ignored as we consider the host kernel to
|
||||
be the hypervisor.
|
||||
|
||||
For a non-VHE host this attribute will exclude EL2 as we consider the
|
||||
hypervisor to be any code that runs at EL2 which is predominantly used for
|
||||
guest/host transitions.
|
||||
|
||||
For the guest this attribute has no effect. Please note that EL2 is
|
||||
never counted within a guest.
|
||||
|
||||
|
||||
exclude_host / exclude_guest
|
||||
----------------------------
|
||||
|
||||
These attributes exclude the KVM host and guest, respectively.
|
||||
|
||||
The KVM host may run at EL0 (userspace), EL1 (non-VHE kernel) and EL2 (VHE
|
||||
kernel or non-VHE hypervisor).
|
||||
|
||||
The KVM guest may run at EL0 (userspace) and EL1 (kernel).
|
||||
|
||||
Due to the overlapping exception levels between host and guests we cannot
|
||||
exclusively rely on the PMU's hardware exception filtering - therefore we
|
||||
must enable/disable counting on the entry and exit to the guest. This is
|
||||
performed differently on VHE and non-VHE systems.
|
||||
|
||||
For non-VHE systems we exclude EL2 for exclude_host - upon entering and
|
||||
exiting the guest we disable/enable the event as appropriate based on the
|
||||
exclude_host and exclude_guest attributes.
|
||||
|
||||
For VHE systems we exclude EL1 for exclude_guest and exclude both EL0,EL2
|
||||
for exclude_host. Upon entering and exiting the guest we modify the event
|
||||
to include/exclude EL0 as appropriate based on the exclude_host and
|
||||
exclude_guest attributes.
|
||||
|
||||
The statements above also apply when these attributes are used within a
|
||||
non-VHE guest however please note that EL2 is never counted within a guest.
|
||||
|
||||
|
||||
Accuracy
|
||||
--------
|
||||
|
||||
On non-VHE hosts we enable/disable counters on the entry/exit of host/guest
|
||||
transition at EL2 - however there is a period of time between
|
||||
enabling/disabling the counters and entering/exiting the guest. We are
|
||||
able to eliminate counters counting host events on the boundaries of guest
|
||||
entry/exit when counting guest events by filtering out EL2 for
|
||||
exclude_host. However when using !exclude_hv there is a small blackout
|
||||
window at the guest entry/exit where host events are not captured.
|
||||
|
||||
On VHE systems there are no blackout windows.
|
@@ -87,7 +87,21 @@ used to get and set the keys for a thread.
|
||||
Virtualization
|
||||
--------------
|
||||
|
||||
Pointer authentication is not currently supported in KVM guests. KVM
|
||||
will mask the feature bits from ID_AA64ISAR1_EL1, and attempted use of
|
||||
the feature will result in an UNDEFINED exception being injected into
|
||||
the guest.
|
||||
Pointer authentication is enabled in KVM guest when each virtual cpu is
|
||||
initialised by passing flags KVM_ARM_VCPU_PTRAUTH_[ADDRESS/GENERIC] and
|
||||
requesting these two separate cpu features to be enabled. The current KVM
|
||||
guest implementation works by enabling both features together, so both
|
||||
these userspace flags are checked before enabling pointer authentication.
|
||||
The separate userspace flag will allow to have no userspace ABI changes
|
||||
if support is added in the future to allow these two features to be
|
||||
enabled independently of one another.
|
||||
|
||||
As Arm Architecture specifies that Pointer Authentication feature is
|
||||
implemented along with the VHE feature so KVM arm64 ptrauth code relies
|
||||
on VHE mode to be present.
|
||||
|
||||
Additionally, when these vcpu feature flags are not set then KVM will
|
||||
filter out the Pointer Authentication system key registers from
|
||||
KVM_GET/SET_REG_* ioctls and mask those features from cpufeature ID
|
||||
register. Any attempt to use the Pointer Authentication instructions will
|
||||
result in an UNDEFINED exception being injected into the guest.
|
||||
|
@@ -61,6 +61,7 @@ stable kernels.
|
||||
| ARM | Cortex-A76 | #1188873 | ARM64_ERRATUM_1188873 |
|
||||
| ARM | Cortex-A76 | #1165522 | ARM64_ERRATUM_1165522 |
|
||||
| ARM | Cortex-A76 | #1286807 | ARM64_ERRATUM_1286807 |
|
||||
| ARM | Neoverse-N1 | #1188873 | ARM64_ERRATUM_1188873 |
|
||||
| ARM | MMU-500 | #841119,#826419 | N/A |
|
||||
| | | | |
|
||||
| Cavium | ThunderX ITS | #22375, #24313 | CAVIUM_ERRATUM_22375 |
|
||||
@@ -77,6 +78,7 @@ stable kernels.
|
||||
| Hisilicon | Hip0{5,6,7} | #161010101 | HISILICON_ERRATUM_161010101 |
|
||||
| Hisilicon | Hip0{6,7} | #161010701 | N/A |
|
||||
| Hisilicon | Hip07 | #161600802 | HISILICON_ERRATUM_161600802 |
|
||||
| Hisilicon | Hip08 SMMU PMCG | #162001800 | N/A |
|
||||
| | | | |
|
||||
| Qualcomm Tech. | Kryo/Falkor v1 | E1003 | QCOM_FALKOR_ERRATUM_1003 |
|
||||
| Qualcomm Tech. | Falkor v1 | E1009 | QCOM_FALKOR_ERRATUM_1009 |
|
||||
|
@@ -34,6 +34,23 @@ model features for SVE is included in Appendix A.
|
||||
following sections: software that needs to verify that those interfaces are
|
||||
present must check for HWCAP_SVE instead.
|
||||
|
||||
* On hardware that supports the SVE2 extensions, HWCAP2_SVE2 will also
|
||||
be reported in the AT_HWCAP2 aux vector entry. In addition to this,
|
||||
optional extensions to SVE2 may be reported by the presence of:
|
||||
|
||||
HWCAP2_SVE2
|
||||
HWCAP2_SVEAES
|
||||
HWCAP2_SVEPMULL
|
||||
HWCAP2_SVEBITPERM
|
||||
HWCAP2_SVESHA3
|
||||
HWCAP2_SVESM4
|
||||
|
||||
This list may be extended over time as the SVE architecture evolves.
|
||||
|
||||
These extensions are also reported via the CPU ID register ID_AA64ZFR0_EL1,
|
||||
which userspace can read using an MRS instruction. See elf_hwcaps.txt and
|
||||
cpu-feature-registers.txt for details.
|
||||
|
||||
* Debuggers should restrict themselves to interacting with the target via the
|
||||
NT_ARM_SVE regset. The recommended way of detecting support for this regset
|
||||
is to connect to a target process first and then attempt a
|
||||
|
@@ -1,6 +1,6 @@
|
||||
|
||||
On atomic bitops.
|
||||
|
||||
=============
|
||||
Atomic bitops
|
||||
=============
|
||||
|
||||
While our bitmap_{}() functions are non-atomic, we have a number of operations
|
||||
operating on single bits in a bitmap that are atomic.
|
||||
|
@@ -56,6 +56,23 @@ Barriers:
|
||||
smp_mb__{before,after}_atomic()
|
||||
|
||||
|
||||
TYPES (signed vs unsigned)
|
||||
-----
|
||||
|
||||
While atomic_t, atomic_long_t and atomic64_t use int, long and s64
|
||||
respectively (for hysterical raisins), the kernel uses -fno-strict-overflow
|
||||
(which implies -fwrapv) and defines signed overflow to behave like
|
||||
2s-complement.
|
||||
|
||||
Therefore, an explicitly unsigned variant of the atomic ops is strictly
|
||||
unnecessary and we can simply cast, there is no UB.
|
||||
|
||||
There was a bug in UBSAN prior to GCC-8 that would generate UB warnings for
|
||||
signed types.
|
||||
|
||||
With this we also conform to the C/C++ _Atomic behaviour and things like
|
||||
P1236R1.
|
||||
|
||||
|
||||
SEMANTICS
|
||||
---------
|
||||
|
@@ -20,13 +20,26 @@ for that device, by setting low_latency to 0. See Section 3 for
|
||||
details on how to configure BFQ for the desired tradeoff between
|
||||
latency and throughput, or on how to maximize throughput.
|
||||
|
||||
BFQ has a non-null overhead, which limits the maximum IOPS that a CPU
|
||||
can process for a device scheduled with BFQ. To give an idea of the
|
||||
limits on slow or average CPUs, here are, first, the limits of BFQ for
|
||||
three different CPUs, on, respectively, an average laptop, an old
|
||||
desktop, and a cheap embedded system, in case full hierarchical
|
||||
support is enabled (i.e., CONFIG_BFQ_GROUP_IOSCHED is set), but
|
||||
CONFIG_DEBUG_BLK_CGROUP is not set (Section 4-2):
|
||||
As every I/O scheduler, BFQ adds some overhead to per-I/O-request
|
||||
processing. To give an idea of this overhead, the total,
|
||||
single-lock-protected, per-request processing time of BFQ---i.e., the
|
||||
sum of the execution times of the request insertion, dispatch and
|
||||
completion hooks---is, e.g., 1.9 us on an Intel Core i7-2760QM@2.40GHz
|
||||
(dated CPU for notebooks; time measured with simple code
|
||||
instrumentation, and using the throughput-sync.sh script of the S
|
||||
suite [1], in performance-profiling mode). To put this result into
|
||||
context, the total, single-lock-protected, per-request execution time
|
||||
of the lightest I/O scheduler available in blk-mq, mq-deadline, is 0.7
|
||||
us (mq-deadline is ~800 LOC, against ~10500 LOC for BFQ).
|
||||
|
||||
Scheduling overhead further limits the maximum IOPS that a CPU can
|
||||
process (already limited by the execution of the rest of the I/O
|
||||
stack). To give an idea of the limits with BFQ, on slow or average
|
||||
CPUs, here are, first, the limits of BFQ for three different CPUs, on,
|
||||
respectively, an average laptop, an old desktop, and a cheap embedded
|
||||
system, in case full hierarchical support is enabled (i.e.,
|
||||
CONFIG_BFQ_GROUP_IOSCHED is set), but CONFIG_DEBUG_BLK_CGROUP is not
|
||||
set (Section 4-2):
|
||||
- Intel i7-4850HQ: 400 KIOPS
|
||||
- AMD A8-3850: 250 KIOPS
|
||||
- ARM CortexTM-A53 Octa-core: 80 KIOPS
|
||||
@@ -566,3 +579,5 @@ applications. Unset this tunable if you need/want to control weights.
|
||||
Slightly extended version:
|
||||
http://algogroup.unimore.it/people/paolo/disk_sched/bfq-v1-suite-
|
||||
results.pdf
|
||||
|
||||
[3] https://github.com/Algodev-github/S
|
||||
|
@@ -93,3 +93,7 @@ zoned=[0/1]: Default: 0
|
||||
|
||||
zone_size=[MB]: Default: 256
|
||||
Per zone size when exposed as a zoned block device. Must be a power of two.
|
||||
|
||||
zone_nr_conv=[nr_conv]: Default: 0
|
||||
The number of conventional zones to create when block device is zoned. If
|
||||
zone_nr_conv >= nr_zones, it will be reduced to nr_zones - 1.
|
||||
|
@@ -85,8 +85,33 @@ Q: Can loops be supported in a safe way?
|
||||
A: It's not clear yet.
|
||||
|
||||
BPF developers are trying to find a way to
|
||||
support bounded loops where the verifier can guarantee that
|
||||
the program terminates in less than 4096 instructions.
|
||||
support bounded loops.
|
||||
|
||||
Q: What are the verifier limits?
|
||||
--------------------------------
|
||||
A: The only limit known to the user space is BPF_MAXINSNS (4096).
|
||||
It's the maximum number of instructions that the unprivileged bpf
|
||||
program can have. The verifier has various internal limits.
|
||||
Like the maximum number of instructions that can be explored during
|
||||
program analysis. Currently, that limit is set to 1 million.
|
||||
Which essentially means that the largest program can consist
|
||||
of 1 million NOP instructions. There is a limit to the maximum number
|
||||
of subsequent branches, a limit to the number of nested bpf-to-bpf
|
||||
calls, a limit to the number of the verifier states per instruction,
|
||||
a limit to the number of maps used by the program.
|
||||
All these limits can be hit with a sufficiently complex program.
|
||||
There are also non-numerical limits that can cause the program
|
||||
to be rejected. The verifier used to recognize only pointer + constant
|
||||
expressions. Now it can recognize pointer + bounded_register.
|
||||
bpf_lookup_map_elem(key) had a requirement that 'key' must be
|
||||
a pointer to the stack. Now, 'key' can be a pointer to map value.
|
||||
The verifier is steadily getting 'smarter'. The limits are
|
||||
being removed. The only way to know that the program is going to
|
||||
be accepted by the verifier is to try to load it.
|
||||
The bpf development process guarantees that the future kernel
|
||||
versions will accept all bpf programs that were accepted by
|
||||
the earlier versions.
|
||||
|
||||
|
||||
Instruction level questions
|
||||
---------------------------
|
||||
|
@@ -82,6 +82,8 @@ sequentially and type id is assigned to each recognized type starting from id
|
||||
#define BTF_KIND_RESTRICT 11 /* Restrict */
|
||||
#define BTF_KIND_FUNC 12 /* Function */
|
||||
#define BTF_KIND_FUNC_PROTO 13 /* Function Proto */
|
||||
#define BTF_KIND_VAR 14 /* Variable */
|
||||
#define BTF_KIND_DATASEC 15 /* Section */
|
||||
|
||||
Note that the type section encodes debug info, not just pure types.
|
||||
``BTF_KIND_FUNC`` is not a type, and it represents a defined subprogram.
|
||||
@@ -393,6 +395,61 @@ refers to parameter type.
|
||||
If the function has variable arguments, the last parameter is encoded with
|
||||
``name_off = 0`` and ``type = 0``.
|
||||
|
||||
2.2.14 BTF_KIND_VAR
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
``struct btf_type`` encoding requirement:
|
||||
* ``name_off``: offset to a valid C identifier
|
||||
* ``info.kind_flag``: 0
|
||||
* ``info.kind``: BTF_KIND_VAR
|
||||
* ``info.vlen``: 0
|
||||
* ``type``: the type of the variable
|
||||
|
||||
``btf_type`` is followed by a single ``struct btf_variable`` with the
|
||||
following data::
|
||||
|
||||
struct btf_var {
|
||||
__u32 linkage;
|
||||
};
|
||||
|
||||
``struct btf_var`` encoding:
|
||||
* ``linkage``: currently only static variable 0, or globally allocated
|
||||
variable in ELF sections 1
|
||||
|
||||
Not all type of global variables are supported by LLVM at this point.
|
||||
The following is currently available:
|
||||
|
||||
* static variables with or without section attributes
|
||||
* global variables with section attributes
|
||||
|
||||
The latter is for future extraction of map key/value type id's from a
|
||||
map definition.
|
||||
|
||||
2.2.15 BTF_KIND_DATASEC
|
||||
~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
``struct btf_type`` encoding requirement:
|
||||
* ``name_off``: offset to a valid name associated with a variable or
|
||||
one of .data/.bss/.rodata
|
||||
* ``info.kind_flag``: 0
|
||||
* ``info.kind``: BTF_KIND_DATASEC
|
||||
* ``info.vlen``: # of variables
|
||||
* ``size``: total section size in bytes (0 at compilation time, patched
|
||||
to actual size by BPF loaders such as libbpf)
|
||||
|
||||
``btf_type`` is followed by ``info.vlen`` number of ``struct btf_var_secinfo``.::
|
||||
|
||||
struct btf_var_secinfo {
|
||||
__u32 type;
|
||||
__u32 offset;
|
||||
__u32 size;
|
||||
};
|
||||
|
||||
``struct btf_var_secinfo`` encoding:
|
||||
* ``type``: the type of the BTF_KIND_VAR variable
|
||||
* ``offset``: the in-section offset of the variable
|
||||
* ``size``: the size of the variable in bytes
|
||||
|
||||
3. BTF Kernel API
|
||||
*****************
|
||||
|
||||
@@ -521,6 +578,7 @@ For line_info, the line number and column number are defined as below:
|
||||
#define BPF_LINE_INFO_LINE_COL(line_col) ((line_col) & 0x3ff)
|
||||
|
||||
3.4 BPF_{PROG,MAP}_GET_NEXT_ID
|
||||
==============================
|
||||
|
||||
In kernel, every loaded program, map or btf has a unique id. The id won't
|
||||
change during the lifetime of a program, map, or btf.
|
||||
@@ -530,6 +588,7 @@ each command, to user space, for bpf program or maps, respectively, so an
|
||||
inspection tool can inspect all programs and maps.
|
||||
|
||||
3.5 BPF_{PROG,MAP}_GET_FD_BY_ID
|
||||
===============================
|
||||
|
||||
An introspection tool cannot use id to get details about program or maps.
|
||||
A file descriptor needs to be obtained first for reference-counting purpose.
|
||||
|
@@ -36,6 +36,16 @@ Two sets of Questions and Answers (Q&A) are maintained.
|
||||
bpf_devel_QA
|
||||
|
||||
|
||||
Program types
|
||||
=============
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
||||
prog_cgroup_sysctl
|
||||
prog_flow_dissector
|
||||
|
||||
|
||||
.. Links:
|
||||
.. _Documentation/networking/filter.txt: ../networking/filter.txt
|
||||
.. _man-pages: https://www.kernel.org/doc/man-pages/
|
||||
|
125
Documentation/bpf/prog_cgroup_sysctl.rst
Normal file
125
Documentation/bpf/prog_cgroup_sysctl.rst
Normal file
@@ -0,0 +1,125 @@
|
||||
.. SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
|
||||
|
||||
===========================
|
||||
BPF_PROG_TYPE_CGROUP_SYSCTL
|
||||
===========================
|
||||
|
||||
This document describes ``BPF_PROG_TYPE_CGROUP_SYSCTL`` program type that
|
||||
provides cgroup-bpf hook for sysctl.
|
||||
|
||||
The hook has to be attached to a cgroup and will be called every time a
|
||||
process inside that cgroup tries to read from or write to sysctl knob in proc.
|
||||
|
||||
1. Attach type
|
||||
**************
|
||||
|
||||
``BPF_CGROUP_SYSCTL`` attach type has to be used to attach
|
||||
``BPF_PROG_TYPE_CGROUP_SYSCTL`` program to a cgroup.
|
||||
|
||||
2. Context
|
||||
**********
|
||||
|
||||
``BPF_PROG_TYPE_CGROUP_SYSCTL`` provides access to the following context from
|
||||
BPF program::
|
||||
|
||||
struct bpf_sysctl {
|
||||
__u32 write;
|
||||
__u32 file_pos;
|
||||
};
|
||||
|
||||
* ``write`` indicates whether sysctl value is being read (``0``) or written
|
||||
(``1``). This field is read-only.
|
||||
|
||||
* ``file_pos`` indicates file position sysctl is being accessed at, read
|
||||
or written. This field is read-write. Writing to the field sets the starting
|
||||
position in sysctl proc file ``read(2)`` will be reading from or ``write(2)``
|
||||
will be writing to. Writing zero to the field can be used e.g. to override
|
||||
whole sysctl value by ``bpf_sysctl_set_new_value()`` on ``write(2)`` even
|
||||
when it's called by user space on ``file_pos > 0``. Writing non-zero
|
||||
value to the field can be used to access part of sysctl value starting from
|
||||
specified ``file_pos``. Not all sysctl support access with ``file_pos !=
|
||||
0``, e.g. writes to numeric sysctl entries must always be at file position
|
||||
``0``. See also ``kernel.sysctl_writes_strict`` sysctl.
|
||||
|
||||
See `linux/bpf.h`_ for more details on how context field can be accessed.
|
||||
|
||||
3. Return code
|
||||
**************
|
||||
|
||||
``BPF_PROG_TYPE_CGROUP_SYSCTL`` program must return one of the following
|
||||
return codes:
|
||||
|
||||
* ``0`` means "reject access to sysctl";
|
||||
* ``1`` means "proceed with access".
|
||||
|
||||
If program returns ``0`` user space will get ``-1`` from ``read(2)`` or
|
||||
``write(2)`` and ``errno`` will be set to ``EPERM``.
|
||||
|
||||
4. Helpers
|
||||
**********
|
||||
|
||||
Since sysctl knob is represented by a name and a value, sysctl specific BPF
|
||||
helpers focus on providing access to these properties:
|
||||
|
||||
* ``bpf_sysctl_get_name()`` to get sysctl name as it is visible in
|
||||
``/proc/sys`` into provided by BPF program buffer;
|
||||
|
||||
* ``bpf_sysctl_get_current_value()`` to get string value currently held by
|
||||
sysctl into provided by BPF program buffer. This helper is available on both
|
||||
``read(2)`` from and ``write(2)`` to sysctl;
|
||||
|
||||
* ``bpf_sysctl_get_new_value()`` to get new string value currently being
|
||||
written to sysctl before actual write happens. This helper can be used only
|
||||
on ``ctx->write == 1``;
|
||||
|
||||
* ``bpf_sysctl_set_new_value()`` to override new string value currently being
|
||||
written to sysctl before actual write happens. Sysctl value will be
|
||||
overridden starting from the current ``ctx->file_pos``. If the whole value
|
||||
has to be overridden BPF program can set ``file_pos`` to zero before calling
|
||||
to the helper. This helper can be used only on ``ctx->write == 1``. New
|
||||
string value set by the helper is treated and verified by kernel same way as
|
||||
an equivalent string passed by user space.
|
||||
|
||||
BPF program sees sysctl value same way as user space does in proc filesystem,
|
||||
i.e. as a string. Since many sysctl values represent an integer or a vector
|
||||
of integers, the following helpers can be used to get numeric value from the
|
||||
string:
|
||||
|
||||
* ``bpf_strtol()`` to convert initial part of the string to long integer
|
||||
similar to user space `strtol(3)`_;
|
||||
* ``bpf_strtoul()`` to convert initial part of the string to unsigned long
|
||||
integer similar to user space `strtoul(3)`_;
|
||||
|
||||
See `linux/bpf.h`_ for more details on helpers described here.
|
||||
|
||||
5. Examples
|
||||
***********
|
||||
|
||||
See `test_sysctl_prog.c`_ for an example of BPF program in C that access
|
||||
sysctl name and value, parses string value to get vector of integers and uses
|
||||
the result to make decision whether to allow or deny access to sysctl.
|
||||
|
||||
6. Notes
|
||||
********
|
||||
|
||||
``BPF_PROG_TYPE_CGROUP_SYSCTL`` is intended to be used in **trusted** root
|
||||
environment, for example to monitor sysctl usage or catch unreasonable values
|
||||
an application, running as root in a separate cgroup, is trying to set.
|
||||
|
||||
Since `task_dfl_cgroup(current)` is called at `sys_read` / `sys_write` time it
|
||||
may return results different from that at `sys_open` time, i.e. process that
|
||||
opened sysctl file in proc filesystem may differ from process that is trying
|
||||
to read from / write to it and two such processes may run in different
|
||||
cgroups, what means ``BPF_PROG_TYPE_CGROUP_SYSCTL`` should not be used as a
|
||||
security mechanism to limit sysctl usage.
|
||||
|
||||
As with any cgroup-bpf program additional care should be taken if an
|
||||
application running as root in a cgroup should not be allowed to
|
||||
detach/replace BPF program attached by administrator.
|
||||
|
||||
.. Links
|
||||
.. _linux/bpf.h: ../../include/uapi/linux/bpf.h
|
||||
.. _strtol(3): http://man7.org/linux/man-pages/man3/strtol.3p.html
|
||||
.. _strtoul(3): http://man7.org/linux/man-pages/man3/strtoul.3p.html
|
||||
.. _test_sysctl_prog.c:
|
||||
../../tools/testing/selftests/bpf/progs/test_sysctl_prog.c
|
@@ -1,8 +1,8 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
==================
|
||||
BPF Flow Dissector
|
||||
==================
|
||||
============================
|
||||
BPF_PROG_TYPE_FLOW_DISSECTOR
|
||||
============================
|
||||
|
||||
Overview
|
||||
========
|
@@ -1,5 +1,7 @@
|
||||
Clearing WARN_ONCE
|
||||
------------------
|
||||
|
||||
WARN_ONCE / WARN_ON_ONCE only print a warning once.
|
||||
WARN_ONCE / WARN_ON_ONCE / printk_once only emit a message once.
|
||||
|
||||
echo 1 > /sys/kernel/debug/clear_warn_once
|
||||
|
||||
|
@@ -101,16 +101,6 @@ changes occur:
|
||||
translations for software managed TLB configurations.
|
||||
The sparc64 port currently does this.
|
||||
|
||||
6) ``void tlb_migrate_finish(struct mm_struct *mm)``
|
||||
|
||||
This interface is called at the end of an explicit
|
||||
process migration. This interface provides a hook
|
||||
to allow a platform to update TLB or context-specific
|
||||
information for the address space.
|
||||
|
||||
The ia64 sn2 platform is one example of a platform
|
||||
that uses this interface.
|
||||
|
||||
Next, we have the cache flushing interfaces. In general, when Linux
|
||||
is changing an existing virtual-->physical mapping to a new value,
|
||||
the sequence will be in one of the following forms::
|
||||
|
@@ -22,7 +22,6 @@ Core utilities
|
||||
workqueue
|
||||
genericirq
|
||||
xarray
|
||||
flexible-arrays
|
||||
librs
|
||||
genalloc
|
||||
errseq
|
||||
|
@@ -147,10 +147,10 @@ Division Functions
|
||||
.. kernel-doc:: include/linux/math64.h
|
||||
:internal:
|
||||
|
||||
.. kernel-doc:: lib/div64.c
|
||||
.. kernel-doc:: lib/math/div64.c
|
||||
:functions: div_s64_rem div64_u64_rem div64_u64 div64_s64
|
||||
|
||||
.. kernel-doc:: lib/gcd.c
|
||||
.. kernel-doc:: lib/math/gcd.c
|
||||
:export:
|
||||
|
||||
UUID/GUID
|
||||
|
@@ -58,6 +58,14 @@ A raw pointer value may be printed with %p which will hash the address
|
||||
before printing. The kernel also supports extended specifiers for printing
|
||||
pointers of different types.
|
||||
|
||||
Some of the extended specifiers print the data on the given address instead
|
||||
of printing the address itself. In this case, the following error messages
|
||||
might be printed instead of the unreachable information::
|
||||
|
||||
(null) data on plain NULL address
|
||||
(efault) data on invalid address
|
||||
(einval) invalid data on a valid address
|
||||
|
||||
Plain Pointers
|
||||
--------------
|
||||
|
||||
|
@@ -3,79 +3,79 @@ How CPU topology info is exported via sysfs
|
||||
===========================================
|
||||
|
||||
Export CPU topology info via sysfs. Items (attributes) are similar
|
||||
to /proc/cpuinfo output of some architectures:
|
||||
to /proc/cpuinfo output of some architectures. They reside in
|
||||
/sys/devices/system/cpu/cpuX/topology/:
|
||||
|
||||
1) /sys/devices/system/cpu/cpuX/topology/physical_package_id:
|
||||
physical_package_id:
|
||||
|
||||
physical package id of cpuX. Typically corresponds to a physical
|
||||
socket number, but the actual value is architecture and platform
|
||||
dependent.
|
||||
|
||||
2) /sys/devices/system/cpu/cpuX/topology/core_id:
|
||||
core_id:
|
||||
|
||||
the CPU core ID of cpuX. Typically it is the hardware platform's
|
||||
identifier (rather than the kernel's). The actual value is
|
||||
architecture and platform dependent.
|
||||
|
||||
3) /sys/devices/system/cpu/cpuX/topology/book_id:
|
||||
book_id:
|
||||
|
||||
the book ID of cpuX. Typically it is the hardware platform's
|
||||
identifier (rather than the kernel's). The actual value is
|
||||
architecture and platform dependent.
|
||||
|
||||
4) /sys/devices/system/cpu/cpuX/topology/drawer_id:
|
||||
drawer_id:
|
||||
|
||||
the drawer ID of cpuX. Typically it is the hardware platform's
|
||||
identifier (rather than the kernel's). The actual value is
|
||||
architecture and platform dependent.
|
||||
|
||||
5) /sys/devices/system/cpu/cpuX/topology/thread_siblings:
|
||||
thread_siblings:
|
||||
|
||||
internal kernel map of cpuX's hardware threads within the same
|
||||
core as cpuX.
|
||||
|
||||
6) /sys/devices/system/cpu/cpuX/topology/thread_siblings_list:
|
||||
thread_siblings_list:
|
||||
|
||||
human-readable list of cpuX's hardware threads within the same
|
||||
core as cpuX.
|
||||
|
||||
7) /sys/devices/system/cpu/cpuX/topology/core_siblings:
|
||||
core_siblings:
|
||||
|
||||
internal kernel map of cpuX's hardware threads within the same
|
||||
physical_package_id.
|
||||
|
||||
8) /sys/devices/system/cpu/cpuX/topology/core_siblings_list:
|
||||
core_siblings_list:
|
||||
|
||||
human-readable list of cpuX's hardware threads within the same
|
||||
physical_package_id.
|
||||
|
||||
9) /sys/devices/system/cpu/cpuX/topology/book_siblings:
|
||||
book_siblings:
|
||||
|
||||
internal kernel map of cpuX's hardware threads within the same
|
||||
book_id.
|
||||
|
||||
10) /sys/devices/system/cpu/cpuX/topology/book_siblings_list:
|
||||
book_siblings_list:
|
||||
|
||||
human-readable list of cpuX's hardware threads within the same
|
||||
book_id.
|
||||
|
||||
11) /sys/devices/system/cpu/cpuX/topology/drawer_siblings:
|
||||
drawer_siblings:
|
||||
|
||||
internal kernel map of cpuX's hardware threads within the same
|
||||
drawer_id.
|
||||
|
||||
12) /sys/devices/system/cpu/cpuX/topology/drawer_siblings_list:
|
||||
drawer_siblings_list:
|
||||
|
||||
human-readable list of cpuX's hardware threads within the same
|
||||
drawer_id.
|
||||
|
||||
To implement it in an architecture-neutral way, a new source file,
|
||||
drivers/base/topology.c, is to export the 6 to 12 attributes. The book
|
||||
and drawer related sysfs files will only be created if CONFIG_SCHED_BOOK
|
||||
and CONFIG_SCHED_DRAWER are selected.
|
||||
Architecture-neutral, drivers/base/topology.c, exports these attributes.
|
||||
However, the book and drawer related sysfs files will only be created if
|
||||
CONFIG_SCHED_BOOK and CONFIG_SCHED_DRAWER are selected, respectively.
|
||||
|
||||
CONFIG_SCHED_BOOK and CONFIG_DRAWER are currently only used on s390, where
|
||||
they reflect the cpu and cache hierarchy.
|
||||
CONFIG_SCHED_BOOK and CONFIG_SCHED_DRAWER are currently only used on s390,
|
||||
where they reflect the cpu and cache hierarchy.
|
||||
|
||||
For an architecture to support this feature, it must define some of
|
||||
these macros in include/asm-XXX/topology.h::
|
||||
@@ -98,10 +98,10 @@ To be consistent on all architectures, include/linux/topology.h
|
||||
provides default definitions for any of the above macros that are
|
||||
not defined by include/asm-XXX/topology.h:
|
||||
|
||||
1) physical_package_id: -1
|
||||
2) core_id: 0
|
||||
3) sibling_cpumask: just the given CPU
|
||||
4) core_cpumask: just the given CPU
|
||||
1) topology_physical_package_id: -1
|
||||
2) topology_core_id: 0
|
||||
3) topology_sibling_cpumask: just the given CPU
|
||||
4) topology_core_cpumask: just the given CPU
|
||||
|
||||
For architectures that don't support books (CONFIG_SCHED_BOOK) there are no
|
||||
default definitions for topology_book_id() and topology_book_cpumask().
|
||||
|
@@ -133,7 +133,6 @@ Code Example For Use of Operational State Memory With SHASH
|
||||
if (!sdesc)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
sdesc->shash.tfm = alg;
|
||||
sdesc->shash.flags = 0x0;
|
||||
return sdesc;
|
||||
}
|
||||
|
||||
|
@@ -34,10 +34,6 @@ Configure the kernel with::
|
||||
CONFIG_DEBUG_FS=y
|
||||
CONFIG_GCOV_KERNEL=y
|
||||
|
||||
select the gcc's gcov format, default is autodetect based on gcc version::
|
||||
|
||||
CONFIG_GCOV_FORMAT_AUTODETECT=y
|
||||
|
||||
and to get coverage data for the entire kernel::
|
||||
|
||||
CONFIG_GCOV_PROFILE_ALL=y
|
||||
@@ -169,6 +165,20 @@ b) gcov is run on the BUILD machine
|
||||
[user@build] gcov -o /tmp/coverage/tmp/out/init main.c
|
||||
|
||||
|
||||
Note on compilers
|
||||
-----------------
|
||||
|
||||
GCC and LLVM gcov tools are not necessarily compatible. Use gcov_ to work with
|
||||
GCC-generated .gcno and .gcda files, and use llvm-cov_ for Clang.
|
||||
|
||||
.. _gcov: http://gcc.gnu.org/onlinedocs/gcc/Gcov.html
|
||||
.. _llvm-cov: https://llvm.org/docs/CommandGuide/llvm-cov.html
|
||||
|
||||
Build differences between GCC and Clang gcov are handled by Kconfig. It
|
||||
automatically selects the appropriate gcov format depending on the detected
|
||||
toolchain.
|
||||
|
||||
|
||||
Troubleshooting
|
||||
---------------
|
||||
|
||||
|
@@ -7,6 +7,11 @@ directory. These are intended to be small tests to exercise individual code
|
||||
paths in the kernel. Tests are intended to be run after building, installing
|
||||
and booting a kernel.
|
||||
|
||||
You can find additional information on Kselftest framework, how to
|
||||
write new tests using the framework on Kselftest wiki:
|
||||
|
||||
https://kselftest.wiki.kernel.org/
|
||||
|
||||
On some systems, hot-plug tests could hang forever waiting for cpu and
|
||||
memory to be ready to be offlined. A special hot-plug target is created
|
||||
to run the full range of hot-plug tests. In default mode, hot-plug tests run
|
||||
@@ -14,6 +19,10 @@ in safe mode with a limited scope. In limited mode, cpu-hotplug test is
|
||||
run on a single cpu as opposed to all hotplug capable cpus, and memory
|
||||
hotplug test is run on 2% of hotplug capable memory instead of 10%.
|
||||
|
||||
kselftest runs as a userspace process. Tests that can be written/run in
|
||||
userspace may wish to use the `Test Harness`_. Tests that need to be
|
||||
run in kernel space may wish to use a `Test Module`_.
|
||||
|
||||
Running the selftests (hotplug tests are run in limited mode)
|
||||
=============================================================
|
||||
|
||||
@@ -31,17 +40,32 @@ To build and run the tests with a single command, use::
|
||||
|
||||
Note that some tests will require root privileges.
|
||||
|
||||
Build and run from user specific object directory (make O=dir)::
|
||||
Kselftest supports saving output files in a separate directory and then
|
||||
running tests. To locate output files in a separate directory two syntaxes
|
||||
are supported. In both cases the working directory must be the root of the
|
||||
kernel src. This is applicable to "Running a subset of selftests" section
|
||||
below.
|
||||
|
||||
To build, save output files in a separate directory with O= ::
|
||||
|
||||
$ make O=/tmp/kselftest kselftest
|
||||
|
||||
Build and run KBUILD_OUTPUT directory (make KBUILD_OUTPUT=)::
|
||||
To build, save output files in a separate directory with KBUILD_OUTPUT ::
|
||||
|
||||
$ make KBUILD_OUTPUT=/tmp/kselftest kselftest
|
||||
$ export KBUILD_OUTPUT=/tmp/kselftest; make kselftest
|
||||
|
||||
The above commands run the tests and print pass/fail summary to make it
|
||||
easier to understand the test results. Please find the detailed individual
|
||||
test results for each test in /tmp/testname file(s).
|
||||
The O= assignment takes precedence over the KBUILD_OUTPUT environment
|
||||
variable.
|
||||
|
||||
The above commands by default run the tests and print full pass/fail report.
|
||||
Kselftest supports "summary" option to make it easier to understand the test
|
||||
results. Please find the detailed individual test results for each test in
|
||||
/tmp/testname file(s) when summary option is specified. This is applicable
|
||||
to "Running a subset of selftests" section below.
|
||||
|
||||
To run kselftest with summary option enabled ::
|
||||
|
||||
$ make summary=1 kselftest
|
||||
|
||||
Running a subset of selftests
|
||||
=============================
|
||||
@@ -57,17 +81,13 @@ You can specify multiple tests to build and run::
|
||||
|
||||
$ make TARGETS="size timers" kselftest
|
||||
|
||||
Build and run from user specific object directory (make O=dir)::
|
||||
To build, save output files in a separate directory with O= ::
|
||||
|
||||
$ make O=/tmp/kselftest TARGETS="size timers" kselftest
|
||||
|
||||
Build and run KBUILD_OUTPUT directory (make KBUILD_OUTPUT=)::
|
||||
To build, save output files in a separate directory with KBUILD_OUTPUT ::
|
||||
|
||||
$ make KBUILD_OUTPUT=/tmp/kselftest TARGETS="size timers" kselftest
|
||||
|
||||
The above commands run the tests and print pass/fail summary to make it
|
||||
easier to understand the test results. Please find the detailed individual
|
||||
test results for each test in /tmp/testname file(s).
|
||||
$ export KBUILD_OUTPUT=/tmp/kselftest; make TARGETS="size timers" kselftest
|
||||
|
||||
See the top-level tools/testing/selftests/Makefile for the list of all
|
||||
possible targets.
|
||||
@@ -161,11 +181,97 @@ Contributing new tests (details)
|
||||
|
||||
e.g: tools/testing/selftests/android/config
|
||||
|
||||
Test Module
|
||||
===========
|
||||
|
||||
Kselftest tests the kernel from userspace. Sometimes things need
|
||||
testing from within the kernel, one method of doing this is to create a
|
||||
test module. We can tie the module into the kselftest framework by
|
||||
using a shell script test runner. ``kselftest_module.sh`` is designed
|
||||
to facilitate this process. There is also a header file provided to
|
||||
assist writing kernel modules that are for use with kselftest:
|
||||
|
||||
- ``tools/testing/kselftest/kselftest_module.h``
|
||||
- ``tools/testing/kselftest/kselftest_module.sh``
|
||||
|
||||
How to use
|
||||
----------
|
||||
|
||||
Here we show the typical steps to create a test module and tie it into
|
||||
kselftest. We use kselftests for lib/ as an example.
|
||||
|
||||
1. Create the test module
|
||||
|
||||
2. Create the test script that will run (load/unload) the module
|
||||
e.g. ``tools/testing/selftests/lib/printf.sh``
|
||||
|
||||
3. Add line to config file e.g. ``tools/testing/selftests/lib/config``
|
||||
|
||||
4. Add test script to makefile e.g. ``tools/testing/selftests/lib/Makefile``
|
||||
|
||||
5. Verify it works:
|
||||
|
||||
.. code-block:: sh
|
||||
|
||||
# Assumes you have booted a fresh build of this kernel tree
|
||||
cd /path/to/linux/tree
|
||||
make kselftest-merge
|
||||
make modules
|
||||
sudo make modules_install
|
||||
make TARGETS=lib kselftest
|
||||
|
||||
Example Module
|
||||
--------------
|
||||
|
||||
A bare bones test module might look like this:
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
// SPDX-License-Identifier: GPL-2.0+
|
||||
|
||||
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
||||
|
||||
#include "../tools/testing/selftests/kselftest_module.h"
|
||||
|
||||
KSTM_MODULE_GLOBALS();
|
||||
|
||||
/*
|
||||
* Kernel module for testing the foobinator
|
||||
*/
|
||||
|
||||
static int __init test_function()
|
||||
{
|
||||
...
|
||||
}
|
||||
|
||||
static void __init selftest(void)
|
||||
{
|
||||
KSTM_CHECK_ZERO(do_test_case("", 0));
|
||||
}
|
||||
|
||||
KSTM_MODULE_LOADERS(test_foo);
|
||||
MODULE_AUTHOR("John Developer <jd@fooman.org>");
|
||||
MODULE_LICENSE("GPL");
|
||||
|
||||
Example test script
|
||||
-------------------
|
||||
|
||||
.. code-block:: sh
|
||||
|
||||
#!/bin/bash
|
||||
# SPDX-License-Identifier: GPL-2.0+
|
||||
$(dirname $0)/../kselftest_module.sh "foo" test_foo
|
||||
|
||||
|
||||
Test Harness
|
||||
============
|
||||
|
||||
The kselftest_harness.h file contains useful helpers to build tests. The tests
|
||||
from tools/testing/selftests/seccomp/seccomp_bpf.c can be used as example.
|
||||
The kselftest_harness.h file contains useful helpers to build tests. The
|
||||
test harness is for userspace testing, for kernel space testing see `Test
|
||||
Module`_ above.
|
||||
|
||||
The tests from tools/testing/selftests/seccomp/seccomp_bpf.c can be used as
|
||||
example.
|
||||
|
||||
Example
|
||||
-------
|
||||
|
272
Documentation/device-mapper/dm-dust.txt
Normal file
272
Documentation/device-mapper/dm-dust.txt
Normal file
@@ -0,0 +1,272 @@
|
||||
dm-dust
|
||||
=======
|
||||
|
||||
This target emulates the behavior of bad sectors at arbitrary
|
||||
locations, and the ability to enable the emulation of the failures
|
||||
at an arbitrary time.
|
||||
|
||||
This target behaves similarly to a linear target. At a given time,
|
||||
the user can send a message to the target to start failing read
|
||||
requests on specific blocks (to emulate the behavior of a hard disk
|
||||
drive with bad sectors).
|
||||
|
||||
When the failure behavior is enabled (i.e.: when the output of
|
||||
"dmsetup status" displays "fail_read_on_bad_block"), reads of blocks
|
||||
in the "bad block list" will fail with EIO ("Input/output error").
|
||||
|
||||
Writes of blocks in the "bad block list will result in the following:
|
||||
|
||||
1. Remove the block from the "bad block list".
|
||||
2. Successfully complete the write.
|
||||
|
||||
This emulates the "remapped sector" behavior of a drive with bad
|
||||
sectors.
|
||||
|
||||
Normally, a drive that is encountering bad sectors will most likely
|
||||
encounter more bad sectors, at an unknown time or location.
|
||||
With dm-dust, the user can use the "addbadblock" and "removebadblock"
|
||||
messages to add arbitrary bad blocks at new locations, and the
|
||||
"enable" and "disable" messages to modulate the state of whether the
|
||||
configured "bad blocks" will be treated as bad, or bypassed.
|
||||
This allows the pre-writing of test data and metadata prior to
|
||||
simulating a "failure" event where bad sectors start to appear.
|
||||
|
||||
Table parameters:
|
||||
-----------------
|
||||
<device_path> <offset> <blksz>
|
||||
|
||||
Mandatory parameters:
|
||||
<device_path>: path to the block device.
|
||||
<offset>: offset to data area from start of device_path
|
||||
<blksz>: block size in bytes
|
||||
(minimum 512, maximum 1073741824, must be a power of 2)
|
||||
|
||||
Usage instructions:
|
||||
-------------------
|
||||
|
||||
First, find the size (in 512-byte sectors) of the device to be used:
|
||||
|
||||
$ sudo blockdev --getsz /dev/vdb1
|
||||
33552384
|
||||
|
||||
Create the dm-dust device:
|
||||
(For a device with a block size of 512 bytes)
|
||||
$ sudo dmsetup create dust1 --table '0 33552384 dust /dev/vdb1 0 512'
|
||||
|
||||
(For a device with a block size of 4096 bytes)
|
||||
$ sudo dmsetup create dust1 --table '0 33552384 dust /dev/vdb1 0 4096'
|
||||
|
||||
Check the status of the read behavior ("bypass" indicates that all I/O
|
||||
will be passed through to the underlying device):
|
||||
$ sudo dmsetup status dust1
|
||||
0 33552384 dust 252:17 bypass
|
||||
|
||||
$ sudo dd if=/dev/mapper/dust1 of=/dev/null bs=512 count=128 iflag=direct
|
||||
128+0 records in
|
||||
128+0 records out
|
||||
|
||||
$ sudo dd if=/dev/zero of=/dev/mapper/dust1 bs=512 count=128 oflag=direct
|
||||
128+0 records in
|
||||
128+0 records out
|
||||
|
||||
Adding and removing bad blocks:
|
||||
-------------------------------
|
||||
|
||||
At any time (i.e.: whether the device has the "bad block" emulation
|
||||
enabled or disabled), bad blocks may be added or removed from the
|
||||
device via the "addbadblock" and "removebadblock" messages:
|
||||
|
||||
$ sudo dmsetup message dust1 0 addbadblock 60
|
||||
kernel: device-mapper: dust: badblock added at block 60
|
||||
|
||||
$ sudo dmsetup message dust1 0 addbadblock 67
|
||||
kernel: device-mapper: dust: badblock added at block 67
|
||||
|
||||
$ sudo dmsetup message dust1 0 addbadblock 72
|
||||
kernel: device-mapper: dust: badblock added at block 72
|
||||
|
||||
These bad blocks will be stored in the "bad block list".
|
||||
While the device is in "bypass" mode, reads and writes will succeed:
|
||||
|
||||
$ sudo dmsetup status dust1
|
||||
0 33552384 dust 252:17 bypass
|
||||
|
||||
Enabling block read failures:
|
||||
-----------------------------
|
||||
|
||||
To enable the "fail read on bad block" behavior, send the "enable" message:
|
||||
|
||||
$ sudo dmsetup message dust1 0 enable
|
||||
kernel: device-mapper: dust: enabling read failures on bad sectors
|
||||
|
||||
$ sudo dmsetup status dust1
|
||||
0 33552384 dust 252:17 fail_read_on_bad_block
|
||||
|
||||
With the device in "fail read on bad block" mode, attempting to read a
|
||||
block will encounter an "Input/output error":
|
||||
|
||||
$ sudo dd if=/dev/mapper/dust1 of=/dev/null bs=512 count=1 skip=67 iflag=direct
|
||||
dd: error reading '/dev/mapper/dust1': Input/output error
|
||||
0+0 records in
|
||||
0+0 records out
|
||||
0 bytes copied, 0.00040651 s, 0.0 kB/s
|
||||
|
||||
...and writing to the bad blocks will remove the blocks from the list,
|
||||
therefore emulating the "remap" behavior of hard disk drives:
|
||||
|
||||
$ sudo dd if=/dev/zero of=/dev/mapper/dust1 bs=512 count=128 oflag=direct
|
||||
128+0 records in
|
||||
128+0 records out
|
||||
|
||||
kernel: device-mapper: dust: block 60 removed from badblocklist by write
|
||||
kernel: device-mapper: dust: block 67 removed from badblocklist by write
|
||||
kernel: device-mapper: dust: block 72 removed from badblocklist by write
|
||||
kernel: device-mapper: dust: block 87 removed from badblocklist by write
|
||||
|
||||
Bad block add/remove error handling:
|
||||
------------------------------------
|
||||
|
||||
Attempting to add a bad block that already exists in the list will
|
||||
result in an "Invalid argument" error, as well as a helpful message:
|
||||
|
||||
$ sudo dmsetup message dust1 0 addbadblock 88
|
||||
device-mapper: message ioctl on dust1 failed: Invalid argument
|
||||
kernel: device-mapper: dust: block 88 already in badblocklist
|
||||
|
||||
Attempting to remove a bad block that doesn't exist in the list will
|
||||
result in an "Invalid argument" error, as well as a helpful message:
|
||||
|
||||
$ sudo dmsetup message dust1 0 removebadblock 87
|
||||
device-mapper: message ioctl on dust1 failed: Invalid argument
|
||||
kernel: device-mapper: dust: block 87 not found in badblocklist
|
||||
|
||||
Counting the number of bad blocks in the bad block list:
|
||||
--------------------------------------------------------
|
||||
|
||||
To count the number of bad blocks configured in the device, run the
|
||||
following message command:
|
||||
|
||||
$ sudo dmsetup message dust1 0 countbadblocks
|
||||
|
||||
A message will print with the number of bad blocks currently
|
||||
configured on the device:
|
||||
|
||||
kernel: device-mapper: dust: countbadblocks: 895 badblock(s) found
|
||||
|
||||
Querying for specific bad blocks:
|
||||
---------------------------------
|
||||
|
||||
To find out if a specific block is in the bad block list, run the
|
||||
following message command:
|
||||
|
||||
$ sudo dmsetup message dust1 0 queryblock 72
|
||||
|
||||
The following message will print if the block is in the list:
|
||||
device-mapper: dust: queryblock: block 72 found in badblocklist
|
||||
|
||||
The following message will print if the block is in the list:
|
||||
device-mapper: dust: queryblock: block 72 not found in badblocklist
|
||||
|
||||
The "queryblock" message command will work in both the "enabled"
|
||||
and "disabled" modes, allowing the verification of whether a block
|
||||
will be treated as "bad" without having to issue I/O to the device,
|
||||
or having to "enable" the bad block emulation.
|
||||
|
||||
Clearing the bad block list:
|
||||
----------------------------
|
||||
|
||||
To clear the bad block list (without needing to individually run
|
||||
a "removebadblock" message command for every block), run the
|
||||
following message command:
|
||||
|
||||
$ sudo dmsetup message dust1 0 clearbadblocks
|
||||
|
||||
After clearing the bad block list, the following message will appear:
|
||||
|
||||
kernel: device-mapper: dust: clearbadblocks: badblocks cleared
|
||||
|
||||
If there were no bad blocks to clear, the following message will
|
||||
appear:
|
||||
|
||||
kernel: device-mapper: dust: clearbadblocks: no badblocks found
|
||||
|
||||
Message commands list:
|
||||
----------------------
|
||||
|
||||
Below is a list of the messages that can be sent to a dust device:
|
||||
|
||||
Operations on blocks (requires a <blknum> argument):
|
||||
|
||||
addbadblock <blknum>
|
||||
queryblock <blknum>
|
||||
removebadblock <blknum>
|
||||
|
||||
...where <blknum> is a block number within range of the device
|
||||
(corresponding to the block size of the device.)
|
||||
|
||||
Single argument message commands:
|
||||
|
||||
countbadblocks
|
||||
clearbadblocks
|
||||
disable
|
||||
enable
|
||||
quiet
|
||||
|
||||
Device removal:
|
||||
---------------
|
||||
|
||||
When finished, remove the device via the "dmsetup remove" command:
|
||||
|
||||
$ sudo dmsetup remove dust1
|
||||
|
||||
Quiet mode:
|
||||
-----------
|
||||
|
||||
On test runs with many bad blocks, it may be desirable to avoid
|
||||
excessive logging (from bad blocks added, removed, or "remapped").
|
||||
This can be done by enabling "quiet mode" via the following message:
|
||||
|
||||
$ sudo dmsetup message dust1 0 quiet
|
||||
|
||||
This will suppress log messages from add / remove / removed by write
|
||||
operations. Log messages from "countbadblocks" or "queryblock"
|
||||
message commands will still print in quiet mode.
|
||||
|
||||
The status of quiet mode can be seen by running "dmsetup status":
|
||||
|
||||
$ sudo dmsetup status dust1
|
||||
0 33552384 dust 252:17 fail_read_on_bad_block quiet
|
||||
|
||||
To disable quiet mode, send the "quiet" message again:
|
||||
|
||||
$ sudo dmsetup message dust1 0 quiet
|
||||
|
||||
$ sudo dmsetup status dust1
|
||||
0 33552384 dust 252:17 fail_read_on_bad_block verbose
|
||||
|
||||
(The presence of "verbose" indicates normal logging.)
|
||||
|
||||
"Why not...?"
|
||||
-------------
|
||||
|
||||
scsi_debug has a "medium error" mode that can fail reads on one
|
||||
specified sector (sector 0x1234, hardcoded in the source code), but
|
||||
it uses RAM for the persistent storage, which drastically decreases
|
||||
the potential device size.
|
||||
|
||||
dm-flakey fails all I/O from all block locations at a specified time
|
||||
frequency, and not a given point in time.
|
||||
|
||||
When a bad sector occurs on a hard disk drive, reads to that sector
|
||||
are failed by the device, usually resulting in an error code of EIO
|
||||
("I/O error") or ENODATA ("No data available"). However, a write to
|
||||
the sector may succeed, and result in the sector becoming readable
|
||||
after the device controller no longer experiences errors reading the
|
||||
sector (or after a reallocation of the sector). However, there may
|
||||
be bad sectors that occur on the device in the future, in a different,
|
||||
unpredictable location.
|
||||
|
||||
This target seeks to provide a device that can exhibit the behavior
|
||||
of a bad sector at a known sector location, at a known time, based
|
||||
on a large storage device (at least tens of gigabytes, not occupying
|
||||
system memory).
|
@@ -21,6 +21,13 @@ mode it calculates and verifies the integrity tag internally. In this
|
||||
mode, the dm-integrity target can be used to detect silent data
|
||||
corruption on the disk or in the I/O path.
|
||||
|
||||
There's an alternate mode of operation where dm-integrity uses bitmap
|
||||
instead of a journal. If a bit in the bitmap is 1, the corresponding
|
||||
region's data and integrity tags are not synchronized - if the machine
|
||||
crashes, the unsynchronized regions will be recalculated. The bitmap mode
|
||||
is faster than the journal mode, because we don't have to write the data
|
||||
twice, but it is also less reliable, because if data corruption happens
|
||||
when the machine crashes, it may not be detected.
|
||||
|
||||
When loading the target for the first time, the kernel driver will format
|
||||
the device. But it will only format the device if the superblock contains
|
||||
@@ -59,6 +66,10 @@ Target arguments:
|
||||
either both data and tag or none of them are written. The
|
||||
journaled mode degrades write throughput twice because the
|
||||
data have to be written twice.
|
||||
B - bitmap mode - data and metadata are written without any
|
||||
synchronization, the driver maintains a bitmap of dirty
|
||||
regions where data and metadata don't match. This mode can
|
||||
only be used with internal hash.
|
||||
R - recovery mode - in this mode, journal is not replayed,
|
||||
checksums are not checked and writes to the device are not
|
||||
allowed. This mode is useful for data recovery if the
|
||||
@@ -79,6 +90,10 @@ interleave_sectors:number
|
||||
a power of two. If the device is already formatted, the value from
|
||||
the superblock is used.
|
||||
|
||||
meta_device:device
|
||||
Don't interleave the data and metadata on on device. Use a
|
||||
separate device for metadata.
|
||||
|
||||
buffer_sectors:number
|
||||
The number of sectors in one buffer. The value is rounded down to
|
||||
a power of two.
|
||||
@@ -146,6 +161,15 @@ block_size:number
|
||||
Supported values are 512, 1024, 2048 and 4096 bytes. If not
|
||||
specified the default block size is 512 bytes.
|
||||
|
||||
sectors_per_bit:number
|
||||
In the bitmap mode, this parameter specifies the number of
|
||||
512-byte sectors that corresponds to one bitmap bit.
|
||||
|
||||
bitmap_flush_interval:number
|
||||
The bitmap flush interval in milliseconds. The metadata buffers
|
||||
are synchronized when this interval expires.
|
||||
|
||||
|
||||
The journal mode (D/J), buffer_sectors, journal_watermark, commit_time can
|
||||
be changed when reloading the target (load an inactive table and swap the
|
||||
tables with suspend and resume). The other arguments should not be changed
|
||||
@@ -167,7 +191,13 @@ The layout of the formatted block device:
|
||||
provides (i.e. the size of the device minus the size of all
|
||||
metadata and padding). The user of this target should not send
|
||||
bios that access data beyond the "provided data sectors" limit.
|
||||
* flags - a flag is set if journal_mac is used
|
||||
* flags
|
||||
SB_FLAG_HAVE_JOURNAL_MAC - a flag is set if journal_mac is used
|
||||
SB_FLAG_RECALCULATING - recalculating is in progress
|
||||
SB_FLAG_DIRTY_BITMAP - journal area contains the bitmap of dirty
|
||||
blocks
|
||||
* log2(sectors per block)
|
||||
* a position where recalculating finished
|
||||
* journal
|
||||
The journal is divided into sections, each section contains:
|
||||
* metadata area (4kiB), it contains journal entries
|
||||
|
@@ -11,3 +11,15 @@ Example:
|
||||
reg = <0xffd08000 0x1000>;
|
||||
cpu1-start-addr = <0xffd080c4>;
|
||||
};
|
||||
|
||||
ARM64 - Stratix10
|
||||
Required properties:
|
||||
- compatible : "altr,sys-mgr-s10"
|
||||
- reg : Should contain 1 register range(address and length)
|
||||
for system manager register.
|
||||
|
||||
Example:
|
||||
sysmgr@ffd12000 {
|
||||
compatible = "altr,sys-mgr-s10";
|
||||
reg = <0xffd12000 0x228>;
|
||||
};
|
||||
|
@@ -110,6 +110,7 @@ Board compatible values (alphabetically, grouped by SoC):
|
||||
|
||||
- "amlogic,u200" (Meson g12a s905d2)
|
||||
- "amediatech,x96-max" (Meson g12a s905x2)
|
||||
- "seirobotics,sei510" (Meson g12a s905x2)
|
||||
|
||||
Amlogic Meson Firmware registers Interface
|
||||
------------------------------------------
|
||||
|
@@ -25,6 +25,7 @@ compatible: must be one of:
|
||||
o "atmel,at91sam9n12"
|
||||
o "atmel,at91sam9rl"
|
||||
o "atmel,at91sam9xe"
|
||||
o "microchip,sam9x60"
|
||||
* "atmel,sama5" for SoCs using a Cortex-A5, shall be extended with the specific
|
||||
SoC family:
|
||||
o "atmel,sama5d2" shall be extended with the specific SoC compatible:
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user