Merge drm/drm-next into drm-misc-next
Backmerging 5.2-rc1 to -misc-next for robher Signed-off-by: Sean Paul <seanpaul@chromium.org>
This commit is contained in:
@@ -387,14 +387,14 @@ ForEachMacros:
|
|||||||
- 'rhl_for_each_entry_rcu'
|
- 'rhl_for_each_entry_rcu'
|
||||||
- 'rhl_for_each_rcu'
|
- 'rhl_for_each_rcu'
|
||||||
- 'rht_for_each'
|
- 'rht_for_each'
|
||||||
- 'rht_for_each_continue'
|
- 'rht_for_each_from'
|
||||||
- 'rht_for_each_entry'
|
- 'rht_for_each_entry'
|
||||||
- 'rht_for_each_entry_continue'
|
- 'rht_for_each_entry_from'
|
||||||
- 'rht_for_each_entry_rcu'
|
- 'rht_for_each_entry_rcu'
|
||||||
- 'rht_for_each_entry_rcu_continue'
|
- 'rht_for_each_entry_rcu_from'
|
||||||
- 'rht_for_each_entry_safe'
|
- 'rht_for_each_entry_safe'
|
||||||
- 'rht_for_each_rcu'
|
- 'rht_for_each_rcu'
|
||||||
- 'rht_for_each_rcu_continue'
|
- 'rht_for_each_rcu_from'
|
||||||
- '__rq_for_each_bio'
|
- '__rq_for_each_bio'
|
||||||
- 'rq_for_each_bvec'
|
- 'rq_for_each_bvec'
|
||||||
- 'rq_for_each_segment'
|
- 'rq_for_each_segment'
|
||||||
|
@@ -1 +1,2 @@
|
|||||||
Christoph Hellwig <hch@lst.de>
|
Christoph Hellwig <hch@lst.de>
|
||||||
|
Marc Gonzalez <marc.w.gonzalez@free.fr>
|
||||||
|
24
.gitignore
vendored
24
.gitignore
vendored
@@ -58,6 +58,7 @@ modules.builtin
|
|||||||
/vmlinuz
|
/vmlinuz
|
||||||
/System.map
|
/System.map
|
||||||
/Module.markers
|
/Module.markers
|
||||||
|
/modules.builtin.modinfo
|
||||||
|
|
||||||
#
|
#
|
||||||
# RPM spec file (make rpm-pkg)
|
# RPM spec file (make rpm-pkg)
|
||||||
@@ -80,20 +81,22 @@ modules.builtin
|
|||||||
/tar-install/
|
/tar-install/
|
||||||
|
|
||||||
#
|
#
|
||||||
# git files that we don't want to ignore even if they are dot-files
|
# We don't want to ignore the following even if they are dot-files
|
||||||
#
|
#
|
||||||
|
!.clang-format
|
||||||
|
!.cocciconfig
|
||||||
|
!.get_maintainer.ignore
|
||||||
|
!.gitattributes
|
||||||
!.gitignore
|
!.gitignore
|
||||||
!.mailmap
|
!.mailmap
|
||||||
!.cocciconfig
|
|
||||||
!.clang-format
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Generated include files
|
# Generated include files
|
||||||
#
|
#
|
||||||
include/config
|
/include/config/
|
||||||
include/generated
|
/include/generated/
|
||||||
include/ksym
|
/include/ksym/
|
||||||
arch/*/include/generated
|
/arch/*/include/generated/
|
||||||
|
|
||||||
# stgit generated dirs
|
# stgit generated dirs
|
||||||
patches-*
|
patches-*
|
||||||
@@ -129,7 +132,12 @@ signing_key.x509
|
|||||||
x509.genkey
|
x509.genkey
|
||||||
|
|
||||||
# Kconfig presets
|
# Kconfig presets
|
||||||
all.config
|
/all.config
|
||||||
|
/alldef.config
|
||||||
|
/allmod.config
|
||||||
|
/allno.config
|
||||||
|
/allrandom.config
|
||||||
|
/allyes.config
|
||||||
|
|
||||||
# Kdevelop4
|
# Kdevelop4
|
||||||
*.kdev4
|
*.kdev4
|
||||||
|
16
.mailmap
16
.mailmap
@@ -16,6 +16,11 @@ Alan Cox <alan@lxorguk.ukuu.org.uk>
|
|||||||
Alan Cox <root@hraefn.swansea.linux.org.uk>
|
Alan Cox <root@hraefn.swansea.linux.org.uk>
|
||||||
Aleksey Gorelov <aleksey_gorelov@phoenix.com>
|
Aleksey Gorelov <aleksey_gorelov@phoenix.com>
|
||||||
Aleksandar Markovic <aleksandar.markovic@mips.com> <aleksandar.markovic@imgtec.com>
|
Aleksandar Markovic <aleksandar.markovic@mips.com> <aleksandar.markovic@imgtec.com>
|
||||||
|
Alex Shi <alex.shi@linux.alibaba.com> <alex.shi@intel.com>
|
||||||
|
Alex Shi <alex.shi@linux.alibaba.com> <alex.shi@linaro.org>
|
||||||
|
Alexei Starovoitov <ast@kernel.org> <ast@plumgrid.com>
|
||||||
|
Alexei Starovoitov <ast@kernel.org> <alexei.starovoitov@gmail.com>
|
||||||
|
Alexei Starovoitov <ast@kernel.org> <ast@fb.com>
|
||||||
Al Viro <viro@ftp.linux.org.uk>
|
Al Viro <viro@ftp.linux.org.uk>
|
||||||
Al Viro <viro@zenIV.linux.org.uk>
|
Al Viro <viro@zenIV.linux.org.uk>
|
||||||
Andi Shyti <andi@etezian.org> <andi.shyti@samsung.com>
|
Andi Shyti <andi@etezian.org> <andi.shyti@samsung.com>
|
||||||
@@ -46,6 +51,12 @@ Christoph Hellwig <hch@lst.de>
|
|||||||
Christophe Ricard <christophe.ricard@gmail.com>
|
Christophe Ricard <christophe.ricard@gmail.com>
|
||||||
Corey Minyard <minyard@acm.org>
|
Corey Minyard <minyard@acm.org>
|
||||||
Damian Hobson-Garcia <dhobsong@igel.co.jp>
|
Damian Hobson-Garcia <dhobsong@igel.co.jp>
|
||||||
|
Daniel Borkmann <daniel@iogearbox.net> <dborkman@redhat.com>
|
||||||
|
Daniel Borkmann <daniel@iogearbox.net> <dborkmann@redhat.com>
|
||||||
|
Daniel Borkmann <daniel@iogearbox.net> <danborkmann@iogearbox.net>
|
||||||
|
Daniel Borkmann <daniel@iogearbox.net> <daniel.borkmann@tik.ee.ethz.ch>
|
||||||
|
Daniel Borkmann <daniel@iogearbox.net> <danborkmann@googlemail.com>
|
||||||
|
Daniel Borkmann <daniel@iogearbox.net> <dxchgb@gmail.com>
|
||||||
David Brownell <david-b@pacbell.net>
|
David Brownell <david-b@pacbell.net>
|
||||||
David Woodhouse <dwmw2@shinybook.infradead.org>
|
David Woodhouse <dwmw2@shinybook.infradead.org>
|
||||||
Dengcheng Zhu <dzhu@wavecomp.com> <dengcheng.zhu@mips.com>
|
Dengcheng Zhu <dzhu@wavecomp.com> <dengcheng.zhu@mips.com>
|
||||||
@@ -117,6 +128,8 @@ Leonid I Ananiev <leonid.i.ananiev@intel.com>
|
|||||||
Linas Vepstas <linas@austin.ibm.com>
|
Linas Vepstas <linas@austin.ibm.com>
|
||||||
Linus Lüssing <linus.luessing@c0d3.blue> <linus.luessing@web.de>
|
Linus Lüssing <linus.luessing@c0d3.blue> <linus.luessing@web.de>
|
||||||
Linus Lüssing <linus.luessing@c0d3.blue> <linus.luessing@ascom.ch>
|
Linus Lüssing <linus.luessing@c0d3.blue> <linus.luessing@ascom.ch>
|
||||||
|
Li Yang <leoyang.li@nxp.com> <leo@zh-kernel.org>
|
||||||
|
Li Yang <leoyang.li@nxp.com> <leoli@freescale.com>
|
||||||
Maciej W. Rozycki <macro@mips.com> <macro@imgtec.com>
|
Maciej W. Rozycki <macro@mips.com> <macro@imgtec.com>
|
||||||
Marcin Nowakowski <marcin.nowakowski@mips.com> <marcin.nowakowski@imgtec.com>
|
Marcin Nowakowski <marcin.nowakowski@mips.com> <marcin.nowakowski@imgtec.com>
|
||||||
Mark Brown <broonie@sirena.org.uk>
|
Mark Brown <broonie@sirena.org.uk>
|
||||||
@@ -189,6 +202,7 @@ Santosh Shilimkar <ssantosh@kernel.org>
|
|||||||
Santosh Shilimkar <santosh.shilimkar@oracle.org>
|
Santosh Shilimkar <santosh.shilimkar@oracle.org>
|
||||||
Sascha Hauer <s.hauer@pengutronix.de>
|
Sascha Hauer <s.hauer@pengutronix.de>
|
||||||
S.Çağlar Onur <caglar@pardus.org.tr>
|
S.Çağlar Onur <caglar@pardus.org.tr>
|
||||||
|
Sean Nyekjaer <sean@geanix.com> <sean.nyekjaer@prevas.dk>
|
||||||
Sebastian Reichel <sre@kernel.org> <sre@debian.org>
|
Sebastian Reichel <sre@kernel.org> <sre@debian.org>
|
||||||
Sebastian Reichel <sre@kernel.org> <sebastian.reichel@collabora.co.uk>
|
Sebastian Reichel <sre@kernel.org> <sebastian.reichel@collabora.co.uk>
|
||||||
Shiraz Hashim <shiraz.linux.kernel@gmail.com> <shiraz.hashim@st.com>
|
Shiraz Hashim <shiraz.linux.kernel@gmail.com> <shiraz.hashim@st.com>
|
||||||
@@ -207,6 +221,8 @@ Tejun Heo <htejun@gmail.com>
|
|||||||
Thomas Graf <tgraf@suug.ch>
|
Thomas Graf <tgraf@suug.ch>
|
||||||
Thomas Pedersen <twp@codeaurora.org>
|
Thomas Pedersen <twp@codeaurora.org>
|
||||||
Tony Luck <tony.luck@intel.com>
|
Tony Luck <tony.luck@intel.com>
|
||||||
|
TripleX Chung <xxx.phy@gmail.com> <zhongyu@18mail.cn>
|
||||||
|
TripleX Chung <xxx.phy@gmail.com> <triplex@zh-kernel.org>
|
||||||
Tsuneo Yoshioka <Tsuneo.Yoshioka@f-secure.com>
|
Tsuneo Yoshioka <Tsuneo.Yoshioka@f-secure.com>
|
||||||
Uwe Kleine-König <ukleinek@informatik.uni-freiburg.de>
|
Uwe Kleine-König <ukleinek@informatik.uni-freiburg.de>
|
||||||
Uwe Kleine-König <ukl@pengutronix.de>
|
Uwe Kleine-König <ukl@pengutronix.de>
|
||||||
|
@@ -1,3 +1,5 @@
|
|||||||
|
This ABI is deprecated and will be removed after 2021. It is
|
||||||
|
replaced with the batadv generic netlink family.
|
||||||
|
|
||||||
What: /sys/class/net/<iface>/batman-adv/elp_interval
|
What: /sys/class/net/<iface>/batman-adv/elp_interval
|
||||||
Date: Feb 2014
|
Date: Feb 2014
|
@@ -1,3 +1,5 @@
|
|||||||
|
This ABI is deprecated and will be removed after 2021. It is
|
||||||
|
replaced with the batadv generic netlink family.
|
||||||
|
|
||||||
What: /sys/class/net/<mesh_iface>/mesh/aggregated_ogms
|
What: /sys/class/net/<mesh_iface>/mesh/aggregated_ogms
|
||||||
Date: May 2010
|
Date: May 2010
|
@@ -6,6 +6,8 @@ Description:
|
|||||||
This file allows user to read/write the raw NVMEM contents.
|
This file allows user to read/write the raw NVMEM contents.
|
||||||
Permissions for write to this file depends on the nvmem
|
Permissions for write to this file depends on the nvmem
|
||||||
provider configuration.
|
provider configuration.
|
||||||
|
Note: This file is only present if CONFIG_NVMEM_SYSFS
|
||||||
|
is enabled
|
||||||
|
|
||||||
ex:
|
ex:
|
||||||
hexdump /sys/bus/nvmem/devices/qfprom0/nvmem
|
hexdump /sys/bus/nvmem/devices/qfprom0/nvmem
|
||||||
|
@@ -81,7 +81,9 @@ What: /sys/bus/vmbus/devices/<UUID>/channels/<N>/latency
|
|||||||
Date: September. 2017
|
Date: September. 2017
|
||||||
KernelVersion: 4.14
|
KernelVersion: 4.14
|
||||||
Contact: Stephen Hemminger <sthemmin@microsoft.com>
|
Contact: Stephen Hemminger <sthemmin@microsoft.com>
|
||||||
Description: Channel signaling latency
|
Description: Channel signaling latency. This file is available only for
|
||||||
|
performance critical channels (storage, network, etc.) that use
|
||||||
|
the monitor page mechanism.
|
||||||
Users: Debugging tools
|
Users: Debugging tools
|
||||||
|
|
||||||
What: /sys/bus/vmbus/devices/<UUID>/channels/<N>/out_mask
|
What: /sys/bus/vmbus/devices/<UUID>/channels/<N>/out_mask
|
||||||
@@ -95,7 +97,9 @@ What: /sys/bus/vmbus/devices/<UUID>/channels/<N>/pending
|
|||||||
Date: September. 2017
|
Date: September. 2017
|
||||||
KernelVersion: 4.14
|
KernelVersion: 4.14
|
||||||
Contact: Stephen Hemminger <sthemmin@microsoft.com>
|
Contact: Stephen Hemminger <sthemmin@microsoft.com>
|
||||||
Description: Channel interrupt pending state
|
Description: Channel interrupt pending state. This file is available only for
|
||||||
|
performance critical channels (storage, network, etc.) that use
|
||||||
|
the monitor page mechanism.
|
||||||
Users: Debugging tools
|
Users: Debugging tools
|
||||||
|
|
||||||
What: /sys/bus/vmbus/devices/<UUID>/channels/<N>/read_avail
|
What: /sys/bus/vmbus/devices/<UUID>/channels/<N>/read_avail
|
||||||
@@ -137,7 +141,9 @@ What: /sys/bus/vmbus/devices/<UUID>/channels/<N>/monitor_id
|
|||||||
Date: January. 2018
|
Date: January. 2018
|
||||||
KernelVersion: 4.16
|
KernelVersion: 4.16
|
||||||
Contact: Stephen Hemminger <sthemmin@microsoft.com>
|
Contact: Stephen Hemminger <sthemmin@microsoft.com>
|
||||||
Description: Monitor bit associated with channel
|
Description: Monitor bit associated with channel. This file is available only
|
||||||
|
for performance critical channels (storage, network, etc.) that
|
||||||
|
use the monitor page mechanism.
|
||||||
Users: Debugging tools and userspace drivers
|
Users: Debugging tools and userspace drivers
|
||||||
|
|
||||||
What: /sys/bus/vmbus/devices/<UUID>/channels/<N>/ring
|
What: /sys/bus/vmbus/devices/<UUID>/channels/<N>/ring
|
||||||
|
@@ -90,4 +90,89 @@ Date: December 2009
|
|||||||
Contact: Lee Schermerhorn <lee.schermerhorn@hp.com>
|
Contact: Lee Schermerhorn <lee.schermerhorn@hp.com>
|
||||||
Description:
|
Description:
|
||||||
The node's huge page size control/query attributes.
|
The node's huge page size control/query attributes.
|
||||||
See Documentation/admin-guide/mm/hugetlbpage.rst
|
See Documentation/admin-guide/mm/hugetlbpage.rst
|
||||||
|
|
||||||
|
What: /sys/devices/system/node/nodeX/accessY/
|
||||||
|
Date: December 2018
|
||||||
|
Contact: Keith Busch <keith.busch@intel.com>
|
||||||
|
Description:
|
||||||
|
The node's relationship to other nodes for access class "Y".
|
||||||
|
|
||||||
|
What: /sys/devices/system/node/nodeX/accessY/initiators/
|
||||||
|
Date: December 2018
|
||||||
|
Contact: Keith Busch <keith.busch@intel.com>
|
||||||
|
Description:
|
||||||
|
The directory containing symlinks to memory initiator
|
||||||
|
nodes that have class "Y" access to this target node's
|
||||||
|
memory. CPUs and other memory initiators in nodes not in
|
||||||
|
the list accessing this node's memory may have different
|
||||||
|
performance.
|
||||||
|
|
||||||
|
What: /sys/devices/system/node/nodeX/accessY/targets/
|
||||||
|
Date: December 2018
|
||||||
|
Contact: Keith Busch <keith.busch@intel.com>
|
||||||
|
Description:
|
||||||
|
The directory containing symlinks to memory targets that
|
||||||
|
this initiator node has class "Y" access.
|
||||||
|
|
||||||
|
What: /sys/devices/system/node/nodeX/accessY/initiators/read_bandwidth
|
||||||
|
Date: December 2018
|
||||||
|
Contact: Keith Busch <keith.busch@intel.com>
|
||||||
|
Description:
|
||||||
|
This node's read bandwidth in MB/s when accessed from
|
||||||
|
nodes found in this access class's linked initiators.
|
||||||
|
|
||||||
|
What: /sys/devices/system/node/nodeX/accessY/initiators/read_latency
|
||||||
|
Date: December 2018
|
||||||
|
Contact: Keith Busch <keith.busch@intel.com>
|
||||||
|
Description:
|
||||||
|
This node's read latency in nanoseconds when accessed
|
||||||
|
from nodes found in this access class's linked initiators.
|
||||||
|
|
||||||
|
What: /sys/devices/system/node/nodeX/accessY/initiators/write_bandwidth
|
||||||
|
Date: December 2018
|
||||||
|
Contact: Keith Busch <keith.busch@intel.com>
|
||||||
|
Description:
|
||||||
|
This node's write bandwidth in MB/s when accessed from
|
||||||
|
found in this access class's linked initiators.
|
||||||
|
|
||||||
|
What: /sys/devices/system/node/nodeX/accessY/initiators/write_latency
|
||||||
|
Date: December 2018
|
||||||
|
Contact: Keith Busch <keith.busch@intel.com>
|
||||||
|
Description:
|
||||||
|
This node's write latency in nanoseconds when access
|
||||||
|
from nodes found in this class's linked initiators.
|
||||||
|
|
||||||
|
What: /sys/devices/system/node/nodeX/memory_side_cache/indexY/
|
||||||
|
Date: December 2018
|
||||||
|
Contact: Keith Busch <keith.busch@intel.com>
|
||||||
|
Description:
|
||||||
|
The directory containing attributes for the memory-side cache
|
||||||
|
level 'Y'.
|
||||||
|
|
||||||
|
What: /sys/devices/system/node/nodeX/memory_side_cache/indexY/indexing
|
||||||
|
Date: December 2018
|
||||||
|
Contact: Keith Busch <keith.busch@intel.com>
|
||||||
|
Description:
|
||||||
|
The caches associativity indexing: 0 for direct mapped,
|
||||||
|
non-zero if indexed.
|
||||||
|
|
||||||
|
What: /sys/devices/system/node/nodeX/memory_side_cache/indexY/line_size
|
||||||
|
Date: December 2018
|
||||||
|
Contact: Keith Busch <keith.busch@intel.com>
|
||||||
|
Description:
|
||||||
|
The number of bytes accessed from the next cache level on a
|
||||||
|
cache miss.
|
||||||
|
|
||||||
|
What: /sys/devices/system/node/nodeX/memory_side_cache/indexY/size
|
||||||
|
Date: December 2018
|
||||||
|
Contact: Keith Busch <keith.busch@intel.com>
|
||||||
|
Description:
|
||||||
|
The size of this memory side cache in bytes.
|
||||||
|
|
||||||
|
What: /sys/devices/system/node/nodeX/memory_side_cache/indexY/write_policy
|
||||||
|
Date: December 2018
|
||||||
|
Contact: Keith Busch <keith.busch@intel.com>
|
||||||
|
Description:
|
||||||
|
The cache write policy: 0 for write-back, 1 for write-through,
|
||||||
|
other or unknown.
|
||||||
|
@@ -1,23 +1,46 @@
|
|||||||
|
What: /sys/kernel/debug/wilco_ec/h1_gpio
|
||||||
|
Date: April 2019
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Description:
|
||||||
|
As part of Chrome OS's FAFT (Fully Automated Firmware Testing)
|
||||||
|
tests, we need to ensure that the H1 chip is properly setting
|
||||||
|
some GPIO lines. The h1_gpio attribute exposes the state
|
||||||
|
of the lines:
|
||||||
|
- ENTRY_TO_FACT_MODE in BIT(0)
|
||||||
|
- SPI_CHROME_SEL in BIT(1)
|
||||||
|
|
||||||
|
Output will formatted with "0x%02x\n".
|
||||||
|
|
||||||
What: /sys/kernel/debug/wilco_ec/raw
|
What: /sys/kernel/debug/wilco_ec/raw
|
||||||
Date: January 2019
|
Date: January 2019
|
||||||
KernelVersion: 5.1
|
KernelVersion: 5.1
|
||||||
Description:
|
Description:
|
||||||
Write and read raw mailbox commands to the EC.
|
Write and read raw mailbox commands to the EC.
|
||||||
|
|
||||||
For writing:
|
You can write a hexadecimal sentence to raw, and that series of
|
||||||
Bytes 0-1 indicate the message type:
|
bytes will be sent to the EC. Then, you can read the bytes of
|
||||||
00 F0 = Execute Legacy Command
|
response by reading from raw.
|
||||||
00 F2 = Read/Write NVRAM Property
|
|
||||||
Byte 2 provides the command code
|
|
||||||
Bytes 3+ consist of the data passed in the request
|
|
||||||
|
|
||||||
At least three bytes are required, for the msg type and command,
|
For writing, bytes 0-1 indicate the message type, one of enum
|
||||||
with additional bytes optional for additional data.
|
wilco_ec_msg_type. Byte 2+ consist of the data passed in the
|
||||||
|
request, starting at MBOX[0]
|
||||||
|
|
||||||
|
At least three bytes are required for writing, two for the type
|
||||||
|
and at least a single byte of data. Only the first
|
||||||
|
EC_MAILBOX_DATA_SIZE bytes of MBOX will be used.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
// Request EC info type 3 (EC firmware build date)
|
// Request EC info type 3 (EC firmware build date)
|
||||||
$ echo 00 f0 38 00 03 00 > raw
|
// Corresponds with sending type 0x00f0 with
|
||||||
|
// MBOX = [38, 00, 03, 00]
|
||||||
|
$ echo 00 f0 38 00 03 00 > /sys/kernel/debug/wilco_ec/raw
|
||||||
// View the result. The decoded ASCII result "12/21/18" is
|
// View the result. The decoded ASCII result "12/21/18" is
|
||||||
// included after the raw hex.
|
// included after the raw hex.
|
||||||
$ cat raw
|
// Corresponds with MBOX = [00, 00, 31, 32, 2f, 32, 31, 38, ...]
|
||||||
00 31 32 2f 32 31 2f 31 38 00 38 00 01 00 2f 00 .12/21/18.8...
|
$ cat /sys/kernel/debug/wilco_ec/raw
|
||||||
|
00 00 31 32 2f 32 31 2f 31 38 00 38 00 01 00 2f 00 ..12/21/18.8...
|
||||||
|
|
||||||
|
Note that the first 32 bytes of the received MBOX[] will be
|
||||||
|
printed, even if some of the data is junk. It is up to you to
|
||||||
|
know how many of the first bytes of data are the actual
|
||||||
|
response.
|
||||||
|
230
Documentation/ABI/testing/sysfs-bus-counter
Normal file
230
Documentation/ABI/testing/sysfs-bus-counter
Normal file
@@ -0,0 +1,230 @@
|
|||||||
|
What: /sys/bus/counter/devices/counterX/countY/count
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Count data of Count Y represented as a string.
|
||||||
|
|
||||||
|
What: /sys/bus/counter/devices/counterX/countY/ceiling
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Count value ceiling for Count Y. This is the upper limit for the
|
||||||
|
respective counter.
|
||||||
|
|
||||||
|
What: /sys/bus/counter/devices/counterX/countY/floor
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Count value floor for Count Y. This is the lower limit for the
|
||||||
|
respective counter.
|
||||||
|
|
||||||
|
What: /sys/bus/counter/devices/counterX/countY/count_mode
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Count mode for channel Y. The ceiling and floor values for
|
||||||
|
Count Y are used by the count mode where required. The following
|
||||||
|
count modes are available:
|
||||||
|
|
||||||
|
normal:
|
||||||
|
Counting is continuous in either direction.
|
||||||
|
|
||||||
|
range limit:
|
||||||
|
An upper or lower limit is set, mimicking limit switches
|
||||||
|
in the mechanical counterpart. The upper limit is set to
|
||||||
|
the Count Y ceiling value, while the lower limit is set
|
||||||
|
to the Count Y floor value. The counter freezes at
|
||||||
|
count = ceiling when counting up, and at count = floor
|
||||||
|
when counting down. At either of these limits, the
|
||||||
|
counting is resumed only when the count direction is
|
||||||
|
reversed.
|
||||||
|
|
||||||
|
non-recycle:
|
||||||
|
The counter is disabled whenever a counter overflow or
|
||||||
|
underflow takes place. The counter is re-enabled when a
|
||||||
|
new count value is loaded to the counter via a preset
|
||||||
|
operation or direct write.
|
||||||
|
|
||||||
|
modulo-n:
|
||||||
|
A count value boundary is set between the Count Y floor
|
||||||
|
value and the Count Y ceiling value. The counter is
|
||||||
|
reset to the Count Y floor value at count = ceiling when
|
||||||
|
counting up, while the counter is set to the Count Y
|
||||||
|
ceiling value at count = floor when counting down; the
|
||||||
|
counter does not freeze at the boundary points, but
|
||||||
|
counts continuously throughout.
|
||||||
|
|
||||||
|
What: /sys/bus/counter/devices/counterX/countY/count_mode_available
|
||||||
|
What: /sys/bus/counter/devices/counterX/countY/error_noise_available
|
||||||
|
What: /sys/bus/counter/devices/counterX/countY/function_available
|
||||||
|
What: /sys/bus/counter/devices/counterX/countY/signalZ_action_available
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Discrete set of available values for the respective Count Y
|
||||||
|
configuration are listed in this file. Values are delimited by
|
||||||
|
newline characters.
|
||||||
|
|
||||||
|
What: /sys/bus/counter/devices/counterX/countY/direction
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Read-only attribute that indicates the count direction of Count
|
||||||
|
Y. Two count directions are available: forward and backward.
|
||||||
|
|
||||||
|
Some counter devices are able to determine the direction of
|
||||||
|
their counting. For example, quadrature encoding counters can
|
||||||
|
determine the direction of movement by evaluating the leading
|
||||||
|
phase of the respective A and B quadrature encoding signals.
|
||||||
|
This attribute exposes such count directions.
|
||||||
|
|
||||||
|
What: /sys/bus/counter/devices/counterX/countY/enable
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Whether channel Y counter is enabled. Valid attribute values are
|
||||||
|
boolean.
|
||||||
|
|
||||||
|
This attribute is intended to serve as a pause/unpause mechanism
|
||||||
|
for Count Y. Suppose a counter device is used to count the total
|
||||||
|
movement of a conveyor belt: this attribute allows an operator
|
||||||
|
to temporarily pause the counter, service the conveyor belt,
|
||||||
|
and then finally unpause the counter to continue where it had
|
||||||
|
left off.
|
||||||
|
|
||||||
|
What: /sys/bus/counter/devices/counterX/countY/error_noise
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Read-only attribute that indicates whether excessive noise is
|
||||||
|
present at the channel Y counter inputs.
|
||||||
|
|
||||||
|
What: /sys/bus/counter/devices/counterX/countY/function
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Count function mode of Count Y; count function evaluation is
|
||||||
|
triggered by conditions specified by the Count Y signalZ_action
|
||||||
|
attributes. The following count functions are available:
|
||||||
|
|
||||||
|
increase:
|
||||||
|
Accumulated count is incremented.
|
||||||
|
|
||||||
|
decrease:
|
||||||
|
Accumulated count is decremented.
|
||||||
|
|
||||||
|
pulse-direction:
|
||||||
|
Rising edges on signal A updates the respective count.
|
||||||
|
The input level of signal B determines direction.
|
||||||
|
|
||||||
|
quadrature x1 a:
|
||||||
|
If direction is forward, rising edges on quadrature pair
|
||||||
|
signal A updates the respective count; if the direction
|
||||||
|
is backward, falling edges on quadrature pair signal A
|
||||||
|
updates the respective count. Quadrature encoding
|
||||||
|
determines the direction.
|
||||||
|
|
||||||
|
quadrature x1 b:
|
||||||
|
If direction is forward, rising edges on quadrature pair
|
||||||
|
signal B updates the respective count; if the direction
|
||||||
|
is backward, falling edges on quadrature pair signal B
|
||||||
|
updates the respective count. Quadrature encoding
|
||||||
|
determines the direction.
|
||||||
|
|
||||||
|
quadrature x2 a:
|
||||||
|
Any state transition on quadrature pair signal A updates
|
||||||
|
the respective count. Quadrature encoding determines the
|
||||||
|
direction.
|
||||||
|
|
||||||
|
quadrature x2 b:
|
||||||
|
Any state transition on quadrature pair signal B updates
|
||||||
|
the respective count. Quadrature encoding determines the
|
||||||
|
direction.
|
||||||
|
|
||||||
|
quadrature x4:
|
||||||
|
Any state transition on either quadrature pair signals
|
||||||
|
updates the respective count. Quadrature encoding
|
||||||
|
determines the direction.
|
||||||
|
|
||||||
|
What: /sys/bus/counter/devices/counterX/countY/name
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Read-only attribute that indicates the device-specific name of
|
||||||
|
Count Y. If possible, this should match the name of the
|
||||||
|
respective channel as it appears in the device datasheet.
|
||||||
|
|
||||||
|
What: /sys/bus/counter/devices/counterX/countY/preset
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
If the counter device supports preset registers -- registers
|
||||||
|
used to load counter channels to a set count upon device-defined
|
||||||
|
preset operation trigger events -- the preset count for channel
|
||||||
|
Y is provided by this attribute.
|
||||||
|
|
||||||
|
What: /sys/bus/counter/devices/counterX/countY/preset_enable
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Whether channel Y counter preset operation is enabled. Valid
|
||||||
|
attribute values are boolean.
|
||||||
|
|
||||||
|
What: /sys/bus/counter/devices/counterX/countY/signalZ_action
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Action mode of Count Y for Signal Z. This attribute indicates
|
||||||
|
the condition of Signal Z that triggers the count function
|
||||||
|
evaluation for Count Y. The following action modes are
|
||||||
|
available:
|
||||||
|
|
||||||
|
none:
|
||||||
|
Signal does not trigger the count function. In
|
||||||
|
Pulse-Direction count function mode, this Signal is
|
||||||
|
evaluated as Direction.
|
||||||
|
|
||||||
|
rising edge:
|
||||||
|
Low state transitions to high state.
|
||||||
|
|
||||||
|
falling edge:
|
||||||
|
High state transitions to low state.
|
||||||
|
|
||||||
|
both edges:
|
||||||
|
Any state transition.
|
||||||
|
|
||||||
|
What: /sys/bus/counter/devices/counterX/name
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Read-only attribute that indicates the device-specific name of
|
||||||
|
the Counter. This should match the name of the device as it
|
||||||
|
appears in its respective datasheet.
|
||||||
|
|
||||||
|
What: /sys/bus/counter/devices/counterX/num_counts
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Read-only attribute that indicates the total number of Counts
|
||||||
|
belonging to the Counter.
|
||||||
|
|
||||||
|
What: /sys/bus/counter/devices/counterX/num_signals
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Read-only attribute that indicates the total number of Signals
|
||||||
|
belonging to the Counter.
|
||||||
|
|
||||||
|
What: /sys/bus/counter/devices/counterX/signalY/signal
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Signal data of Signal Y represented as a string.
|
||||||
|
|
||||||
|
What: /sys/bus/counter/devices/counterX/signalY/name
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Read-only attribute that indicates the device-specific name of
|
||||||
|
Signal Y. If possible, this should match the name of the
|
||||||
|
respective signal as it appears in the device datasheet.
|
36
Documentation/ABI/testing/sysfs-bus-counter-104-quad-8
Normal file
36
Documentation/ABI/testing/sysfs-bus-counter-104-quad-8
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
What: /sys/bus/counter/devices/counterX/signalY/index_polarity
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Active level of index input Signal Y; irrelevant in
|
||||||
|
non-synchronous load mode.
|
||||||
|
|
||||||
|
What: /sys/bus/counter/devices/counterX/signalY/index_polarity_available
|
||||||
|
What: /sys/bus/counter/devices/counterX/signalY/synchronous_mode_available
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Discrete set of available values for the respective Signal Y
|
||||||
|
configuration are listed in this file.
|
||||||
|
|
||||||
|
What: /sys/bus/counter/devices/counterX/signalY/synchronous_mode
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Configure the counter associated with Signal Y for
|
||||||
|
non-synchronous or synchronous load mode. Synchronous load mode
|
||||||
|
cannot be selected in non-quadrature (Pulse-Direction) clock
|
||||||
|
mode.
|
||||||
|
|
||||||
|
non-synchronous:
|
||||||
|
A logic low level is the active level at this index
|
||||||
|
input. The index function (as enabled via preset_enable)
|
||||||
|
is performed directly on the active level of the index
|
||||||
|
input.
|
||||||
|
|
||||||
|
synchronous:
|
||||||
|
Intended for interfacing with encoder Index output in
|
||||||
|
quadrature clock mode. The active level is configured
|
||||||
|
via index_polarity. The index function (as enabled via
|
||||||
|
preset_enable) is performed synchronously with the
|
||||||
|
quadrature clock on the active level of the index input.
|
16
Documentation/ABI/testing/sysfs-bus-counter-ftm-quaddec
Normal file
16
Documentation/ABI/testing/sysfs-bus-counter-ftm-quaddec
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
What: /sys/bus/counter/devices/counterX/countY/prescaler_available
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Discrete set of available values for the respective Count Y
|
||||||
|
configuration are listed in this file. Values are delimited by
|
||||||
|
newline characters.
|
||||||
|
|
||||||
|
What: /sys/bus/counter/devices/counterX/countY/prescaler
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Configure the prescaler value associated with Count Y.
|
||||||
|
On the FlexTimer, the counter clock source passes through a
|
||||||
|
prescaler (i.e. a counter). This acts like a clock
|
||||||
|
divider.
|
20
Documentation/ABI/testing/sysfs-bus-i2c-devices-pca954x
Normal file
20
Documentation/ABI/testing/sysfs-bus-i2c-devices-pca954x
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
What: /sys/bus/i2c/.../idle_state
|
||||||
|
Date: January 2019
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: Robert Shearman <robert.shearman@att.com>
|
||||||
|
Description:
|
||||||
|
Value that exists only for mux devices that can be
|
||||||
|
written to control the behaviour of the multiplexer on
|
||||||
|
idle. Possible values:
|
||||||
|
-2 - disconnect on idle, i.e. deselect the last used
|
||||||
|
channel, which is useful when there is a device
|
||||||
|
with an address that conflicts with another
|
||||||
|
device on another mux on the same parent bus.
|
||||||
|
-1 - leave the mux as-is, which is the most optimal
|
||||||
|
setting in terms of I2C operations and is the
|
||||||
|
default mode.
|
||||||
|
0..<nchans> - set the mux to a predetermined channel,
|
||||||
|
which is useful if there is one channel that is
|
||||||
|
used almost always, and you want to reduce the
|
||||||
|
latency for normal operations after rare
|
||||||
|
transactions on other channels
|
@@ -1656,6 +1656,8 @@ What: /sys/bus/iio/devices/iio:deviceX/in_countY_raw
|
|||||||
KernelVersion: 4.10
|
KernelVersion: 4.10
|
||||||
Contact: linux-iio@vger.kernel.org
|
Contact: linux-iio@vger.kernel.org
|
||||||
Description:
|
Description:
|
||||||
|
This interface is deprecated; please use the Counter subsystem.
|
||||||
|
|
||||||
Raw counter device counts from channel Y. For quadrature
|
Raw counter device counts from channel Y. For quadrature
|
||||||
counters, multiplication by an available [Y]_scale results in
|
counters, multiplication by an available [Y]_scale results in
|
||||||
the counts of a single quadrature signal phase from channel Y.
|
the counts of a single quadrature signal phase from channel Y.
|
||||||
@@ -1664,6 +1666,8 @@ What: /sys/bus/iio/devices/iio:deviceX/in_indexY_raw
|
|||||||
KernelVersion: 4.10
|
KernelVersion: 4.10
|
||||||
Contact: linux-iio@vger.kernel.org
|
Contact: linux-iio@vger.kernel.org
|
||||||
Description:
|
Description:
|
||||||
|
This interface is deprecated; please use the Counter subsystem.
|
||||||
|
|
||||||
Raw counter device index value from channel Y. This attribute
|
Raw counter device index value from channel Y. This attribute
|
||||||
provides an absolute positional reference (e.g. a pulse once per
|
provides an absolute positional reference (e.g. a pulse once per
|
||||||
revolution) which may be used to home positional systems as
|
revolution) which may be used to home positional systems as
|
||||||
@@ -1673,6 +1677,8 @@ What: /sys/bus/iio/devices/iio:deviceX/in_count_count_direction_available
|
|||||||
KernelVersion: 4.12
|
KernelVersion: 4.12
|
||||||
Contact: linux-iio@vger.kernel.org
|
Contact: linux-iio@vger.kernel.org
|
||||||
Description:
|
Description:
|
||||||
|
This interface is deprecated; please use the Counter subsystem.
|
||||||
|
|
||||||
A list of possible counting directions which are:
|
A list of possible counting directions which are:
|
||||||
- "up" : counter device is increasing.
|
- "up" : counter device is increasing.
|
||||||
- "down": counter device is decreasing.
|
- "down": counter device is decreasing.
|
||||||
@@ -1681,6 +1687,8 @@ What: /sys/bus/iio/devices/iio:deviceX/in_countY_count_direction
|
|||||||
KernelVersion: 4.12
|
KernelVersion: 4.12
|
||||||
Contact: linux-iio@vger.kernel.org
|
Contact: linux-iio@vger.kernel.org
|
||||||
Description:
|
Description:
|
||||||
|
This interface is deprecated; please use the Counter subsystem.
|
||||||
|
|
||||||
Raw counter device counters direction for channel Y.
|
Raw counter device counters direction for channel Y.
|
||||||
|
|
||||||
What: /sys/bus/iio/devices/iio:deviceX/in_phaseY_raw
|
What: /sys/bus/iio/devices/iio:deviceX/in_phaseY_raw
|
||||||
|
@@ -6,6 +6,8 @@ What: /sys/bus/iio/devices/iio:deviceX/in_index_synchronous_mode_available
|
|||||||
KernelVersion: 4.10
|
KernelVersion: 4.10
|
||||||
Contact: linux-iio@vger.kernel.org
|
Contact: linux-iio@vger.kernel.org
|
||||||
Description:
|
Description:
|
||||||
|
This interface is deprecated; please use the Counter subsystem.
|
||||||
|
|
||||||
Discrete set of available values for the respective counter
|
Discrete set of available values for the respective counter
|
||||||
configuration are listed in this file.
|
configuration are listed in this file.
|
||||||
|
|
||||||
@@ -13,6 +15,8 @@ What: /sys/bus/iio/devices/iio:deviceX/in_countY_count_mode
|
|||||||
KernelVersion: 4.10
|
KernelVersion: 4.10
|
||||||
Contact: linux-iio@vger.kernel.org
|
Contact: linux-iio@vger.kernel.org
|
||||||
Description:
|
Description:
|
||||||
|
This interface is deprecated; please use the Counter subsystem.
|
||||||
|
|
||||||
Count mode for channel Y. Four count modes are available:
|
Count mode for channel Y. Four count modes are available:
|
||||||
normal, range limit, non-recycle, and modulo-n. The preset value
|
normal, range limit, non-recycle, and modulo-n. The preset value
|
||||||
for channel Y is used by the count mode where required.
|
for channel Y is used by the count mode where required.
|
||||||
@@ -47,6 +51,8 @@ What: /sys/bus/iio/devices/iio:deviceX/in_countY_noise_error
|
|||||||
KernelVersion: 4.10
|
KernelVersion: 4.10
|
||||||
Contact: linux-iio@vger.kernel.org
|
Contact: linux-iio@vger.kernel.org
|
||||||
Description:
|
Description:
|
||||||
|
This interface is deprecated; please use the Counter subsystem.
|
||||||
|
|
||||||
Read-only attribute that indicates whether excessive noise is
|
Read-only attribute that indicates whether excessive noise is
|
||||||
present at the channel Y count inputs in quadrature clock mode;
|
present at the channel Y count inputs in quadrature clock mode;
|
||||||
irrelevant in non-quadrature clock mode.
|
irrelevant in non-quadrature clock mode.
|
||||||
@@ -55,6 +61,8 @@ What: /sys/bus/iio/devices/iio:deviceX/in_countY_preset
|
|||||||
KernelVersion: 4.10
|
KernelVersion: 4.10
|
||||||
Contact: linux-iio@vger.kernel.org
|
Contact: linux-iio@vger.kernel.org
|
||||||
Description:
|
Description:
|
||||||
|
This interface is deprecated; please use the Counter subsystem.
|
||||||
|
|
||||||
If the counter device supports preset registers, the preset
|
If the counter device supports preset registers, the preset
|
||||||
count for channel Y is provided by this attribute.
|
count for channel Y is provided by this attribute.
|
||||||
|
|
||||||
@@ -62,6 +70,8 @@ What: /sys/bus/iio/devices/iio:deviceX/in_countY_quadrature_mode
|
|||||||
KernelVersion: 4.10
|
KernelVersion: 4.10
|
||||||
Contact: linux-iio@vger.kernel.org
|
Contact: linux-iio@vger.kernel.org
|
||||||
Description:
|
Description:
|
||||||
|
This interface is deprecated; please use the Counter subsystem.
|
||||||
|
|
||||||
Configure channel Y counter for non-quadrature or quadrature
|
Configure channel Y counter for non-quadrature or quadrature
|
||||||
clock mode. Selecting non-quadrature clock mode will disable
|
clock mode. Selecting non-quadrature clock mode will disable
|
||||||
synchronous load mode. In quadrature clock mode, the channel Y
|
synchronous load mode. In quadrature clock mode, the channel Y
|
||||||
@@ -83,6 +93,8 @@ What: /sys/bus/iio/devices/iio:deviceX/in_countY_set_to_preset_on_index
|
|||||||
KernelVersion: 4.10
|
KernelVersion: 4.10
|
||||||
Contact: linux-iio@vger.kernel.org
|
Contact: linux-iio@vger.kernel.org
|
||||||
Description:
|
Description:
|
||||||
|
This interface is deprecated; please use the Counter subsystem.
|
||||||
|
|
||||||
Whether to set channel Y counter with channel Y preset value
|
Whether to set channel Y counter with channel Y preset value
|
||||||
when channel Y index input is active, or continuously count.
|
when channel Y index input is active, or continuously count.
|
||||||
Valid attribute values are boolean.
|
Valid attribute values are boolean.
|
||||||
@@ -91,6 +103,8 @@ What: /sys/bus/iio/devices/iio:deviceX/in_indexY_index_polarity
|
|||||||
KernelVersion: 4.10
|
KernelVersion: 4.10
|
||||||
Contact: linux-iio@vger.kernel.org
|
Contact: linux-iio@vger.kernel.org
|
||||||
Description:
|
Description:
|
||||||
|
This interface is deprecated; please use the Counter subsystem.
|
||||||
|
|
||||||
Active level of channel Y index input; irrelevant in
|
Active level of channel Y index input; irrelevant in
|
||||||
non-synchronous load mode.
|
non-synchronous load mode.
|
||||||
|
|
||||||
@@ -98,6 +112,8 @@ What: /sys/bus/iio/devices/iio:deviceX/in_indexY_synchronous_mode
|
|||||||
KernelVersion: 4.10
|
KernelVersion: 4.10
|
||||||
Contact: linux-iio@vger.kernel.org
|
Contact: linux-iio@vger.kernel.org
|
||||||
Description:
|
Description:
|
||||||
|
This interface is deprecated; please use the Counter subsystem.
|
||||||
|
|
||||||
Configure channel Y counter for non-synchronous or synchronous
|
Configure channel Y counter for non-synchronous or synchronous
|
||||||
load mode. Synchronous load mode cannot be selected in
|
load mode. Synchronous load mode cannot be selected in
|
||||||
non-quadrature clock mode.
|
non-quadrature clock mode.
|
||||||
|
@@ -1,26 +1,31 @@
|
|||||||
What: /sys/bus/iio/devices/iio:deviceX/outY_freq_start
|
What: /sys/bus/iio/devices/iio:deviceX/out_altvoltageY_frequency_start
|
||||||
|
Date: March 2019
|
||||||
KernelVersion: 3.1.0
|
KernelVersion: 3.1.0
|
||||||
Contact: linux-iio@vger.kernel.org
|
Contact: linux-iio@vger.kernel.org
|
||||||
Description:
|
Description:
|
||||||
Frequency sweep start frequency in Hz.
|
Frequency sweep start frequency in Hz.
|
||||||
|
|
||||||
What: /sys/bus/iio/devices/iio:deviceX/outY_freq_increment
|
What: /sys/bus/iio/devices/iio:deviceX/out_altvoltageY_frequency_increment
|
||||||
|
Date: March 2019
|
||||||
KernelVersion: 3.1.0
|
KernelVersion: 3.1.0
|
||||||
Contact: linux-iio@vger.kernel.org
|
Contact: linux-iio@vger.kernel.org
|
||||||
Description:
|
Description:
|
||||||
Frequency increment in Hz (step size) between consecutive
|
Frequency increment in Hz (step size) between consecutive
|
||||||
frequency points along the sweep.
|
frequency points along the sweep.
|
||||||
|
|
||||||
What: /sys/bus/iio/devices/iio:deviceX/outY_freq_points
|
What: /sys/bus/iio/devices/iio:deviceX/out_altvoltageY_frequency_points
|
||||||
|
Date: March 2019
|
||||||
KernelVersion: 3.1.0
|
KernelVersion: 3.1.0
|
||||||
Contact: linux-iio@vger.kernel.org
|
Contact: linux-iio@vger.kernel.org
|
||||||
Description:
|
Description:
|
||||||
Number of frequency points (steps) in the frequency sweep.
|
Number of frequency points (steps) in the frequency sweep.
|
||||||
This value, in conjunction with the outY_freq_start and the
|
This value, in conjunction with the
|
||||||
outY_freq_increment, determines the frequency sweep range
|
out_altvoltageY_frequency_start and the
|
||||||
for the sweep operation.
|
out_altvoltageY_frequency_increment, determines the frequency
|
||||||
|
sweep range for the sweep operation.
|
||||||
|
|
||||||
What: /sys/bus/iio/devices/iio:deviceX/outY_settling_cycles
|
What: /sys/bus/iio/devices/iio:deviceX/out_altvoltageY_settling_cycles
|
||||||
|
Date: March 2019
|
||||||
KernelVersion: 3.1.0
|
KernelVersion: 3.1.0
|
||||||
Contact: linux-iio@vger.kernel.org
|
Contact: linux-iio@vger.kernel.org
|
||||||
Description:
|
Description:
|
@@ -1,6 +1,6 @@
|
|||||||
What: /sys/bus/iio/devices/iio:deviceX/start_cleaning
|
What: /sys/bus/iio/devices/iio:deviceX/start_cleaning
|
||||||
Date: December 2018
|
Date: December 2018
|
||||||
KernelVersion: 4.22
|
KernelVersion: 5.0
|
||||||
Contact: linux-iio@vger.kernel.org
|
Contact: linux-iio@vger.kernel.org
|
||||||
Description:
|
Description:
|
||||||
Writing 1 starts sensor self cleaning. Internal fan accelerates
|
Writing 1 starts sensor self cleaning. Internal fan accelerates
|
||||||
|
24
Documentation/ABI/testing/sysfs-bus-iio-temperature-max31856
Normal file
24
Documentation/ABI/testing/sysfs-bus-iio-temperature-max31856
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
What: /sys/bus/iio/devices/iio:deviceX/fault_oc
|
||||||
|
KernelVersion: 5.1
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Open-circuit fault. The detection of open-circuit faults,
|
||||||
|
such as those caused by broken thermocouple wires.
|
||||||
|
Reading returns either '1' or '0'.
|
||||||
|
'1' = An open circuit such as broken thermocouple wires
|
||||||
|
has been detected.
|
||||||
|
'0' = No open circuit or broken thermocouple wires are detected
|
||||||
|
|
||||||
|
What: /sys/bus/iio/devices/iio:deviceX/fault_ovuv
|
||||||
|
KernelVersion: 5.1
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Overvoltage or Undervoltage Input Fault. The internal circuitry
|
||||||
|
is protected from excessive voltages applied to the thermocouple
|
||||||
|
cables by integrated MOSFETs at the T+ and T- inputs, and the
|
||||||
|
BIAS output. These MOSFETs turn off when the input voltage is
|
||||||
|
negative or greater than VDD.
|
||||||
|
Reading returns either '1' or '0'.
|
||||||
|
'1' = The input voltage is negative or greater than VDD.
|
||||||
|
'0' = The input voltage is positive and less than VDD (normal
|
||||||
|
state).
|
@@ -30,4 +30,12 @@ Description: (RW) Configure MSC buffer size for "single" or "multi" modes.
|
|||||||
there are no active users and tracing is not enabled) and then
|
there are no active users and tracing is not enabled) and then
|
||||||
allocates a new one.
|
allocates a new one.
|
||||||
|
|
||||||
|
What: /sys/bus/intel_th/devices/<intel_th_id>-msc<msc-id>/win_switch
|
||||||
|
Date: May 2019
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: Alexander Shishkin <alexander.shishkin@linux.intel.com>
|
||||||
|
Description: (RW) Trigger window switch for the MSC's buffer, in
|
||||||
|
multi-window mode. In "multi" mode, accepts writes of "1", thereby
|
||||||
|
triggering a window switch for the buffer. Returns an error in any
|
||||||
|
other operating mode or attempts to write something other than "1".
|
||||||
|
|
||||||
|
@@ -65,3 +65,18 @@ Description: Display the ME firmware version.
|
|||||||
<platform>:<major>.<minor>.<milestone>.<build_no>.
|
<platform>:<major>.<minor>.<milestone>.<build_no>.
|
||||||
There can be up to three such blocks for different
|
There can be up to three such blocks for different
|
||||||
FW components.
|
FW components.
|
||||||
|
|
||||||
|
What: /sys/class/mei/meiN/dev_state
|
||||||
|
Date: Mar 2019
|
||||||
|
KernelVersion: 5.1
|
||||||
|
Contact: Tomas Winkler <tomas.winkler@intel.com>
|
||||||
|
Description: Display the ME device state.
|
||||||
|
|
||||||
|
The device state can have following values:
|
||||||
|
INITIALIZING
|
||||||
|
INIT_CLIENTS
|
||||||
|
ENABLED
|
||||||
|
RESETTING
|
||||||
|
DISABLED
|
||||||
|
POWER_DOWN
|
||||||
|
POWER_UP
|
||||||
|
@@ -114,15 +114,60 @@ Description:
|
|||||||
Access: Read
|
Access: Read
|
||||||
Valid values: Represented in microamps
|
Valid values: Represented in microamps
|
||||||
|
|
||||||
|
What: /sys/class/power_supply/<supply_name>/charge_control_limit
|
||||||
|
Date: Oct 2012
|
||||||
|
Contact: linux-pm@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Maximum allowable charging current. Used for charge rate
|
||||||
|
throttling for thermal cooling or improving battery health.
|
||||||
|
|
||||||
|
Access: Read, Write
|
||||||
|
Valid values: Represented in microamps
|
||||||
|
|
||||||
|
What: /sys/class/power_supply/<supply_name>/charge_control_limit_max
|
||||||
|
Date: Oct 2012
|
||||||
|
Contact: linux-pm@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Maximum legal value for the charge_control_limit property.
|
||||||
|
|
||||||
|
Access: Read
|
||||||
|
Valid values: Represented in microamps
|
||||||
|
|
||||||
|
What: /sys/class/power_supply/<supply_name>/charge_control_start_threshold
|
||||||
|
Date: April 2019
|
||||||
|
Contact: linux-pm@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Represents a battery percentage level, below which charging will
|
||||||
|
begin.
|
||||||
|
|
||||||
|
Access: Read, Write
|
||||||
|
Valid values: 0 - 100 (percent)
|
||||||
|
|
||||||
|
What: /sys/class/power_supply/<supply_name>/charge_control_end_threshold
|
||||||
|
Date: April 2019
|
||||||
|
Contact: linux-pm@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Represents a battery percentage level, above which charging will
|
||||||
|
stop.
|
||||||
|
|
||||||
|
Access: Read, Write
|
||||||
|
Valid values: 0 - 100 (percent)
|
||||||
|
|
||||||
What: /sys/class/power_supply/<supply_name>/charge_type
|
What: /sys/class/power_supply/<supply_name>/charge_type
|
||||||
Date: July 2009
|
Date: July 2009
|
||||||
Contact: linux-pm@vger.kernel.org
|
Contact: linux-pm@vger.kernel.org
|
||||||
Description:
|
Description:
|
||||||
Represents the type of charging currently being applied to the
|
Represents the type of charging currently being applied to the
|
||||||
battery.
|
battery. "Trickle", "Fast", and "Standard" all mean different
|
||||||
|
charging speeds. "Adaptive" means that the charger uses some
|
||||||
|
algorithm to adjust the charge rate dynamically, without
|
||||||
|
any user configuration required. "Custom" means that the charger
|
||||||
|
uses the charge_control_* properties as configuration for some
|
||||||
|
different algorithm.
|
||||||
|
|
||||||
Access: Read
|
Access: Read, Write
|
||||||
Valid values: "Unknown", "N/A", "Trickle", "Fast"
|
Valid values: "Unknown", "N/A", "Trickle", "Fast", "Standard",
|
||||||
|
"Adaptive", "Custom"
|
||||||
|
|
||||||
What: /sys/class/power_supply/<supply_name>/charge_term_current
|
What: /sys/class/power_supply/<supply_name>/charge_term_current
|
||||||
Date: July 2014
|
Date: July 2014
|
||||||
|
@@ -212,7 +212,7 @@ Description:
|
|||||||
Messages may be broken into parts if
|
Messages may be broken into parts if
|
||||||
they are long.
|
they are long.
|
||||||
|
|
||||||
receieved_messages: (RO) Number of message responses
|
received_messages: (RO) Number of message responses
|
||||||
received.
|
received.
|
||||||
|
|
||||||
received_message_parts: (RO) Number of message fragments
|
received_message_parts: (RO) Number of message fragments
|
||||||
|
@@ -484,6 +484,7 @@ What: /sys/devices/system/cpu/vulnerabilities
|
|||||||
/sys/devices/system/cpu/vulnerabilities/spectre_v2
|
/sys/devices/system/cpu/vulnerabilities/spectre_v2
|
||||||
/sys/devices/system/cpu/vulnerabilities/spec_store_bypass
|
/sys/devices/system/cpu/vulnerabilities/spec_store_bypass
|
||||||
/sys/devices/system/cpu/vulnerabilities/l1tf
|
/sys/devices/system/cpu/vulnerabilities/l1tf
|
||||||
|
/sys/devices/system/cpu/vulnerabilities/mds
|
||||||
Date: January 2018
|
Date: January 2018
|
||||||
Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
|
Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
|
||||||
Description: Information about CPU vulnerabilities
|
Description: Information about CPU vulnerabilities
|
||||||
@@ -496,8 +497,7 @@ Description: Information about CPU vulnerabilities
|
|||||||
"Vulnerable" CPU is affected and no mitigation in effect
|
"Vulnerable" CPU is affected and no mitigation in effect
|
||||||
"Mitigation: $M" CPU is affected and mitigation $M is in effect
|
"Mitigation: $M" CPU is affected and mitigation $M is in effect
|
||||||
|
|
||||||
Details about the l1tf file can be found in
|
See also: Documentation/admin-guide/hw-vuln/index.rst
|
||||||
Documentation/admin-guide/l1tf.rst
|
|
||||||
|
|
||||||
What: /sys/devices/system/cpu/smt
|
What: /sys/devices/system/cpu/smt
|
||||||
/sys/devices/system/cpu/smt/active
|
/sys/devices/system/cpu/smt/active
|
||||||
@@ -511,10 +511,30 @@ Description: Control Symetric Multi Threading (SMT)
|
|||||||
control: Read/write interface to control SMT. Possible
|
control: Read/write interface to control SMT. Possible
|
||||||
values:
|
values:
|
||||||
|
|
||||||
"on" SMT is enabled
|
"on" SMT is enabled
|
||||||
"off" SMT is disabled
|
"off" SMT is disabled
|
||||||
"forceoff" SMT is force disabled. Cannot be changed.
|
"forceoff" SMT is force disabled. Cannot be changed.
|
||||||
"notsupported" SMT is not supported by the CPU
|
"notsupported" SMT is not supported by the CPU
|
||||||
|
"notimplemented" SMT runtime toggling is not
|
||||||
|
implemented for the architecture
|
||||||
|
|
||||||
If control status is "forceoff" or "notsupported" writes
|
If control status is "forceoff" or "notsupported" writes
|
||||||
are rejected.
|
are rejected.
|
||||||
|
|
||||||
|
What: /sys/devices/system/cpu/cpu#/power/energy_perf_bias
|
||||||
|
Date: March 2019
|
||||||
|
Contact: linux-pm@vger.kernel.org
|
||||||
|
Description: Intel Energy and Performance Bias Hint (EPB)
|
||||||
|
|
||||||
|
EPB for the given CPU in a sliding scale 0 - 15, where a value
|
||||||
|
of 0 corresponds to a hint preference for highest performance
|
||||||
|
and a value of 15 corresponds to the maximum energy savings.
|
||||||
|
|
||||||
|
In order to change the EPB value for the CPU, write either
|
||||||
|
a number in the 0 - 15 sliding scale above, or one of the
|
||||||
|
strings: "performance", "balance-performance", "normal",
|
||||||
|
"balance-power", "power" (that represent values reflected by
|
||||||
|
their meaning), to this attribute.
|
||||||
|
|
||||||
|
This attribute is present for all online CPUs supporting the
|
||||||
|
Intel EPB feature.
|
||||||
|
6
Documentation/ABI/testing/sysfs-driver-ucsi-ccg
Normal file
6
Documentation/ABI/testing/sysfs-driver-ucsi-ccg
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
What: /sys/bus/i2c/drivers/ucsi_ccg/.../do_flash
|
||||||
|
Date: May 2019
|
||||||
|
Contact: Ajay Gupta <ajayg@nvidia.com>
|
||||||
|
Description:
|
||||||
|
Tell the driver for Cypress CCGx Type-C controller to attempt
|
||||||
|
firmware upgrade by writing [Yy1] to the file.
|
@@ -45,7 +45,7 @@ Description:
|
|||||||
use this feature without a clearance from a patch
|
use this feature without a clearance from a patch
|
||||||
distributor. Removal (rmmod) of patch modules is permanently
|
distributor. Removal (rmmod) of patch modules is permanently
|
||||||
disabled when the feature is used. See
|
disabled when the feature is used. See
|
||||||
Documentation/livepatch/livepatch.txt for more information.
|
Documentation/livepatch/livepatch.rst for more information.
|
||||||
|
|
||||||
What: /sys/kernel/livepatch/<patch>/<object>
|
What: /sys/kernel/livepatch/<patch>/<object>
|
||||||
Date: Nov 2014
|
Date: Nov 2014
|
||||||
|
27
Documentation/ABI/testing/usb-uevent
Normal file
27
Documentation/ABI/testing/usb-uevent
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
What: Raise a uevent when a USB Host Controller has died
|
||||||
|
Date: 2019-04-17
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: linux-usb@vger.kernel.org
|
||||||
|
Description: When the USB Host Controller has entered a state where it is no
|
||||||
|
longer functional a uevent will be raised. The uevent will
|
||||||
|
contain ACTION=offline and ERROR=DEAD.
|
||||||
|
|
||||||
|
Here is an example taken using udevadm monitor -p:
|
||||||
|
|
||||||
|
KERNEL[130.428945] offline /devices/pci0000:00/0000:00:10.0/usb2 (usb)
|
||||||
|
ACTION=offline
|
||||||
|
BUSNUM=002
|
||||||
|
DEVNAME=/dev/bus/usb/002/001
|
||||||
|
DEVNUM=001
|
||||||
|
DEVPATH=/devices/pci0000:00/0000:00:10.0/usb2
|
||||||
|
DEVTYPE=usb_device
|
||||||
|
DRIVER=usb
|
||||||
|
ERROR=DEAD
|
||||||
|
MAJOR=189
|
||||||
|
MINOR=128
|
||||||
|
PRODUCT=1d6b/2/414
|
||||||
|
SEQNUM=2168
|
||||||
|
SUBSYSTEM=usb
|
||||||
|
TYPE=9/0/1
|
||||||
|
|
||||||
|
Users: chromium-os-dev@chromium.org
|
@@ -147,7 +147,7 @@ networking subsystems make sure that the buffers they use are valid
|
|||||||
for you to DMA from/to.
|
for you to DMA from/to.
|
||||||
|
|
||||||
DMA addressing capabilities
|
DMA addressing capabilities
|
||||||
==========================
|
===========================
|
||||||
|
|
||||||
By default, the kernel assumes that your device can address 32-bits of DMA
|
By default, the kernel assumes that your device can address 32-bits of DMA
|
||||||
addressing. For a 64-bit capable device, this needs to be increased, and for
|
addressing. For a 64-bit capable device, this needs to be increased, and for
|
||||||
@@ -365,13 +365,12 @@ __get_free_pages() (but takes size instead of a page order). If your
|
|||||||
driver needs regions sized smaller than a page, you may prefer using
|
driver needs regions sized smaller than a page, you may prefer using
|
||||||
the dma_pool interface, described below.
|
the dma_pool interface, described below.
|
||||||
|
|
||||||
The consistent DMA mapping interfaces, for non-NULL dev, will by
|
The consistent DMA mapping interfaces, will by default return a DMA address
|
||||||
default return a DMA address which is 32-bit addressable. Even if the
|
which is 32-bit addressable. Even if the device indicates (via the DMA mask)
|
||||||
device indicates (via DMA mask) that it may address the upper 32-bits,
|
that it may address the upper 32-bits, consistent allocation will only
|
||||||
consistent allocation will only return > 32-bit addresses for DMA if
|
return > 32-bit addresses for DMA if the consistent DMA mask has been
|
||||||
the consistent DMA mask has been explicitly changed via
|
explicitly changed via dma_set_coherent_mask(). This is true of the
|
||||||
dma_set_coherent_mask(). This is true of the dma_pool interface as
|
dma_pool interface as well.
|
||||||
well.
|
|
||||||
|
|
||||||
dma_alloc_coherent() returns two values: the virtual address which you
|
dma_alloc_coherent() returns two values: the virtual address which you
|
||||||
can use to access it from the CPU and dma_handle which you pass to the
|
can use to access it from the CPU and dma_handle which you pass to the
|
||||||
|
@@ -28,8 +28,13 @@ ifeq ($(HAVE_SPHINX),0)
|
|||||||
|
|
||||||
else # HAVE_SPHINX
|
else # HAVE_SPHINX
|
||||||
|
|
||||||
# User-friendly check for pdflatex
|
# User-friendly check for pdflatex and latexmk
|
||||||
HAVE_PDFLATEX := $(shell if which $(PDFLATEX) >/dev/null 2>&1; then echo 1; else echo 0; fi)
|
HAVE_PDFLATEX := $(shell if which $(PDFLATEX) >/dev/null 2>&1; then echo 1; else echo 0; fi)
|
||||||
|
HAVE_LATEXMK := $(shell if which latexmk >/dev/null 2>&1; then echo 1; else echo 0; fi)
|
||||||
|
|
||||||
|
ifeq ($(HAVE_LATEXMK),1)
|
||||||
|
PDFLATEX := latexmk -$(PDFLATEX)
|
||||||
|
endif #HAVE_LATEXMK
|
||||||
|
|
||||||
# Internal variables.
|
# Internal variables.
|
||||||
PAPEROPT_a4 = -D latex_paper_size=a4
|
PAPEROPT_a4 = -D latex_paper_size=a4
|
||||||
@@ -82,7 +87,7 @@ pdfdocs:
|
|||||||
else # HAVE_PDFLATEX
|
else # HAVE_PDFLATEX
|
||||||
|
|
||||||
pdfdocs: latexdocs
|
pdfdocs: latexdocs
|
||||||
$(foreach var,$(SPHINXDIRS), $(MAKE) PDFLATEX=$(PDFLATEX) LATEXOPTS="$(LATEXOPTS)" -C $(BUILDDIR)/$(var)/latex || exit;)
|
$(foreach var,$(SPHINXDIRS), $(MAKE) PDFLATEX="$(PDFLATEX)" LATEXOPTS="$(LATEXOPTS)" -C $(BUILDDIR)/$(var)/latex || exit;)
|
||||||
|
|
||||||
endif # HAVE_PDFLATEX
|
endif # HAVE_PDFLATEX
|
||||||
|
|
||||||
|
@@ -155,8 +155,7 @@ keeping lock contention under control at all tree levels regardless
|
|||||||
of the level of loading on the system.
|
of the level of loading on the system.
|
||||||
|
|
||||||
</p><p>RCU updaters wait for normal grace periods by registering
|
</p><p>RCU updaters wait for normal grace periods by registering
|
||||||
RCU callbacks, either directly via <tt>call_rcu()</tt> and
|
RCU callbacks, either directly via <tt>call_rcu()</tt>
|
||||||
friends (namely <tt>call_rcu_bh()</tt> and <tt>call_rcu_sched()</tt>),
|
|
||||||
or indirectly via <tt>synchronize_rcu()</tt> and friends.
|
or indirectly via <tt>synchronize_rcu()</tt> and friends.
|
||||||
RCU callbacks are represented by <tt>rcu_head</tt> structures,
|
RCU callbacks are represented by <tt>rcu_head</tt> structures,
|
||||||
which are queued on <tt>rcu_data</tt> structures while they are
|
which are queued on <tt>rcu_data</tt> structures while they are
|
||||||
|
@@ -56,6 +56,7 @@ sections.
|
|||||||
RCU-preempt Expedited Grace Periods</a></h2>
|
RCU-preempt Expedited Grace Periods</a></h2>
|
||||||
|
|
||||||
<p>
|
<p>
|
||||||
|
<tt>CONFIG_PREEMPT=y</tt> kernels implement RCU-preempt.
|
||||||
The overall flow of the handling of a given CPU by an RCU-preempt
|
The overall flow of the handling of a given CPU by an RCU-preempt
|
||||||
expedited grace period is shown in the following diagram:
|
expedited grace period is shown in the following diagram:
|
||||||
|
|
||||||
@@ -139,6 +140,7 @@ or offline, among other things.
|
|||||||
RCU-sched Expedited Grace Periods</a></h2>
|
RCU-sched Expedited Grace Periods</a></h2>
|
||||||
|
|
||||||
<p>
|
<p>
|
||||||
|
<tt>CONFIG_PREEMPT=n</tt> kernels implement RCU-sched.
|
||||||
The overall flow of the handling of a given CPU by an RCU-sched
|
The overall flow of the handling of a given CPU by an RCU-sched
|
||||||
expedited grace period is shown in the following diagram:
|
expedited grace period is shown in the following diagram:
|
||||||
|
|
||||||
@@ -146,7 +148,7 @@ expedited grace period is shown in the following diagram:
|
|||||||
|
|
||||||
<p>
|
<p>
|
||||||
As with RCU-preempt, RCU-sched's
|
As with RCU-preempt, RCU-sched's
|
||||||
<tt>synchronize_sched_expedited()</tt> ignores offline and
|
<tt>synchronize_rcu_expedited()</tt> ignores offline and
|
||||||
idle CPUs, again because they are in remotely detectable
|
idle CPUs, again because they are in remotely detectable
|
||||||
quiescent states.
|
quiescent states.
|
||||||
However, because the
|
However, because the
|
||||||
|
@@ -34,12 +34,11 @@ Similarly, any code that happens before the beginning of a given RCU grace
|
|||||||
period is guaranteed to see the effects of all accesses following the end
|
period is guaranteed to see the effects of all accesses following the end
|
||||||
of that grace period that are within RCU read-side critical sections.
|
of that grace period that are within RCU read-side critical sections.
|
||||||
|
|
||||||
<p>This guarantee is particularly pervasive for <tt>synchronize_sched()</tt>,
|
<p>Note well that RCU-sched read-side critical sections include any region
|
||||||
for which RCU-sched read-side critical sections include any region
|
|
||||||
of code for which preemption is disabled.
|
of code for which preemption is disabled.
|
||||||
Given that each individual machine instruction can be thought of as
|
Given that each individual machine instruction can be thought of as
|
||||||
an extremely small region of preemption-disabled code, one can think of
|
an extremely small region of preemption-disabled code, one can think of
|
||||||
<tt>synchronize_sched()</tt> as <tt>smp_mb()</tt> on steroids.
|
<tt>synchronize_rcu()</tt> as <tt>smp_mb()</tt> on steroids.
|
||||||
|
|
||||||
<p>RCU updaters use this guarantee by splitting their updates into
|
<p>RCU updaters use this guarantee by splitting their updates into
|
||||||
two phases, one of which is executed before the grace period and
|
two phases, one of which is executed before the grace period and
|
||||||
|
@@ -81,18 +81,19 @@ currently executing on some other CPU. We therefore cannot free
|
|||||||
up any data structures used by the old NMI handler until execution
|
up any data structures used by the old NMI handler until execution
|
||||||
of it completes on all other CPUs.
|
of it completes on all other CPUs.
|
||||||
|
|
||||||
One way to accomplish this is via synchronize_sched(), perhaps as
|
One way to accomplish this is via synchronize_rcu(), perhaps as
|
||||||
follows:
|
follows:
|
||||||
|
|
||||||
unset_nmi_callback();
|
unset_nmi_callback();
|
||||||
synchronize_sched();
|
synchronize_rcu();
|
||||||
kfree(my_nmi_data);
|
kfree(my_nmi_data);
|
||||||
|
|
||||||
This works because synchronize_sched() blocks until all CPUs complete
|
This works because (as of v4.20) synchronize_rcu() blocks until all
|
||||||
any preemption-disabled segments of code that they were executing.
|
CPUs complete any preemption-disabled segments of code that they were
|
||||||
Since NMI handlers disable preemption, synchronize_sched() is guaranteed
|
executing.
|
||||||
|
Since NMI handlers disable preemption, synchronize_rcu() is guaranteed
|
||||||
not to return until all ongoing NMI handlers exit. It is therefore safe
|
not to return until all ongoing NMI handlers exit. It is therefore safe
|
||||||
to free up the handler's data as soon as synchronize_sched() returns.
|
to free up the handler's data as soon as synchronize_rcu() returns.
|
||||||
|
|
||||||
Important note: for this to work, the architecture in question must
|
Important note: for this to work, the architecture in question must
|
||||||
invoke nmi_enter() and nmi_exit() on NMI entry and exit, respectively.
|
invoke nmi_enter() and nmi_exit() on NMI entry and exit, respectively.
|
||||||
|
@@ -86,10 +86,8 @@ even on a UP system. So do not do it! Even on a UP system, the RCU
|
|||||||
infrastructure -must- respect grace periods, and -must- invoke callbacks
|
infrastructure -must- respect grace periods, and -must- invoke callbacks
|
||||||
from a known environment in which no locks are held.
|
from a known environment in which no locks are held.
|
||||||
|
|
||||||
It -is- safe for synchronize_sched() and synchronize_rcu_bh() to return
|
Note that it -is- safe for synchronize_rcu() to return immediately on
|
||||||
immediately on an UP system. It is also safe for synchronize_rcu()
|
UP systems, including !PREEMPT SMP builds running on UP systems.
|
||||||
to return immediately on UP systems, except when running preemptable
|
|
||||||
RCU.
|
|
||||||
|
|
||||||
Quick Quiz #3: Why can't synchronize_rcu() return immediately on
|
Quick Quiz #3: Why can't synchronize_rcu() return immediately on
|
||||||
UP systems running preemptable RCU?
|
UP systems running preemptable RCU?
|
||||||
|
@@ -182,16 +182,13 @@ over a rather long period of time, but improvements are always welcome!
|
|||||||
when publicizing a pointer to a structure that can
|
when publicizing a pointer to a structure that can
|
||||||
be traversed by an RCU read-side critical section.
|
be traversed by an RCU read-side critical section.
|
||||||
|
|
||||||
5. If call_rcu(), or a related primitive such as call_rcu_bh(),
|
5. If call_rcu() or call_srcu() is used, the callback function will
|
||||||
call_rcu_sched(), or call_srcu() is used, the callback function
|
be called from softirq context. In particular, it cannot block.
|
||||||
will be called from softirq context. In particular, it cannot
|
|
||||||
block.
|
|
||||||
|
|
||||||
6. Since synchronize_rcu() can block, it cannot be called from
|
6. Since synchronize_rcu() can block, it cannot be called
|
||||||
any sort of irq context. The same rule applies for
|
from any sort of irq context. The same rule applies
|
||||||
synchronize_rcu_bh(), synchronize_sched(), synchronize_srcu(),
|
for synchronize_srcu(), synchronize_rcu_expedited(), and
|
||||||
synchronize_rcu_expedited(), synchronize_rcu_bh_expedited(),
|
synchronize_srcu_expedited().
|
||||||
synchronize_sched_expedite(), and synchronize_srcu_expedited().
|
|
||||||
|
|
||||||
The expedited forms of these primitives have the same semantics
|
The expedited forms of these primitives have the same semantics
|
||||||
as the non-expedited forms, but expediting is both expensive and
|
as the non-expedited forms, but expediting is both expensive and
|
||||||
@@ -212,20 +209,20 @@ over a rather long period of time, but improvements are always welcome!
|
|||||||
of the system, especially to real-time workloads running on
|
of the system, especially to real-time workloads running on
|
||||||
the rest of the system.
|
the rest of the system.
|
||||||
|
|
||||||
7. If the updater uses call_rcu() or synchronize_rcu(), then the
|
7. As of v4.20, a given kernel implements only one RCU flavor,
|
||||||
corresponding readers must use rcu_read_lock() and
|
which is RCU-sched for PREEMPT=n and RCU-preempt for PREEMPT=y.
|
||||||
rcu_read_unlock(). If the updater uses call_rcu_bh() or
|
If the updater uses call_rcu() or synchronize_rcu(),
|
||||||
synchronize_rcu_bh(), then the corresponding readers must
|
then the corresponding readers my use rcu_read_lock() and
|
||||||
use rcu_read_lock_bh() and rcu_read_unlock_bh(). If the
|
rcu_read_unlock(), rcu_read_lock_bh() and rcu_read_unlock_bh(),
|
||||||
updater uses call_rcu_sched() or synchronize_sched(), then
|
or any pair of primitives that disables and re-enables preemption,
|
||||||
the corresponding readers must disable preemption, possibly
|
for example, rcu_read_lock_sched() and rcu_read_unlock_sched().
|
||||||
by calling rcu_read_lock_sched() and rcu_read_unlock_sched().
|
If the updater uses synchronize_srcu() or call_srcu(),
|
||||||
If the updater uses synchronize_srcu() or call_srcu(), then
|
then the corresponding readers must use srcu_read_lock() and
|
||||||
the corresponding readers must use srcu_read_lock() and
|
|
||||||
srcu_read_unlock(), and with the same srcu_struct. The rules for
|
srcu_read_unlock(), and with the same srcu_struct. The rules for
|
||||||
the expedited primitives are the same as for their non-expedited
|
the expedited primitives are the same as for their non-expedited
|
||||||
counterparts. Mixing things up will result in confusion and
|
counterparts. Mixing things up will result in confusion and
|
||||||
broken kernels.
|
broken kernels, and has even resulted in an exploitable security
|
||||||
|
issue.
|
||||||
|
|
||||||
One exception to this rule: rcu_read_lock() and rcu_read_unlock()
|
One exception to this rule: rcu_read_lock() and rcu_read_unlock()
|
||||||
may be substituted for rcu_read_lock_bh() and rcu_read_unlock_bh()
|
may be substituted for rcu_read_lock_bh() and rcu_read_unlock_bh()
|
||||||
@@ -288,8 +285,7 @@ over a rather long period of time, but improvements are always welcome!
|
|||||||
d. Periodically invoke synchronize_rcu(), permitting a limited
|
d. Periodically invoke synchronize_rcu(), permitting a limited
|
||||||
number of updates per grace period.
|
number of updates per grace period.
|
||||||
|
|
||||||
The same cautions apply to call_rcu_bh(), call_rcu_sched(),
|
The same cautions apply to call_srcu() and kfree_rcu().
|
||||||
call_srcu(), and kfree_rcu().
|
|
||||||
|
|
||||||
Note that although these primitives do take action to avoid memory
|
Note that although these primitives do take action to avoid memory
|
||||||
exhaustion when any given CPU has too many callbacks, a determined
|
exhaustion when any given CPU has too many callbacks, a determined
|
||||||
@@ -322,7 +318,7 @@ over a rather long period of time, but improvements are always welcome!
|
|||||||
|
|
||||||
11. Any lock acquired by an RCU callback must be acquired elsewhere
|
11. Any lock acquired by an RCU callback must be acquired elsewhere
|
||||||
with softirq disabled, e.g., via spin_lock_irqsave(),
|
with softirq disabled, e.g., via spin_lock_irqsave(),
|
||||||
spin_lock_bh(), etc. Failing to disable irq on a given
|
spin_lock_bh(), etc. Failing to disable softirq on a given
|
||||||
acquisition of that lock will result in deadlock as soon as
|
acquisition of that lock will result in deadlock as soon as
|
||||||
the RCU softirq handler happens to run your RCU callback while
|
the RCU softirq handler happens to run your RCU callback while
|
||||||
interrupting that acquisition's critical section.
|
interrupting that acquisition's critical section.
|
||||||
@@ -335,13 +331,16 @@ over a rather long period of time, but improvements are always welcome!
|
|||||||
must use whatever locking or other synchronization is required
|
must use whatever locking or other synchronization is required
|
||||||
to safely access and/or modify that data structure.
|
to safely access and/or modify that data structure.
|
||||||
|
|
||||||
RCU callbacks are -usually- executed on the same CPU that executed
|
Do not assume that RCU callbacks will be executed on the same
|
||||||
the corresponding call_rcu(), call_rcu_bh(), or call_rcu_sched(),
|
CPU that executed the corresponding call_rcu() or call_srcu().
|
||||||
but are by -no- means guaranteed to be. For example, if a given
|
For example, if a given CPU goes offline while having an RCU
|
||||||
CPU goes offline while having an RCU callback pending, then that
|
callback pending, then that RCU callback will execute on some
|
||||||
RCU callback will execute on some surviving CPU. (If this was
|
surviving CPU. (If this was not the case, a self-spawning RCU
|
||||||
not the case, a self-spawning RCU callback would prevent the
|
callback would prevent the victim CPU from ever going offline.)
|
||||||
victim CPU from ever going offline.)
|
Furthermore, CPUs designated by rcu_nocbs= might well -always-
|
||||||
|
have their RCU callbacks executed on some other CPUs, in fact,
|
||||||
|
for some real-time workloads, this is the whole point of using
|
||||||
|
the rcu_nocbs= kernel boot parameter.
|
||||||
|
|
||||||
13. Unlike other forms of RCU, it -is- permissible to block in an
|
13. Unlike other forms of RCU, it -is- permissible to block in an
|
||||||
SRCU read-side critical section (demarked by srcu_read_lock()
|
SRCU read-side critical section (demarked by srcu_read_lock()
|
||||||
@@ -381,11 +380,11 @@ over a rather long period of time, but improvements are always welcome!
|
|||||||
|
|
||||||
SRCU's expedited primitive (synchronize_srcu_expedited())
|
SRCU's expedited primitive (synchronize_srcu_expedited())
|
||||||
never sends IPIs to other CPUs, so it is easier on
|
never sends IPIs to other CPUs, so it is easier on
|
||||||
real-time workloads than is synchronize_rcu_expedited(),
|
real-time workloads than is synchronize_rcu_expedited().
|
||||||
synchronize_rcu_bh_expedited() or synchronize_sched_expedited().
|
|
||||||
|
|
||||||
Note that rcu_dereference() and rcu_assign_pointer() relate to
|
Note that rcu_assign_pointer() relates to SRCU just as it does to
|
||||||
SRCU just as they do to other forms of RCU.
|
other forms of RCU, but instead of rcu_dereference() you should
|
||||||
|
use srcu_dereference() in order to avoid lockdep splats.
|
||||||
|
|
||||||
14. The whole point of call_rcu(), synchronize_rcu(), and friends
|
14. The whole point of call_rcu(), synchronize_rcu(), and friends
|
||||||
is to wait until all pre-existing readers have finished before
|
is to wait until all pre-existing readers have finished before
|
||||||
@@ -405,6 +404,9 @@ over a rather long period of time, but improvements are always welcome!
|
|||||||
read-side critical sections. It is the responsibility of the
|
read-side critical sections. It is the responsibility of the
|
||||||
RCU update-side primitives to deal with this.
|
RCU update-side primitives to deal with this.
|
||||||
|
|
||||||
|
For SRCU readers, you can use smp_mb__after_srcu_read_unlock()
|
||||||
|
immediately after an srcu_read_unlock() to get a full barrier.
|
||||||
|
|
||||||
16. Use CONFIG_PROVE_LOCKING, CONFIG_DEBUG_OBJECTS_RCU_HEAD, and the
|
16. Use CONFIG_PROVE_LOCKING, CONFIG_DEBUG_OBJECTS_RCU_HEAD, and the
|
||||||
__rcu sparse checks to validate your RCU code. These can help
|
__rcu sparse checks to validate your RCU code. These can help
|
||||||
find problems as follows:
|
find problems as follows:
|
||||||
@@ -428,22 +430,19 @@ over a rather long period of time, but improvements are always welcome!
|
|||||||
These debugging aids can help you find problems that are
|
These debugging aids can help you find problems that are
|
||||||
otherwise extremely difficult to spot.
|
otherwise extremely difficult to spot.
|
||||||
|
|
||||||
17. If you register a callback using call_rcu(), call_rcu_bh(),
|
17. If you register a callback using call_rcu() or call_srcu(), and
|
||||||
call_rcu_sched(), or call_srcu(), and pass in a function defined
|
pass in a function defined within a loadable module, then it in
|
||||||
within a loadable module, then it in necessary to wait for
|
necessary to wait for all pending callbacks to be invoked after
|
||||||
all pending callbacks to be invoked after the last invocation
|
the last invocation and before unloading that module. Note that
|
||||||
and before unloading that module. Note that it is absolutely
|
it is absolutely -not- sufficient to wait for a grace period!
|
||||||
-not- sufficient to wait for a grace period! The current (say)
|
The current (say) synchronize_rcu() implementation is -not-
|
||||||
synchronize_rcu() implementation waits only for all previous
|
guaranteed to wait for callbacks registered on other CPUs.
|
||||||
callbacks registered on the CPU that synchronize_rcu() is running
|
Or even on the current CPU if that CPU recently went offline
|
||||||
on, but it is -not- guaranteed to wait for callbacks registered
|
and came back online.
|
||||||
on other CPUs.
|
|
||||||
|
|
||||||
You instead need to use one of the barrier functions:
|
You instead need to use one of the barrier functions:
|
||||||
|
|
||||||
o call_rcu() -> rcu_barrier()
|
o call_rcu() -> rcu_barrier()
|
||||||
o call_rcu_bh() -> rcu_barrier()
|
|
||||||
o call_rcu_sched() -> rcu_barrier()
|
|
||||||
o call_srcu() -> srcu_barrier()
|
o call_srcu() -> srcu_barrier()
|
||||||
|
|
||||||
However, these barrier functions are absolutely -not- guaranteed
|
However, these barrier functions are absolutely -not- guaranteed
|
||||||
|
@@ -52,10 +52,10 @@ o If I am running on a uniprocessor kernel, which can only do one
|
|||||||
o How can I see where RCU is currently used in the Linux kernel?
|
o How can I see where RCU is currently used in the Linux kernel?
|
||||||
|
|
||||||
Search for "rcu_read_lock", "rcu_read_unlock", "call_rcu",
|
Search for "rcu_read_lock", "rcu_read_unlock", "call_rcu",
|
||||||
"rcu_read_lock_bh", "rcu_read_unlock_bh", "call_rcu_bh",
|
"rcu_read_lock_bh", "rcu_read_unlock_bh", "srcu_read_lock",
|
||||||
"srcu_read_lock", "srcu_read_unlock", "synchronize_rcu",
|
"srcu_read_unlock", "synchronize_rcu", "synchronize_net",
|
||||||
"synchronize_net", "synchronize_srcu", and the other RCU
|
"synchronize_srcu", and the other RCU primitives. Or grab one
|
||||||
primitives. Or grab one of the cscope databases from:
|
of the cscope databases from:
|
||||||
|
|
||||||
http://www.rdrop.com/users/paulmck/RCU/linuxusage/rculocktab.html
|
http://www.rdrop.com/users/paulmck/RCU/linuxusage/rculocktab.html
|
||||||
|
|
||||||
|
@@ -351,3 +351,106 @@ garbage values.
|
|||||||
|
|
||||||
In short, rcu_dereference() is -not- optional when you are going to
|
In short, rcu_dereference() is -not- optional when you are going to
|
||||||
dereference the resulting pointer.
|
dereference the resulting pointer.
|
||||||
|
|
||||||
|
|
||||||
|
WHICH MEMBER OF THE rcu_dereference() FAMILY SHOULD YOU USE?
|
||||||
|
|
||||||
|
First, please avoid using rcu_dereference_raw() and also please avoid
|
||||||
|
using rcu_dereference_check() and rcu_dereference_protected() with a
|
||||||
|
second argument with a constant value of 1 (or true, for that matter).
|
||||||
|
With that caution out of the way, here is some guidance for which
|
||||||
|
member of the rcu_dereference() to use in various situations:
|
||||||
|
|
||||||
|
1. If the access needs to be within an RCU read-side critical
|
||||||
|
section, use rcu_dereference(). With the new consolidated
|
||||||
|
RCU flavors, an RCU read-side critical section is entered
|
||||||
|
using rcu_read_lock(), anything that disables bottom halves,
|
||||||
|
anything that disables interrupts, or anything that disables
|
||||||
|
preemption.
|
||||||
|
|
||||||
|
2. If the access might be within an RCU read-side critical section
|
||||||
|
on the one hand, or protected by (say) my_lock on the other,
|
||||||
|
use rcu_dereference_check(), for example:
|
||||||
|
|
||||||
|
p1 = rcu_dereference_check(p->rcu_protected_pointer,
|
||||||
|
lockdep_is_held(&my_lock));
|
||||||
|
|
||||||
|
|
||||||
|
3. If the access might be within an RCU read-side critical section
|
||||||
|
on the one hand, or protected by either my_lock or your_lock on
|
||||||
|
the other, again use rcu_dereference_check(), for example:
|
||||||
|
|
||||||
|
p1 = rcu_dereference_check(p->rcu_protected_pointer,
|
||||||
|
lockdep_is_held(&my_lock) ||
|
||||||
|
lockdep_is_held(&your_lock));
|
||||||
|
|
||||||
|
4. If the access is on the update side, so that it is always protected
|
||||||
|
by my_lock, use rcu_dereference_protected():
|
||||||
|
|
||||||
|
p1 = rcu_dereference_protected(p->rcu_protected_pointer,
|
||||||
|
lockdep_is_held(&my_lock));
|
||||||
|
|
||||||
|
This can be extended to handle multiple locks as in #3 above,
|
||||||
|
and both can be extended to check other conditions as well.
|
||||||
|
|
||||||
|
5. If the protection is supplied by the caller, and is thus unknown
|
||||||
|
to this code, that is the rare case when rcu_dereference_raw()
|
||||||
|
is appropriate. In addition, rcu_dereference_raw() might be
|
||||||
|
appropriate when the lockdep expression would be excessively
|
||||||
|
complex, except that a better approach in that case might be to
|
||||||
|
take a long hard look at your synchronization design. Still,
|
||||||
|
there are data-locking cases where any one of a very large number
|
||||||
|
of locks or reference counters suffices to protect the pointer,
|
||||||
|
so rcu_dereference_raw() does have its place.
|
||||||
|
|
||||||
|
However, its place is probably quite a bit smaller than one
|
||||||
|
might expect given the number of uses in the current kernel.
|
||||||
|
Ditto for its synonym, rcu_dereference_check( ... , 1), and
|
||||||
|
its close relative, rcu_dereference_protected(... , 1).
|
||||||
|
|
||||||
|
|
||||||
|
SPARSE CHECKING OF RCU-PROTECTED POINTERS
|
||||||
|
|
||||||
|
The sparse static-analysis tool checks for direct access to RCU-protected
|
||||||
|
pointers, which can result in "interesting" bugs due to compiler
|
||||||
|
optimizations involving invented loads and perhaps also load tearing.
|
||||||
|
For example, suppose someone mistakenly does something like this:
|
||||||
|
|
||||||
|
p = q->rcu_protected_pointer;
|
||||||
|
do_something_with(p->a);
|
||||||
|
do_something_else_with(p->b);
|
||||||
|
|
||||||
|
If register pressure is high, the compiler might optimize "p" out
|
||||||
|
of existence, transforming the code to something like this:
|
||||||
|
|
||||||
|
do_something_with(q->rcu_protected_pointer->a);
|
||||||
|
do_something_else_with(q->rcu_protected_pointer->b);
|
||||||
|
|
||||||
|
This could fatally disappoint your code if q->rcu_protected_pointer
|
||||||
|
changed in the meantime. Nor is this a theoretical problem: Exactly
|
||||||
|
this sort of bug cost Paul E. McKenney (and several of his innocent
|
||||||
|
colleagues) a three-day weekend back in the early 1990s.
|
||||||
|
|
||||||
|
Load tearing could of course result in dereferencing a mashup of a pair
|
||||||
|
of pointers, which also might fatally disappoint your code.
|
||||||
|
|
||||||
|
These problems could have been avoided simply by making the code instead
|
||||||
|
read as follows:
|
||||||
|
|
||||||
|
p = rcu_dereference(q->rcu_protected_pointer);
|
||||||
|
do_something_with(p->a);
|
||||||
|
do_something_else_with(p->b);
|
||||||
|
|
||||||
|
Unfortunately, these sorts of bugs can be extremely hard to spot during
|
||||||
|
review. This is where the sparse tool comes into play, along with the
|
||||||
|
"__rcu" marker. If you mark a pointer declaration, whether in a structure
|
||||||
|
or as a formal parameter, with "__rcu", which tells sparse to complain if
|
||||||
|
this pointer is accessed directly. It will also cause sparse to complain
|
||||||
|
if a pointer not marked with "__rcu" is accessed using rcu_dereference()
|
||||||
|
and friends. For example, ->rcu_protected_pointer might be declared as
|
||||||
|
follows:
|
||||||
|
|
||||||
|
struct foo __rcu *rcu_protected_pointer;
|
||||||
|
|
||||||
|
Use of "__rcu" is opt-in. If you choose not to use it, then you should
|
||||||
|
ignore the sparse warnings.
|
||||||
|
@@ -83,16 +83,15 @@ Pseudo-code using rcu_barrier() is as follows:
|
|||||||
2. Execute rcu_barrier().
|
2. Execute rcu_barrier().
|
||||||
3. Allow the module to be unloaded.
|
3. Allow the module to be unloaded.
|
||||||
|
|
||||||
There are also rcu_barrier_bh(), rcu_barrier_sched(), and srcu_barrier()
|
There is also an srcu_barrier() function for SRCU, and you of course
|
||||||
functions for the other flavors of RCU, and you of course must match
|
must match the flavor of rcu_barrier() with that of call_rcu(). If your
|
||||||
the flavor of rcu_barrier() with that of call_rcu(). If your module
|
module uses multiple flavors of call_rcu(), then it must also use multiple
|
||||||
uses multiple flavors of call_rcu(), then it must also use multiple
|
|
||||||
flavors of rcu_barrier() when unloading that module. For example, if
|
flavors of rcu_barrier() when unloading that module. For example, if
|
||||||
it uses call_rcu_bh(), call_srcu() on srcu_struct_1, and call_srcu() on
|
it uses call_rcu(), call_srcu() on srcu_struct_1, and call_srcu() on
|
||||||
srcu_struct_2(), then the following three lines of code will be required
|
srcu_struct_2(), then the following three lines of code will be required
|
||||||
when unloading:
|
when unloading:
|
||||||
|
|
||||||
1 rcu_barrier_bh();
|
1 rcu_barrier();
|
||||||
2 srcu_barrier(&srcu_struct_1);
|
2 srcu_barrier(&srcu_struct_1);
|
||||||
3 srcu_barrier(&srcu_struct_2);
|
3 srcu_barrier(&srcu_struct_2);
|
||||||
|
|
||||||
@@ -185,12 +184,12 @@ module invokes call_rcu() from timers, you will need to first cancel all
|
|||||||
the timers, and only then invoke rcu_barrier() to wait for any remaining
|
the timers, and only then invoke rcu_barrier() to wait for any remaining
|
||||||
RCU callbacks to complete.
|
RCU callbacks to complete.
|
||||||
|
|
||||||
Of course, if you module uses call_rcu_bh(), you will need to invoke
|
Of course, if you module uses call_rcu(), you will need to invoke
|
||||||
rcu_barrier_bh() before unloading. Similarly, if your module uses
|
rcu_barrier() before unloading. Similarly, if your module uses
|
||||||
call_rcu_sched(), you will need to invoke rcu_barrier_sched() before
|
call_srcu(), you will need to invoke srcu_barrier() before unloading,
|
||||||
unloading. If your module uses call_rcu(), call_rcu_bh(), -and-
|
and on the same srcu_struct structure. If your module uses call_rcu()
|
||||||
call_rcu_sched(), then you will need to invoke each of rcu_barrier(),
|
-and- call_srcu(), then you will need to invoke rcu_barrier() -and-
|
||||||
rcu_barrier_bh(), and rcu_barrier_sched().
|
srcu_barrier().
|
||||||
|
|
||||||
|
|
||||||
Implementing rcu_barrier()
|
Implementing rcu_barrier()
|
||||||
@@ -223,8 +222,8 @@ shown below. Note that the final "1" in on_each_cpu()'s argument list
|
|||||||
ensures that all the calls to rcu_barrier_func() will have completed
|
ensures that all the calls to rcu_barrier_func() will have completed
|
||||||
before on_each_cpu() returns. Line 9 then waits for the completion.
|
before on_each_cpu() returns. Line 9 then waits for the completion.
|
||||||
|
|
||||||
This code was rewritten in 2008 to support rcu_barrier_bh() and
|
This code was rewritten in 2008 and several times thereafter, but this
|
||||||
rcu_barrier_sched() in addition to the original rcu_barrier().
|
still gives the general idea.
|
||||||
|
|
||||||
The rcu_barrier_func() runs on each CPU, where it invokes call_rcu()
|
The rcu_barrier_func() runs on each CPU, where it invokes call_rcu()
|
||||||
to post an RCU callback, as follows:
|
to post an RCU callback, as follows:
|
||||||
|
@@ -310,7 +310,7 @@ reader, updater, and reclaimer.
|
|||||||
|
|
||||||
|
|
||||||
rcu_assign_pointer()
|
rcu_assign_pointer()
|
||||||
+--------+
|
+--------+
|
||||||
+---------------------->| reader |---------+
|
+---------------------->| reader |---------+
|
||||||
| +--------+ |
|
| +--------+ |
|
||||||
| | |
|
| | |
|
||||||
@@ -318,12 +318,12 @@ reader, updater, and reclaimer.
|
|||||||
| | | rcu_read_lock()
|
| | | rcu_read_lock()
|
||||||
| | | rcu_read_unlock()
|
| | | rcu_read_unlock()
|
||||||
| rcu_dereference() | |
|
| rcu_dereference() | |
|
||||||
+---------+ | |
|
+---------+ | |
|
||||||
| updater |<---------------------+ |
|
| updater |<----------------+ |
|
||||||
+---------+ V
|
+---------+ V
|
||||||
| +-----------+
|
| +-----------+
|
||||||
+----------------------------------->| reclaimer |
|
+----------------------------------->| reclaimer |
|
||||||
+-----------+
|
+-----------+
|
||||||
Defer:
|
Defer:
|
||||||
synchronize_rcu() & call_rcu()
|
synchronize_rcu() & call_rcu()
|
||||||
|
|
||||||
|
@@ -63,6 +63,110 @@ as well as medium and long term trends. The total absolute stall time
|
|||||||
spikes which wouldn't necessarily make a dent in the time averages,
|
spikes which wouldn't necessarily make a dent in the time averages,
|
||||||
or to average trends over custom time frames.
|
or to average trends over custom time frames.
|
||||||
|
|
||||||
|
Monitoring for pressure thresholds
|
||||||
|
==================================
|
||||||
|
|
||||||
|
Users can register triggers and use poll() to be woken up when resource
|
||||||
|
pressure exceeds certain thresholds.
|
||||||
|
|
||||||
|
A trigger describes the maximum cumulative stall time over a specific
|
||||||
|
time window, e.g. 100ms of total stall time within any 500ms window to
|
||||||
|
generate a wakeup event.
|
||||||
|
|
||||||
|
To register a trigger user has to open psi interface file under
|
||||||
|
/proc/pressure/ representing the resource to be monitored and write the
|
||||||
|
desired threshold and time window. The open file descriptor should be
|
||||||
|
used to wait for trigger events using select(), poll() or epoll().
|
||||||
|
The following format is used:
|
||||||
|
|
||||||
|
<some|full> <stall amount in us> <time window in us>
|
||||||
|
|
||||||
|
For example writing "some 150000 1000000" into /proc/pressure/memory
|
||||||
|
would add 150ms threshold for partial memory stall measured within
|
||||||
|
1sec time window. Writing "full 50000 1000000" into /proc/pressure/io
|
||||||
|
would add 50ms threshold for full io stall measured within 1sec time window.
|
||||||
|
|
||||||
|
Triggers can be set on more than one psi metric and more than one trigger
|
||||||
|
for the same psi metric can be specified. However for each trigger a separate
|
||||||
|
file descriptor is required to be able to poll it separately from others,
|
||||||
|
therefore for each trigger a separate open() syscall should be made even
|
||||||
|
when opening the same psi interface file.
|
||||||
|
|
||||||
|
Monitors activate only when system enters stall state for the monitored
|
||||||
|
psi metric and deactivates upon exit from the stall state. While system is
|
||||||
|
in the stall state psi signal growth is monitored at a rate of 10 times per
|
||||||
|
tracking window.
|
||||||
|
|
||||||
|
The kernel accepts window sizes ranging from 500ms to 10s, therefore min
|
||||||
|
monitoring update interval is 50ms and max is 1s. Min limit is set to
|
||||||
|
prevent overly frequent polling. Max limit is chosen as a high enough number
|
||||||
|
after which monitors are most likely not needed and psi averages can be used
|
||||||
|
instead.
|
||||||
|
|
||||||
|
When activated, psi monitor stays active for at least the duration of one
|
||||||
|
tracking window to avoid repeated activations/deactivations when system is
|
||||||
|
bouncing in and out of the stall state.
|
||||||
|
|
||||||
|
Notifications to the userspace are rate-limited to one per tracking window.
|
||||||
|
|
||||||
|
The trigger will de-register when the file descriptor used to define the
|
||||||
|
trigger is closed.
|
||||||
|
|
||||||
|
Userspace monitor usage example
|
||||||
|
===============================
|
||||||
|
|
||||||
|
#include <errno.h>
|
||||||
|
#include <fcntl.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <poll.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Monitor memory partial stall with 1s tracking window size
|
||||||
|
* and 150ms threshold.
|
||||||
|
*/
|
||||||
|
int main() {
|
||||||
|
const char trig[] = "some 150000 1000000";
|
||||||
|
struct pollfd fds;
|
||||||
|
int n;
|
||||||
|
|
||||||
|
fds.fd = open("/proc/pressure/memory", O_RDWR | O_NONBLOCK);
|
||||||
|
if (fds.fd < 0) {
|
||||||
|
printf("/proc/pressure/memory open error: %s\n",
|
||||||
|
strerror(errno));
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
fds.events = POLLPRI;
|
||||||
|
|
||||||
|
if (write(fds.fd, trig, strlen(trig) + 1) < 0) {
|
||||||
|
printf("/proc/pressure/memory write error: %s\n",
|
||||||
|
strerror(errno));
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("waiting for events...\n");
|
||||||
|
while (1) {
|
||||||
|
n = poll(&fds, 1, -1);
|
||||||
|
if (n < 0) {
|
||||||
|
printf("poll error: %s\n", strerror(errno));
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (fds.revents & POLLERR) {
|
||||||
|
printf("got POLLERR, event source is gone\n");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
if (fds.revents & POLLPRI) {
|
||||||
|
printf("event triggered!\n");
|
||||||
|
} else {
|
||||||
|
printf("unknown event received: 0x%x\n", fds.revents);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
Cgroup2 interface
|
Cgroup2 interface
|
||||||
=================
|
=================
|
||||||
|
|
||||||
@@ -71,3 +175,6 @@ mounted, pressure stall information is also tracked for tasks grouped
|
|||||||
into cgroups. Each subdirectory in the cgroupfs mountpoint contains
|
into cgroups. Each subdirectory in the cgroupfs mountpoint contains
|
||||||
cpu.pressure, memory.pressure, and io.pressure files; the format is
|
cpu.pressure, memory.pressure, and io.pressure files; the format is
|
||||||
the same as the /proc/pressure/ files.
|
the same as the /proc/pressure/ files.
|
||||||
|
|
||||||
|
Per-cgroup psi monitors can be specified and used the same way as
|
||||||
|
system-wide ones.
|
||||||
|
@@ -1,66 +0,0 @@
|
|||||||
The AML Debugger
|
|
||||||
|
|
||||||
Copyright (C) 2016, Intel Corporation
|
|
||||||
Author: Lv Zheng <lv.zheng@intel.com>
|
|
||||||
|
|
||||||
|
|
||||||
This document describes the usage of the AML debugger embedded in the Linux
|
|
||||||
kernel.
|
|
||||||
|
|
||||||
1. Build the debugger
|
|
||||||
|
|
||||||
The following kernel configuration items are required to enable the AML
|
|
||||||
debugger interface from the Linux kernel:
|
|
||||||
|
|
||||||
CONFIG_ACPI_DEBUGGER=y
|
|
||||||
CONFIG_ACPI_DEBUGGER_USER=m
|
|
||||||
|
|
||||||
The userspace utilities can be built from the kernel source tree using
|
|
||||||
the following commands:
|
|
||||||
|
|
||||||
$ cd tools
|
|
||||||
$ make acpi
|
|
||||||
|
|
||||||
The resultant userspace tool binary is then located at:
|
|
||||||
|
|
||||||
tools/power/acpi/acpidbg
|
|
||||||
|
|
||||||
It can be installed to system directories by running "make install" (as a
|
|
||||||
sufficiently privileged user).
|
|
||||||
|
|
||||||
2. Start the userspace debugger interface
|
|
||||||
|
|
||||||
After booting the kernel with the debugger built-in, the debugger can be
|
|
||||||
started by using the following commands:
|
|
||||||
|
|
||||||
# mount -t debugfs none /sys/kernel/debug
|
|
||||||
# modprobe acpi_dbg
|
|
||||||
# tools/power/acpi/acpidbg
|
|
||||||
|
|
||||||
That spawns the interactive AML debugger environment where you can execute
|
|
||||||
debugger commands.
|
|
||||||
|
|
||||||
The commands are documented in the "ACPICA Overview and Programmer Reference"
|
|
||||||
that can be downloaded from
|
|
||||||
|
|
||||||
https://acpica.org/documentation
|
|
||||||
|
|
||||||
The detailed debugger commands reference is located in Chapter 12 "ACPICA
|
|
||||||
Debugger Reference". The "help" command can be used for a quick reference.
|
|
||||||
|
|
||||||
3. Stop the userspace debugger interface
|
|
||||||
|
|
||||||
The interactive debugger interface can be closed by pressing Ctrl+C or using
|
|
||||||
the "quit" or "exit" commands. When finished, unload the module with:
|
|
||||||
|
|
||||||
# rmmod acpi_dbg
|
|
||||||
|
|
||||||
The module unloading may fail if there is an acpidbg instance running.
|
|
||||||
|
|
||||||
4. Run the debugger in a script
|
|
||||||
|
|
||||||
It may be useful to run the AML debugger in a test script. "acpidbg" supports
|
|
||||||
this in a special "batch" mode. For example, the following command outputs
|
|
||||||
the entire ACPI namespace:
|
|
||||||
|
|
||||||
# acpidbg -b "namespace"
|
|
@@ -1,147 +0,0 @@
|
|||||||
APEI output format
|
|
||||||
~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
APEI uses printk as hardware error reporting interface, the output
|
|
||||||
format is as follow.
|
|
||||||
|
|
||||||
<error record> :=
|
|
||||||
APEI generic hardware error status
|
|
||||||
severity: <integer>, <severity string>
|
|
||||||
section: <integer>, severity: <integer>, <severity string>
|
|
||||||
flags: <integer>
|
|
||||||
<section flags strings>
|
|
||||||
fru_id: <uuid string>
|
|
||||||
fru_text: <string>
|
|
||||||
section_type: <section type string>
|
|
||||||
<section data>
|
|
||||||
|
|
||||||
<severity string>* := recoverable | fatal | corrected | info
|
|
||||||
|
|
||||||
<section flags strings># :=
|
|
||||||
[primary][, containment warning][, reset][, threshold exceeded]\
|
|
||||||
[, resource not accessible][, latent error]
|
|
||||||
|
|
||||||
<section type string> := generic processor error | memory error | \
|
|
||||||
PCIe error | unknown, <uuid string>
|
|
||||||
|
|
||||||
<section data> :=
|
|
||||||
<generic processor section data> | <memory section data> | \
|
|
||||||
<pcie section data> | <null>
|
|
||||||
|
|
||||||
<generic processor section data> :=
|
|
||||||
[processor_type: <integer>, <proc type string>]
|
|
||||||
[processor_isa: <integer>, <proc isa string>]
|
|
||||||
[error_type: <integer>
|
|
||||||
<proc error type strings>]
|
|
||||||
[operation: <integer>, <proc operation string>]
|
|
||||||
[flags: <integer>
|
|
||||||
<proc flags strings>]
|
|
||||||
[level: <integer>]
|
|
||||||
[version_info: <integer>]
|
|
||||||
[processor_id: <integer>]
|
|
||||||
[target_address: <integer>]
|
|
||||||
[requestor_id: <integer>]
|
|
||||||
[responder_id: <integer>]
|
|
||||||
[IP: <integer>]
|
|
||||||
|
|
||||||
<proc type string>* := IA32/X64 | IA64
|
|
||||||
|
|
||||||
<proc isa string>* := IA32 | IA64 | X64
|
|
||||||
|
|
||||||
<processor error type strings># :=
|
|
||||||
[cache error][, TLB error][, bus error][, micro-architectural error]
|
|
||||||
|
|
||||||
<proc operation string>* := unknown or generic | data read | data write | \
|
|
||||||
instruction execution
|
|
||||||
|
|
||||||
<proc flags strings># :=
|
|
||||||
[restartable][, precise IP][, overflow][, corrected]
|
|
||||||
|
|
||||||
<memory section data> :=
|
|
||||||
[error_status: <integer>]
|
|
||||||
[physical_address: <integer>]
|
|
||||||
[physical_address_mask: <integer>]
|
|
||||||
[node: <integer>]
|
|
||||||
[card: <integer>]
|
|
||||||
[module: <integer>]
|
|
||||||
[bank: <integer>]
|
|
||||||
[device: <integer>]
|
|
||||||
[row: <integer>]
|
|
||||||
[column: <integer>]
|
|
||||||
[bit_position: <integer>]
|
|
||||||
[requestor_id: <integer>]
|
|
||||||
[responder_id: <integer>]
|
|
||||||
[target_id: <integer>]
|
|
||||||
[error_type: <integer>, <mem error type string>]
|
|
||||||
|
|
||||||
<mem error type string>* :=
|
|
||||||
unknown | no error | single-bit ECC | multi-bit ECC | \
|
|
||||||
single-symbol chipkill ECC | multi-symbol chipkill ECC | master abort | \
|
|
||||||
target abort | parity error | watchdog timeout | invalid address | \
|
|
||||||
mirror Broken | memory sparing | scrub corrected error | \
|
|
||||||
scrub uncorrected error
|
|
||||||
|
|
||||||
<pcie section data> :=
|
|
||||||
[port_type: <integer>, <pcie port type string>]
|
|
||||||
[version: <integer>.<integer>]
|
|
||||||
[command: <integer>, status: <integer>]
|
|
||||||
[device_id: <integer>:<integer>:<integer>.<integer>
|
|
||||||
slot: <integer>
|
|
||||||
secondary_bus: <integer>
|
|
||||||
vendor_id: <integer>, device_id: <integer>
|
|
||||||
class_code: <integer>]
|
|
||||||
[serial number: <integer>, <integer>]
|
|
||||||
[bridge: secondary_status: <integer>, control: <integer>]
|
|
||||||
[aer_status: <integer>, aer_mask: <integer>
|
|
||||||
<aer status string>
|
|
||||||
[aer_uncor_severity: <integer>]
|
|
||||||
aer_layer=<aer layer string>, aer_agent=<aer agent string>
|
|
||||||
aer_tlp_header: <integer> <integer> <integer> <integer>]
|
|
||||||
|
|
||||||
<pcie port type string>* := PCIe end point | legacy PCI end point | \
|
|
||||||
unknown | unknown | root port | upstream switch port | \
|
|
||||||
downstream switch port | PCIe to PCI/PCI-X bridge | \
|
|
||||||
PCI/PCI-X to PCIe bridge | root complex integrated endpoint device | \
|
|
||||||
root complex event collector
|
|
||||||
|
|
||||||
if section severity is fatal or recoverable
|
|
||||||
<aer status string># :=
|
|
||||||
unknown | unknown | unknown | unknown | Data Link Protocol | \
|
|
||||||
unknown | unknown | unknown | unknown | unknown | unknown | unknown | \
|
|
||||||
Poisoned TLP | Flow Control Protocol | Completion Timeout | \
|
|
||||||
Completer Abort | Unexpected Completion | Receiver Overflow | \
|
|
||||||
Malformed TLP | ECRC | Unsupported Request
|
|
||||||
else
|
|
||||||
<aer status string># :=
|
|
||||||
Receiver Error | unknown | unknown | unknown | unknown | unknown | \
|
|
||||||
Bad TLP | Bad DLLP | RELAY_NUM Rollover | unknown | unknown | unknown | \
|
|
||||||
Replay Timer Timeout | Advisory Non-Fatal
|
|
||||||
fi
|
|
||||||
|
|
||||||
<aer layer string> :=
|
|
||||||
Physical Layer | Data Link Layer | Transaction Layer
|
|
||||||
|
|
||||||
<aer agent string> :=
|
|
||||||
Receiver ID | Requester ID | Completer ID | Transmitter ID
|
|
||||||
|
|
||||||
Where, [] designate corresponding content is optional
|
|
||||||
|
|
||||||
All <field string> description with * has the following format:
|
|
||||||
|
|
||||||
field: <integer>, <field string>
|
|
||||||
|
|
||||||
Where value of <integer> should be the position of "string" in <field
|
|
||||||
string> description. Otherwise, <field string> will be "unknown".
|
|
||||||
|
|
||||||
All <field strings> description with # has the following format:
|
|
||||||
|
|
||||||
field: <integer>
|
|
||||||
<field strings>
|
|
||||||
|
|
||||||
Where each string in <fields strings> corresponding to one set bit of
|
|
||||||
<integer>. The bit position is the position of "string" in <field
|
|
||||||
strings> description.
|
|
||||||
|
|
||||||
For more detailed explanation of every field, please refer to UEFI
|
|
||||||
specification version 2.3 or later, section Appendix N: Common
|
|
||||||
Platform Error Record.
|
|
99
Documentation/acpi/dsd/leds.txt
Normal file
99
Documentation/acpi/dsd/leds.txt
Normal file
@@ -0,0 +1,99 @@
|
|||||||
|
Describing and referring to LEDs in ACPI
|
||||||
|
|
||||||
|
Individual LEDs are described by hierarchical data extension [6] nodes under the
|
||||||
|
device node, the LED driver chip. The "reg" property in the LED specific nodes
|
||||||
|
tells the numerical ID of each individual LED output to which the LEDs are
|
||||||
|
connected. [3] The hierarchical data nodes are named "led@X", where X is the
|
||||||
|
number of the LED output.
|
||||||
|
|
||||||
|
Referring to LEDs in Device tree is documented in [4], in "flash-leds" property
|
||||||
|
documentation. In short, LEDs are directly referred to by using phandles.
|
||||||
|
|
||||||
|
While Device tree allows referring to any node in the tree[1], in ACPI
|
||||||
|
references are limited to device nodes only [2]. For this reason using the same
|
||||||
|
mechanism on ACPI is not possible. A mechanism to refer to non-device ACPI nodes
|
||||||
|
is documented in [7].
|
||||||
|
|
||||||
|
ACPI allows (as does DT) using integer arguments after the reference. A
|
||||||
|
combination of the LED driver device reference and an integer argument,
|
||||||
|
referring to the "reg" property of the relevant LED, is used to identify
|
||||||
|
individual LEDs. The value of the "reg" property is a contract between the
|
||||||
|
firmware and software, it uniquely identifies the LED driver outputs.
|
||||||
|
|
||||||
|
Under the LED driver device, The first hierarchical data extension package list
|
||||||
|
entry shall contain the string "led@" followed by the number of the LED,
|
||||||
|
followed by the referred object name. That object shall be named "LED" followed
|
||||||
|
by the number of the LED.
|
||||||
|
|
||||||
|
An ASL example of a camera sensor device and a LED driver device for two LEDs.
|
||||||
|
Objects not relevant for LEDs or the references to them have been omitted.
|
||||||
|
|
||||||
|
Device (LED)
|
||||||
|
{
|
||||||
|
Name (_DSD, Package () {
|
||||||
|
ToUUID("dbb8e3e6-5886-4ba6-8795-1319f52a966b"),
|
||||||
|
Package () {
|
||||||
|
Package () { "led@0", LED0 },
|
||||||
|
Package () { "led@1", LED1 },
|
||||||
|
}
|
||||||
|
})
|
||||||
|
Name (LED0, Package () {
|
||||||
|
ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
|
||||||
|
Package () {
|
||||||
|
Package () { "reg", 0 },
|
||||||
|
Package () { "flash-max-microamp", 1000000 },
|
||||||
|
Package () { "flash-timeout-us", 200000 },
|
||||||
|
Package () { "led-max-microamp", 100000 },
|
||||||
|
Package () { "label", "white:flash" },
|
||||||
|
}
|
||||||
|
})
|
||||||
|
Name (LED1, Package () {
|
||||||
|
ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
|
||||||
|
Package () {
|
||||||
|
Package () { "reg", 1 },
|
||||||
|
Package () { "led-max-microamp", 10000 },
|
||||||
|
Package () { "label", "red:indicator" },
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
Device (SEN)
|
||||||
|
{
|
||||||
|
Name (_DSD, Package () {
|
||||||
|
ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
|
||||||
|
Package () {
|
||||||
|
Package () {
|
||||||
|
"flash-leds",
|
||||||
|
Package () { ^LED, "led@0", ^LED, "led@1" },
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
where
|
||||||
|
|
||||||
|
LED LED driver device
|
||||||
|
LED0 First LED
|
||||||
|
LED1 Second LED
|
||||||
|
SEN Camera sensor device (or another device the LED is
|
||||||
|
related to)
|
||||||
|
|
||||||
|
[1] Device tree. <URL:http://www.devicetree.org>, referenced 2019-02-21.
|
||||||
|
|
||||||
|
[2] Advanced Configuration and Power Interface Specification.
|
||||||
|
<URL:https://uefi.org/sites/default/files/resources/ACPI_6_3_final_Jan30.pdf>,
|
||||||
|
referenced 2019-02-21.
|
||||||
|
|
||||||
|
[3] Documentation/devicetree/bindings/leds/common.txt
|
||||||
|
|
||||||
|
[4] Documentation/devicetree/bindings/media/video-interfaces.txt
|
||||||
|
|
||||||
|
[5] Device Properties UUID For _DSD.
|
||||||
|
<URL:http://www.uefi.org/sites/default/files/resources/_DSD-device-properties-UUID.pdf>,
|
||||||
|
referenced 2019-02-21.
|
||||||
|
|
||||||
|
[6] Hierarchical Data Extension UUID For _DSD.
|
||||||
|
<URL:http://www.uefi.org/sites/default/files/resources/_DSD-hierarchical-data-extension-UUID-v1.1.pdf>,
|
||||||
|
referenced 2019-02-21.
|
||||||
|
|
||||||
|
[7] Documentation/acpi/dsd/data-node-reference.txt
|
@@ -1,58 +0,0 @@
|
|||||||
ACPI I2C Muxes
|
|
||||||
--------------
|
|
||||||
|
|
||||||
Describing an I2C device hierarchy that includes I2C muxes requires an ACPI
|
|
||||||
Device () scope per mux channel.
|
|
||||||
|
|
||||||
Consider this topology:
|
|
||||||
|
|
||||||
+------+ +------+
|
|
||||||
| SMB1 |-->| MUX0 |--CH00--> i2c client A (0x50)
|
|
||||||
| | | 0x70 |--CH01--> i2c client B (0x50)
|
|
||||||
+------+ +------+
|
|
||||||
|
|
||||||
which corresponds to the following ASL:
|
|
||||||
|
|
||||||
Device (SMB1)
|
|
||||||
{
|
|
||||||
Name (_HID, ...)
|
|
||||||
Device (MUX0)
|
|
||||||
{
|
|
||||||
Name (_HID, ...)
|
|
||||||
Name (_CRS, ResourceTemplate () {
|
|
||||||
I2cSerialBus (0x70, ControllerInitiated, I2C_SPEED,
|
|
||||||
AddressingMode7Bit, "^SMB1", 0x00,
|
|
||||||
ResourceConsumer,,)
|
|
||||||
}
|
|
||||||
|
|
||||||
Device (CH00)
|
|
||||||
{
|
|
||||||
Name (_ADR, 0)
|
|
||||||
|
|
||||||
Device (CLIA)
|
|
||||||
{
|
|
||||||
Name (_HID, ...)
|
|
||||||
Name (_CRS, ResourceTemplate () {
|
|
||||||
I2cSerialBus (0x50, ControllerInitiated, I2C_SPEED,
|
|
||||||
AddressingMode7Bit, "^CH00", 0x00,
|
|
||||||
ResourceConsumer,,)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Device (CH01)
|
|
||||||
{
|
|
||||||
Name (_ADR, 1)
|
|
||||||
|
|
||||||
Device (CLIB)
|
|
||||||
{
|
|
||||||
Name (_HID, ...)
|
|
||||||
Name (_CRS, ResourceTemplate () {
|
|
||||||
I2cSerialBus (0x50, ControllerInitiated, I2C_SPEED,
|
|
||||||
AddressingMode7Bit, "^CH01", 0x00,
|
|
||||||
ResourceConsumer,,)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
@@ -1,111 +0,0 @@
|
|||||||
Upgrading ACPI tables via initrd
|
|
||||||
================================
|
|
||||||
|
|
||||||
1) Introduction (What is this about)
|
|
||||||
2) What is this for
|
|
||||||
3) How does it work
|
|
||||||
4) References (Where to retrieve userspace tools)
|
|
||||||
|
|
||||||
1) What is this about
|
|
||||||
---------------------
|
|
||||||
|
|
||||||
If the ACPI_TABLE_UPGRADE compile option is true, it is possible to
|
|
||||||
upgrade the ACPI execution environment that is defined by the ACPI tables
|
|
||||||
via upgrading the ACPI tables provided by the BIOS with an instrumented,
|
|
||||||
modified, more recent version one, or installing brand new ACPI tables.
|
|
||||||
|
|
||||||
When building initrd with kernel in a single image, option
|
|
||||||
ACPI_TABLE_OVERRIDE_VIA_BUILTIN_INITRD should also be true for this
|
|
||||||
feature to work.
|
|
||||||
|
|
||||||
For a full list of ACPI tables that can be upgraded/installed, take a look
|
|
||||||
at the char *table_sigs[MAX_ACPI_SIGNATURE]; definition in
|
|
||||||
drivers/acpi/tables.c.
|
|
||||||
All ACPI tables iasl (Intel's ACPI compiler and disassembler) knows should
|
|
||||||
be overridable, except:
|
|
||||||
- ACPI_SIG_RSDP (has a signature of 6 bytes)
|
|
||||||
- ACPI_SIG_FACS (does not have an ordinary ACPI table header)
|
|
||||||
Both could get implemented as well.
|
|
||||||
|
|
||||||
|
|
||||||
2) What is this for
|
|
||||||
-------------------
|
|
||||||
|
|
||||||
Complain to your platform/BIOS vendor if you find a bug which is so severe
|
|
||||||
that a workaround is not accepted in the Linux kernel. And this facility
|
|
||||||
allows you to upgrade the buggy tables before your platform/BIOS vendor
|
|
||||||
releases an upgraded BIOS binary.
|
|
||||||
|
|
||||||
This facility can be used by platform/BIOS vendors to provide a Linux
|
|
||||||
compatible environment without modifying the underlying platform firmware.
|
|
||||||
|
|
||||||
This facility also provides a powerful feature to easily debug and test
|
|
||||||
ACPI BIOS table compatibility with the Linux kernel by modifying old
|
|
||||||
platform provided ACPI tables or inserting new ACPI tables.
|
|
||||||
|
|
||||||
It can and should be enabled in any kernel because there is no functional
|
|
||||||
change with not instrumented initrds.
|
|
||||||
|
|
||||||
|
|
||||||
3) How does it work
|
|
||||||
-------------------
|
|
||||||
|
|
||||||
# Extract the machine's ACPI tables:
|
|
||||||
cd /tmp
|
|
||||||
acpidump >acpidump
|
|
||||||
acpixtract -a acpidump
|
|
||||||
# Disassemble, modify and recompile them:
|
|
||||||
iasl -d *.dat
|
|
||||||
# For example add this statement into a _PRT (PCI Routing Table) function
|
|
||||||
# of the DSDT:
|
|
||||||
Store("HELLO WORLD", debug)
|
|
||||||
# And increase the OEM Revision. For example, before modification:
|
|
||||||
DefinitionBlock ("DSDT.aml", "DSDT", 2, "INTEL ", "TEMPLATE", 0x00000000)
|
|
||||||
# After modification:
|
|
||||||
DefinitionBlock ("DSDT.aml", "DSDT", 2, "INTEL ", "TEMPLATE", 0x00000001)
|
|
||||||
iasl -sa dsdt.dsl
|
|
||||||
# Add the raw ACPI tables to an uncompressed cpio archive.
|
|
||||||
# They must be put into a /kernel/firmware/acpi directory inside the cpio
|
|
||||||
# archive. Note that if the table put here matches a platform table
|
|
||||||
# (similar Table Signature, and similar OEMID, and similar OEM Table ID)
|
|
||||||
# with a more recent OEM Revision, the platform table will be upgraded by
|
|
||||||
# this table. If the table put here doesn't match a platform table
|
|
||||||
# (dissimilar Table Signature, or dissimilar OEMID, or dissimilar OEM Table
|
|
||||||
# ID), this table will be appended.
|
|
||||||
mkdir -p kernel/firmware/acpi
|
|
||||||
cp dsdt.aml kernel/firmware/acpi
|
|
||||||
# A maximum of "NR_ACPI_INITRD_TABLES (64)" tables are currently allowed
|
|
||||||
# (see osl.c):
|
|
||||||
iasl -sa facp.dsl
|
|
||||||
iasl -sa ssdt1.dsl
|
|
||||||
cp facp.aml kernel/firmware/acpi
|
|
||||||
cp ssdt1.aml kernel/firmware/acpi
|
|
||||||
# The uncompressed cpio archive must be the first. Other, typically
|
|
||||||
# compressed cpio archives, must be concatenated on top of the uncompressed
|
|
||||||
# one. Following command creates the uncompressed cpio archive and
|
|
||||||
# concatenates the original initrd on top:
|
|
||||||
find kernel | cpio -H newc --create > /boot/instrumented_initrd
|
|
||||||
cat /boot/initrd >>/boot/instrumented_initrd
|
|
||||||
# reboot with increased acpi debug level, e.g. boot params:
|
|
||||||
acpi.debug_level=0x2 acpi.debug_layer=0xFFFFFFFF
|
|
||||||
# and check your syslog:
|
|
||||||
[ 1.268089] ACPI: PCI Interrupt Routing Table [\_SB_.PCI0._PRT]
|
|
||||||
[ 1.272091] [ACPI Debug] String [0x0B] "HELLO WORLD"
|
|
||||||
|
|
||||||
iasl is able to disassemble and recompile quite a lot different,
|
|
||||||
also static ACPI tables.
|
|
||||||
|
|
||||||
|
|
||||||
4) Where to retrieve userspace tools
|
|
||||||
------------------------------------
|
|
||||||
|
|
||||||
iasl and acpixtract are part of Intel's ACPICA project:
|
|
||||||
http://acpica.org/
|
|
||||||
and should be packaged by distributions (for example in the acpica package
|
|
||||||
on SUSE).
|
|
||||||
|
|
||||||
acpidump can be found in Len Browns pmtools:
|
|
||||||
ftp://kernel.org/pub/linux/kernel/people/lenb/acpi/utils/pmtools/acpidump
|
|
||||||
This tool is also part of the acpica package on SUSE.
|
|
||||||
Alternatively, used ACPI tables can be retrieved via sysfs in latest kernels:
|
|
||||||
/sys/firmware/acpi/tables
|
|
@@ -1,73 +0,0 @@
|
|||||||
Linux ACPI Custom Control Method How To
|
|
||||||
=======================================
|
|
||||||
|
|
||||||
Written by Zhang Rui <rui.zhang@intel.com>
|
|
||||||
|
|
||||||
|
|
||||||
Linux supports customizing ACPI control methods at runtime.
|
|
||||||
|
|
||||||
Users can use this to
|
|
||||||
1. override an existing method which may not work correctly,
|
|
||||||
or just for debugging purposes.
|
|
||||||
2. insert a completely new method in order to create a missing
|
|
||||||
method such as _OFF, _ON, _STA, _INI, etc.
|
|
||||||
For these cases, it is far simpler to dynamically install a single
|
|
||||||
control method rather than override the entire DSDT, because kernel
|
|
||||||
rebuild/reboot is not needed and test result can be got in minutes.
|
|
||||||
|
|
||||||
Note: Only ACPI METHOD can be overridden, any other object types like
|
|
||||||
"Device", "OperationRegion", are not recognized. Methods
|
|
||||||
declared inside scope operators are also not supported.
|
|
||||||
Note: The same ACPI control method can be overridden for many times,
|
|
||||||
and it's always the latest one that used by Linux/kernel.
|
|
||||||
Note: To get the ACPI debug object output (Store (AAAA, Debug)),
|
|
||||||
please run "echo 1 > /sys/module/acpi/parameters/aml_debug_output".
|
|
||||||
|
|
||||||
1. override an existing method
|
|
||||||
a) get the ACPI table via ACPI sysfs I/F. e.g. to get the DSDT,
|
|
||||||
just run "cat /sys/firmware/acpi/tables/DSDT > /tmp/dsdt.dat"
|
|
||||||
b) disassemble the table by running "iasl -d dsdt.dat".
|
|
||||||
c) rewrite the ASL code of the method and save it in a new file,
|
|
||||||
d) package the new file (psr.asl) to an ACPI table format.
|
|
||||||
Here is an example of a customized \_SB._AC._PSR method,
|
|
||||||
|
|
||||||
DefinitionBlock ("", "SSDT", 1, "", "", 0x20080715)
|
|
||||||
{
|
|
||||||
Method (\_SB_.AC._PSR, 0, NotSerialized)
|
|
||||||
{
|
|
||||||
Store ("In AC _PSR", Debug)
|
|
||||||
Return (ACON)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Note that the full pathname of the method in ACPI namespace
|
|
||||||
should be used.
|
|
||||||
e) assemble the file to generate the AML code of the method.
|
|
||||||
e.g. "iasl -vw 6084 psr.asl" (psr.aml is generated as a result)
|
|
||||||
If parameter "-vw 6084" is not supported by your iASL compiler,
|
|
||||||
please try a newer version.
|
|
||||||
f) mount debugfs by "mount -t debugfs none /sys/kernel/debug"
|
|
||||||
g) override the old method via the debugfs by running
|
|
||||||
"cat /tmp/psr.aml > /sys/kernel/debug/acpi/custom_method"
|
|
||||||
|
|
||||||
2. insert a new method
|
|
||||||
This is easier than overriding an existing method.
|
|
||||||
We just need to create the ASL code of the method we want to
|
|
||||||
insert and then follow the step c) ~ g) in section 1.
|
|
||||||
|
|
||||||
3. undo your changes
|
|
||||||
The "undo" operation is not supported for a new inserted method
|
|
||||||
right now, i.e. we can not remove a method currently.
|
|
||||||
For an overridden method, in order to undo your changes, please
|
|
||||||
save a copy of the method original ASL code in step c) section 1,
|
|
||||||
and redo step c) ~ g) to override the method with the original one.
|
|
||||||
|
|
||||||
|
|
||||||
Note: We can use a kernel with multiple custom ACPI method running,
|
|
||||||
But each individual write to debugfs can implement a SINGLE
|
|
||||||
method override. i.e. if we want to insert/override multiple
|
|
||||||
ACPI methods, we need to redo step c) ~ g) for multiple times.
|
|
||||||
|
|
||||||
Note: Be aware that root can mis-use this driver to modify arbitrary
|
|
||||||
memory and gain additional rights, if root's privileges got
|
|
||||||
restricted (for example if root is not allowed to load additional
|
|
||||||
modules after boot).
|
|
@@ -1,192 +0,0 @@
|
|||||||
ACPICA Trace Facility
|
|
||||||
|
|
||||||
Copyright (C) 2015, Intel Corporation
|
|
||||||
Author: Lv Zheng <lv.zheng@intel.com>
|
|
||||||
|
|
||||||
|
|
||||||
Abstract:
|
|
||||||
|
|
||||||
This document describes the functions and the interfaces of the method
|
|
||||||
tracing facility.
|
|
||||||
|
|
||||||
1. Functionalities and usage examples:
|
|
||||||
|
|
||||||
ACPICA provides method tracing capability. And two functions are
|
|
||||||
currently implemented using this capability.
|
|
||||||
|
|
||||||
A. Log reducer
|
|
||||||
ACPICA subsystem provides debugging outputs when CONFIG_ACPI_DEBUG is
|
|
||||||
enabled. The debugging messages which are deployed via
|
|
||||||
ACPI_DEBUG_PRINT() macro can be reduced at 2 levels - per-component
|
|
||||||
level (known as debug layer, configured via
|
|
||||||
/sys/module/acpi/parameters/debug_layer) and per-type level (known as
|
|
||||||
debug level, configured via /sys/module/acpi/parameters/debug_level).
|
|
||||||
|
|
||||||
But when the particular layer/level is applied to the control method
|
|
||||||
evaluations, the quantity of the debugging outputs may still be too
|
|
||||||
large to be put into the kernel log buffer. The idea thus is worked out
|
|
||||||
to only enable the particular debug layer/level (normally more detailed)
|
|
||||||
logs when the control method evaluation is started, and disable the
|
|
||||||
detailed logging when the control method evaluation is stopped.
|
|
||||||
|
|
||||||
The following command examples illustrate the usage of the "log reducer"
|
|
||||||
functionality:
|
|
||||||
a. Filter out the debug layer/level matched logs when control methods
|
|
||||||
are being evaluated:
|
|
||||||
# cd /sys/module/acpi/parameters
|
|
||||||
# echo "0xXXXXXXXX" > trace_debug_layer
|
|
||||||
# echo "0xYYYYYYYY" > trace_debug_level
|
|
||||||
# echo "enable" > trace_state
|
|
||||||
b. Filter out the debug layer/level matched logs when the specified
|
|
||||||
control method is being evaluated:
|
|
||||||
# cd /sys/module/acpi/parameters
|
|
||||||
# echo "0xXXXXXXXX" > trace_debug_layer
|
|
||||||
# echo "0xYYYYYYYY" > trace_debug_level
|
|
||||||
# echo "\PPPP.AAAA.TTTT.HHHH" > trace_method_name
|
|
||||||
# echo "method" > /sys/module/acpi/parameters/trace_state
|
|
||||||
c. Filter out the debug layer/level matched logs when the specified
|
|
||||||
control method is being evaluated for the first time:
|
|
||||||
# cd /sys/module/acpi/parameters
|
|
||||||
# echo "0xXXXXXXXX" > trace_debug_layer
|
|
||||||
# echo "0xYYYYYYYY" > trace_debug_level
|
|
||||||
# echo "\PPPP.AAAA.TTTT.HHHH" > trace_method_name
|
|
||||||
# echo "method-once" > /sys/module/acpi/parameters/trace_state
|
|
||||||
Where:
|
|
||||||
0xXXXXXXXX/0xYYYYYYYY: Refer to Documentation/acpi/debug.txt for
|
|
||||||
possible debug layer/level masking values.
|
|
||||||
\PPPP.AAAA.TTTT.HHHH: Full path of a control method that can be found
|
|
||||||
in the ACPI namespace. It needn't be an entry
|
|
||||||
of a control method evaluation.
|
|
||||||
|
|
||||||
B. AML tracer
|
|
||||||
|
|
||||||
There are special log entries added by the method tracing facility at
|
|
||||||
the "trace points" the AML interpreter starts/stops to execute a control
|
|
||||||
method, or an AML opcode. Note that the format of the log entries are
|
|
||||||
subject to change:
|
|
||||||
[ 0.186427] exdebug-0398 ex_trace_point : Method Begin [0xf58394d8:\_SB.PCI0.LPCB.ECOK] execution.
|
|
||||||
[ 0.186630] exdebug-0398 ex_trace_point : Opcode Begin [0xf5905c88:If] execution.
|
|
||||||
[ 0.186820] exdebug-0398 ex_trace_point : Opcode Begin [0xf5905cc0:LEqual] execution.
|
|
||||||
[ 0.187010] exdebug-0398 ex_trace_point : Opcode Begin [0xf5905a20:-NamePath-] execution.
|
|
||||||
[ 0.187214] exdebug-0398 ex_trace_point : Opcode End [0xf5905a20:-NamePath-] execution.
|
|
||||||
[ 0.187407] exdebug-0398 ex_trace_point : Opcode Begin [0xf5905f60:One] execution.
|
|
||||||
[ 0.187594] exdebug-0398 ex_trace_point : Opcode End [0xf5905f60:One] execution.
|
|
||||||
[ 0.187789] exdebug-0398 ex_trace_point : Opcode End [0xf5905cc0:LEqual] execution.
|
|
||||||
[ 0.187980] exdebug-0398 ex_trace_point : Opcode Begin [0xf5905cc0:Return] execution.
|
|
||||||
[ 0.188146] exdebug-0398 ex_trace_point : Opcode Begin [0xf5905f60:One] execution.
|
|
||||||
[ 0.188334] exdebug-0398 ex_trace_point : Opcode End [0xf5905f60:One] execution.
|
|
||||||
[ 0.188524] exdebug-0398 ex_trace_point : Opcode End [0xf5905cc0:Return] execution.
|
|
||||||
[ 0.188712] exdebug-0398 ex_trace_point : Opcode End [0xf5905c88:If] execution.
|
|
||||||
[ 0.188903] exdebug-0398 ex_trace_point : Method End [0xf58394d8:\_SB.PCI0.LPCB.ECOK] execution.
|
|
||||||
|
|
||||||
Developers can utilize these special log entries to track the AML
|
|
||||||
interpretion, thus can aid issue debugging and performance tuning. Note
|
|
||||||
that, as the "AML tracer" logs are implemented via ACPI_DEBUG_PRINT()
|
|
||||||
macro, CONFIG_ACPI_DEBUG is also required to be enabled for enabling
|
|
||||||
"AML tracer" logs.
|
|
||||||
|
|
||||||
The following command examples illustrate the usage of the "AML tracer"
|
|
||||||
functionality:
|
|
||||||
a. Filter out the method start/stop "AML tracer" logs when control
|
|
||||||
methods are being evaluated:
|
|
||||||
# cd /sys/module/acpi/parameters
|
|
||||||
# echo "0x80" > trace_debug_layer
|
|
||||||
# echo "0x10" > trace_debug_level
|
|
||||||
# echo "enable" > trace_state
|
|
||||||
b. Filter out the method start/stop "AML tracer" when the specified
|
|
||||||
control method is being evaluated:
|
|
||||||
# cd /sys/module/acpi/parameters
|
|
||||||
# echo "0x80" > trace_debug_layer
|
|
||||||
# echo "0x10" > trace_debug_level
|
|
||||||
# echo "\PPPP.AAAA.TTTT.HHHH" > trace_method_name
|
|
||||||
# echo "method" > trace_state
|
|
||||||
c. Filter out the method start/stop "AML tracer" logs when the specified
|
|
||||||
control method is being evaluated for the first time:
|
|
||||||
# cd /sys/module/acpi/parameters
|
|
||||||
# echo "0x80" > trace_debug_layer
|
|
||||||
# echo "0x10" > trace_debug_level
|
|
||||||
# echo "\PPPP.AAAA.TTTT.HHHH" > trace_method_name
|
|
||||||
# echo "method-once" > trace_state
|
|
||||||
d. Filter out the method/opcode start/stop "AML tracer" when the
|
|
||||||
specified control method is being evaluated:
|
|
||||||
# cd /sys/module/acpi/parameters
|
|
||||||
# echo "0x80" > trace_debug_layer
|
|
||||||
# echo "0x10" > trace_debug_level
|
|
||||||
# echo "\PPPP.AAAA.TTTT.HHHH" > trace_method_name
|
|
||||||
# echo "opcode" > trace_state
|
|
||||||
e. Filter out the method/opcode start/stop "AML tracer" when the
|
|
||||||
specified control method is being evaluated for the first time:
|
|
||||||
# cd /sys/module/acpi/parameters
|
|
||||||
# echo "0x80" > trace_debug_layer
|
|
||||||
# echo "0x10" > trace_debug_level
|
|
||||||
# echo "\PPPP.AAAA.TTTT.HHHH" > trace_method_name
|
|
||||||
# echo "opcode-opcode" > trace_state
|
|
||||||
|
|
||||||
Note that all above method tracing facility related module parameters can
|
|
||||||
be used as the boot parameters, for example:
|
|
||||||
acpi.trace_debug_layer=0x80 acpi.trace_debug_level=0x10 \
|
|
||||||
acpi.trace_method_name=\_SB.LID0._LID acpi.trace_state=opcode-once
|
|
||||||
|
|
||||||
2. Interface descriptions:
|
|
||||||
|
|
||||||
All method tracing functions can be configured via ACPI module
|
|
||||||
parameters that are accessible at /sys/module/acpi/parameters/:
|
|
||||||
|
|
||||||
trace_method_name
|
|
||||||
The full path of the AML method that the user wants to trace.
|
|
||||||
Note that the full path shouldn't contain the trailing "_"s in its
|
|
||||||
name segments but may contain "\" to form an absolute path.
|
|
||||||
|
|
||||||
trace_debug_layer
|
|
||||||
The temporary debug_layer used when the tracing feature is enabled.
|
|
||||||
Using ACPI_EXECUTER (0x80) by default, which is the debug_layer
|
|
||||||
used to match all "AML tracer" logs.
|
|
||||||
|
|
||||||
trace_debug_level
|
|
||||||
The temporary debug_level used when the tracing feature is enabled.
|
|
||||||
Using ACPI_LV_TRACE_POINT (0x10) by default, which is the
|
|
||||||
debug_level used to match all "AML tracer" logs.
|
|
||||||
|
|
||||||
trace_state
|
|
||||||
The status of the tracing feature.
|
|
||||||
Users can enable/disable this debug tracing feature by executing
|
|
||||||
the following command:
|
|
||||||
# echo string > /sys/module/acpi/parameters/trace_state
|
|
||||||
Where "string" should be one of the following:
|
|
||||||
"disable"
|
|
||||||
Disable the method tracing feature.
|
|
||||||
"enable"
|
|
||||||
Enable the method tracing feature.
|
|
||||||
ACPICA debugging messages matching
|
|
||||||
"trace_debug_layer/trace_debug_level" during any method
|
|
||||||
execution will be logged.
|
|
||||||
"method"
|
|
||||||
Enable the method tracing feature.
|
|
||||||
ACPICA debugging messages matching
|
|
||||||
"trace_debug_layer/trace_debug_level" during method execution
|
|
||||||
of "trace_method_name" will be logged.
|
|
||||||
"method-once"
|
|
||||||
Enable the method tracing feature.
|
|
||||||
ACPICA debugging messages matching
|
|
||||||
"trace_debug_layer/trace_debug_level" during method execution
|
|
||||||
of "trace_method_name" will be logged only once.
|
|
||||||
"opcode"
|
|
||||||
Enable the method tracing feature.
|
|
||||||
ACPICA debugging messages matching
|
|
||||||
"trace_debug_layer/trace_debug_level" during method/opcode
|
|
||||||
execution of "trace_method_name" will be logged.
|
|
||||||
"opcode-once"
|
|
||||||
Enable the method tracing feature.
|
|
||||||
ACPICA debugging messages matching
|
|
||||||
"trace_debug_layer/trace_debug_level" during method/opcode
|
|
||||||
execution of "trace_method_name" will be logged only once.
|
|
||||||
Note that, the difference between the "enable" and other feature
|
|
||||||
enabling options are:
|
|
||||||
1. When "enable" is specified, since
|
|
||||||
"trace_debug_layer/trace_debug_level" shall apply to all control
|
|
||||||
method evaluations, after configuring "trace_state" to "enable",
|
|
||||||
"trace_method_name" will be reset to NULL.
|
|
||||||
2. When "method/opcode" is specified, if
|
|
||||||
"trace_method_name" is NULL when "trace_state" is configured to
|
|
||||||
these options, the "trace_debug_layer/trace_debug_level" will
|
|
||||||
apply to all control method evaluations.
|
|
@@ -1,172 +0,0 @@
|
|||||||
|
|
||||||
In order to support ACPI open-ended hardware configurations (e.g. development
|
|
||||||
boards) we need a way to augment the ACPI configuration provided by the firmware
|
|
||||||
image. A common example is connecting sensors on I2C / SPI buses on development
|
|
||||||
boards.
|
|
||||||
|
|
||||||
Although this can be accomplished by creating a kernel platform driver or
|
|
||||||
recompiling the firmware image with updated ACPI tables, neither is practical:
|
|
||||||
the former proliferates board specific kernel code while the latter requires
|
|
||||||
access to firmware tools which are often not publicly available.
|
|
||||||
|
|
||||||
Because ACPI supports external references in AML code a more practical
|
|
||||||
way to augment firmware ACPI configuration is by dynamically loading
|
|
||||||
user defined SSDT tables that contain the board specific information.
|
|
||||||
|
|
||||||
For example, to enumerate a Bosch BMA222E accelerometer on the I2C bus of the
|
|
||||||
Minnowboard MAX development board exposed via the LSE connector [1], the
|
|
||||||
following ASL code can be used:
|
|
||||||
|
|
||||||
DefinitionBlock ("minnowmax.aml", "SSDT", 1, "Vendor", "Accel", 0x00000003)
|
|
||||||
{
|
|
||||||
External (\_SB.I2C6, DeviceObj)
|
|
||||||
|
|
||||||
Scope (\_SB.I2C6)
|
|
||||||
{
|
|
||||||
Device (STAC)
|
|
||||||
{
|
|
||||||
Name (_ADR, Zero)
|
|
||||||
Name (_HID, "BMA222E")
|
|
||||||
|
|
||||||
Method (_CRS, 0, Serialized)
|
|
||||||
{
|
|
||||||
Name (RBUF, ResourceTemplate ()
|
|
||||||
{
|
|
||||||
I2cSerialBus (0x0018, ControllerInitiated, 0x00061A80,
|
|
||||||
AddressingMode7Bit, "\\_SB.I2C6", 0x00,
|
|
||||||
ResourceConsumer, ,)
|
|
||||||
GpioInt (Edge, ActiveHigh, Exclusive, PullDown, 0x0000,
|
|
||||||
"\\_SB.GPO2", 0x00, ResourceConsumer, , )
|
|
||||||
{ // Pin list
|
|
||||||
0
|
|
||||||
}
|
|
||||||
})
|
|
||||||
Return (RBUF)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
which can then be compiled to AML binary format:
|
|
||||||
|
|
||||||
$ iasl minnowmax.asl
|
|
||||||
|
|
||||||
Intel ACPI Component Architecture
|
|
||||||
ASL Optimizing Compiler version 20140214-64 [Mar 29 2014]
|
|
||||||
Copyright (c) 2000 - 2014 Intel Corporation
|
|
||||||
|
|
||||||
ASL Input: minnomax.asl - 30 lines, 614 bytes, 7 keywords
|
|
||||||
AML Output: minnowmax.aml - 165 bytes, 6 named objects, 1 executable opcodes
|
|
||||||
|
|
||||||
[1] http://wiki.minnowboard.org/MinnowBoard_MAX#Low_Speed_Expansion_Connector_.28Top.29
|
|
||||||
|
|
||||||
The resulting AML code can then be loaded by the kernel using one of the methods
|
|
||||||
below.
|
|
||||||
|
|
||||||
== Loading ACPI SSDTs from initrd ==
|
|
||||||
|
|
||||||
This option allows loading of user defined SSDTs from initrd and it is useful
|
|
||||||
when the system does not support EFI or when there is not enough EFI storage.
|
|
||||||
|
|
||||||
It works in a similar way with initrd based ACPI tables override/upgrade: SSDT
|
|
||||||
aml code must be placed in the first, uncompressed, initrd under the
|
|
||||||
"kernel/firmware/acpi" path. Multiple files can be used and this will translate
|
|
||||||
in loading multiple tables. Only SSDT and OEM tables are allowed. See
|
|
||||||
initrd_table_override.txt for more details.
|
|
||||||
|
|
||||||
Here is an example:
|
|
||||||
|
|
||||||
# Add the raw ACPI tables to an uncompressed cpio archive.
|
|
||||||
# They must be put into a /kernel/firmware/acpi directory inside the
|
|
||||||
# cpio archive.
|
|
||||||
# The uncompressed cpio archive must be the first.
|
|
||||||
# Other, typically compressed cpio archives, must be
|
|
||||||
# concatenated on top of the uncompressed one.
|
|
||||||
mkdir -p kernel/firmware/acpi
|
|
||||||
cp ssdt.aml kernel/firmware/acpi
|
|
||||||
|
|
||||||
# Create the uncompressed cpio archive and concatenate the original initrd
|
|
||||||
# on top:
|
|
||||||
find kernel | cpio -H newc --create > /boot/instrumented_initrd
|
|
||||||
cat /boot/initrd >>/boot/instrumented_initrd
|
|
||||||
|
|
||||||
== Loading ACPI SSDTs from EFI variables ==
|
|
||||||
|
|
||||||
This is the preferred method, when EFI is supported on the platform, because it
|
|
||||||
allows a persistent, OS independent way of storing the user defined SSDTs. There
|
|
||||||
is also work underway to implement EFI support for loading user defined SSDTs
|
|
||||||
and using this method will make it easier to convert to the EFI loading
|
|
||||||
mechanism when that will arrive.
|
|
||||||
|
|
||||||
In order to load SSDTs from an EFI variable the efivar_ssdt kernel command line
|
|
||||||
parameter can be used. The argument for the option is the variable name to
|
|
||||||
use. If there are multiple variables with the same name but with different
|
|
||||||
vendor GUIDs, all of them will be loaded.
|
|
||||||
|
|
||||||
In order to store the AML code in an EFI variable the efivarfs filesystem can be
|
|
||||||
used. It is enabled and mounted by default in /sys/firmware/efi/efivars in all
|
|
||||||
recent distribution.
|
|
||||||
|
|
||||||
Creating a new file in /sys/firmware/efi/efivars will automatically create a new
|
|
||||||
EFI variable. Updating a file in /sys/firmware/efi/efivars will update the EFI
|
|
||||||
variable. Please note that the file name needs to be specially formatted as
|
|
||||||
"Name-GUID" and that the first 4 bytes in the file (little-endian format)
|
|
||||||
represent the attributes of the EFI variable (see EFI_VARIABLE_MASK in
|
|
||||||
include/linux/efi.h). Writing to the file must also be done with one write
|
|
||||||
operation.
|
|
||||||
|
|
||||||
For example, you can use the following bash script to create/update an EFI
|
|
||||||
variable with the content from a given file:
|
|
||||||
|
|
||||||
#!/bin/sh -e
|
|
||||||
|
|
||||||
while ! [ -z "$1" ]; do
|
|
||||||
case "$1" in
|
|
||||||
"-f") filename="$2"; shift;;
|
|
||||||
"-g") guid="$2"; shift;;
|
|
||||||
*) name="$1";;
|
|
||||||
esac
|
|
||||||
shift
|
|
||||||
done
|
|
||||||
|
|
||||||
usage()
|
|
||||||
{
|
|
||||||
echo "Syntax: ${0##*/} -f filename [ -g guid ] name"
|
|
||||||
exit 1
|
|
||||||
}
|
|
||||||
|
|
||||||
[ -n "$name" -a -f "$filename" ] || usage
|
|
||||||
|
|
||||||
EFIVARFS="/sys/firmware/efi/efivars"
|
|
||||||
|
|
||||||
[ -d "$EFIVARFS" ] || exit 2
|
|
||||||
|
|
||||||
if stat -tf $EFIVARFS | grep -q -v de5e81e4; then
|
|
||||||
mount -t efivarfs none $EFIVARFS
|
|
||||||
fi
|
|
||||||
|
|
||||||
# try to pick up an existing GUID
|
|
||||||
[ -n "$guid" ] || guid=$(find "$EFIVARFS" -name "$name-*" | head -n1 | cut -f2- -d-)
|
|
||||||
|
|
||||||
# use a randomly generated GUID
|
|
||||||
[ -n "$guid" ] || guid="$(cat /proc/sys/kernel/random/uuid)"
|
|
||||||
|
|
||||||
# efivarfs expects all of the data in one write
|
|
||||||
tmp=$(mktemp)
|
|
||||||
/bin/echo -ne "\007\000\000\000" | cat - $filename > $tmp
|
|
||||||
dd if=$tmp of="$EFIVARFS/$name-$guid" bs=$(stat -c %s $tmp)
|
|
||||||
rm $tmp
|
|
||||||
|
|
||||||
== Loading ACPI SSDTs from configfs ==
|
|
||||||
|
|
||||||
This option allows loading of user defined SSDTs from userspace via the configfs
|
|
||||||
interface. The CONFIG_ACPI_CONFIGFS option must be select and configfs must be
|
|
||||||
mounted. In the following examples, we assume that configfs has been mounted in
|
|
||||||
/config.
|
|
||||||
|
|
||||||
New tables can be loading by creating new directories in /config/acpi/table/ and
|
|
||||||
writing the SSDT aml code in the aml attribute:
|
|
||||||
|
|
||||||
cd /config/acpi/table
|
|
||||||
mkdir my_ssdt
|
|
||||||
cat ~/ssdt.aml > my_ssdt/aml
|
|
@@ -1,5 +1,11 @@
|
|||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
|
||||||
Collaborative Processor Performance Control (CPPC)
|
==================================================
|
||||||
|
Collaborative Processor Performance Control (CPPC)
|
||||||
|
==================================================
|
||||||
|
|
||||||
|
CPPC
|
||||||
|
====
|
||||||
|
|
||||||
CPPC defined in the ACPI spec describes a mechanism for the OS to manage the
|
CPPC defined in the ACPI spec describes a mechanism for the OS to manage the
|
||||||
performance of a logical processor on a contigious and abstract performance
|
performance of a logical processor on a contigious and abstract performance
|
||||||
@@ -10,31 +16,28 @@ For more details on CPPC please refer to the ACPI specification at:
|
|||||||
|
|
||||||
http://uefi.org/specifications
|
http://uefi.org/specifications
|
||||||
|
|
||||||
Some of the CPPC registers are exposed via sysfs under:
|
Some of the CPPC registers are exposed via sysfs under::
|
||||||
|
|
||||||
/sys/devices/system/cpu/cpuX/acpi_cppc/
|
/sys/devices/system/cpu/cpuX/acpi_cppc/
|
||||||
|
|
||||||
for each cpu X
|
for each cpu X::
|
||||||
|
|
||||||
--------------------------------------------------------------------------------
|
$ ls -lR /sys/devices/system/cpu/cpu0/acpi_cppc/
|
||||||
|
/sys/devices/system/cpu/cpu0/acpi_cppc/:
|
||||||
$ ls -lR /sys/devices/system/cpu/cpu0/acpi_cppc/
|
total 0
|
||||||
/sys/devices/system/cpu/cpu0/acpi_cppc/:
|
-r--r--r-- 1 root root 65536 Mar 5 19:38 feedback_ctrs
|
||||||
total 0
|
-r--r--r-- 1 root root 65536 Mar 5 19:38 highest_perf
|
||||||
-r--r--r-- 1 root root 65536 Mar 5 19:38 feedback_ctrs
|
-r--r--r-- 1 root root 65536 Mar 5 19:38 lowest_freq
|
||||||
-r--r--r-- 1 root root 65536 Mar 5 19:38 highest_perf
|
-r--r--r-- 1 root root 65536 Mar 5 19:38 lowest_nonlinear_perf
|
||||||
-r--r--r-- 1 root root 65536 Mar 5 19:38 lowest_freq
|
-r--r--r-- 1 root root 65536 Mar 5 19:38 lowest_perf
|
||||||
-r--r--r-- 1 root root 65536 Mar 5 19:38 lowest_nonlinear_perf
|
-r--r--r-- 1 root root 65536 Mar 5 19:38 nominal_freq
|
||||||
-r--r--r-- 1 root root 65536 Mar 5 19:38 lowest_perf
|
-r--r--r-- 1 root root 65536 Mar 5 19:38 nominal_perf
|
||||||
-r--r--r-- 1 root root 65536 Mar 5 19:38 nominal_freq
|
-r--r--r-- 1 root root 65536 Mar 5 19:38 reference_perf
|
||||||
-r--r--r-- 1 root root 65536 Mar 5 19:38 nominal_perf
|
-r--r--r-- 1 root root 65536 Mar 5 19:38 wraparound_time
|
||||||
-r--r--r-- 1 root root 65536 Mar 5 19:38 reference_perf
|
|
||||||
-r--r--r-- 1 root root 65536 Mar 5 19:38 wraparound_time
|
|
||||||
|
|
||||||
--------------------------------------------------------------------------------
|
|
||||||
|
|
||||||
* highest_perf : Highest performance of this processor (abstract scale).
|
* highest_perf : Highest performance of this processor (abstract scale).
|
||||||
* nominal_perf : Highest sustained performance of this processor (abstract scale).
|
* nominal_perf : Highest sustained performance of this processor
|
||||||
|
(abstract scale).
|
||||||
* lowest_nonlinear_perf : Lowest performance of this processor with nonlinear
|
* lowest_nonlinear_perf : Lowest performance of this processor with nonlinear
|
||||||
power savings (abstract scale).
|
power savings (abstract scale).
|
||||||
* lowest_perf : Lowest performance of this processor (abstract scale).
|
* lowest_perf : Lowest performance of this processor (abstract scale).
|
||||||
@@ -48,22 +51,26 @@ total 0
|
|||||||
* feedback_ctrs : Includes both Reference and delivered performance counter.
|
* feedback_ctrs : Includes both Reference and delivered performance counter.
|
||||||
Reference counter ticks up proportional to processor's reference performance.
|
Reference counter ticks up proportional to processor's reference performance.
|
||||||
Delivered counter ticks up proportional to processor's delivered performance.
|
Delivered counter ticks up proportional to processor's delivered performance.
|
||||||
* wraparound_time: Minimum time for the feedback counters to wraparound (seconds).
|
* wraparound_time: Minimum time for the feedback counters to wraparound
|
||||||
|
(seconds).
|
||||||
* reference_perf : Performance level at which reference performance counter
|
* reference_perf : Performance level at which reference performance counter
|
||||||
accumulates (abstract scale).
|
accumulates (abstract scale).
|
||||||
|
|
||||||
--------------------------------------------------------------------------------
|
|
||||||
|
|
||||||
Computing Average Delivered Performance
|
Computing Average Delivered Performance
|
||||||
|
=======================================
|
||||||
|
|
||||||
Below describes the steps to compute the average performance delivered by taking
|
Below describes the steps to compute the average performance delivered by
|
||||||
two different snapshots of feedback counters at time T1 and T2.
|
taking two different snapshots of feedback counters at time T1 and T2.
|
||||||
|
|
||||||
T1: Read feedback_ctrs as fbc_t1
|
T1: Read feedback_ctrs as fbc_t1
|
||||||
Wait or run some workload
|
Wait or run some workload
|
||||||
T2: Read feedback_ctrs as fbc_t2
|
|
||||||
|
|
||||||
delivered_counter_delta = fbc_t2[del] - fbc_t1[del]
|
T2: Read feedback_ctrs as fbc_t2
|
||||||
reference_counter_delta = fbc_t2[ref] - fbc_t1[ref]
|
|
||||||
|
|
||||||
delivered_perf = (refernce_perf x delivered_counter_delta) / reference_counter_delta
|
::
|
||||||
|
|
||||||
|
delivered_counter_delta = fbc_t2[del] - fbc_t1[del]
|
||||||
|
reference_counter_delta = fbc_t2[ref] - fbc_t1[ref]
|
||||||
|
|
||||||
|
delivered_perf = (refernce_perf x delivered_counter_delta) / reference_counter_delta
|
@@ -1,6 +1,12 @@
|
|||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
|
||||||
|
===============
|
||||||
|
Overriding DSDT
|
||||||
|
===============
|
||||||
|
|
||||||
Linux supports a method of overriding the BIOS DSDT:
|
Linux supports a method of overriding the BIOS DSDT:
|
||||||
|
|
||||||
CONFIG_ACPI_CUSTOM_DSDT builds the image into the kernel.
|
CONFIG_ACPI_CUSTOM_DSDT - builds the image into the kernel.
|
||||||
|
|
||||||
When to use this method is described in detail on the
|
When to use this method is described in detail on the
|
||||||
Linux/ACPI home page:
|
Linux/ACPI home page:
|
14
Documentation/admin-guide/acpi/index.rst
Normal file
14
Documentation/admin-guide/acpi/index.rst
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
============
|
||||||
|
ACPI Support
|
||||||
|
============
|
||||||
|
|
||||||
|
Here we document in detail how to interact with various mechanisms in
|
||||||
|
the Linux ACPI support.
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 1
|
||||||
|
|
||||||
|
initrd_table_override
|
||||||
|
dsdt-override
|
||||||
|
ssdt-overlays
|
||||||
|
cppc_sysfs
|
115
Documentation/admin-guide/acpi/initrd_table_override.rst
Normal file
115
Documentation/admin-guide/acpi/initrd_table_override.rst
Normal file
@@ -0,0 +1,115 @@
|
|||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
|
||||||
|
================================
|
||||||
|
Upgrading ACPI tables via initrd
|
||||||
|
================================
|
||||||
|
|
||||||
|
What is this about
|
||||||
|
==================
|
||||||
|
|
||||||
|
If the ACPI_TABLE_UPGRADE compile option is true, it is possible to
|
||||||
|
upgrade the ACPI execution environment that is defined by the ACPI tables
|
||||||
|
via upgrading the ACPI tables provided by the BIOS with an instrumented,
|
||||||
|
modified, more recent version one, or installing brand new ACPI tables.
|
||||||
|
|
||||||
|
When building initrd with kernel in a single image, option
|
||||||
|
ACPI_TABLE_OVERRIDE_VIA_BUILTIN_INITRD should also be true for this
|
||||||
|
feature to work.
|
||||||
|
|
||||||
|
For a full list of ACPI tables that can be upgraded/installed, take a look
|
||||||
|
at the char `*table_sigs[MAX_ACPI_SIGNATURE];` definition in
|
||||||
|
drivers/acpi/tables.c.
|
||||||
|
|
||||||
|
All ACPI tables iasl (Intel's ACPI compiler and disassembler) knows should
|
||||||
|
be overridable, except:
|
||||||
|
|
||||||
|
- ACPI_SIG_RSDP (has a signature of 6 bytes)
|
||||||
|
- ACPI_SIG_FACS (does not have an ordinary ACPI table header)
|
||||||
|
|
||||||
|
Both could get implemented as well.
|
||||||
|
|
||||||
|
|
||||||
|
What is this for
|
||||||
|
================
|
||||||
|
|
||||||
|
Complain to your platform/BIOS vendor if you find a bug which is so severe
|
||||||
|
that a workaround is not accepted in the Linux kernel. And this facility
|
||||||
|
allows you to upgrade the buggy tables before your platform/BIOS vendor
|
||||||
|
releases an upgraded BIOS binary.
|
||||||
|
|
||||||
|
This facility can be used by platform/BIOS vendors to provide a Linux
|
||||||
|
compatible environment without modifying the underlying platform firmware.
|
||||||
|
|
||||||
|
This facility also provides a powerful feature to easily debug and test
|
||||||
|
ACPI BIOS table compatibility with the Linux kernel by modifying old
|
||||||
|
platform provided ACPI tables or inserting new ACPI tables.
|
||||||
|
|
||||||
|
It can and should be enabled in any kernel because there is no functional
|
||||||
|
change with not instrumented initrds.
|
||||||
|
|
||||||
|
|
||||||
|
How does it work
|
||||||
|
================
|
||||||
|
::
|
||||||
|
|
||||||
|
# Extract the machine's ACPI tables:
|
||||||
|
cd /tmp
|
||||||
|
acpidump >acpidump
|
||||||
|
acpixtract -a acpidump
|
||||||
|
# Disassemble, modify and recompile them:
|
||||||
|
iasl -d *.dat
|
||||||
|
# For example add this statement into a _PRT (PCI Routing Table) function
|
||||||
|
# of the DSDT:
|
||||||
|
Store("HELLO WORLD", debug)
|
||||||
|
# And increase the OEM Revision. For example, before modification:
|
||||||
|
DefinitionBlock ("DSDT.aml", "DSDT", 2, "INTEL ", "TEMPLATE", 0x00000000)
|
||||||
|
# After modification:
|
||||||
|
DefinitionBlock ("DSDT.aml", "DSDT", 2, "INTEL ", "TEMPLATE", 0x00000001)
|
||||||
|
iasl -sa dsdt.dsl
|
||||||
|
# Add the raw ACPI tables to an uncompressed cpio archive.
|
||||||
|
# They must be put into a /kernel/firmware/acpi directory inside the cpio
|
||||||
|
# archive. Note that if the table put here matches a platform table
|
||||||
|
# (similar Table Signature, and similar OEMID, and similar OEM Table ID)
|
||||||
|
# with a more recent OEM Revision, the platform table will be upgraded by
|
||||||
|
# this table. If the table put here doesn't match a platform table
|
||||||
|
# (dissimilar Table Signature, or dissimilar OEMID, or dissimilar OEM Table
|
||||||
|
# ID), this table will be appended.
|
||||||
|
mkdir -p kernel/firmware/acpi
|
||||||
|
cp dsdt.aml kernel/firmware/acpi
|
||||||
|
# A maximum of "NR_ACPI_INITRD_TABLES (64)" tables are currently allowed
|
||||||
|
# (see osl.c):
|
||||||
|
iasl -sa facp.dsl
|
||||||
|
iasl -sa ssdt1.dsl
|
||||||
|
cp facp.aml kernel/firmware/acpi
|
||||||
|
cp ssdt1.aml kernel/firmware/acpi
|
||||||
|
# The uncompressed cpio archive must be the first. Other, typically
|
||||||
|
# compressed cpio archives, must be concatenated on top of the uncompressed
|
||||||
|
# one. Following command creates the uncompressed cpio archive and
|
||||||
|
# concatenates the original initrd on top:
|
||||||
|
find kernel | cpio -H newc --create > /boot/instrumented_initrd
|
||||||
|
cat /boot/initrd >>/boot/instrumented_initrd
|
||||||
|
# reboot with increased acpi debug level, e.g. boot params:
|
||||||
|
acpi.debug_level=0x2 acpi.debug_layer=0xFFFFFFFF
|
||||||
|
# and check your syslog:
|
||||||
|
[ 1.268089] ACPI: PCI Interrupt Routing Table [\_SB_.PCI0._PRT]
|
||||||
|
[ 1.272091] [ACPI Debug] String [0x0B] "HELLO WORLD"
|
||||||
|
|
||||||
|
iasl is able to disassemble and recompile quite a lot different,
|
||||||
|
also static ACPI tables.
|
||||||
|
|
||||||
|
|
||||||
|
Where to retrieve userspace tools
|
||||||
|
=================================
|
||||||
|
|
||||||
|
iasl and acpixtract are part of Intel's ACPICA project:
|
||||||
|
http://acpica.org/
|
||||||
|
|
||||||
|
and should be packaged by distributions (for example in the acpica package
|
||||||
|
on SUSE).
|
||||||
|
|
||||||
|
acpidump can be found in Len Browns pmtools:
|
||||||
|
ftp://kernel.org/pub/linux/kernel/people/lenb/acpi/utils/pmtools/acpidump
|
||||||
|
|
||||||
|
This tool is also part of the acpica package on SUSE.
|
||||||
|
Alternatively, used ACPI tables can be retrieved via sysfs in latest kernels:
|
||||||
|
/sys/firmware/acpi/tables
|
180
Documentation/admin-guide/acpi/ssdt-overlays.rst
Normal file
180
Documentation/admin-guide/acpi/ssdt-overlays.rst
Normal file
@@ -0,0 +1,180 @@
|
|||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
|
||||||
|
=============
|
||||||
|
SSDT Overlays
|
||||||
|
=============
|
||||||
|
|
||||||
|
In order to support ACPI open-ended hardware configurations (e.g. development
|
||||||
|
boards) we need a way to augment the ACPI configuration provided by the firmware
|
||||||
|
image. A common example is connecting sensors on I2C / SPI buses on development
|
||||||
|
boards.
|
||||||
|
|
||||||
|
Although this can be accomplished by creating a kernel platform driver or
|
||||||
|
recompiling the firmware image with updated ACPI tables, neither is practical:
|
||||||
|
the former proliferates board specific kernel code while the latter requires
|
||||||
|
access to firmware tools which are often not publicly available.
|
||||||
|
|
||||||
|
Because ACPI supports external references in AML code a more practical
|
||||||
|
way to augment firmware ACPI configuration is by dynamically loading
|
||||||
|
user defined SSDT tables that contain the board specific information.
|
||||||
|
|
||||||
|
For example, to enumerate a Bosch BMA222E accelerometer on the I2C bus of the
|
||||||
|
Minnowboard MAX development board exposed via the LSE connector [1], the
|
||||||
|
following ASL code can be used::
|
||||||
|
|
||||||
|
DefinitionBlock ("minnowmax.aml", "SSDT", 1, "Vendor", "Accel", 0x00000003)
|
||||||
|
{
|
||||||
|
External (\_SB.I2C6, DeviceObj)
|
||||||
|
|
||||||
|
Scope (\_SB.I2C6)
|
||||||
|
{
|
||||||
|
Device (STAC)
|
||||||
|
{
|
||||||
|
Name (_ADR, Zero)
|
||||||
|
Name (_HID, "BMA222E")
|
||||||
|
|
||||||
|
Method (_CRS, 0, Serialized)
|
||||||
|
{
|
||||||
|
Name (RBUF, ResourceTemplate ()
|
||||||
|
{
|
||||||
|
I2cSerialBus (0x0018, ControllerInitiated, 0x00061A80,
|
||||||
|
AddressingMode7Bit, "\\_SB.I2C6", 0x00,
|
||||||
|
ResourceConsumer, ,)
|
||||||
|
GpioInt (Edge, ActiveHigh, Exclusive, PullDown, 0x0000,
|
||||||
|
"\\_SB.GPO2", 0x00, ResourceConsumer, , )
|
||||||
|
{ // Pin list
|
||||||
|
0
|
||||||
|
}
|
||||||
|
})
|
||||||
|
Return (RBUF)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
which can then be compiled to AML binary format::
|
||||||
|
|
||||||
|
$ iasl minnowmax.asl
|
||||||
|
|
||||||
|
Intel ACPI Component Architecture
|
||||||
|
ASL Optimizing Compiler version 20140214-64 [Mar 29 2014]
|
||||||
|
Copyright (c) 2000 - 2014 Intel Corporation
|
||||||
|
|
||||||
|
ASL Input: minnomax.asl - 30 lines, 614 bytes, 7 keywords
|
||||||
|
AML Output: minnowmax.aml - 165 bytes, 6 named objects, 1 executable opcodes
|
||||||
|
|
||||||
|
[1] http://wiki.minnowboard.org/MinnowBoard_MAX#Low_Speed_Expansion_Connector_.28Top.29
|
||||||
|
|
||||||
|
The resulting AML code can then be loaded by the kernel using one of the methods
|
||||||
|
below.
|
||||||
|
|
||||||
|
Loading ACPI SSDTs from initrd
|
||||||
|
==============================
|
||||||
|
|
||||||
|
This option allows loading of user defined SSDTs from initrd and it is useful
|
||||||
|
when the system does not support EFI or when there is not enough EFI storage.
|
||||||
|
|
||||||
|
It works in a similar way with initrd based ACPI tables override/upgrade: SSDT
|
||||||
|
aml code must be placed in the first, uncompressed, initrd under the
|
||||||
|
"kernel/firmware/acpi" path. Multiple files can be used and this will translate
|
||||||
|
in loading multiple tables. Only SSDT and OEM tables are allowed. See
|
||||||
|
initrd_table_override.txt for more details.
|
||||||
|
|
||||||
|
Here is an example::
|
||||||
|
|
||||||
|
# Add the raw ACPI tables to an uncompressed cpio archive.
|
||||||
|
# They must be put into a /kernel/firmware/acpi directory inside the
|
||||||
|
# cpio archive.
|
||||||
|
# The uncompressed cpio archive must be the first.
|
||||||
|
# Other, typically compressed cpio archives, must be
|
||||||
|
# concatenated on top of the uncompressed one.
|
||||||
|
mkdir -p kernel/firmware/acpi
|
||||||
|
cp ssdt.aml kernel/firmware/acpi
|
||||||
|
|
||||||
|
# Create the uncompressed cpio archive and concatenate the original initrd
|
||||||
|
# on top:
|
||||||
|
find kernel | cpio -H newc --create > /boot/instrumented_initrd
|
||||||
|
cat /boot/initrd >>/boot/instrumented_initrd
|
||||||
|
|
||||||
|
Loading ACPI SSDTs from EFI variables
|
||||||
|
=====================================
|
||||||
|
|
||||||
|
This is the preferred method, when EFI is supported on the platform, because it
|
||||||
|
allows a persistent, OS independent way of storing the user defined SSDTs. There
|
||||||
|
is also work underway to implement EFI support for loading user defined SSDTs
|
||||||
|
and using this method will make it easier to convert to the EFI loading
|
||||||
|
mechanism when that will arrive.
|
||||||
|
|
||||||
|
In order to load SSDTs from an EFI variable the efivar_ssdt kernel command line
|
||||||
|
parameter can be used. The argument for the option is the variable name to
|
||||||
|
use. If there are multiple variables with the same name but with different
|
||||||
|
vendor GUIDs, all of them will be loaded.
|
||||||
|
|
||||||
|
In order to store the AML code in an EFI variable the efivarfs filesystem can be
|
||||||
|
used. It is enabled and mounted by default in /sys/firmware/efi/efivars in all
|
||||||
|
recent distribution.
|
||||||
|
|
||||||
|
Creating a new file in /sys/firmware/efi/efivars will automatically create a new
|
||||||
|
EFI variable. Updating a file in /sys/firmware/efi/efivars will update the EFI
|
||||||
|
variable. Please note that the file name needs to be specially formatted as
|
||||||
|
"Name-GUID" and that the first 4 bytes in the file (little-endian format)
|
||||||
|
represent the attributes of the EFI variable (see EFI_VARIABLE_MASK in
|
||||||
|
include/linux/efi.h). Writing to the file must also be done with one write
|
||||||
|
operation.
|
||||||
|
|
||||||
|
For example, you can use the following bash script to create/update an EFI
|
||||||
|
variable with the content from a given file::
|
||||||
|
|
||||||
|
#!/bin/sh -e
|
||||||
|
|
||||||
|
while ! [ -z "$1" ]; do
|
||||||
|
case "$1" in
|
||||||
|
"-f") filename="$2"; shift;;
|
||||||
|
"-g") guid="$2"; shift;;
|
||||||
|
*) name="$1";;
|
||||||
|
esac
|
||||||
|
shift
|
||||||
|
done
|
||||||
|
|
||||||
|
usage()
|
||||||
|
{
|
||||||
|
echo "Syntax: ${0##*/} -f filename [ -g guid ] name"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
[ -n "$name" -a -f "$filename" ] || usage
|
||||||
|
|
||||||
|
EFIVARFS="/sys/firmware/efi/efivars"
|
||||||
|
|
||||||
|
[ -d "$EFIVARFS" ] || exit 2
|
||||||
|
|
||||||
|
if stat -tf $EFIVARFS | grep -q -v de5e81e4; then
|
||||||
|
mount -t efivarfs none $EFIVARFS
|
||||||
|
fi
|
||||||
|
|
||||||
|
# try to pick up an existing GUID
|
||||||
|
[ -n "$guid" ] || guid=$(find "$EFIVARFS" -name "$name-*" | head -n1 | cut -f2- -d-)
|
||||||
|
|
||||||
|
# use a randomly generated GUID
|
||||||
|
[ -n "$guid" ] || guid="$(cat /proc/sys/kernel/random/uuid)"
|
||||||
|
|
||||||
|
# efivarfs expects all of the data in one write
|
||||||
|
tmp=$(mktemp)
|
||||||
|
/bin/echo -ne "\007\000\000\000" | cat - $filename > $tmp
|
||||||
|
dd if=$tmp of="$EFIVARFS/$name-$guid" bs=$(stat -c %s $tmp)
|
||||||
|
rm $tmp
|
||||||
|
|
||||||
|
Loading ACPI SSDTs from configfs
|
||||||
|
================================
|
||||||
|
|
||||||
|
This option allows loading of user defined SSDTs from userspace via the configfs
|
||||||
|
interface. The CONFIG_ACPI_CONFIGFS option must be select and configfs must be
|
||||||
|
mounted. In the following examples, we assume that configfs has been mounted in
|
||||||
|
/config.
|
||||||
|
|
||||||
|
New tables can be loading by creating new directories in /config/acpi/table/ and
|
||||||
|
writing the SSDT aml code in the aml attribute::
|
||||||
|
|
||||||
|
cd /config/acpi/table
|
||||||
|
mkdir my_ssdt
|
||||||
|
cat ~/ssdt.aml > my_ssdt/aml
|
@@ -864,6 +864,8 @@ All cgroup core files are prefixed with "cgroup."
|
|||||||
populated
|
populated
|
||||||
1 if the cgroup or its descendants contains any live
|
1 if the cgroup or its descendants contains any live
|
||||||
processes; otherwise, 0.
|
processes; otherwise, 0.
|
||||||
|
frozen
|
||||||
|
1 if the cgroup is frozen; otherwise, 0.
|
||||||
|
|
||||||
cgroup.max.descendants
|
cgroup.max.descendants
|
||||||
A read-write single value files. The default is "max".
|
A read-write single value files. The default is "max".
|
||||||
@@ -897,6 +899,31 @@ All cgroup core files are prefixed with "cgroup."
|
|||||||
A dying cgroup can consume system resources not exceeding
|
A dying cgroup can consume system resources not exceeding
|
||||||
limits, which were active at the moment of cgroup deletion.
|
limits, which were active at the moment of cgroup deletion.
|
||||||
|
|
||||||
|
cgroup.freeze
|
||||||
|
A read-write single value file which exists on non-root cgroups.
|
||||||
|
Allowed values are "0" and "1". The default is "0".
|
||||||
|
|
||||||
|
Writing "1" to the file causes freezing of the cgroup and all
|
||||||
|
descendant cgroups. This means that all belonging processes will
|
||||||
|
be stopped and will not run until the cgroup will be explicitly
|
||||||
|
unfrozen. Freezing of the cgroup may take some time; when this action
|
||||||
|
is completed, the "frozen" value in the cgroup.events control file
|
||||||
|
will be updated to "1" and the corresponding notification will be
|
||||||
|
issued.
|
||||||
|
|
||||||
|
A cgroup can be frozen either by its own settings, or by settings
|
||||||
|
of any ancestor cgroups. If any of ancestor cgroups is frozen, the
|
||||||
|
cgroup will remain frozen.
|
||||||
|
|
||||||
|
Processes in the frozen cgroup can be killed by a fatal signal.
|
||||||
|
They also can enter and leave a frozen cgroup: either by an explicit
|
||||||
|
move by a user, or if freezing of the cgroup races with fork().
|
||||||
|
If a process is moved to a frozen cgroup, it stops. If a process is
|
||||||
|
moved out of a frozen cgroup, it becomes running.
|
||||||
|
|
||||||
|
Frozen status of a cgroup doesn't affect any cgroup tree operations:
|
||||||
|
it's possible to delete a frozen (and empty) cgroup, as well as
|
||||||
|
create new sub-cgroups.
|
||||||
|
|
||||||
Controllers
|
Controllers
|
||||||
===========
|
===========
|
||||||
|
@@ -91,10 +91,48 @@ Currently Available
|
|||||||
* large block (up to pagesize) support
|
* large block (up to pagesize) support
|
||||||
* efficient new ordered mode in JBD2 and ext4 (avoid using buffer head to force
|
* efficient new ordered mode in JBD2 and ext4 (avoid using buffer head to force
|
||||||
the ordering)
|
the ordering)
|
||||||
|
* Case-insensitive file name lookups
|
||||||
|
|
||||||
[1] Filesystems with a block size of 1k may see a limit imposed by the
|
[1] Filesystems with a block size of 1k may see a limit imposed by the
|
||||||
directory hash tree having a maximum depth of two.
|
directory hash tree having a maximum depth of two.
|
||||||
|
|
||||||
|
case-insensitive file name lookups
|
||||||
|
======================================================
|
||||||
|
|
||||||
|
The case-insensitive file name lookup feature is supported on a
|
||||||
|
per-directory basis, allowing the user to mix case-insensitive and
|
||||||
|
case-sensitive directories in the same filesystem. It is enabled by
|
||||||
|
flipping the +F inode attribute of an empty directory. The
|
||||||
|
case-insensitive string match operation is only defined when we know how
|
||||||
|
text in encoded in a byte sequence. For that reason, in order to enable
|
||||||
|
case-insensitive directories, the filesystem must have the
|
||||||
|
casefold feature, which stores the filesystem-wide encoding
|
||||||
|
model used. By default, the charset adopted is the latest version of
|
||||||
|
Unicode (12.1.0, by the time of this writing), encoded in the UTF-8
|
||||||
|
form. The comparison algorithm is implemented by normalizing the
|
||||||
|
strings to the Canonical decomposition form, as defined by Unicode,
|
||||||
|
followed by a byte per byte comparison.
|
||||||
|
|
||||||
|
The case-awareness is name-preserving on the disk, meaning that the file
|
||||||
|
name provided by userspace is a byte-per-byte match to what is actually
|
||||||
|
written in the disk. The Unicode normalization format used by the
|
||||||
|
kernel is thus an internal representation, and not exposed to the
|
||||||
|
userspace nor to the disk, with the important exception of disk hashes,
|
||||||
|
used on large case-insensitive directories with DX feature. On DX
|
||||||
|
directories, the hash must be calculated using the casefolded version of
|
||||||
|
the filename, meaning that the normalization format used actually has an
|
||||||
|
impact on where the directory entry is stored.
|
||||||
|
|
||||||
|
When we change from viewing filenames as opaque byte sequences to seeing
|
||||||
|
them as encoded strings we need to address what happens when a program
|
||||||
|
tries to create a file with an invalid name. The Unicode subsystem
|
||||||
|
within the kernel leaves the decision of what to do in this case to the
|
||||||
|
filesystem, which select its preferred behavior by enabling/disabling
|
||||||
|
the strict mode. When Ext4 encounters one of those strings and the
|
||||||
|
filesystem did not require strict mode, it falls back to considering the
|
||||||
|
entire string as an opaque byte sequence, which still allows the user to
|
||||||
|
operate on that file, but the case-insensitive lookups won't work.
|
||||||
|
|
||||||
Options
|
Options
|
||||||
=======
|
=======
|
||||||
|
|
||||||
|
13
Documentation/admin-guide/hw-vuln/index.rst
Normal file
13
Documentation/admin-guide/hw-vuln/index.rst
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
========================
|
||||||
|
Hardware vulnerabilities
|
||||||
|
========================
|
||||||
|
|
||||||
|
This section describes CPU vulnerabilities and provides an overview of the
|
||||||
|
possible mitigations along with guidance for selecting mitigations if they
|
||||||
|
are configurable at compile, boot or run time.
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 1
|
||||||
|
|
||||||
|
l1tf
|
||||||
|
mds
|
@@ -445,6 +445,7 @@ The default is 'cond'. If 'l1tf=full,force' is given on the kernel command
|
|||||||
line, then 'always' is enforced and the kvm-intel.vmentry_l1d_flush
|
line, then 'always' is enforced and the kvm-intel.vmentry_l1d_flush
|
||||||
module parameter is ignored and writes to the sysfs file are rejected.
|
module parameter is ignored and writes to the sysfs file are rejected.
|
||||||
|
|
||||||
|
.. _mitigation_selection:
|
||||||
|
|
||||||
Mitigation selection guide
|
Mitigation selection guide
|
||||||
--------------------------
|
--------------------------
|
308
Documentation/admin-guide/hw-vuln/mds.rst
Normal file
308
Documentation/admin-guide/hw-vuln/mds.rst
Normal file
@@ -0,0 +1,308 @@
|
|||||||
|
MDS - Microarchitectural Data Sampling
|
||||||
|
======================================
|
||||||
|
|
||||||
|
Microarchitectural Data Sampling is a hardware vulnerability which allows
|
||||||
|
unprivileged speculative access to data which is available in various CPU
|
||||||
|
internal buffers.
|
||||||
|
|
||||||
|
Affected processors
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
This vulnerability affects a wide range of Intel processors. The
|
||||||
|
vulnerability is not present on:
|
||||||
|
|
||||||
|
- Processors from AMD, Centaur and other non Intel vendors
|
||||||
|
|
||||||
|
- Older processor models, where the CPU family is < 6
|
||||||
|
|
||||||
|
- Some Atoms (Bonnell, Saltwell, Goldmont, GoldmontPlus)
|
||||||
|
|
||||||
|
- Intel processors which have the ARCH_CAP_MDS_NO bit set in the
|
||||||
|
IA32_ARCH_CAPABILITIES MSR.
|
||||||
|
|
||||||
|
Whether a processor is affected or not can be read out from the MDS
|
||||||
|
vulnerability file in sysfs. See :ref:`mds_sys_info`.
|
||||||
|
|
||||||
|
Not all processors are affected by all variants of MDS, but the mitigation
|
||||||
|
is identical for all of them so the kernel treats them as a single
|
||||||
|
vulnerability.
|
||||||
|
|
||||||
|
Related CVEs
|
||||||
|
------------
|
||||||
|
|
||||||
|
The following CVE entries are related to the MDS vulnerability:
|
||||||
|
|
||||||
|
============== ===== ===================================================
|
||||||
|
CVE-2018-12126 MSBDS Microarchitectural Store Buffer Data Sampling
|
||||||
|
CVE-2018-12130 MFBDS Microarchitectural Fill Buffer Data Sampling
|
||||||
|
CVE-2018-12127 MLPDS Microarchitectural Load Port Data Sampling
|
||||||
|
CVE-2019-11091 MDSUM Microarchitectural Data Sampling Uncacheable Memory
|
||||||
|
============== ===== ===================================================
|
||||||
|
|
||||||
|
Problem
|
||||||
|
-------
|
||||||
|
|
||||||
|
When performing store, load, L1 refill operations, processors write data
|
||||||
|
into temporary microarchitectural structures (buffers). The data in the
|
||||||
|
buffer can be forwarded to load operations as an optimization.
|
||||||
|
|
||||||
|
Under certain conditions, usually a fault/assist caused by a load
|
||||||
|
operation, data unrelated to the load memory address can be speculatively
|
||||||
|
forwarded from the buffers. Because the load operation causes a fault or
|
||||||
|
assist and its result will be discarded, the forwarded data will not cause
|
||||||
|
incorrect program execution or state changes. But a malicious operation
|
||||||
|
may be able to forward this speculative data to a disclosure gadget which
|
||||||
|
allows in turn to infer the value via a cache side channel attack.
|
||||||
|
|
||||||
|
Because the buffers are potentially shared between Hyper-Threads cross
|
||||||
|
Hyper-Thread attacks are possible.
|
||||||
|
|
||||||
|
Deeper technical information is available in the MDS specific x86
|
||||||
|
architecture section: :ref:`Documentation/x86/mds.rst <mds>`.
|
||||||
|
|
||||||
|
|
||||||
|
Attack scenarios
|
||||||
|
----------------
|
||||||
|
|
||||||
|
Attacks against the MDS vulnerabilities can be mounted from malicious non
|
||||||
|
priviledged user space applications running on hosts or guest. Malicious
|
||||||
|
guest OSes can obviously mount attacks as well.
|
||||||
|
|
||||||
|
Contrary to other speculation based vulnerabilities the MDS vulnerability
|
||||||
|
does not allow the attacker to control the memory target address. As a
|
||||||
|
consequence the attacks are purely sampling based, but as demonstrated with
|
||||||
|
the TLBleed attack samples can be postprocessed successfully.
|
||||||
|
|
||||||
|
Web-Browsers
|
||||||
|
^^^^^^^^^^^^
|
||||||
|
|
||||||
|
It's unclear whether attacks through Web-Browsers are possible at
|
||||||
|
all. The exploitation through Java-Script is considered very unlikely,
|
||||||
|
but other widely used web technologies like Webassembly could possibly be
|
||||||
|
abused.
|
||||||
|
|
||||||
|
|
||||||
|
.. _mds_sys_info:
|
||||||
|
|
||||||
|
MDS system information
|
||||||
|
-----------------------
|
||||||
|
|
||||||
|
The Linux kernel provides a sysfs interface to enumerate the current MDS
|
||||||
|
status of the system: whether the system is vulnerable, and which
|
||||||
|
mitigations are active. The relevant sysfs file is:
|
||||||
|
|
||||||
|
/sys/devices/system/cpu/vulnerabilities/mds
|
||||||
|
|
||||||
|
The possible values in this file are:
|
||||||
|
|
||||||
|
.. list-table::
|
||||||
|
|
||||||
|
* - 'Not affected'
|
||||||
|
- The processor is not vulnerable
|
||||||
|
* - 'Vulnerable'
|
||||||
|
- The processor is vulnerable, but no mitigation enabled
|
||||||
|
* - 'Vulnerable: Clear CPU buffers attempted, no microcode'
|
||||||
|
- The processor is vulnerable but microcode is not updated.
|
||||||
|
|
||||||
|
The mitigation is enabled on a best effort basis. See :ref:`vmwerv`
|
||||||
|
* - 'Mitigation: Clear CPU buffers'
|
||||||
|
- The processor is vulnerable and the CPU buffer clearing mitigation is
|
||||||
|
enabled.
|
||||||
|
|
||||||
|
If the processor is vulnerable then the following information is appended
|
||||||
|
to the above information:
|
||||||
|
|
||||||
|
======================== ============================================
|
||||||
|
'SMT vulnerable' SMT is enabled
|
||||||
|
'SMT mitigated' SMT is enabled and mitigated
|
||||||
|
'SMT disabled' SMT is disabled
|
||||||
|
'SMT Host state unknown' Kernel runs in a VM, Host SMT state unknown
|
||||||
|
======================== ============================================
|
||||||
|
|
||||||
|
.. _vmwerv:
|
||||||
|
|
||||||
|
Best effort mitigation mode
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
If the processor is vulnerable, but the availability of the microcode based
|
||||||
|
mitigation mechanism is not advertised via CPUID the kernel selects a best
|
||||||
|
effort mitigation mode. This mode invokes the mitigation instructions
|
||||||
|
without a guarantee that they clear the CPU buffers.
|
||||||
|
|
||||||
|
This is done to address virtualization scenarios where the host has the
|
||||||
|
microcode update applied, but the hypervisor is not yet updated to expose
|
||||||
|
the CPUID to the guest. If the host has updated microcode the protection
|
||||||
|
takes effect otherwise a few cpu cycles are wasted pointlessly.
|
||||||
|
|
||||||
|
The state in the mds sysfs file reflects this situation accordingly.
|
||||||
|
|
||||||
|
|
||||||
|
Mitigation mechanism
|
||||||
|
-------------------------
|
||||||
|
|
||||||
|
The kernel detects the affected CPUs and the presence of the microcode
|
||||||
|
which is required.
|
||||||
|
|
||||||
|
If a CPU is affected and the microcode is available, then the kernel
|
||||||
|
enables the mitigation by default. The mitigation can be controlled at boot
|
||||||
|
time via a kernel command line option. See
|
||||||
|
:ref:`mds_mitigation_control_command_line`.
|
||||||
|
|
||||||
|
.. _cpu_buffer_clear:
|
||||||
|
|
||||||
|
CPU buffer clearing
|
||||||
|
^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
The mitigation for MDS clears the affected CPU buffers on return to user
|
||||||
|
space and when entering a guest.
|
||||||
|
|
||||||
|
If SMT is enabled it also clears the buffers on idle entry when the CPU
|
||||||
|
is only affected by MSBDS and not any other MDS variant, because the
|
||||||
|
other variants cannot be protected against cross Hyper-Thread attacks.
|
||||||
|
|
||||||
|
For CPUs which are only affected by MSBDS the user space, guest and idle
|
||||||
|
transition mitigations are sufficient and SMT is not affected.
|
||||||
|
|
||||||
|
.. _virt_mechanism:
|
||||||
|
|
||||||
|
Virtualization mitigation
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
The protection for host to guest transition depends on the L1TF
|
||||||
|
vulnerability of the CPU:
|
||||||
|
|
||||||
|
- CPU is affected by L1TF:
|
||||||
|
|
||||||
|
If the L1D flush mitigation is enabled and up to date microcode is
|
||||||
|
available, the L1D flush mitigation is automatically protecting the
|
||||||
|
guest transition.
|
||||||
|
|
||||||
|
If the L1D flush mitigation is disabled then the MDS mitigation is
|
||||||
|
invoked explicit when the host MDS mitigation is enabled.
|
||||||
|
|
||||||
|
For details on L1TF and virtualization see:
|
||||||
|
:ref:`Documentation/admin-guide/hw-vuln//l1tf.rst <mitigation_control_kvm>`.
|
||||||
|
|
||||||
|
- CPU is not affected by L1TF:
|
||||||
|
|
||||||
|
CPU buffers are flushed before entering the guest when the host MDS
|
||||||
|
mitigation is enabled.
|
||||||
|
|
||||||
|
The resulting MDS protection matrix for the host to guest transition:
|
||||||
|
|
||||||
|
============ ===== ============= ============ =================
|
||||||
|
L1TF MDS VMX-L1FLUSH Host MDS MDS-State
|
||||||
|
|
||||||
|
Don't care No Don't care N/A Not affected
|
||||||
|
|
||||||
|
Yes Yes Disabled Off Vulnerable
|
||||||
|
|
||||||
|
Yes Yes Disabled Full Mitigated
|
||||||
|
|
||||||
|
Yes Yes Enabled Don't care Mitigated
|
||||||
|
|
||||||
|
No Yes N/A Off Vulnerable
|
||||||
|
|
||||||
|
No Yes N/A Full Mitigated
|
||||||
|
============ ===== ============= ============ =================
|
||||||
|
|
||||||
|
This only covers the host to guest transition, i.e. prevents leakage from
|
||||||
|
host to guest, but does not protect the guest internally. Guests need to
|
||||||
|
have their own protections.
|
||||||
|
|
||||||
|
.. _xeon_phi:
|
||||||
|
|
||||||
|
XEON PHI specific considerations
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
The XEON PHI processor family is affected by MSBDS which can be exploited
|
||||||
|
cross Hyper-Threads when entering idle states. Some XEON PHI variants allow
|
||||||
|
to use MWAIT in user space (Ring 3) which opens an potential attack vector
|
||||||
|
for malicious user space. The exposure can be disabled on the kernel
|
||||||
|
command line with the 'ring3mwait=disable' command line option.
|
||||||
|
|
||||||
|
XEON PHI is not affected by the other MDS variants and MSBDS is mitigated
|
||||||
|
before the CPU enters a idle state. As XEON PHI is not affected by L1TF
|
||||||
|
either disabling SMT is not required for full protection.
|
||||||
|
|
||||||
|
.. _mds_smt_control:
|
||||||
|
|
||||||
|
SMT control
|
||||||
|
^^^^^^^^^^^
|
||||||
|
|
||||||
|
All MDS variants except MSBDS can be attacked cross Hyper-Threads. That
|
||||||
|
means on CPUs which are affected by MFBDS or MLPDS it is necessary to
|
||||||
|
disable SMT for full protection. These are most of the affected CPUs; the
|
||||||
|
exception is XEON PHI, see :ref:`xeon_phi`.
|
||||||
|
|
||||||
|
Disabling SMT can have a significant performance impact, but the impact
|
||||||
|
depends on the type of workloads.
|
||||||
|
|
||||||
|
See the relevant chapter in the L1TF mitigation documentation for details:
|
||||||
|
:ref:`Documentation/admin-guide/hw-vuln/l1tf.rst <smt_control>`.
|
||||||
|
|
||||||
|
|
||||||
|
.. _mds_mitigation_control_command_line:
|
||||||
|
|
||||||
|
Mitigation control on the kernel command line
|
||||||
|
---------------------------------------------
|
||||||
|
|
||||||
|
The kernel command line allows to control the MDS mitigations at boot
|
||||||
|
time with the option "mds=". The valid arguments for this option are:
|
||||||
|
|
||||||
|
============ =============================================================
|
||||||
|
full If the CPU is vulnerable, enable all available mitigations
|
||||||
|
for the MDS vulnerability, CPU buffer clearing on exit to
|
||||||
|
userspace and when entering a VM. Idle transitions are
|
||||||
|
protected as well if SMT is enabled.
|
||||||
|
|
||||||
|
It does not automatically disable SMT.
|
||||||
|
|
||||||
|
full,nosmt The same as mds=full, with SMT disabled on vulnerable
|
||||||
|
CPUs. This is the complete mitigation.
|
||||||
|
|
||||||
|
off Disables MDS mitigations completely.
|
||||||
|
|
||||||
|
============ =============================================================
|
||||||
|
|
||||||
|
Not specifying this option is equivalent to "mds=full".
|
||||||
|
|
||||||
|
|
||||||
|
Mitigation selection guide
|
||||||
|
--------------------------
|
||||||
|
|
||||||
|
1. Trusted userspace
|
||||||
|
^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
If all userspace applications are from a trusted source and do not
|
||||||
|
execute untrusted code which is supplied externally, then the mitigation
|
||||||
|
can be disabled.
|
||||||
|
|
||||||
|
|
||||||
|
2. Virtualization with trusted guests
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
The same considerations as above versus trusted user space apply.
|
||||||
|
|
||||||
|
3. Virtualization with untrusted guests
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
The protection depends on the state of the L1TF mitigations.
|
||||||
|
See :ref:`virt_mechanism`.
|
||||||
|
|
||||||
|
If the MDS mitigation is enabled and SMT is disabled, guest to host and
|
||||||
|
guest to guest attacks are prevented.
|
||||||
|
|
||||||
|
.. _mds_default_mitigations:
|
||||||
|
|
||||||
|
Default mitigations
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
The kernel default mitigations for vulnerable processors are:
|
||||||
|
|
||||||
|
- Enable CPU buffer clearing
|
||||||
|
|
||||||
|
The kernel does not by default enforce the disabling of SMT, which leaves
|
||||||
|
SMT systems vulnerable when running untrusted code. The same rationale as
|
||||||
|
for L1TF applies.
|
||||||
|
See :ref:`Documentation/admin-guide/hw-vuln//l1tf.rst <default_mitigations>`.
|
@@ -17,14 +17,12 @@ etc.
|
|||||||
kernel-parameters
|
kernel-parameters
|
||||||
devices
|
devices
|
||||||
|
|
||||||
This section describes CPU vulnerabilities and provides an overview of the
|
This section describes CPU vulnerabilities and their mitigations.
|
||||||
possible mitigations along with guidance for selecting mitigations if they
|
|
||||||
are configurable at compile, boot or run time.
|
|
||||||
|
|
||||||
.. toctree::
|
.. toctree::
|
||||||
:maxdepth: 1
|
:maxdepth: 1
|
||||||
|
|
||||||
l1tf
|
hw-vuln/index
|
||||||
|
|
||||||
Here is a set of documents aimed at users who are trying to track down
|
Here is a set of documents aimed at users who are trying to track down
|
||||||
problems and bugs in particular.
|
problems and bugs in particular.
|
||||||
@@ -77,6 +75,7 @@ configure specific aspects of kernel behavior to your liking.
|
|||||||
LSM/index
|
LSM/index
|
||||||
mm/index
|
mm/index
|
||||||
perf-security
|
perf-security
|
||||||
|
acpi/index
|
||||||
|
|
||||||
.. only:: subproject and html
|
.. only:: subproject and html
|
||||||
|
|
||||||
|
@@ -88,6 +88,7 @@ parameter is applicable::
|
|||||||
APIC APIC support is enabled.
|
APIC APIC support is enabled.
|
||||||
APM Advanced Power Management support is enabled.
|
APM Advanced Power Management support is enabled.
|
||||||
ARM ARM architecture is enabled.
|
ARM ARM architecture is enabled.
|
||||||
|
ARM64 ARM64 architecture is enabled.
|
||||||
AX25 Appropriate AX.25 support is enabled.
|
AX25 Appropriate AX.25 support is enabled.
|
||||||
CLK Common clock infrastructure is enabled.
|
CLK Common clock infrastructure is enabled.
|
||||||
CMA Contiguous Memory Area support is enabled.
|
CMA Contiguous Memory Area support is enabled.
|
||||||
|
@@ -704,8 +704,11 @@
|
|||||||
upon panic. This parameter reserves the physical
|
upon panic. This parameter reserves the physical
|
||||||
memory region [offset, offset + size] for that kernel
|
memory region [offset, offset + size] for that kernel
|
||||||
image. If '@offset' is omitted, then a suitable offset
|
image. If '@offset' is omitted, then a suitable offset
|
||||||
is selected automatically. Check
|
is selected automatically.
|
||||||
Documentation/kdump/kdump.txt for further details.
|
[KNL, x86_64] select a region under 4G first, and
|
||||||
|
fall back to reserve region above 4G when '@offset'
|
||||||
|
hasn't been specified.
|
||||||
|
See Documentation/kdump/kdump.txt for further details.
|
||||||
|
|
||||||
crashkernel=range1:size1[,range2:size2,...][@offset]
|
crashkernel=range1:size1[,range2:size2,...][@offset]
|
||||||
[KNL] Same as above, but depends on the memory
|
[KNL] Same as above, but depends on the memory
|
||||||
@@ -1585,7 +1588,7 @@
|
|||||||
Format: { "off" | "enforce" | "fix" | "log" }
|
Format: { "off" | "enforce" | "fix" | "log" }
|
||||||
default: "enforce"
|
default: "enforce"
|
||||||
|
|
||||||
ima_appraise_tcb [IMA]
|
ima_appraise_tcb [IMA] Deprecated. Use ima_policy= instead.
|
||||||
The builtin appraise policy appraises all files
|
The builtin appraise policy appraises all files
|
||||||
owned by uid=0.
|
owned by uid=0.
|
||||||
|
|
||||||
@@ -1612,8 +1615,7 @@
|
|||||||
uid=0.
|
uid=0.
|
||||||
|
|
||||||
The "appraise_tcb" policy appraises the integrity of
|
The "appraise_tcb" policy appraises the integrity of
|
||||||
all files owned by root. (This is the equivalent
|
all files owned by root.
|
||||||
of ima_appraise_tcb.)
|
|
||||||
|
|
||||||
The "secure_boot" policy appraises the integrity
|
The "secure_boot" policy appraises the integrity
|
||||||
of files (eg. kexec kernel image, kernel modules,
|
of files (eg. kexec kernel image, kernel modules,
|
||||||
@@ -1828,6 +1830,9 @@
|
|||||||
ip= [IP_PNP]
|
ip= [IP_PNP]
|
||||||
See Documentation/filesystems/nfs/nfsroot.txt.
|
See Documentation/filesystems/nfs/nfsroot.txt.
|
||||||
|
|
||||||
|
ipcmni_extend [KNL] Extend the maximum number of unique System V
|
||||||
|
IPC identifiers from 32,768 to 16,777,216.
|
||||||
|
|
||||||
irqaffinity= [SMP] Set the default irq affinity mask
|
irqaffinity= [SMP] Set the default irq affinity mask
|
||||||
The argument is a cpu list, as described above.
|
The argument is a cpu list, as described above.
|
||||||
|
|
||||||
@@ -2141,7 +2146,7 @@
|
|||||||
|
|
||||||
Default is 'flush'.
|
Default is 'flush'.
|
||||||
|
|
||||||
For details see: Documentation/admin-guide/l1tf.rst
|
For details see: Documentation/admin-guide/hw-vuln/l1tf.rst
|
||||||
|
|
||||||
l2cr= [PPC]
|
l2cr= [PPC]
|
||||||
|
|
||||||
@@ -2387,6 +2392,32 @@
|
|||||||
Format: <first>,<last>
|
Format: <first>,<last>
|
||||||
Specifies range of consoles to be captured by the MDA.
|
Specifies range of consoles to be captured by the MDA.
|
||||||
|
|
||||||
|
mds= [X86,INTEL]
|
||||||
|
Control mitigation for the Micro-architectural Data
|
||||||
|
Sampling (MDS) vulnerability.
|
||||||
|
|
||||||
|
Certain CPUs are vulnerable to an exploit against CPU
|
||||||
|
internal buffers which can forward information to a
|
||||||
|
disclosure gadget under certain conditions.
|
||||||
|
|
||||||
|
In vulnerable processors, the speculatively
|
||||||
|
forwarded data can be used in a cache side channel
|
||||||
|
attack, to access data to which the attacker does
|
||||||
|
not have direct access.
|
||||||
|
|
||||||
|
This parameter controls the MDS mitigation. The
|
||||||
|
options are:
|
||||||
|
|
||||||
|
full - Enable MDS mitigation on vulnerable CPUs
|
||||||
|
full,nosmt - Enable MDS mitigation and disable
|
||||||
|
SMT on vulnerable CPUs
|
||||||
|
off - Unconditionally disable MDS mitigation
|
||||||
|
|
||||||
|
Not specifying this option is equivalent to
|
||||||
|
mds=full.
|
||||||
|
|
||||||
|
For details see: Documentation/admin-guide/hw-vuln/mds.rst
|
||||||
|
|
||||||
mem=nn[KMG] [KNL,BOOT] Force usage of a specific amount of memory
|
mem=nn[KMG] [KNL,BOOT] Force usage of a specific amount of memory
|
||||||
Amount of memory to be used when the kernel is not able
|
Amount of memory to be used when the kernel is not able
|
||||||
to see the whole system memory or for test.
|
to see the whole system memory or for test.
|
||||||
@@ -2544,6 +2575,42 @@
|
|||||||
in the "bleeding edge" mini2440 support kernel at
|
in the "bleeding edge" mini2440 support kernel at
|
||||||
http://repo.or.cz/w/linux-2.6/mini2440.git
|
http://repo.or.cz/w/linux-2.6/mini2440.git
|
||||||
|
|
||||||
|
mitigations=
|
||||||
|
[X86,PPC,S390,ARM64] Control optional mitigations for
|
||||||
|
CPU vulnerabilities. This is a set of curated,
|
||||||
|
arch-independent options, each of which is an
|
||||||
|
aggregation of existing arch-specific options.
|
||||||
|
|
||||||
|
off
|
||||||
|
Disable all optional CPU mitigations. This
|
||||||
|
improves system performance, but it may also
|
||||||
|
expose users to several CPU vulnerabilities.
|
||||||
|
Equivalent to: nopti [X86,PPC]
|
||||||
|
kpti=0 [ARM64]
|
||||||
|
nospectre_v1 [PPC]
|
||||||
|
nobp=0 [S390]
|
||||||
|
nospectre_v2 [X86,PPC,S390,ARM64]
|
||||||
|
spectre_v2_user=off [X86]
|
||||||
|
spec_store_bypass_disable=off [X86,PPC]
|
||||||
|
ssbd=force-off [ARM64]
|
||||||
|
l1tf=off [X86]
|
||||||
|
mds=off [X86]
|
||||||
|
|
||||||
|
auto (default)
|
||||||
|
Mitigate all CPU vulnerabilities, but leave SMT
|
||||||
|
enabled, even if it's vulnerable. This is for
|
||||||
|
users who don't want to be surprised by SMT
|
||||||
|
getting disabled across kernel upgrades, or who
|
||||||
|
have other ways of avoiding SMT-based attacks.
|
||||||
|
Equivalent to: (default behavior)
|
||||||
|
|
||||||
|
auto,nosmt
|
||||||
|
Mitigate all CPU vulnerabilities, disabling SMT
|
||||||
|
if needed. This is for users who always want to
|
||||||
|
be fully mitigated, even if it means losing SMT.
|
||||||
|
Equivalent to: l1tf=flush,nosmt [X86]
|
||||||
|
mds=full,nosmt [X86]
|
||||||
|
|
||||||
mminit_loglevel=
|
mminit_loglevel=
|
||||||
[KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this
|
[KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this
|
||||||
parameter allows control of the logging verbosity for
|
parameter allows control of the logging verbosity for
|
||||||
@@ -2839,11 +2906,11 @@
|
|||||||
noexec=on: enable non-executable mappings (default)
|
noexec=on: enable non-executable mappings (default)
|
||||||
noexec=off: disable non-executable mappings
|
noexec=off: disable non-executable mappings
|
||||||
|
|
||||||
nosmap [X86]
|
nosmap [X86,PPC]
|
||||||
Disable SMAP (Supervisor Mode Access Prevention)
|
Disable SMAP (Supervisor Mode Access Prevention)
|
||||||
even if it is supported by processor.
|
even if it is supported by processor.
|
||||||
|
|
||||||
nosmep [X86]
|
nosmep [X86,PPC]
|
||||||
Disable SMEP (Supervisor Mode Execution Prevention)
|
Disable SMEP (Supervisor Mode Execution Prevention)
|
||||||
even if it is supported by processor.
|
even if it is supported by processor.
|
||||||
|
|
||||||
@@ -2873,10 +2940,10 @@
|
|||||||
check bypass). With this option data leaks are possible
|
check bypass). With this option data leaks are possible
|
||||||
in the system.
|
in the system.
|
||||||
|
|
||||||
nospectre_v2 [X86,PPC_FSL_BOOK3E] Disable all mitigations for the Spectre variant 2
|
nospectre_v2 [X86,PPC_FSL_BOOK3E,ARM64] Disable all mitigations for
|
||||||
(indirect branch prediction) vulnerability. System may
|
the Spectre variant 2 (indirect branch prediction)
|
||||||
allow data leaks with this option, which is equivalent
|
vulnerability. System may allow data leaks with this
|
||||||
to spectre_v2=off.
|
option.
|
||||||
|
|
||||||
nospec_store_bypass_disable
|
nospec_store_bypass_disable
|
||||||
[HW] Disable all mitigations for the Speculative Store Bypass vulnerability
|
[HW] Disable all mitigations for the Speculative Store Bypass vulnerability
|
||||||
@@ -3110,6 +3177,16 @@
|
|||||||
This will also cause panics on machine check exceptions.
|
This will also cause panics on machine check exceptions.
|
||||||
Useful together with panic=30 to trigger a reboot.
|
Useful together with panic=30 to trigger a reboot.
|
||||||
|
|
||||||
|
page_alloc.shuffle=
|
||||||
|
[KNL] Boolean flag to control whether the page allocator
|
||||||
|
should randomize its free lists. The randomization may
|
||||||
|
be automatically enabled if the kernel detects it is
|
||||||
|
running on a platform with a direct-mapped memory-side
|
||||||
|
cache, and this parameter can be used to
|
||||||
|
override/disable that behavior. The state of the flag
|
||||||
|
can be read from sysfs at:
|
||||||
|
/sys/module/page_alloc/parameters/shuffle.
|
||||||
|
|
||||||
page_owner= [KNL] Boot-time page_owner enabling option.
|
page_owner= [KNL] Boot-time page_owner enabling option.
|
||||||
Storage of the information about who allocated
|
Storage of the information about who allocated
|
||||||
each page is disabled in default. With this switch,
|
each page is disabled in default. With this switch,
|
||||||
@@ -3135,6 +3212,7 @@
|
|||||||
bit 2: print timer info
|
bit 2: print timer info
|
||||||
bit 3: print locks info if CONFIG_LOCKDEP is on
|
bit 3: print locks info if CONFIG_LOCKDEP is on
|
||||||
bit 4: print ftrace buffer
|
bit 4: print ftrace buffer
|
||||||
|
bit 5: print all printk messages in buffer
|
||||||
|
|
||||||
panic_on_warn panic() instead of WARN(). Useful to cause kdump
|
panic_on_warn panic() instead of WARN(). Useful to cause kdump
|
||||||
on a WARN().
|
on a WARN().
|
||||||
@@ -3394,6 +3472,8 @@
|
|||||||
bridges without forcing it upstream. Note:
|
bridges without forcing it upstream. Note:
|
||||||
this removes isolation between devices and
|
this removes isolation between devices and
|
||||||
may put more devices in an IOMMU group.
|
may put more devices in an IOMMU group.
|
||||||
|
force_floating [S390] Force usage of floating interrupts.
|
||||||
|
nomio [S390] Do not use MIO instructions.
|
||||||
|
|
||||||
pcie_aspm= [PCIE] Forcibly enable or disable PCIe Active State Power
|
pcie_aspm= [PCIE] Forcibly enable or disable PCIe Active State Power
|
||||||
Management.
|
Management.
|
||||||
@@ -3623,7 +3703,9 @@
|
|||||||
see CONFIG_RAS_CEC help text.
|
see CONFIG_RAS_CEC help text.
|
||||||
|
|
||||||
rcu_nocbs= [KNL]
|
rcu_nocbs= [KNL]
|
||||||
The argument is a cpu list, as described above.
|
The argument is a cpu list, as described above,
|
||||||
|
except that the string "all" can be used to
|
||||||
|
specify every CPU on the system.
|
||||||
|
|
||||||
In kernels built with CONFIG_RCU_NOCB_CPU=y, set
|
In kernels built with CONFIG_RCU_NOCB_CPU=y, set
|
||||||
the specified list of CPUs to be no-callback CPUs.
|
the specified list of CPUs to be no-callback CPUs.
|
||||||
@@ -3986,7 +4068,9 @@
|
|||||||
[[,]s[mp]#### \
|
[[,]s[mp]#### \
|
||||||
[[,]b[ios] | a[cpi] | k[bd] | t[riple] | e[fi] | p[ci]] \
|
[[,]b[ios] | a[cpi] | k[bd] | t[riple] | e[fi] | p[ci]] \
|
||||||
[[,]f[orce]
|
[[,]f[orce]
|
||||||
Where reboot_mode is one of warm (soft) or cold (hard) or gpio,
|
Where reboot_mode is one of warm (soft) or cold (hard) or gpio
|
||||||
|
(prefix with 'panic_' to set mode for panic
|
||||||
|
reboot only),
|
||||||
reboot_type is one of bios, acpi, kbd, triple, efi, or pci,
|
reboot_type is one of bios, acpi, kbd, triple, efi, or pci,
|
||||||
reboot_force is either force or not specified,
|
reboot_force is either force or not specified,
|
||||||
reboot_cpu is s[mp]#### with #### being the processor
|
reboot_cpu is s[mp]#### with #### being the processor
|
||||||
@@ -4703,6 +4787,10 @@
|
|||||||
[x86] unstable: mark the TSC clocksource as unstable, this
|
[x86] unstable: mark the TSC clocksource as unstable, this
|
||||||
marks the TSC unconditionally unstable at bootup and
|
marks the TSC unconditionally unstable at bootup and
|
||||||
avoids any further wobbles once the TSC watchdog notices.
|
avoids any further wobbles once the TSC watchdog notices.
|
||||||
|
[x86] nowatchdog: disable clocksource watchdog. Used
|
||||||
|
in situations with strict latency requirements (where
|
||||||
|
interruptions from clocksource watchdog are not
|
||||||
|
acceptable).
|
||||||
|
|
||||||
turbografx.map[2|3]= [HW,JOY]
|
turbografx.map[2|3]= [HW,JOY]
|
||||||
TurboGraFX parallel port interface
|
TurboGraFX parallel port interface
|
||||||
@@ -5173,6 +5261,13 @@
|
|||||||
with /sys/devices/system/xen_memory/xen_memory0/scrub_pages.
|
with /sys/devices/system/xen_memory/xen_memory0/scrub_pages.
|
||||||
Default value controlled with CONFIG_XEN_SCRUB_PAGES_DEFAULT.
|
Default value controlled with CONFIG_XEN_SCRUB_PAGES_DEFAULT.
|
||||||
|
|
||||||
|
xen_timer_slop= [X86-64,XEN]
|
||||||
|
Set the timer slop (in nanoseconds) for the virtual Xen
|
||||||
|
timers (default is 100000). This adjusts the minimum
|
||||||
|
delta of virtualized Xen timers, where lower values
|
||||||
|
improve timer resolution at the expense of processing
|
||||||
|
more timer interrupts.
|
||||||
|
|
||||||
xirc2ps_cs= [NET,PCMCIA]
|
xirc2ps_cs= [NET,PCMCIA]
|
||||||
Format:
|
Format:
|
||||||
<irq>,<irq_mask>,<io>,<full_duplex>,<do_sound>,<lockup_hack>[,<irq2>[,<irq3>[,<irq4>]]]
|
<irq>,<irq_mask>,<io>,<full_duplex>,<do_sound>,<lockup_hack>[,<irq2>[,<irq3>[,<irq4>]]]
|
||||||
|
169
Documentation/admin-guide/mm/numaperf.rst
Normal file
169
Documentation/admin-guide/mm/numaperf.rst
Normal file
@@ -0,0 +1,169 @@
|
|||||||
|
.. _numaperf:
|
||||||
|
|
||||||
|
=============
|
||||||
|
NUMA Locality
|
||||||
|
=============
|
||||||
|
|
||||||
|
Some platforms may have multiple types of memory attached to a compute
|
||||||
|
node. These disparate memory ranges may share some characteristics, such
|
||||||
|
as CPU cache coherence, but may have different performance. For example,
|
||||||
|
different media types and buses affect bandwidth and latency.
|
||||||
|
|
||||||
|
A system supports such heterogeneous memory by grouping each memory type
|
||||||
|
under different domains, or "nodes", based on locality and performance
|
||||||
|
characteristics. Some memory may share the same node as a CPU, and others
|
||||||
|
are provided as memory only nodes. While memory only nodes do not provide
|
||||||
|
CPUs, they may still be local to one or more compute nodes relative to
|
||||||
|
other nodes. The following diagram shows one such example of two compute
|
||||||
|
nodes with local memory and a memory only node for each of compute node:
|
||||||
|
|
||||||
|
+------------------+ +------------------+
|
||||||
|
| Compute Node 0 +-----+ Compute Node 1 |
|
||||||
|
| Local Node0 Mem | | Local Node1 Mem |
|
||||||
|
+--------+---------+ +--------+---------+
|
||||||
|
| |
|
||||||
|
+--------+---------+ +--------+---------+
|
||||||
|
| Slower Node2 Mem | | Slower Node3 Mem |
|
||||||
|
+------------------+ +--------+---------+
|
||||||
|
|
||||||
|
A "memory initiator" is a node containing one or more devices such as
|
||||||
|
CPUs or separate memory I/O devices that can initiate memory requests.
|
||||||
|
A "memory target" is a node containing one or more physical address
|
||||||
|
ranges accessible from one or more memory initiators.
|
||||||
|
|
||||||
|
When multiple memory initiators exist, they may not all have the same
|
||||||
|
performance when accessing a given memory target. Each initiator-target
|
||||||
|
pair may be organized into different ranked access classes to represent
|
||||||
|
this relationship. The highest performing initiator to a given target
|
||||||
|
is considered to be one of that target's local initiators, and given
|
||||||
|
the highest access class, 0. Any given target may have one or more
|
||||||
|
local initiators, and any given initiator may have multiple local
|
||||||
|
memory targets.
|
||||||
|
|
||||||
|
To aid applications matching memory targets with their initiators, the
|
||||||
|
kernel provides symlinks to each other. The following example lists the
|
||||||
|
relationship for the access class "0" memory initiators and targets::
|
||||||
|
|
||||||
|
# symlinks -v /sys/devices/system/node/nodeX/access0/targets/
|
||||||
|
relative: /sys/devices/system/node/nodeX/access0/targets/nodeY -> ../../nodeY
|
||||||
|
|
||||||
|
# symlinks -v /sys/devices/system/node/nodeY/access0/initiators/
|
||||||
|
relative: /sys/devices/system/node/nodeY/access0/initiators/nodeX -> ../../nodeX
|
||||||
|
|
||||||
|
A memory initiator may have multiple memory targets in the same access
|
||||||
|
class. The target memory's initiators in a given class indicate the
|
||||||
|
nodes' access characteristics share the same performance relative to other
|
||||||
|
linked initiator nodes. Each target within an initiator's access class,
|
||||||
|
though, do not necessarily perform the same as each other.
|
||||||
|
|
||||||
|
================
|
||||||
|
NUMA Performance
|
||||||
|
================
|
||||||
|
|
||||||
|
Applications may wish to consider which node they want their memory to
|
||||||
|
be allocated from based on the node's performance characteristics. If
|
||||||
|
the system provides these attributes, the kernel exports them under the
|
||||||
|
node sysfs hierarchy by appending the attributes directory under the
|
||||||
|
memory node's access class 0 initiators as follows::
|
||||||
|
|
||||||
|
/sys/devices/system/node/nodeY/access0/initiators/
|
||||||
|
|
||||||
|
These attributes apply only when accessed from nodes that have the
|
||||||
|
are linked under the this access's inititiators.
|
||||||
|
|
||||||
|
The performance characteristics the kernel provides for the local initiators
|
||||||
|
are exported are as follows::
|
||||||
|
|
||||||
|
# tree -P "read*|write*" /sys/devices/system/node/nodeY/access0/initiators/
|
||||||
|
/sys/devices/system/node/nodeY/access0/initiators/
|
||||||
|
|-- read_bandwidth
|
||||||
|
|-- read_latency
|
||||||
|
|-- write_bandwidth
|
||||||
|
`-- write_latency
|
||||||
|
|
||||||
|
The bandwidth attributes are provided in MiB/second.
|
||||||
|
|
||||||
|
The latency attributes are provided in nanoseconds.
|
||||||
|
|
||||||
|
The values reported here correspond to the rated latency and bandwidth
|
||||||
|
for the platform.
|
||||||
|
|
||||||
|
==========
|
||||||
|
NUMA Cache
|
||||||
|
==========
|
||||||
|
|
||||||
|
System memory may be constructed in a hierarchy of elements with various
|
||||||
|
performance characteristics in order to provide large address space of
|
||||||
|
slower performing memory cached by a smaller higher performing memory. The
|
||||||
|
system physical addresses memory initiators are aware of are provided
|
||||||
|
by the last memory level in the hierarchy. The system meanwhile uses
|
||||||
|
higher performing memory to transparently cache access to progressively
|
||||||
|
slower levels.
|
||||||
|
|
||||||
|
The term "far memory" is used to denote the last level memory in the
|
||||||
|
hierarchy. Each increasing cache level provides higher performing
|
||||||
|
initiator access, and the term "near memory" represents the fastest
|
||||||
|
cache provided by the system.
|
||||||
|
|
||||||
|
This numbering is different than CPU caches where the cache level (ex:
|
||||||
|
L1, L2, L3) uses the CPU-side view where each increased level is lower
|
||||||
|
performing. In contrast, the memory cache level is centric to the last
|
||||||
|
level memory, so the higher numbered cache level corresponds to memory
|
||||||
|
nearer to the CPU, and further from far memory.
|
||||||
|
|
||||||
|
The memory-side caches are not directly addressable by software. When
|
||||||
|
software accesses a system address, the system will return it from the
|
||||||
|
near memory cache if it is present. If it is not present, the system
|
||||||
|
accesses the next level of memory until there is either a hit in that
|
||||||
|
cache level, or it reaches far memory.
|
||||||
|
|
||||||
|
An application does not need to know about caching attributes in order
|
||||||
|
to use the system. Software may optionally query the memory cache
|
||||||
|
attributes in order to maximize the performance out of such a setup.
|
||||||
|
If the system provides a way for the kernel to discover this information,
|
||||||
|
for example with ACPI HMAT (Heterogeneous Memory Attribute Table),
|
||||||
|
the kernel will append these attributes to the NUMA node memory target.
|
||||||
|
|
||||||
|
When the kernel first registers a memory cache with a node, the kernel
|
||||||
|
will create the following directory::
|
||||||
|
|
||||||
|
/sys/devices/system/node/nodeX/memory_side_cache/
|
||||||
|
|
||||||
|
If that directory is not present, the system either does not not provide
|
||||||
|
a memory-side cache, or that information is not accessible to the kernel.
|
||||||
|
|
||||||
|
The attributes for each level of cache is provided under its cache
|
||||||
|
level index::
|
||||||
|
|
||||||
|
/sys/devices/system/node/nodeX/memory_side_cache/indexA/
|
||||||
|
/sys/devices/system/node/nodeX/memory_side_cache/indexB/
|
||||||
|
/sys/devices/system/node/nodeX/memory_side_cache/indexC/
|
||||||
|
|
||||||
|
Each cache level's directory provides its attributes. For example, the
|
||||||
|
following shows a single cache level and the attributes available for
|
||||||
|
software to query::
|
||||||
|
|
||||||
|
# tree sys/devices/system/node/node0/memory_side_cache/
|
||||||
|
/sys/devices/system/node/node0/memory_side_cache/
|
||||||
|
|-- index1
|
||||||
|
| |-- indexing
|
||||||
|
| |-- line_size
|
||||||
|
| |-- size
|
||||||
|
| `-- write_policy
|
||||||
|
|
||||||
|
The "indexing" will be 0 if it is a direct-mapped cache, and non-zero
|
||||||
|
for any other indexed based, multi-way associativity.
|
||||||
|
|
||||||
|
The "line_size" is the number of bytes accessed from the next cache
|
||||||
|
level on a miss.
|
||||||
|
|
||||||
|
The "size" is the number of bytes provided by this cache level.
|
||||||
|
|
||||||
|
The "write_policy" will be 0 for write-back, and non-zero for
|
||||||
|
write-through caching.
|
||||||
|
|
||||||
|
========
|
||||||
|
See Also
|
||||||
|
========
|
||||||
|
.. [1] https://www.uefi.org/sites/default/files/resources/ACPI_6_2.pdf
|
||||||
|
Section 5.2.27
|
@@ -1,3 +1,6 @@
|
|||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
.. include:: <isonum.txt>
|
||||||
|
|
||||||
.. |struct cpufreq_policy| replace:: :c:type:`struct cpufreq_policy <cpufreq_policy>`
|
.. |struct cpufreq_policy| replace:: :c:type:`struct cpufreq_policy <cpufreq_policy>`
|
||||||
.. |intel_pstate| replace:: :doc:`intel_pstate <intel_pstate>`
|
.. |intel_pstate| replace:: :doc:`intel_pstate <intel_pstate>`
|
||||||
|
|
||||||
@@ -5,9 +8,10 @@
|
|||||||
CPU Performance Scaling
|
CPU Performance Scaling
|
||||||
=======================
|
=======================
|
||||||
|
|
||||||
::
|
:Copyright: |copy| 2017 Intel Corporation
|
||||||
|
|
||||||
|
:Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
||||||
|
|
||||||
Copyright (c) 2017 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
|
||||||
|
|
||||||
The Concept of CPU Performance Scaling
|
The Concept of CPU Performance Scaling
|
||||||
======================================
|
======================================
|
||||||
@@ -396,8 +400,8 @@ RT or deadline scheduling classes, the governor will increase the frequency to
|
|||||||
the allowed maximum (that is, the ``scaling_max_freq`` policy limit). In turn,
|
the allowed maximum (that is, the ``scaling_max_freq`` policy limit). In turn,
|
||||||
if it is invoked by the CFS scheduling class, the governor will use the
|
if it is invoked by the CFS scheduling class, the governor will use the
|
||||||
Per-Entity Load Tracking (PELT) metric for the root control group of the
|
Per-Entity Load Tracking (PELT) metric for the root control group of the
|
||||||
given CPU as the CPU utilization estimate (see the `Per-entity load tracking`_
|
given CPU as the CPU utilization estimate (see the *Per-entity load tracking*
|
||||||
LWN.net article for a description of the PELT mechanism). Then, the new
|
LWN.net article [1]_ for a description of the PELT mechanism). Then, the new
|
||||||
CPU frequency to apply is computed in accordance with the formula
|
CPU frequency to apply is computed in accordance with the formula
|
||||||
|
|
||||||
f = 1.25 * ``f_0`` * ``util`` / ``max``
|
f = 1.25 * ``f_0`` * ``util`` / ``max``
|
||||||
@@ -698,4 +702,8 @@ hardware feature (e.g. all Intel ones), even if the
|
|||||||
:c:macro:`CONFIG_X86_ACPI_CPUFREQ_CPB` configuration option is set.
|
:c:macro:`CONFIG_X86_ACPI_CPUFREQ_CPB` configuration option is set.
|
||||||
|
|
||||||
|
|
||||||
.. _Per-entity load tracking: https://lwn.net/Articles/531853/
|
References
|
||||||
|
==========
|
||||||
|
|
||||||
|
.. [1] Jonathan Corbet, *Per-entity load tracking*,
|
||||||
|
https://lwn.net/Articles/531853/
|
||||||
|
@@ -1,3 +1,6 @@
|
|||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
.. include:: <isonum.txt>
|
||||||
|
|
||||||
.. |struct cpuidle_state| replace:: :c:type:`struct cpuidle_state <cpuidle_state>`
|
.. |struct cpuidle_state| replace:: :c:type:`struct cpuidle_state <cpuidle_state>`
|
||||||
.. |cpufreq| replace:: :doc:`CPU Performance Scaling <cpufreq>`
|
.. |cpufreq| replace:: :doc:`CPU Performance Scaling <cpufreq>`
|
||||||
|
|
||||||
@@ -5,9 +8,10 @@
|
|||||||
CPU Idle Time Management
|
CPU Idle Time Management
|
||||||
========================
|
========================
|
||||||
|
|
||||||
::
|
:Copyright: |copy| 2018 Intel Corporation
|
||||||
|
|
||||||
|
:Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
||||||
|
|
||||||
Copyright (c) 2018 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
|
||||||
|
|
||||||
Concepts
|
Concepts
|
||||||
========
|
========
|
||||||
|
@@ -1,3 +1,5 @@
|
|||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
|
||||||
================
|
================
|
||||||
Power Management
|
Power Management
|
||||||
================
|
================
|
||||||
|
41
Documentation/admin-guide/pm/intel_epb.rst
Normal file
41
Documentation/admin-guide/pm/intel_epb.rst
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
.. include:: <isonum.txt>
|
||||||
|
|
||||||
|
======================================
|
||||||
|
Intel Performance and Energy Bias Hint
|
||||||
|
======================================
|
||||||
|
|
||||||
|
:Copyright: |copy| 2019 Intel Corporation
|
||||||
|
|
||||||
|
:Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
||||||
|
|
||||||
|
|
||||||
|
.. kernel-doc:: arch/x86/kernel/cpu/intel_epb.c
|
||||||
|
:doc: overview
|
||||||
|
|
||||||
|
Intel Performance and Energy Bias Attribute in ``sysfs``
|
||||||
|
========================================================
|
||||||
|
|
||||||
|
The Intel Performance and Energy Bias Hint (EPB) value for a given (logical) CPU
|
||||||
|
can be checked or updated through a ``sysfs`` attribute (file) under
|
||||||
|
:file:`/sys/devices/system/cpu/cpu<N>/power/`, where the CPU number ``<N>``
|
||||||
|
is allocated at the system initialization time:
|
||||||
|
|
||||||
|
``energy_perf_bias``
|
||||||
|
Shows the current EPB value for the CPU in a sliding scale 0 - 15, where
|
||||||
|
a value of 0 corresponds to a hint preference for highest performance
|
||||||
|
and a value of 15 corresponds to the maximum energy savings.
|
||||||
|
|
||||||
|
In order to update the EPB value for the CPU, this attribute can be
|
||||||
|
written to, either with a number in the 0 - 15 sliding scale above, or
|
||||||
|
with one of the strings: "performance", "balance-performance", "normal",
|
||||||
|
"balance-power", "power" that represent values reflected by their
|
||||||
|
meaning.
|
||||||
|
|
||||||
|
This attribute is present for all online CPUs supporting the EPB
|
||||||
|
feature.
|
||||||
|
|
||||||
|
Note that while the EPB interface to the processor is defined at the logical CPU
|
||||||
|
level, the physical register backing it may be shared by multiple CPUs (for
|
||||||
|
example, SMT siblings or cores in one package). For this reason, updating the
|
||||||
|
EPB value for one CPU may cause the EPB values for other CPUs to change.
|
@@ -1,10 +1,13 @@
|
|||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
.. include:: <isonum.txt>
|
||||||
|
|
||||||
===============================================
|
===============================================
|
||||||
``intel_pstate`` CPU Performance Scaling Driver
|
``intel_pstate`` CPU Performance Scaling Driver
|
||||||
===============================================
|
===============================================
|
||||||
|
|
||||||
::
|
:Copyright: |copy| 2017 Intel Corporation
|
||||||
|
|
||||||
Copyright (c) 2017 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
:Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
||||||
|
|
||||||
|
|
||||||
General Information
|
General Information
|
||||||
@@ -20,11 +23,10 @@ you have not done that yet.]
|
|||||||
|
|
||||||
For the processors supported by ``intel_pstate``, the P-state concept is broader
|
For the processors supported by ``intel_pstate``, the P-state concept is broader
|
||||||
than just an operating frequency or an operating performance point (see the
|
than just an operating frequency or an operating performance point (see the
|
||||||
`LinuxCon Europe 2015 presentation by Kristen Accardi <LCEU2015_>`_ for more
|
LinuxCon Europe 2015 presentation by Kristen Accardi [1]_ for more
|
||||||
information about that). For this reason, the representation of P-states used
|
information about that). For this reason, the representation of P-states used
|
||||||
by ``intel_pstate`` internally follows the hardware specification (for details
|
by ``intel_pstate`` internally follows the hardware specification (for details
|
||||||
refer to `Intel® 64 and IA-32 Architectures Software Developer’s Manual
|
refer to Intel Software Developer’s Manual [2]_). However, the ``CPUFreq`` core
|
||||||
Volume 3: System Programming Guide <SDM_>`_). However, the ``CPUFreq`` core
|
|
||||||
uses frequencies for identifying operating performance points of CPUs and
|
uses frequencies for identifying operating performance points of CPUs and
|
||||||
frequencies are involved in the user space interface exposed by it, so
|
frequencies are involved in the user space interface exposed by it, so
|
||||||
``intel_pstate`` maps its internal representation of P-states to frequencies too
|
``intel_pstate`` maps its internal representation of P-states to frequencies too
|
||||||
@@ -561,9 +563,9 @@ or to pin every task potentially sensitive to them to a specific CPU.]
|
|||||||
|
|
||||||
On the majority of systems supported by ``intel_pstate``, the ACPI tables
|
On the majority of systems supported by ``intel_pstate``, the ACPI tables
|
||||||
provided by the platform firmware contain ``_PSS`` objects returning information
|
provided by the platform firmware contain ``_PSS`` objects returning information
|
||||||
that can be used for CPU performance scaling (refer to the `ACPI specification`_
|
that can be used for CPU performance scaling (refer to the ACPI specification
|
||||||
for details on the ``_PSS`` objects and the format of the information returned
|
[3]_ for details on the ``_PSS`` objects and the format of the information
|
||||||
by them).
|
returned by them).
|
||||||
|
|
||||||
The information returned by the ACPI ``_PSS`` objects is used by the
|
The information returned by the ACPI ``_PSS`` objects is used by the
|
||||||
``acpi-cpufreq`` scaling driver. On systems supported by ``intel_pstate``
|
``acpi-cpufreq`` scaling driver. On systems supported by ``intel_pstate``
|
||||||
@@ -728,6 +730,14 @@ P-state is called, the ``ftrace`` filter can be set to to
|
|||||||
<idle>-0 [000] ..s. 2537.654843: intel_pstate_set_pstate <-intel_pstate_timer_func
|
<idle>-0 [000] ..s. 2537.654843: intel_pstate_set_pstate <-intel_pstate_timer_func
|
||||||
|
|
||||||
|
|
||||||
.. _LCEU2015: http://events.linuxfoundation.org/sites/events/files/slides/LinuxConEurope_2015.pdf
|
References
|
||||||
.. _SDM: http://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-software-developer-system-programming-manual-325384.html
|
==========
|
||||||
.. _ACPI specification: http://www.uefi.org/sites/default/files/resources/ACPI_6_1.pdf
|
|
||||||
|
.. [1] Kristen Accardi, *Balancing Power and Performance in the Linux Kernel*,
|
||||||
|
http://events.linuxfoundation.org/sites/events/files/slides/LinuxConEurope_2015.pdf
|
||||||
|
|
||||||
|
.. [2] *Intel® 64 and IA-32 Architectures Software Developer’s Manual Volume 3: System Programming Guide*,
|
||||||
|
http://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-software-developer-system-programming-manual-325384.html
|
||||||
|
|
||||||
|
.. [3] *Advanced Configuration and Power Interface Specification*,
|
||||||
|
https://uefi.org/sites/default/files/resources/ACPI_6_3_final_Jan30.pdf
|
||||||
|
@@ -1,10 +1,14 @@
|
|||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
.. include:: <isonum.txt>
|
||||||
|
|
||||||
===================
|
===================
|
||||||
System Sleep States
|
System Sleep States
|
||||||
===================
|
===================
|
||||||
|
|
||||||
::
|
:Copyright: |copy| 2017 Intel Corporation
|
||||||
|
|
||||||
|
:Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
||||||
|
|
||||||
Copyright (c) 2017 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
|
||||||
|
|
||||||
Sleep states are global low-power states of the entire system in which user
|
Sleep states are global low-power states of the entire system in which user
|
||||||
space code cannot be executed and the overall system activity is significantly
|
space code cannot be executed and the overall system activity is significantly
|
||||||
|
@@ -1,10 +1,14 @@
|
|||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
.. include:: <isonum.txt>
|
||||||
|
|
||||||
===========================
|
===========================
|
||||||
Power Management Strategies
|
Power Management Strategies
|
||||||
===========================
|
===========================
|
||||||
|
|
||||||
::
|
:Copyright: |copy| 2017 Intel Corporation
|
||||||
|
|
||||||
|
:Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
||||||
|
|
||||||
Copyright (c) 2017 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
|
||||||
|
|
||||||
The Linux kernel supports two major high-level power management strategies.
|
The Linux kernel supports two major high-level power management strategies.
|
||||||
|
|
||||||
|
@@ -1,3 +1,5 @@
|
|||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
|
||||||
============================
|
============================
|
||||||
System-Wide Power Management
|
System-Wide Power Management
|
||||||
============================
|
============================
|
||||||
|
@@ -1,3 +1,5 @@
|
|||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
|
||||||
==============================
|
==============================
|
||||||
Working-State Power Management
|
Working-State Power Management
|
||||||
==============================
|
==============================
|
||||||
@@ -8,3 +10,4 @@ Working-State Power Management
|
|||||||
cpuidle
|
cpuidle
|
||||||
cpufreq
|
cpufreq
|
||||||
intel_pstate
|
intel_pstate
|
||||||
|
intel_epb
|
||||||
|
@@ -209,6 +209,22 @@ infrastructure:
|
|||||||
| AT | [35-32] | y |
|
| AT | [35-32] | y |
|
||||||
x--------------------------------------------------x
|
x--------------------------------------------------x
|
||||||
|
|
||||||
|
6) ID_AA64ZFR0_EL1 - SVE feature ID register 0
|
||||||
|
|
||||||
|
x--------------------------------------------------x
|
||||||
|
| Name | bits | visible |
|
||||||
|
|--------------------------------------------------|
|
||||||
|
| SM4 | [43-40] | y |
|
||||||
|
|--------------------------------------------------|
|
||||||
|
| SHA3 | [35-32] | y |
|
||||||
|
|--------------------------------------------------|
|
||||||
|
| BitPerm | [19-16] | y |
|
||||||
|
|--------------------------------------------------|
|
||||||
|
| AES | [7-4] | y |
|
||||||
|
|--------------------------------------------------|
|
||||||
|
| SVEVer | [3-0] | y |
|
||||||
|
x--------------------------------------------------x
|
||||||
|
|
||||||
Appendix I: Example
|
Appendix I: Example
|
||||||
---------------------------
|
---------------------------
|
||||||
|
|
||||||
|
@@ -13,9 +13,9 @@ architected discovery mechanism available to userspace code at EL0. The
|
|||||||
kernel exposes the presence of these features to userspace through a set
|
kernel exposes the presence of these features to userspace through a set
|
||||||
of flags called hwcaps, exposed in the auxilliary vector.
|
of flags called hwcaps, exposed in the auxilliary vector.
|
||||||
|
|
||||||
Userspace software can test for features by acquiring the AT_HWCAP entry
|
Userspace software can test for features by acquiring the AT_HWCAP or
|
||||||
of the auxilliary vector, and testing whether the relevant flags are
|
AT_HWCAP2 entry of the auxiliary vector, and testing whether the relevant
|
||||||
set, e.g.
|
flags are set, e.g.
|
||||||
|
|
||||||
bool floating_point_is_present(void)
|
bool floating_point_is_present(void)
|
||||||
{
|
{
|
||||||
@@ -135,6 +135,10 @@ HWCAP_DCPOP
|
|||||||
|
|
||||||
Functionality implied by ID_AA64ISAR1_EL1.DPB == 0b0001.
|
Functionality implied by ID_AA64ISAR1_EL1.DPB == 0b0001.
|
||||||
|
|
||||||
|
HWCAP2_DCPODP
|
||||||
|
|
||||||
|
Functionality implied by ID_AA64ISAR1_EL1.DPB == 0b0010.
|
||||||
|
|
||||||
HWCAP_SHA3
|
HWCAP_SHA3
|
||||||
|
|
||||||
Functionality implied by ID_AA64ISAR0_EL1.SHA3 == 0b0001.
|
Functionality implied by ID_AA64ISAR0_EL1.SHA3 == 0b0001.
|
||||||
@@ -159,6 +163,30 @@ HWCAP_SVE
|
|||||||
|
|
||||||
Functionality implied by ID_AA64PFR0_EL1.SVE == 0b0001.
|
Functionality implied by ID_AA64PFR0_EL1.SVE == 0b0001.
|
||||||
|
|
||||||
|
HWCAP2_SVE2
|
||||||
|
|
||||||
|
Functionality implied by ID_AA64ZFR0_EL1.SVEVer == 0b0001.
|
||||||
|
|
||||||
|
HWCAP2_SVEAES
|
||||||
|
|
||||||
|
Functionality implied by ID_AA64ZFR0_EL1.AES == 0b0001.
|
||||||
|
|
||||||
|
HWCAP2_SVEPMULL
|
||||||
|
|
||||||
|
Functionality implied by ID_AA64ZFR0_EL1.AES == 0b0010.
|
||||||
|
|
||||||
|
HWCAP2_SVEBITPERM
|
||||||
|
|
||||||
|
Functionality implied by ID_AA64ZFR0_EL1.BitPerm == 0b0001.
|
||||||
|
|
||||||
|
HWCAP2_SVESHA3
|
||||||
|
|
||||||
|
Functionality implied by ID_AA64ZFR0_EL1.SHA3 == 0b0001.
|
||||||
|
|
||||||
|
HWCAP2_SVESM4
|
||||||
|
|
||||||
|
Functionality implied by ID_AA64ZFR0_EL1.SM4 == 0b0001.
|
||||||
|
|
||||||
HWCAP_ASIMDFHM
|
HWCAP_ASIMDFHM
|
||||||
|
|
||||||
Functionality implied by ID_AA64ISAR0_EL1.FHM == 0b0001.
|
Functionality implied by ID_AA64ISAR0_EL1.FHM == 0b0001.
|
||||||
@@ -194,3 +222,10 @@ HWCAP_PACG
|
|||||||
Functionality implied by ID_AA64ISAR1_EL1.GPA == 0b0001 or
|
Functionality implied by ID_AA64ISAR1_EL1.GPA == 0b0001 or
|
||||||
ID_AA64ISAR1_EL1.GPI == 0b0001, as described by
|
ID_AA64ISAR1_EL1.GPI == 0b0001, as described by
|
||||||
Documentation/arm64/pointer-authentication.txt.
|
Documentation/arm64/pointer-authentication.txt.
|
||||||
|
|
||||||
|
|
||||||
|
4. Unused AT_HWCAP bits
|
||||||
|
-----------------------
|
||||||
|
|
||||||
|
For interoperation with userspace, the kernel guarantees that bits 62
|
||||||
|
and 63 of AT_HWCAP will always be returned as 0.
|
||||||
|
85
Documentation/arm64/perf.txt
Normal file
85
Documentation/arm64/perf.txt
Normal file
@@ -0,0 +1,85 @@
|
|||||||
|
Perf Event Attributes
|
||||||
|
=====================
|
||||||
|
|
||||||
|
Author: Andrew Murray <andrew.murray@arm.com>
|
||||||
|
Date: 2019-03-06
|
||||||
|
|
||||||
|
exclude_user
|
||||||
|
------------
|
||||||
|
|
||||||
|
This attribute excludes userspace.
|
||||||
|
|
||||||
|
Userspace always runs at EL0 and thus this attribute will exclude EL0.
|
||||||
|
|
||||||
|
|
||||||
|
exclude_kernel
|
||||||
|
--------------
|
||||||
|
|
||||||
|
This attribute excludes the kernel.
|
||||||
|
|
||||||
|
The kernel runs at EL2 with VHE and EL1 without. Guest kernels always run
|
||||||
|
at EL1.
|
||||||
|
|
||||||
|
For the host this attribute will exclude EL1 and additionally EL2 on a VHE
|
||||||
|
system.
|
||||||
|
|
||||||
|
For the guest this attribute will exclude EL1. Please note that EL2 is
|
||||||
|
never counted within a guest.
|
||||||
|
|
||||||
|
|
||||||
|
exclude_hv
|
||||||
|
----------
|
||||||
|
|
||||||
|
This attribute excludes the hypervisor.
|
||||||
|
|
||||||
|
For a VHE host this attribute is ignored as we consider the host kernel to
|
||||||
|
be the hypervisor.
|
||||||
|
|
||||||
|
For a non-VHE host this attribute will exclude EL2 as we consider the
|
||||||
|
hypervisor to be any code that runs at EL2 which is predominantly used for
|
||||||
|
guest/host transitions.
|
||||||
|
|
||||||
|
For the guest this attribute has no effect. Please note that EL2 is
|
||||||
|
never counted within a guest.
|
||||||
|
|
||||||
|
|
||||||
|
exclude_host / exclude_guest
|
||||||
|
----------------------------
|
||||||
|
|
||||||
|
These attributes exclude the KVM host and guest, respectively.
|
||||||
|
|
||||||
|
The KVM host may run at EL0 (userspace), EL1 (non-VHE kernel) and EL2 (VHE
|
||||||
|
kernel or non-VHE hypervisor).
|
||||||
|
|
||||||
|
The KVM guest may run at EL0 (userspace) and EL1 (kernel).
|
||||||
|
|
||||||
|
Due to the overlapping exception levels between host and guests we cannot
|
||||||
|
exclusively rely on the PMU's hardware exception filtering - therefore we
|
||||||
|
must enable/disable counting on the entry and exit to the guest. This is
|
||||||
|
performed differently on VHE and non-VHE systems.
|
||||||
|
|
||||||
|
For non-VHE systems we exclude EL2 for exclude_host - upon entering and
|
||||||
|
exiting the guest we disable/enable the event as appropriate based on the
|
||||||
|
exclude_host and exclude_guest attributes.
|
||||||
|
|
||||||
|
For VHE systems we exclude EL1 for exclude_guest and exclude both EL0,EL2
|
||||||
|
for exclude_host. Upon entering and exiting the guest we modify the event
|
||||||
|
to include/exclude EL0 as appropriate based on the exclude_host and
|
||||||
|
exclude_guest attributes.
|
||||||
|
|
||||||
|
The statements above also apply when these attributes are used within a
|
||||||
|
non-VHE guest however please note that EL2 is never counted within a guest.
|
||||||
|
|
||||||
|
|
||||||
|
Accuracy
|
||||||
|
--------
|
||||||
|
|
||||||
|
On non-VHE hosts we enable/disable counters on the entry/exit of host/guest
|
||||||
|
transition at EL2 - however there is a period of time between
|
||||||
|
enabling/disabling the counters and entering/exiting the guest. We are
|
||||||
|
able to eliminate counters counting host events on the boundaries of guest
|
||||||
|
entry/exit when counting guest events by filtering out EL2 for
|
||||||
|
exclude_host. However when using !exclude_hv there is a small blackout
|
||||||
|
window at the guest entry/exit where host events are not captured.
|
||||||
|
|
||||||
|
On VHE systems there are no blackout windows.
|
@@ -87,7 +87,21 @@ used to get and set the keys for a thread.
|
|||||||
Virtualization
|
Virtualization
|
||||||
--------------
|
--------------
|
||||||
|
|
||||||
Pointer authentication is not currently supported in KVM guests. KVM
|
Pointer authentication is enabled in KVM guest when each virtual cpu is
|
||||||
will mask the feature bits from ID_AA64ISAR1_EL1, and attempted use of
|
initialised by passing flags KVM_ARM_VCPU_PTRAUTH_[ADDRESS/GENERIC] and
|
||||||
the feature will result in an UNDEFINED exception being injected into
|
requesting these two separate cpu features to be enabled. The current KVM
|
||||||
the guest.
|
guest implementation works by enabling both features together, so both
|
||||||
|
these userspace flags are checked before enabling pointer authentication.
|
||||||
|
The separate userspace flag will allow to have no userspace ABI changes
|
||||||
|
if support is added in the future to allow these two features to be
|
||||||
|
enabled independently of one another.
|
||||||
|
|
||||||
|
As Arm Architecture specifies that Pointer Authentication feature is
|
||||||
|
implemented along with the VHE feature so KVM arm64 ptrauth code relies
|
||||||
|
on VHE mode to be present.
|
||||||
|
|
||||||
|
Additionally, when these vcpu feature flags are not set then KVM will
|
||||||
|
filter out the Pointer Authentication system key registers from
|
||||||
|
KVM_GET/SET_REG_* ioctls and mask those features from cpufeature ID
|
||||||
|
register. Any attempt to use the Pointer Authentication instructions will
|
||||||
|
result in an UNDEFINED exception being injected into the guest.
|
||||||
|
@@ -61,6 +61,7 @@ stable kernels.
|
|||||||
| ARM | Cortex-A76 | #1188873 | ARM64_ERRATUM_1188873 |
|
| ARM | Cortex-A76 | #1188873 | ARM64_ERRATUM_1188873 |
|
||||||
| ARM | Cortex-A76 | #1165522 | ARM64_ERRATUM_1165522 |
|
| ARM | Cortex-A76 | #1165522 | ARM64_ERRATUM_1165522 |
|
||||||
| ARM | Cortex-A76 | #1286807 | ARM64_ERRATUM_1286807 |
|
| ARM | Cortex-A76 | #1286807 | ARM64_ERRATUM_1286807 |
|
||||||
|
| ARM | Neoverse-N1 | #1188873 | ARM64_ERRATUM_1188873 |
|
||||||
| ARM | MMU-500 | #841119,#826419 | N/A |
|
| ARM | MMU-500 | #841119,#826419 | N/A |
|
||||||
| | | | |
|
| | | | |
|
||||||
| Cavium | ThunderX ITS | #22375, #24313 | CAVIUM_ERRATUM_22375 |
|
| Cavium | ThunderX ITS | #22375, #24313 | CAVIUM_ERRATUM_22375 |
|
||||||
@@ -77,6 +78,7 @@ stable kernels.
|
|||||||
| Hisilicon | Hip0{5,6,7} | #161010101 | HISILICON_ERRATUM_161010101 |
|
| Hisilicon | Hip0{5,6,7} | #161010101 | HISILICON_ERRATUM_161010101 |
|
||||||
| Hisilicon | Hip0{6,7} | #161010701 | N/A |
|
| Hisilicon | Hip0{6,7} | #161010701 | N/A |
|
||||||
| Hisilicon | Hip07 | #161600802 | HISILICON_ERRATUM_161600802 |
|
| Hisilicon | Hip07 | #161600802 | HISILICON_ERRATUM_161600802 |
|
||||||
|
| Hisilicon | Hip08 SMMU PMCG | #162001800 | N/A |
|
||||||
| | | | |
|
| | | | |
|
||||||
| Qualcomm Tech. | Kryo/Falkor v1 | E1003 | QCOM_FALKOR_ERRATUM_1003 |
|
| Qualcomm Tech. | Kryo/Falkor v1 | E1003 | QCOM_FALKOR_ERRATUM_1003 |
|
||||||
| Qualcomm Tech. | Falkor v1 | E1009 | QCOM_FALKOR_ERRATUM_1009 |
|
| Qualcomm Tech. | Falkor v1 | E1009 | QCOM_FALKOR_ERRATUM_1009 |
|
||||||
|
@@ -34,6 +34,23 @@ model features for SVE is included in Appendix A.
|
|||||||
following sections: software that needs to verify that those interfaces are
|
following sections: software that needs to verify that those interfaces are
|
||||||
present must check for HWCAP_SVE instead.
|
present must check for HWCAP_SVE instead.
|
||||||
|
|
||||||
|
* On hardware that supports the SVE2 extensions, HWCAP2_SVE2 will also
|
||||||
|
be reported in the AT_HWCAP2 aux vector entry. In addition to this,
|
||||||
|
optional extensions to SVE2 may be reported by the presence of:
|
||||||
|
|
||||||
|
HWCAP2_SVE2
|
||||||
|
HWCAP2_SVEAES
|
||||||
|
HWCAP2_SVEPMULL
|
||||||
|
HWCAP2_SVEBITPERM
|
||||||
|
HWCAP2_SVESHA3
|
||||||
|
HWCAP2_SVESM4
|
||||||
|
|
||||||
|
This list may be extended over time as the SVE architecture evolves.
|
||||||
|
|
||||||
|
These extensions are also reported via the CPU ID register ID_AA64ZFR0_EL1,
|
||||||
|
which userspace can read using an MRS instruction. See elf_hwcaps.txt and
|
||||||
|
cpu-feature-registers.txt for details.
|
||||||
|
|
||||||
* Debuggers should restrict themselves to interacting with the target via the
|
* Debuggers should restrict themselves to interacting with the target via the
|
||||||
NT_ARM_SVE regset. The recommended way of detecting support for this regset
|
NT_ARM_SVE regset. The recommended way of detecting support for this regset
|
||||||
is to connect to a target process first and then attempt a
|
is to connect to a target process first and then attempt a
|
||||||
|
@@ -1,6 +1,6 @@
|
|||||||
|
=============
|
||||||
On atomic bitops.
|
Atomic bitops
|
||||||
|
=============
|
||||||
|
|
||||||
While our bitmap_{}() functions are non-atomic, we have a number of operations
|
While our bitmap_{}() functions are non-atomic, we have a number of operations
|
||||||
operating on single bits in a bitmap that are atomic.
|
operating on single bits in a bitmap that are atomic.
|
||||||
|
@@ -56,6 +56,23 @@ Barriers:
|
|||||||
smp_mb__{before,after}_atomic()
|
smp_mb__{before,after}_atomic()
|
||||||
|
|
||||||
|
|
||||||
|
TYPES (signed vs unsigned)
|
||||||
|
-----
|
||||||
|
|
||||||
|
While atomic_t, atomic_long_t and atomic64_t use int, long and s64
|
||||||
|
respectively (for hysterical raisins), the kernel uses -fno-strict-overflow
|
||||||
|
(which implies -fwrapv) and defines signed overflow to behave like
|
||||||
|
2s-complement.
|
||||||
|
|
||||||
|
Therefore, an explicitly unsigned variant of the atomic ops is strictly
|
||||||
|
unnecessary and we can simply cast, there is no UB.
|
||||||
|
|
||||||
|
There was a bug in UBSAN prior to GCC-8 that would generate UB warnings for
|
||||||
|
signed types.
|
||||||
|
|
||||||
|
With this we also conform to the C/C++ _Atomic behaviour and things like
|
||||||
|
P1236R1.
|
||||||
|
|
||||||
|
|
||||||
SEMANTICS
|
SEMANTICS
|
||||||
---------
|
---------
|
||||||
|
@@ -20,13 +20,26 @@ for that device, by setting low_latency to 0. See Section 3 for
|
|||||||
details on how to configure BFQ for the desired tradeoff between
|
details on how to configure BFQ for the desired tradeoff between
|
||||||
latency and throughput, or on how to maximize throughput.
|
latency and throughput, or on how to maximize throughput.
|
||||||
|
|
||||||
BFQ has a non-null overhead, which limits the maximum IOPS that a CPU
|
As every I/O scheduler, BFQ adds some overhead to per-I/O-request
|
||||||
can process for a device scheduled with BFQ. To give an idea of the
|
processing. To give an idea of this overhead, the total,
|
||||||
limits on slow or average CPUs, here are, first, the limits of BFQ for
|
single-lock-protected, per-request processing time of BFQ---i.e., the
|
||||||
three different CPUs, on, respectively, an average laptop, an old
|
sum of the execution times of the request insertion, dispatch and
|
||||||
desktop, and a cheap embedded system, in case full hierarchical
|
completion hooks---is, e.g., 1.9 us on an Intel Core i7-2760QM@2.40GHz
|
||||||
support is enabled (i.e., CONFIG_BFQ_GROUP_IOSCHED is set), but
|
(dated CPU for notebooks; time measured with simple code
|
||||||
CONFIG_DEBUG_BLK_CGROUP is not set (Section 4-2):
|
instrumentation, and using the throughput-sync.sh script of the S
|
||||||
|
suite [1], in performance-profiling mode). To put this result into
|
||||||
|
context, the total, single-lock-protected, per-request execution time
|
||||||
|
of the lightest I/O scheduler available in blk-mq, mq-deadline, is 0.7
|
||||||
|
us (mq-deadline is ~800 LOC, against ~10500 LOC for BFQ).
|
||||||
|
|
||||||
|
Scheduling overhead further limits the maximum IOPS that a CPU can
|
||||||
|
process (already limited by the execution of the rest of the I/O
|
||||||
|
stack). To give an idea of the limits with BFQ, on slow or average
|
||||||
|
CPUs, here are, first, the limits of BFQ for three different CPUs, on,
|
||||||
|
respectively, an average laptop, an old desktop, and a cheap embedded
|
||||||
|
system, in case full hierarchical support is enabled (i.e.,
|
||||||
|
CONFIG_BFQ_GROUP_IOSCHED is set), but CONFIG_DEBUG_BLK_CGROUP is not
|
||||||
|
set (Section 4-2):
|
||||||
- Intel i7-4850HQ: 400 KIOPS
|
- Intel i7-4850HQ: 400 KIOPS
|
||||||
- AMD A8-3850: 250 KIOPS
|
- AMD A8-3850: 250 KIOPS
|
||||||
- ARM CortexTM-A53 Octa-core: 80 KIOPS
|
- ARM CortexTM-A53 Octa-core: 80 KIOPS
|
||||||
@@ -566,3 +579,5 @@ applications. Unset this tunable if you need/want to control weights.
|
|||||||
Slightly extended version:
|
Slightly extended version:
|
||||||
http://algogroup.unimore.it/people/paolo/disk_sched/bfq-v1-suite-
|
http://algogroup.unimore.it/people/paolo/disk_sched/bfq-v1-suite-
|
||||||
results.pdf
|
results.pdf
|
||||||
|
|
||||||
|
[3] https://github.com/Algodev-github/S
|
||||||
|
@@ -93,3 +93,7 @@ zoned=[0/1]: Default: 0
|
|||||||
|
|
||||||
zone_size=[MB]: Default: 256
|
zone_size=[MB]: Default: 256
|
||||||
Per zone size when exposed as a zoned block device. Must be a power of two.
|
Per zone size when exposed as a zoned block device. Must be a power of two.
|
||||||
|
|
||||||
|
zone_nr_conv=[nr_conv]: Default: 0
|
||||||
|
The number of conventional zones to create when block device is zoned. If
|
||||||
|
zone_nr_conv >= nr_zones, it will be reduced to nr_zones - 1.
|
||||||
|
@@ -85,8 +85,33 @@ Q: Can loops be supported in a safe way?
|
|||||||
A: It's not clear yet.
|
A: It's not clear yet.
|
||||||
|
|
||||||
BPF developers are trying to find a way to
|
BPF developers are trying to find a way to
|
||||||
support bounded loops where the verifier can guarantee that
|
support bounded loops.
|
||||||
the program terminates in less than 4096 instructions.
|
|
||||||
|
Q: What are the verifier limits?
|
||||||
|
--------------------------------
|
||||||
|
A: The only limit known to the user space is BPF_MAXINSNS (4096).
|
||||||
|
It's the maximum number of instructions that the unprivileged bpf
|
||||||
|
program can have. The verifier has various internal limits.
|
||||||
|
Like the maximum number of instructions that can be explored during
|
||||||
|
program analysis. Currently, that limit is set to 1 million.
|
||||||
|
Which essentially means that the largest program can consist
|
||||||
|
of 1 million NOP instructions. There is a limit to the maximum number
|
||||||
|
of subsequent branches, a limit to the number of nested bpf-to-bpf
|
||||||
|
calls, a limit to the number of the verifier states per instruction,
|
||||||
|
a limit to the number of maps used by the program.
|
||||||
|
All these limits can be hit with a sufficiently complex program.
|
||||||
|
There are also non-numerical limits that can cause the program
|
||||||
|
to be rejected. The verifier used to recognize only pointer + constant
|
||||||
|
expressions. Now it can recognize pointer + bounded_register.
|
||||||
|
bpf_lookup_map_elem(key) had a requirement that 'key' must be
|
||||||
|
a pointer to the stack. Now, 'key' can be a pointer to map value.
|
||||||
|
The verifier is steadily getting 'smarter'. The limits are
|
||||||
|
being removed. The only way to know that the program is going to
|
||||||
|
be accepted by the verifier is to try to load it.
|
||||||
|
The bpf development process guarantees that the future kernel
|
||||||
|
versions will accept all bpf programs that were accepted by
|
||||||
|
the earlier versions.
|
||||||
|
|
||||||
|
|
||||||
Instruction level questions
|
Instruction level questions
|
||||||
---------------------------
|
---------------------------
|
||||||
|
@@ -82,6 +82,8 @@ sequentially and type id is assigned to each recognized type starting from id
|
|||||||
#define BTF_KIND_RESTRICT 11 /* Restrict */
|
#define BTF_KIND_RESTRICT 11 /* Restrict */
|
||||||
#define BTF_KIND_FUNC 12 /* Function */
|
#define BTF_KIND_FUNC 12 /* Function */
|
||||||
#define BTF_KIND_FUNC_PROTO 13 /* Function Proto */
|
#define BTF_KIND_FUNC_PROTO 13 /* Function Proto */
|
||||||
|
#define BTF_KIND_VAR 14 /* Variable */
|
||||||
|
#define BTF_KIND_DATASEC 15 /* Section */
|
||||||
|
|
||||||
Note that the type section encodes debug info, not just pure types.
|
Note that the type section encodes debug info, not just pure types.
|
||||||
``BTF_KIND_FUNC`` is not a type, and it represents a defined subprogram.
|
``BTF_KIND_FUNC`` is not a type, and it represents a defined subprogram.
|
||||||
@@ -393,6 +395,61 @@ refers to parameter type.
|
|||||||
If the function has variable arguments, the last parameter is encoded with
|
If the function has variable arguments, the last parameter is encoded with
|
||||||
``name_off = 0`` and ``type = 0``.
|
``name_off = 0`` and ``type = 0``.
|
||||||
|
|
||||||
|
2.2.14 BTF_KIND_VAR
|
||||||
|
~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
``struct btf_type`` encoding requirement:
|
||||||
|
* ``name_off``: offset to a valid C identifier
|
||||||
|
* ``info.kind_flag``: 0
|
||||||
|
* ``info.kind``: BTF_KIND_VAR
|
||||||
|
* ``info.vlen``: 0
|
||||||
|
* ``type``: the type of the variable
|
||||||
|
|
||||||
|
``btf_type`` is followed by a single ``struct btf_variable`` with the
|
||||||
|
following data::
|
||||||
|
|
||||||
|
struct btf_var {
|
||||||
|
__u32 linkage;
|
||||||
|
};
|
||||||
|
|
||||||
|
``struct btf_var`` encoding:
|
||||||
|
* ``linkage``: currently only static variable 0, or globally allocated
|
||||||
|
variable in ELF sections 1
|
||||||
|
|
||||||
|
Not all type of global variables are supported by LLVM at this point.
|
||||||
|
The following is currently available:
|
||||||
|
|
||||||
|
* static variables with or without section attributes
|
||||||
|
* global variables with section attributes
|
||||||
|
|
||||||
|
The latter is for future extraction of map key/value type id's from a
|
||||||
|
map definition.
|
||||||
|
|
||||||
|
2.2.15 BTF_KIND_DATASEC
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
``struct btf_type`` encoding requirement:
|
||||||
|
* ``name_off``: offset to a valid name associated with a variable or
|
||||||
|
one of .data/.bss/.rodata
|
||||||
|
* ``info.kind_flag``: 0
|
||||||
|
* ``info.kind``: BTF_KIND_DATASEC
|
||||||
|
* ``info.vlen``: # of variables
|
||||||
|
* ``size``: total section size in bytes (0 at compilation time, patched
|
||||||
|
to actual size by BPF loaders such as libbpf)
|
||||||
|
|
||||||
|
``btf_type`` is followed by ``info.vlen`` number of ``struct btf_var_secinfo``.::
|
||||||
|
|
||||||
|
struct btf_var_secinfo {
|
||||||
|
__u32 type;
|
||||||
|
__u32 offset;
|
||||||
|
__u32 size;
|
||||||
|
};
|
||||||
|
|
||||||
|
``struct btf_var_secinfo`` encoding:
|
||||||
|
* ``type``: the type of the BTF_KIND_VAR variable
|
||||||
|
* ``offset``: the in-section offset of the variable
|
||||||
|
* ``size``: the size of the variable in bytes
|
||||||
|
|
||||||
3. BTF Kernel API
|
3. BTF Kernel API
|
||||||
*****************
|
*****************
|
||||||
|
|
||||||
@@ -521,6 +578,7 @@ For line_info, the line number and column number are defined as below:
|
|||||||
#define BPF_LINE_INFO_LINE_COL(line_col) ((line_col) & 0x3ff)
|
#define BPF_LINE_INFO_LINE_COL(line_col) ((line_col) & 0x3ff)
|
||||||
|
|
||||||
3.4 BPF_{PROG,MAP}_GET_NEXT_ID
|
3.4 BPF_{PROG,MAP}_GET_NEXT_ID
|
||||||
|
==============================
|
||||||
|
|
||||||
In kernel, every loaded program, map or btf has a unique id. The id won't
|
In kernel, every loaded program, map or btf has a unique id. The id won't
|
||||||
change during the lifetime of a program, map, or btf.
|
change during the lifetime of a program, map, or btf.
|
||||||
@@ -530,6 +588,7 @@ each command, to user space, for bpf program or maps, respectively, so an
|
|||||||
inspection tool can inspect all programs and maps.
|
inspection tool can inspect all programs and maps.
|
||||||
|
|
||||||
3.5 BPF_{PROG,MAP}_GET_FD_BY_ID
|
3.5 BPF_{PROG,MAP}_GET_FD_BY_ID
|
||||||
|
===============================
|
||||||
|
|
||||||
An introspection tool cannot use id to get details about program or maps.
|
An introspection tool cannot use id to get details about program or maps.
|
||||||
A file descriptor needs to be obtained first for reference-counting purpose.
|
A file descriptor needs to be obtained first for reference-counting purpose.
|
||||||
|
@@ -36,6 +36,16 @@ Two sets of Questions and Answers (Q&A) are maintained.
|
|||||||
bpf_devel_QA
|
bpf_devel_QA
|
||||||
|
|
||||||
|
|
||||||
|
Program types
|
||||||
|
=============
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 1
|
||||||
|
|
||||||
|
prog_cgroup_sysctl
|
||||||
|
prog_flow_dissector
|
||||||
|
|
||||||
|
|
||||||
.. Links:
|
.. Links:
|
||||||
.. _Documentation/networking/filter.txt: ../networking/filter.txt
|
.. _Documentation/networking/filter.txt: ../networking/filter.txt
|
||||||
.. _man-pages: https://www.kernel.org/doc/man-pages/
|
.. _man-pages: https://www.kernel.org/doc/man-pages/
|
||||||
|
125
Documentation/bpf/prog_cgroup_sysctl.rst
Normal file
125
Documentation/bpf/prog_cgroup_sysctl.rst
Normal file
@@ -0,0 +1,125 @@
|
|||||||
|
.. SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
|
||||||
|
|
||||||
|
===========================
|
||||||
|
BPF_PROG_TYPE_CGROUP_SYSCTL
|
||||||
|
===========================
|
||||||
|
|
||||||
|
This document describes ``BPF_PROG_TYPE_CGROUP_SYSCTL`` program type that
|
||||||
|
provides cgroup-bpf hook for sysctl.
|
||||||
|
|
||||||
|
The hook has to be attached to a cgroup and will be called every time a
|
||||||
|
process inside that cgroup tries to read from or write to sysctl knob in proc.
|
||||||
|
|
||||||
|
1. Attach type
|
||||||
|
**************
|
||||||
|
|
||||||
|
``BPF_CGROUP_SYSCTL`` attach type has to be used to attach
|
||||||
|
``BPF_PROG_TYPE_CGROUP_SYSCTL`` program to a cgroup.
|
||||||
|
|
||||||
|
2. Context
|
||||||
|
**********
|
||||||
|
|
||||||
|
``BPF_PROG_TYPE_CGROUP_SYSCTL`` provides access to the following context from
|
||||||
|
BPF program::
|
||||||
|
|
||||||
|
struct bpf_sysctl {
|
||||||
|
__u32 write;
|
||||||
|
__u32 file_pos;
|
||||||
|
};
|
||||||
|
|
||||||
|
* ``write`` indicates whether sysctl value is being read (``0``) or written
|
||||||
|
(``1``). This field is read-only.
|
||||||
|
|
||||||
|
* ``file_pos`` indicates file position sysctl is being accessed at, read
|
||||||
|
or written. This field is read-write. Writing to the field sets the starting
|
||||||
|
position in sysctl proc file ``read(2)`` will be reading from or ``write(2)``
|
||||||
|
will be writing to. Writing zero to the field can be used e.g. to override
|
||||||
|
whole sysctl value by ``bpf_sysctl_set_new_value()`` on ``write(2)`` even
|
||||||
|
when it's called by user space on ``file_pos > 0``. Writing non-zero
|
||||||
|
value to the field can be used to access part of sysctl value starting from
|
||||||
|
specified ``file_pos``. Not all sysctl support access with ``file_pos !=
|
||||||
|
0``, e.g. writes to numeric sysctl entries must always be at file position
|
||||||
|
``0``. See also ``kernel.sysctl_writes_strict`` sysctl.
|
||||||
|
|
||||||
|
See `linux/bpf.h`_ for more details on how context field can be accessed.
|
||||||
|
|
||||||
|
3. Return code
|
||||||
|
**************
|
||||||
|
|
||||||
|
``BPF_PROG_TYPE_CGROUP_SYSCTL`` program must return one of the following
|
||||||
|
return codes:
|
||||||
|
|
||||||
|
* ``0`` means "reject access to sysctl";
|
||||||
|
* ``1`` means "proceed with access".
|
||||||
|
|
||||||
|
If program returns ``0`` user space will get ``-1`` from ``read(2)`` or
|
||||||
|
``write(2)`` and ``errno`` will be set to ``EPERM``.
|
||||||
|
|
||||||
|
4. Helpers
|
||||||
|
**********
|
||||||
|
|
||||||
|
Since sysctl knob is represented by a name and a value, sysctl specific BPF
|
||||||
|
helpers focus on providing access to these properties:
|
||||||
|
|
||||||
|
* ``bpf_sysctl_get_name()`` to get sysctl name as it is visible in
|
||||||
|
``/proc/sys`` into provided by BPF program buffer;
|
||||||
|
|
||||||
|
* ``bpf_sysctl_get_current_value()`` to get string value currently held by
|
||||||
|
sysctl into provided by BPF program buffer. This helper is available on both
|
||||||
|
``read(2)`` from and ``write(2)`` to sysctl;
|
||||||
|
|
||||||
|
* ``bpf_sysctl_get_new_value()`` to get new string value currently being
|
||||||
|
written to sysctl before actual write happens. This helper can be used only
|
||||||
|
on ``ctx->write == 1``;
|
||||||
|
|
||||||
|
* ``bpf_sysctl_set_new_value()`` to override new string value currently being
|
||||||
|
written to sysctl before actual write happens. Sysctl value will be
|
||||||
|
overridden starting from the current ``ctx->file_pos``. If the whole value
|
||||||
|
has to be overridden BPF program can set ``file_pos`` to zero before calling
|
||||||
|
to the helper. This helper can be used only on ``ctx->write == 1``. New
|
||||||
|
string value set by the helper is treated and verified by kernel same way as
|
||||||
|
an equivalent string passed by user space.
|
||||||
|
|
||||||
|
BPF program sees sysctl value same way as user space does in proc filesystem,
|
||||||
|
i.e. as a string. Since many sysctl values represent an integer or a vector
|
||||||
|
of integers, the following helpers can be used to get numeric value from the
|
||||||
|
string:
|
||||||
|
|
||||||
|
* ``bpf_strtol()`` to convert initial part of the string to long integer
|
||||||
|
similar to user space `strtol(3)`_;
|
||||||
|
* ``bpf_strtoul()`` to convert initial part of the string to unsigned long
|
||||||
|
integer similar to user space `strtoul(3)`_;
|
||||||
|
|
||||||
|
See `linux/bpf.h`_ for more details on helpers described here.
|
||||||
|
|
||||||
|
5. Examples
|
||||||
|
***********
|
||||||
|
|
||||||
|
See `test_sysctl_prog.c`_ for an example of BPF program in C that access
|
||||||
|
sysctl name and value, parses string value to get vector of integers and uses
|
||||||
|
the result to make decision whether to allow or deny access to sysctl.
|
||||||
|
|
||||||
|
6. Notes
|
||||||
|
********
|
||||||
|
|
||||||
|
``BPF_PROG_TYPE_CGROUP_SYSCTL`` is intended to be used in **trusted** root
|
||||||
|
environment, for example to monitor sysctl usage or catch unreasonable values
|
||||||
|
an application, running as root in a separate cgroup, is trying to set.
|
||||||
|
|
||||||
|
Since `task_dfl_cgroup(current)` is called at `sys_read` / `sys_write` time it
|
||||||
|
may return results different from that at `sys_open` time, i.e. process that
|
||||||
|
opened sysctl file in proc filesystem may differ from process that is trying
|
||||||
|
to read from / write to it and two such processes may run in different
|
||||||
|
cgroups, what means ``BPF_PROG_TYPE_CGROUP_SYSCTL`` should not be used as a
|
||||||
|
security mechanism to limit sysctl usage.
|
||||||
|
|
||||||
|
As with any cgroup-bpf program additional care should be taken if an
|
||||||
|
application running as root in a cgroup should not be allowed to
|
||||||
|
detach/replace BPF program attached by administrator.
|
||||||
|
|
||||||
|
.. Links
|
||||||
|
.. _linux/bpf.h: ../../include/uapi/linux/bpf.h
|
||||||
|
.. _strtol(3): http://man7.org/linux/man-pages/man3/strtol.3p.html
|
||||||
|
.. _strtoul(3): http://man7.org/linux/man-pages/man3/strtoul.3p.html
|
||||||
|
.. _test_sysctl_prog.c:
|
||||||
|
../../tools/testing/selftests/bpf/progs/test_sysctl_prog.c
|
@@ -1,8 +1,8 @@
|
|||||||
.. SPDX-License-Identifier: GPL-2.0
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
|
||||||
==================
|
============================
|
||||||
BPF Flow Dissector
|
BPF_PROG_TYPE_FLOW_DISSECTOR
|
||||||
==================
|
============================
|
||||||
|
|
||||||
Overview
|
Overview
|
||||||
========
|
========
|
@@ -1,5 +1,7 @@
|
|||||||
|
Clearing WARN_ONCE
|
||||||
|
------------------
|
||||||
|
|
||||||
WARN_ONCE / WARN_ON_ONCE only print a warning once.
|
WARN_ONCE / WARN_ON_ONCE / printk_once only emit a message once.
|
||||||
|
|
||||||
echo 1 > /sys/kernel/debug/clear_warn_once
|
echo 1 > /sys/kernel/debug/clear_warn_once
|
||||||
|
|
||||||
|
@@ -101,16 +101,6 @@ changes occur:
|
|||||||
translations for software managed TLB configurations.
|
translations for software managed TLB configurations.
|
||||||
The sparc64 port currently does this.
|
The sparc64 port currently does this.
|
||||||
|
|
||||||
6) ``void tlb_migrate_finish(struct mm_struct *mm)``
|
|
||||||
|
|
||||||
This interface is called at the end of an explicit
|
|
||||||
process migration. This interface provides a hook
|
|
||||||
to allow a platform to update TLB or context-specific
|
|
||||||
information for the address space.
|
|
||||||
|
|
||||||
The ia64 sn2 platform is one example of a platform
|
|
||||||
that uses this interface.
|
|
||||||
|
|
||||||
Next, we have the cache flushing interfaces. In general, when Linux
|
Next, we have the cache flushing interfaces. In general, when Linux
|
||||||
is changing an existing virtual-->physical mapping to a new value,
|
is changing an existing virtual-->physical mapping to a new value,
|
||||||
the sequence will be in one of the following forms::
|
the sequence will be in one of the following forms::
|
||||||
|
@@ -22,7 +22,6 @@ Core utilities
|
|||||||
workqueue
|
workqueue
|
||||||
genericirq
|
genericirq
|
||||||
xarray
|
xarray
|
||||||
flexible-arrays
|
|
||||||
librs
|
librs
|
||||||
genalloc
|
genalloc
|
||||||
errseq
|
errseq
|
||||||
|
@@ -147,10 +147,10 @@ Division Functions
|
|||||||
.. kernel-doc:: include/linux/math64.h
|
.. kernel-doc:: include/linux/math64.h
|
||||||
:internal:
|
:internal:
|
||||||
|
|
||||||
.. kernel-doc:: lib/div64.c
|
.. kernel-doc:: lib/math/div64.c
|
||||||
:functions: div_s64_rem div64_u64_rem div64_u64 div64_s64
|
:functions: div_s64_rem div64_u64_rem div64_u64 div64_s64
|
||||||
|
|
||||||
.. kernel-doc:: lib/gcd.c
|
.. kernel-doc:: lib/math/gcd.c
|
||||||
:export:
|
:export:
|
||||||
|
|
||||||
UUID/GUID
|
UUID/GUID
|
||||||
|
@@ -58,6 +58,14 @@ A raw pointer value may be printed with %p which will hash the address
|
|||||||
before printing. The kernel also supports extended specifiers for printing
|
before printing. The kernel also supports extended specifiers for printing
|
||||||
pointers of different types.
|
pointers of different types.
|
||||||
|
|
||||||
|
Some of the extended specifiers print the data on the given address instead
|
||||||
|
of printing the address itself. In this case, the following error messages
|
||||||
|
might be printed instead of the unreachable information::
|
||||||
|
|
||||||
|
(null) data on plain NULL address
|
||||||
|
(efault) data on invalid address
|
||||||
|
(einval) invalid data on a valid address
|
||||||
|
|
||||||
Plain Pointers
|
Plain Pointers
|
||||||
--------------
|
--------------
|
||||||
|
|
||||||
|
@@ -3,79 +3,79 @@ How CPU topology info is exported via sysfs
|
|||||||
===========================================
|
===========================================
|
||||||
|
|
||||||
Export CPU topology info via sysfs. Items (attributes) are similar
|
Export CPU topology info via sysfs. Items (attributes) are similar
|
||||||
to /proc/cpuinfo output of some architectures:
|
to /proc/cpuinfo output of some architectures. They reside in
|
||||||
|
/sys/devices/system/cpu/cpuX/topology/:
|
||||||
|
|
||||||
1) /sys/devices/system/cpu/cpuX/topology/physical_package_id:
|
physical_package_id:
|
||||||
|
|
||||||
physical package id of cpuX. Typically corresponds to a physical
|
physical package id of cpuX. Typically corresponds to a physical
|
||||||
socket number, but the actual value is architecture and platform
|
socket number, but the actual value is architecture and platform
|
||||||
dependent.
|
dependent.
|
||||||
|
|
||||||
2) /sys/devices/system/cpu/cpuX/topology/core_id:
|
core_id:
|
||||||
|
|
||||||
the CPU core ID of cpuX. Typically it is the hardware platform's
|
the CPU core ID of cpuX. Typically it is the hardware platform's
|
||||||
identifier (rather than the kernel's). The actual value is
|
identifier (rather than the kernel's). The actual value is
|
||||||
architecture and platform dependent.
|
architecture and platform dependent.
|
||||||
|
|
||||||
3) /sys/devices/system/cpu/cpuX/topology/book_id:
|
book_id:
|
||||||
|
|
||||||
the book ID of cpuX. Typically it is the hardware platform's
|
the book ID of cpuX. Typically it is the hardware platform's
|
||||||
identifier (rather than the kernel's). The actual value is
|
identifier (rather than the kernel's). The actual value is
|
||||||
architecture and platform dependent.
|
architecture and platform dependent.
|
||||||
|
|
||||||
4) /sys/devices/system/cpu/cpuX/topology/drawer_id:
|
drawer_id:
|
||||||
|
|
||||||
the drawer ID of cpuX. Typically it is the hardware platform's
|
the drawer ID of cpuX. Typically it is the hardware platform's
|
||||||
identifier (rather than the kernel's). The actual value is
|
identifier (rather than the kernel's). The actual value is
|
||||||
architecture and platform dependent.
|
architecture and platform dependent.
|
||||||
|
|
||||||
5) /sys/devices/system/cpu/cpuX/topology/thread_siblings:
|
thread_siblings:
|
||||||
|
|
||||||
internal kernel map of cpuX's hardware threads within the same
|
internal kernel map of cpuX's hardware threads within the same
|
||||||
core as cpuX.
|
core as cpuX.
|
||||||
|
|
||||||
6) /sys/devices/system/cpu/cpuX/topology/thread_siblings_list:
|
thread_siblings_list:
|
||||||
|
|
||||||
human-readable list of cpuX's hardware threads within the same
|
human-readable list of cpuX's hardware threads within the same
|
||||||
core as cpuX.
|
core as cpuX.
|
||||||
|
|
||||||
7) /sys/devices/system/cpu/cpuX/topology/core_siblings:
|
core_siblings:
|
||||||
|
|
||||||
internal kernel map of cpuX's hardware threads within the same
|
internal kernel map of cpuX's hardware threads within the same
|
||||||
physical_package_id.
|
physical_package_id.
|
||||||
|
|
||||||
8) /sys/devices/system/cpu/cpuX/topology/core_siblings_list:
|
core_siblings_list:
|
||||||
|
|
||||||
human-readable list of cpuX's hardware threads within the same
|
human-readable list of cpuX's hardware threads within the same
|
||||||
physical_package_id.
|
physical_package_id.
|
||||||
|
|
||||||
9) /sys/devices/system/cpu/cpuX/topology/book_siblings:
|
book_siblings:
|
||||||
|
|
||||||
internal kernel map of cpuX's hardware threads within the same
|
internal kernel map of cpuX's hardware threads within the same
|
||||||
book_id.
|
book_id.
|
||||||
|
|
||||||
10) /sys/devices/system/cpu/cpuX/topology/book_siblings_list:
|
book_siblings_list:
|
||||||
|
|
||||||
human-readable list of cpuX's hardware threads within the same
|
human-readable list of cpuX's hardware threads within the same
|
||||||
book_id.
|
book_id.
|
||||||
|
|
||||||
11) /sys/devices/system/cpu/cpuX/topology/drawer_siblings:
|
drawer_siblings:
|
||||||
|
|
||||||
internal kernel map of cpuX's hardware threads within the same
|
internal kernel map of cpuX's hardware threads within the same
|
||||||
drawer_id.
|
drawer_id.
|
||||||
|
|
||||||
12) /sys/devices/system/cpu/cpuX/topology/drawer_siblings_list:
|
drawer_siblings_list:
|
||||||
|
|
||||||
human-readable list of cpuX's hardware threads within the same
|
human-readable list of cpuX's hardware threads within the same
|
||||||
drawer_id.
|
drawer_id.
|
||||||
|
|
||||||
To implement it in an architecture-neutral way, a new source file,
|
Architecture-neutral, drivers/base/topology.c, exports these attributes.
|
||||||
drivers/base/topology.c, is to export the 6 to 12 attributes. The book
|
However, the book and drawer related sysfs files will only be created if
|
||||||
and drawer related sysfs files will only be created if CONFIG_SCHED_BOOK
|
CONFIG_SCHED_BOOK and CONFIG_SCHED_DRAWER are selected, respectively.
|
||||||
and CONFIG_SCHED_DRAWER are selected.
|
|
||||||
|
|
||||||
CONFIG_SCHED_BOOK and CONFIG_DRAWER are currently only used on s390, where
|
CONFIG_SCHED_BOOK and CONFIG_SCHED_DRAWER are currently only used on s390,
|
||||||
they reflect the cpu and cache hierarchy.
|
where they reflect the cpu and cache hierarchy.
|
||||||
|
|
||||||
For an architecture to support this feature, it must define some of
|
For an architecture to support this feature, it must define some of
|
||||||
these macros in include/asm-XXX/topology.h::
|
these macros in include/asm-XXX/topology.h::
|
||||||
@@ -98,10 +98,10 @@ To be consistent on all architectures, include/linux/topology.h
|
|||||||
provides default definitions for any of the above macros that are
|
provides default definitions for any of the above macros that are
|
||||||
not defined by include/asm-XXX/topology.h:
|
not defined by include/asm-XXX/topology.h:
|
||||||
|
|
||||||
1) physical_package_id: -1
|
1) topology_physical_package_id: -1
|
||||||
2) core_id: 0
|
2) topology_core_id: 0
|
||||||
3) sibling_cpumask: just the given CPU
|
3) topology_sibling_cpumask: just the given CPU
|
||||||
4) core_cpumask: just the given CPU
|
4) topology_core_cpumask: just the given CPU
|
||||||
|
|
||||||
For architectures that don't support books (CONFIG_SCHED_BOOK) there are no
|
For architectures that don't support books (CONFIG_SCHED_BOOK) there are no
|
||||||
default definitions for topology_book_id() and topology_book_cpumask().
|
default definitions for topology_book_id() and topology_book_cpumask().
|
||||||
|
@@ -133,7 +133,6 @@ Code Example For Use of Operational State Memory With SHASH
|
|||||||
if (!sdesc)
|
if (!sdesc)
|
||||||
return ERR_PTR(-ENOMEM);
|
return ERR_PTR(-ENOMEM);
|
||||||
sdesc->shash.tfm = alg;
|
sdesc->shash.tfm = alg;
|
||||||
sdesc->shash.flags = 0x0;
|
|
||||||
return sdesc;
|
return sdesc;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -34,10 +34,6 @@ Configure the kernel with::
|
|||||||
CONFIG_DEBUG_FS=y
|
CONFIG_DEBUG_FS=y
|
||||||
CONFIG_GCOV_KERNEL=y
|
CONFIG_GCOV_KERNEL=y
|
||||||
|
|
||||||
select the gcc's gcov format, default is autodetect based on gcc version::
|
|
||||||
|
|
||||||
CONFIG_GCOV_FORMAT_AUTODETECT=y
|
|
||||||
|
|
||||||
and to get coverage data for the entire kernel::
|
and to get coverage data for the entire kernel::
|
||||||
|
|
||||||
CONFIG_GCOV_PROFILE_ALL=y
|
CONFIG_GCOV_PROFILE_ALL=y
|
||||||
@@ -169,6 +165,20 @@ b) gcov is run on the BUILD machine
|
|||||||
[user@build] gcov -o /tmp/coverage/tmp/out/init main.c
|
[user@build] gcov -o /tmp/coverage/tmp/out/init main.c
|
||||||
|
|
||||||
|
|
||||||
|
Note on compilers
|
||||||
|
-----------------
|
||||||
|
|
||||||
|
GCC and LLVM gcov tools are not necessarily compatible. Use gcov_ to work with
|
||||||
|
GCC-generated .gcno and .gcda files, and use llvm-cov_ for Clang.
|
||||||
|
|
||||||
|
.. _gcov: http://gcc.gnu.org/onlinedocs/gcc/Gcov.html
|
||||||
|
.. _llvm-cov: https://llvm.org/docs/CommandGuide/llvm-cov.html
|
||||||
|
|
||||||
|
Build differences between GCC and Clang gcov are handled by Kconfig. It
|
||||||
|
automatically selects the appropriate gcov format depending on the detected
|
||||||
|
toolchain.
|
||||||
|
|
||||||
|
|
||||||
Troubleshooting
|
Troubleshooting
|
||||||
---------------
|
---------------
|
||||||
|
|
||||||
|
@@ -7,6 +7,11 @@ directory. These are intended to be small tests to exercise individual code
|
|||||||
paths in the kernel. Tests are intended to be run after building, installing
|
paths in the kernel. Tests are intended to be run after building, installing
|
||||||
and booting a kernel.
|
and booting a kernel.
|
||||||
|
|
||||||
|
You can find additional information on Kselftest framework, how to
|
||||||
|
write new tests using the framework on Kselftest wiki:
|
||||||
|
|
||||||
|
https://kselftest.wiki.kernel.org/
|
||||||
|
|
||||||
On some systems, hot-plug tests could hang forever waiting for cpu and
|
On some systems, hot-plug tests could hang forever waiting for cpu and
|
||||||
memory to be ready to be offlined. A special hot-plug target is created
|
memory to be ready to be offlined. A special hot-plug target is created
|
||||||
to run the full range of hot-plug tests. In default mode, hot-plug tests run
|
to run the full range of hot-plug tests. In default mode, hot-plug tests run
|
||||||
@@ -14,6 +19,10 @@ in safe mode with a limited scope. In limited mode, cpu-hotplug test is
|
|||||||
run on a single cpu as opposed to all hotplug capable cpus, and memory
|
run on a single cpu as opposed to all hotplug capable cpus, and memory
|
||||||
hotplug test is run on 2% of hotplug capable memory instead of 10%.
|
hotplug test is run on 2% of hotplug capable memory instead of 10%.
|
||||||
|
|
||||||
|
kselftest runs as a userspace process. Tests that can be written/run in
|
||||||
|
userspace may wish to use the `Test Harness`_. Tests that need to be
|
||||||
|
run in kernel space may wish to use a `Test Module`_.
|
||||||
|
|
||||||
Running the selftests (hotplug tests are run in limited mode)
|
Running the selftests (hotplug tests are run in limited mode)
|
||||||
=============================================================
|
=============================================================
|
||||||
|
|
||||||
@@ -31,17 +40,32 @@ To build and run the tests with a single command, use::
|
|||||||
|
|
||||||
Note that some tests will require root privileges.
|
Note that some tests will require root privileges.
|
||||||
|
|
||||||
Build and run from user specific object directory (make O=dir)::
|
Kselftest supports saving output files in a separate directory and then
|
||||||
|
running tests. To locate output files in a separate directory two syntaxes
|
||||||
|
are supported. In both cases the working directory must be the root of the
|
||||||
|
kernel src. This is applicable to "Running a subset of selftests" section
|
||||||
|
below.
|
||||||
|
|
||||||
|
To build, save output files in a separate directory with O= ::
|
||||||
|
|
||||||
$ make O=/tmp/kselftest kselftest
|
$ make O=/tmp/kselftest kselftest
|
||||||
|
|
||||||
Build and run KBUILD_OUTPUT directory (make KBUILD_OUTPUT=)::
|
To build, save output files in a separate directory with KBUILD_OUTPUT ::
|
||||||
|
|
||||||
$ make KBUILD_OUTPUT=/tmp/kselftest kselftest
|
$ export KBUILD_OUTPUT=/tmp/kselftest; make kselftest
|
||||||
|
|
||||||
The above commands run the tests and print pass/fail summary to make it
|
The O= assignment takes precedence over the KBUILD_OUTPUT environment
|
||||||
easier to understand the test results. Please find the detailed individual
|
variable.
|
||||||
test results for each test in /tmp/testname file(s).
|
|
||||||
|
The above commands by default run the tests and print full pass/fail report.
|
||||||
|
Kselftest supports "summary" option to make it easier to understand the test
|
||||||
|
results. Please find the detailed individual test results for each test in
|
||||||
|
/tmp/testname file(s) when summary option is specified. This is applicable
|
||||||
|
to "Running a subset of selftests" section below.
|
||||||
|
|
||||||
|
To run kselftest with summary option enabled ::
|
||||||
|
|
||||||
|
$ make summary=1 kselftest
|
||||||
|
|
||||||
Running a subset of selftests
|
Running a subset of selftests
|
||||||
=============================
|
=============================
|
||||||
@@ -57,17 +81,13 @@ You can specify multiple tests to build and run::
|
|||||||
|
|
||||||
$ make TARGETS="size timers" kselftest
|
$ make TARGETS="size timers" kselftest
|
||||||
|
|
||||||
Build and run from user specific object directory (make O=dir)::
|
To build, save output files in a separate directory with O= ::
|
||||||
|
|
||||||
$ make O=/tmp/kselftest TARGETS="size timers" kselftest
|
$ make O=/tmp/kselftest TARGETS="size timers" kselftest
|
||||||
|
|
||||||
Build and run KBUILD_OUTPUT directory (make KBUILD_OUTPUT=)::
|
To build, save output files in a separate directory with KBUILD_OUTPUT ::
|
||||||
|
|
||||||
$ make KBUILD_OUTPUT=/tmp/kselftest TARGETS="size timers" kselftest
|
$ export KBUILD_OUTPUT=/tmp/kselftest; make TARGETS="size timers" kselftest
|
||||||
|
|
||||||
The above commands run the tests and print pass/fail summary to make it
|
|
||||||
easier to understand the test results. Please find the detailed individual
|
|
||||||
test results for each test in /tmp/testname file(s).
|
|
||||||
|
|
||||||
See the top-level tools/testing/selftests/Makefile for the list of all
|
See the top-level tools/testing/selftests/Makefile for the list of all
|
||||||
possible targets.
|
possible targets.
|
||||||
@@ -161,11 +181,97 @@ Contributing new tests (details)
|
|||||||
|
|
||||||
e.g: tools/testing/selftests/android/config
|
e.g: tools/testing/selftests/android/config
|
||||||
|
|
||||||
|
Test Module
|
||||||
|
===========
|
||||||
|
|
||||||
|
Kselftest tests the kernel from userspace. Sometimes things need
|
||||||
|
testing from within the kernel, one method of doing this is to create a
|
||||||
|
test module. We can tie the module into the kselftest framework by
|
||||||
|
using a shell script test runner. ``kselftest_module.sh`` is designed
|
||||||
|
to facilitate this process. There is also a header file provided to
|
||||||
|
assist writing kernel modules that are for use with kselftest:
|
||||||
|
|
||||||
|
- ``tools/testing/kselftest/kselftest_module.h``
|
||||||
|
- ``tools/testing/kselftest/kselftest_module.sh``
|
||||||
|
|
||||||
|
How to use
|
||||||
|
----------
|
||||||
|
|
||||||
|
Here we show the typical steps to create a test module and tie it into
|
||||||
|
kselftest. We use kselftests for lib/ as an example.
|
||||||
|
|
||||||
|
1. Create the test module
|
||||||
|
|
||||||
|
2. Create the test script that will run (load/unload) the module
|
||||||
|
e.g. ``tools/testing/selftests/lib/printf.sh``
|
||||||
|
|
||||||
|
3. Add line to config file e.g. ``tools/testing/selftests/lib/config``
|
||||||
|
|
||||||
|
4. Add test script to makefile e.g. ``tools/testing/selftests/lib/Makefile``
|
||||||
|
|
||||||
|
5. Verify it works:
|
||||||
|
|
||||||
|
.. code-block:: sh
|
||||||
|
|
||||||
|
# Assumes you have booted a fresh build of this kernel tree
|
||||||
|
cd /path/to/linux/tree
|
||||||
|
make kselftest-merge
|
||||||
|
make modules
|
||||||
|
sudo make modules_install
|
||||||
|
make TARGETS=lib kselftest
|
||||||
|
|
||||||
|
Example Module
|
||||||
|
--------------
|
||||||
|
|
||||||
|
A bare bones test module might look like this:
|
||||||
|
|
||||||
|
.. code-block:: c
|
||||||
|
|
||||||
|
// SPDX-License-Identifier: GPL-2.0+
|
||||||
|
|
||||||
|
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
||||||
|
|
||||||
|
#include "../tools/testing/selftests/kselftest_module.h"
|
||||||
|
|
||||||
|
KSTM_MODULE_GLOBALS();
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Kernel module for testing the foobinator
|
||||||
|
*/
|
||||||
|
|
||||||
|
static int __init test_function()
|
||||||
|
{
|
||||||
|
...
|
||||||
|
}
|
||||||
|
|
||||||
|
static void __init selftest(void)
|
||||||
|
{
|
||||||
|
KSTM_CHECK_ZERO(do_test_case("", 0));
|
||||||
|
}
|
||||||
|
|
||||||
|
KSTM_MODULE_LOADERS(test_foo);
|
||||||
|
MODULE_AUTHOR("John Developer <jd@fooman.org>");
|
||||||
|
MODULE_LICENSE("GPL");
|
||||||
|
|
||||||
|
Example test script
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
.. code-block:: sh
|
||||||
|
|
||||||
|
#!/bin/bash
|
||||||
|
# SPDX-License-Identifier: GPL-2.0+
|
||||||
|
$(dirname $0)/../kselftest_module.sh "foo" test_foo
|
||||||
|
|
||||||
|
|
||||||
Test Harness
|
Test Harness
|
||||||
============
|
============
|
||||||
|
|
||||||
The kselftest_harness.h file contains useful helpers to build tests. The tests
|
The kselftest_harness.h file contains useful helpers to build tests. The
|
||||||
from tools/testing/selftests/seccomp/seccomp_bpf.c can be used as example.
|
test harness is for userspace testing, for kernel space testing see `Test
|
||||||
|
Module`_ above.
|
||||||
|
|
||||||
|
The tests from tools/testing/selftests/seccomp/seccomp_bpf.c can be used as
|
||||||
|
example.
|
||||||
|
|
||||||
Example
|
Example
|
||||||
-------
|
-------
|
||||||
|
272
Documentation/device-mapper/dm-dust.txt
Normal file
272
Documentation/device-mapper/dm-dust.txt
Normal file
@@ -0,0 +1,272 @@
|
|||||||
|
dm-dust
|
||||||
|
=======
|
||||||
|
|
||||||
|
This target emulates the behavior of bad sectors at arbitrary
|
||||||
|
locations, and the ability to enable the emulation of the failures
|
||||||
|
at an arbitrary time.
|
||||||
|
|
||||||
|
This target behaves similarly to a linear target. At a given time,
|
||||||
|
the user can send a message to the target to start failing read
|
||||||
|
requests on specific blocks (to emulate the behavior of a hard disk
|
||||||
|
drive with bad sectors).
|
||||||
|
|
||||||
|
When the failure behavior is enabled (i.e.: when the output of
|
||||||
|
"dmsetup status" displays "fail_read_on_bad_block"), reads of blocks
|
||||||
|
in the "bad block list" will fail with EIO ("Input/output error").
|
||||||
|
|
||||||
|
Writes of blocks in the "bad block list will result in the following:
|
||||||
|
|
||||||
|
1. Remove the block from the "bad block list".
|
||||||
|
2. Successfully complete the write.
|
||||||
|
|
||||||
|
This emulates the "remapped sector" behavior of a drive with bad
|
||||||
|
sectors.
|
||||||
|
|
||||||
|
Normally, a drive that is encountering bad sectors will most likely
|
||||||
|
encounter more bad sectors, at an unknown time or location.
|
||||||
|
With dm-dust, the user can use the "addbadblock" and "removebadblock"
|
||||||
|
messages to add arbitrary bad blocks at new locations, and the
|
||||||
|
"enable" and "disable" messages to modulate the state of whether the
|
||||||
|
configured "bad blocks" will be treated as bad, or bypassed.
|
||||||
|
This allows the pre-writing of test data and metadata prior to
|
||||||
|
simulating a "failure" event where bad sectors start to appear.
|
||||||
|
|
||||||
|
Table parameters:
|
||||||
|
-----------------
|
||||||
|
<device_path> <offset> <blksz>
|
||||||
|
|
||||||
|
Mandatory parameters:
|
||||||
|
<device_path>: path to the block device.
|
||||||
|
<offset>: offset to data area from start of device_path
|
||||||
|
<blksz>: block size in bytes
|
||||||
|
(minimum 512, maximum 1073741824, must be a power of 2)
|
||||||
|
|
||||||
|
Usage instructions:
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
First, find the size (in 512-byte sectors) of the device to be used:
|
||||||
|
|
||||||
|
$ sudo blockdev --getsz /dev/vdb1
|
||||||
|
33552384
|
||||||
|
|
||||||
|
Create the dm-dust device:
|
||||||
|
(For a device with a block size of 512 bytes)
|
||||||
|
$ sudo dmsetup create dust1 --table '0 33552384 dust /dev/vdb1 0 512'
|
||||||
|
|
||||||
|
(For a device with a block size of 4096 bytes)
|
||||||
|
$ sudo dmsetup create dust1 --table '0 33552384 dust /dev/vdb1 0 4096'
|
||||||
|
|
||||||
|
Check the status of the read behavior ("bypass" indicates that all I/O
|
||||||
|
will be passed through to the underlying device):
|
||||||
|
$ sudo dmsetup status dust1
|
||||||
|
0 33552384 dust 252:17 bypass
|
||||||
|
|
||||||
|
$ sudo dd if=/dev/mapper/dust1 of=/dev/null bs=512 count=128 iflag=direct
|
||||||
|
128+0 records in
|
||||||
|
128+0 records out
|
||||||
|
|
||||||
|
$ sudo dd if=/dev/zero of=/dev/mapper/dust1 bs=512 count=128 oflag=direct
|
||||||
|
128+0 records in
|
||||||
|
128+0 records out
|
||||||
|
|
||||||
|
Adding and removing bad blocks:
|
||||||
|
-------------------------------
|
||||||
|
|
||||||
|
At any time (i.e.: whether the device has the "bad block" emulation
|
||||||
|
enabled or disabled), bad blocks may be added or removed from the
|
||||||
|
device via the "addbadblock" and "removebadblock" messages:
|
||||||
|
|
||||||
|
$ sudo dmsetup message dust1 0 addbadblock 60
|
||||||
|
kernel: device-mapper: dust: badblock added at block 60
|
||||||
|
|
||||||
|
$ sudo dmsetup message dust1 0 addbadblock 67
|
||||||
|
kernel: device-mapper: dust: badblock added at block 67
|
||||||
|
|
||||||
|
$ sudo dmsetup message dust1 0 addbadblock 72
|
||||||
|
kernel: device-mapper: dust: badblock added at block 72
|
||||||
|
|
||||||
|
These bad blocks will be stored in the "bad block list".
|
||||||
|
While the device is in "bypass" mode, reads and writes will succeed:
|
||||||
|
|
||||||
|
$ sudo dmsetup status dust1
|
||||||
|
0 33552384 dust 252:17 bypass
|
||||||
|
|
||||||
|
Enabling block read failures:
|
||||||
|
-----------------------------
|
||||||
|
|
||||||
|
To enable the "fail read on bad block" behavior, send the "enable" message:
|
||||||
|
|
||||||
|
$ sudo dmsetup message dust1 0 enable
|
||||||
|
kernel: device-mapper: dust: enabling read failures on bad sectors
|
||||||
|
|
||||||
|
$ sudo dmsetup status dust1
|
||||||
|
0 33552384 dust 252:17 fail_read_on_bad_block
|
||||||
|
|
||||||
|
With the device in "fail read on bad block" mode, attempting to read a
|
||||||
|
block will encounter an "Input/output error":
|
||||||
|
|
||||||
|
$ sudo dd if=/dev/mapper/dust1 of=/dev/null bs=512 count=1 skip=67 iflag=direct
|
||||||
|
dd: error reading '/dev/mapper/dust1': Input/output error
|
||||||
|
0+0 records in
|
||||||
|
0+0 records out
|
||||||
|
0 bytes copied, 0.00040651 s, 0.0 kB/s
|
||||||
|
|
||||||
|
...and writing to the bad blocks will remove the blocks from the list,
|
||||||
|
therefore emulating the "remap" behavior of hard disk drives:
|
||||||
|
|
||||||
|
$ sudo dd if=/dev/zero of=/dev/mapper/dust1 bs=512 count=128 oflag=direct
|
||||||
|
128+0 records in
|
||||||
|
128+0 records out
|
||||||
|
|
||||||
|
kernel: device-mapper: dust: block 60 removed from badblocklist by write
|
||||||
|
kernel: device-mapper: dust: block 67 removed from badblocklist by write
|
||||||
|
kernel: device-mapper: dust: block 72 removed from badblocklist by write
|
||||||
|
kernel: device-mapper: dust: block 87 removed from badblocklist by write
|
||||||
|
|
||||||
|
Bad block add/remove error handling:
|
||||||
|
------------------------------------
|
||||||
|
|
||||||
|
Attempting to add a bad block that already exists in the list will
|
||||||
|
result in an "Invalid argument" error, as well as a helpful message:
|
||||||
|
|
||||||
|
$ sudo dmsetup message dust1 0 addbadblock 88
|
||||||
|
device-mapper: message ioctl on dust1 failed: Invalid argument
|
||||||
|
kernel: device-mapper: dust: block 88 already in badblocklist
|
||||||
|
|
||||||
|
Attempting to remove a bad block that doesn't exist in the list will
|
||||||
|
result in an "Invalid argument" error, as well as a helpful message:
|
||||||
|
|
||||||
|
$ sudo dmsetup message dust1 0 removebadblock 87
|
||||||
|
device-mapper: message ioctl on dust1 failed: Invalid argument
|
||||||
|
kernel: device-mapper: dust: block 87 not found in badblocklist
|
||||||
|
|
||||||
|
Counting the number of bad blocks in the bad block list:
|
||||||
|
--------------------------------------------------------
|
||||||
|
|
||||||
|
To count the number of bad blocks configured in the device, run the
|
||||||
|
following message command:
|
||||||
|
|
||||||
|
$ sudo dmsetup message dust1 0 countbadblocks
|
||||||
|
|
||||||
|
A message will print with the number of bad blocks currently
|
||||||
|
configured on the device:
|
||||||
|
|
||||||
|
kernel: device-mapper: dust: countbadblocks: 895 badblock(s) found
|
||||||
|
|
||||||
|
Querying for specific bad blocks:
|
||||||
|
---------------------------------
|
||||||
|
|
||||||
|
To find out if a specific block is in the bad block list, run the
|
||||||
|
following message command:
|
||||||
|
|
||||||
|
$ sudo dmsetup message dust1 0 queryblock 72
|
||||||
|
|
||||||
|
The following message will print if the block is in the list:
|
||||||
|
device-mapper: dust: queryblock: block 72 found in badblocklist
|
||||||
|
|
||||||
|
The following message will print if the block is in the list:
|
||||||
|
device-mapper: dust: queryblock: block 72 not found in badblocklist
|
||||||
|
|
||||||
|
The "queryblock" message command will work in both the "enabled"
|
||||||
|
and "disabled" modes, allowing the verification of whether a block
|
||||||
|
will be treated as "bad" without having to issue I/O to the device,
|
||||||
|
or having to "enable" the bad block emulation.
|
||||||
|
|
||||||
|
Clearing the bad block list:
|
||||||
|
----------------------------
|
||||||
|
|
||||||
|
To clear the bad block list (without needing to individually run
|
||||||
|
a "removebadblock" message command for every block), run the
|
||||||
|
following message command:
|
||||||
|
|
||||||
|
$ sudo dmsetup message dust1 0 clearbadblocks
|
||||||
|
|
||||||
|
After clearing the bad block list, the following message will appear:
|
||||||
|
|
||||||
|
kernel: device-mapper: dust: clearbadblocks: badblocks cleared
|
||||||
|
|
||||||
|
If there were no bad blocks to clear, the following message will
|
||||||
|
appear:
|
||||||
|
|
||||||
|
kernel: device-mapper: dust: clearbadblocks: no badblocks found
|
||||||
|
|
||||||
|
Message commands list:
|
||||||
|
----------------------
|
||||||
|
|
||||||
|
Below is a list of the messages that can be sent to a dust device:
|
||||||
|
|
||||||
|
Operations on blocks (requires a <blknum> argument):
|
||||||
|
|
||||||
|
addbadblock <blknum>
|
||||||
|
queryblock <blknum>
|
||||||
|
removebadblock <blknum>
|
||||||
|
|
||||||
|
...where <blknum> is a block number within range of the device
|
||||||
|
(corresponding to the block size of the device.)
|
||||||
|
|
||||||
|
Single argument message commands:
|
||||||
|
|
||||||
|
countbadblocks
|
||||||
|
clearbadblocks
|
||||||
|
disable
|
||||||
|
enable
|
||||||
|
quiet
|
||||||
|
|
||||||
|
Device removal:
|
||||||
|
---------------
|
||||||
|
|
||||||
|
When finished, remove the device via the "dmsetup remove" command:
|
||||||
|
|
||||||
|
$ sudo dmsetup remove dust1
|
||||||
|
|
||||||
|
Quiet mode:
|
||||||
|
-----------
|
||||||
|
|
||||||
|
On test runs with many bad blocks, it may be desirable to avoid
|
||||||
|
excessive logging (from bad blocks added, removed, or "remapped").
|
||||||
|
This can be done by enabling "quiet mode" via the following message:
|
||||||
|
|
||||||
|
$ sudo dmsetup message dust1 0 quiet
|
||||||
|
|
||||||
|
This will suppress log messages from add / remove / removed by write
|
||||||
|
operations. Log messages from "countbadblocks" or "queryblock"
|
||||||
|
message commands will still print in quiet mode.
|
||||||
|
|
||||||
|
The status of quiet mode can be seen by running "dmsetup status":
|
||||||
|
|
||||||
|
$ sudo dmsetup status dust1
|
||||||
|
0 33552384 dust 252:17 fail_read_on_bad_block quiet
|
||||||
|
|
||||||
|
To disable quiet mode, send the "quiet" message again:
|
||||||
|
|
||||||
|
$ sudo dmsetup message dust1 0 quiet
|
||||||
|
|
||||||
|
$ sudo dmsetup status dust1
|
||||||
|
0 33552384 dust 252:17 fail_read_on_bad_block verbose
|
||||||
|
|
||||||
|
(The presence of "verbose" indicates normal logging.)
|
||||||
|
|
||||||
|
"Why not...?"
|
||||||
|
-------------
|
||||||
|
|
||||||
|
scsi_debug has a "medium error" mode that can fail reads on one
|
||||||
|
specified sector (sector 0x1234, hardcoded in the source code), but
|
||||||
|
it uses RAM for the persistent storage, which drastically decreases
|
||||||
|
the potential device size.
|
||||||
|
|
||||||
|
dm-flakey fails all I/O from all block locations at a specified time
|
||||||
|
frequency, and not a given point in time.
|
||||||
|
|
||||||
|
When a bad sector occurs on a hard disk drive, reads to that sector
|
||||||
|
are failed by the device, usually resulting in an error code of EIO
|
||||||
|
("I/O error") or ENODATA ("No data available"). However, a write to
|
||||||
|
the sector may succeed, and result in the sector becoming readable
|
||||||
|
after the device controller no longer experiences errors reading the
|
||||||
|
sector (or after a reallocation of the sector). However, there may
|
||||||
|
be bad sectors that occur on the device in the future, in a different,
|
||||||
|
unpredictable location.
|
||||||
|
|
||||||
|
This target seeks to provide a device that can exhibit the behavior
|
||||||
|
of a bad sector at a known sector location, at a known time, based
|
||||||
|
on a large storage device (at least tens of gigabytes, not occupying
|
||||||
|
system memory).
|
@@ -21,6 +21,13 @@ mode it calculates and verifies the integrity tag internally. In this
|
|||||||
mode, the dm-integrity target can be used to detect silent data
|
mode, the dm-integrity target can be used to detect silent data
|
||||||
corruption on the disk or in the I/O path.
|
corruption on the disk or in the I/O path.
|
||||||
|
|
||||||
|
There's an alternate mode of operation where dm-integrity uses bitmap
|
||||||
|
instead of a journal. If a bit in the bitmap is 1, the corresponding
|
||||||
|
region's data and integrity tags are not synchronized - if the machine
|
||||||
|
crashes, the unsynchronized regions will be recalculated. The bitmap mode
|
||||||
|
is faster than the journal mode, because we don't have to write the data
|
||||||
|
twice, but it is also less reliable, because if data corruption happens
|
||||||
|
when the machine crashes, it may not be detected.
|
||||||
|
|
||||||
When loading the target for the first time, the kernel driver will format
|
When loading the target for the first time, the kernel driver will format
|
||||||
the device. But it will only format the device if the superblock contains
|
the device. But it will only format the device if the superblock contains
|
||||||
@@ -59,6 +66,10 @@ Target arguments:
|
|||||||
either both data and tag or none of them are written. The
|
either both data and tag or none of them are written. The
|
||||||
journaled mode degrades write throughput twice because the
|
journaled mode degrades write throughput twice because the
|
||||||
data have to be written twice.
|
data have to be written twice.
|
||||||
|
B - bitmap mode - data and metadata are written without any
|
||||||
|
synchronization, the driver maintains a bitmap of dirty
|
||||||
|
regions where data and metadata don't match. This mode can
|
||||||
|
only be used with internal hash.
|
||||||
R - recovery mode - in this mode, journal is not replayed,
|
R - recovery mode - in this mode, journal is not replayed,
|
||||||
checksums are not checked and writes to the device are not
|
checksums are not checked and writes to the device are not
|
||||||
allowed. This mode is useful for data recovery if the
|
allowed. This mode is useful for data recovery if the
|
||||||
@@ -79,6 +90,10 @@ interleave_sectors:number
|
|||||||
a power of two. If the device is already formatted, the value from
|
a power of two. If the device is already formatted, the value from
|
||||||
the superblock is used.
|
the superblock is used.
|
||||||
|
|
||||||
|
meta_device:device
|
||||||
|
Don't interleave the data and metadata on on device. Use a
|
||||||
|
separate device for metadata.
|
||||||
|
|
||||||
buffer_sectors:number
|
buffer_sectors:number
|
||||||
The number of sectors in one buffer. The value is rounded down to
|
The number of sectors in one buffer. The value is rounded down to
|
||||||
a power of two.
|
a power of two.
|
||||||
@@ -146,6 +161,15 @@ block_size:number
|
|||||||
Supported values are 512, 1024, 2048 and 4096 bytes. If not
|
Supported values are 512, 1024, 2048 and 4096 bytes. If not
|
||||||
specified the default block size is 512 bytes.
|
specified the default block size is 512 bytes.
|
||||||
|
|
||||||
|
sectors_per_bit:number
|
||||||
|
In the bitmap mode, this parameter specifies the number of
|
||||||
|
512-byte sectors that corresponds to one bitmap bit.
|
||||||
|
|
||||||
|
bitmap_flush_interval:number
|
||||||
|
The bitmap flush interval in milliseconds. The metadata buffers
|
||||||
|
are synchronized when this interval expires.
|
||||||
|
|
||||||
|
|
||||||
The journal mode (D/J), buffer_sectors, journal_watermark, commit_time can
|
The journal mode (D/J), buffer_sectors, journal_watermark, commit_time can
|
||||||
be changed when reloading the target (load an inactive table and swap the
|
be changed when reloading the target (load an inactive table and swap the
|
||||||
tables with suspend and resume). The other arguments should not be changed
|
tables with suspend and resume). The other arguments should not be changed
|
||||||
@@ -167,7 +191,13 @@ The layout of the formatted block device:
|
|||||||
provides (i.e. the size of the device minus the size of all
|
provides (i.e. the size of the device minus the size of all
|
||||||
metadata and padding). The user of this target should not send
|
metadata and padding). The user of this target should not send
|
||||||
bios that access data beyond the "provided data sectors" limit.
|
bios that access data beyond the "provided data sectors" limit.
|
||||||
* flags - a flag is set if journal_mac is used
|
* flags
|
||||||
|
SB_FLAG_HAVE_JOURNAL_MAC - a flag is set if journal_mac is used
|
||||||
|
SB_FLAG_RECALCULATING - recalculating is in progress
|
||||||
|
SB_FLAG_DIRTY_BITMAP - journal area contains the bitmap of dirty
|
||||||
|
blocks
|
||||||
|
* log2(sectors per block)
|
||||||
|
* a position where recalculating finished
|
||||||
* journal
|
* journal
|
||||||
The journal is divided into sections, each section contains:
|
The journal is divided into sections, each section contains:
|
||||||
* metadata area (4kiB), it contains journal entries
|
* metadata area (4kiB), it contains journal entries
|
||||||
|
@@ -11,3 +11,15 @@ Example:
|
|||||||
reg = <0xffd08000 0x1000>;
|
reg = <0xffd08000 0x1000>;
|
||||||
cpu1-start-addr = <0xffd080c4>;
|
cpu1-start-addr = <0xffd080c4>;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
ARM64 - Stratix10
|
||||||
|
Required properties:
|
||||||
|
- compatible : "altr,sys-mgr-s10"
|
||||||
|
- reg : Should contain 1 register range(address and length)
|
||||||
|
for system manager register.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
sysmgr@ffd12000 {
|
||||||
|
compatible = "altr,sys-mgr-s10";
|
||||||
|
reg = <0xffd12000 0x228>;
|
||||||
|
};
|
||||||
|
@@ -110,6 +110,7 @@ Board compatible values (alphabetically, grouped by SoC):
|
|||||||
|
|
||||||
- "amlogic,u200" (Meson g12a s905d2)
|
- "amlogic,u200" (Meson g12a s905d2)
|
||||||
- "amediatech,x96-max" (Meson g12a s905x2)
|
- "amediatech,x96-max" (Meson g12a s905x2)
|
||||||
|
- "seirobotics,sei510" (Meson g12a s905x2)
|
||||||
|
|
||||||
Amlogic Meson Firmware registers Interface
|
Amlogic Meson Firmware registers Interface
|
||||||
------------------------------------------
|
------------------------------------------
|
||||||
|
@@ -25,6 +25,7 @@ compatible: must be one of:
|
|||||||
o "atmel,at91sam9n12"
|
o "atmel,at91sam9n12"
|
||||||
o "atmel,at91sam9rl"
|
o "atmel,at91sam9rl"
|
||||||
o "atmel,at91sam9xe"
|
o "atmel,at91sam9xe"
|
||||||
|
o "microchip,sam9x60"
|
||||||
* "atmel,sama5" for SoCs using a Cortex-A5, shall be extended with the specific
|
* "atmel,sama5" for SoCs using a Cortex-A5, shall be extended with the specific
|
||||||
SoC family:
|
SoC family:
|
||||||
o "atmel,sama5d2" shall be extended with the specific SoC compatible:
|
o "atmel,sama5d2" shall be extended with the specific SoC compatible:
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user