net: dsa: sja1105: Implement state machine for TAS with PTP clock source

Tested using the following bash script and the tc from iproute2-next:

	#!/bin/bash

	set -e -u -o pipefail

	NSEC_PER_SEC="1000000000"

	gatemask() {
		local tc_list="$1"
		local mask=0

		for tc in ${tc_list}; do
			mask=$((${mask} | (1 << ${tc})))
		done

		printf "%02x" ${mask}
	}

	if ! systemctl is-active --quiet ptp4l; then
		echo "Please start the ptp4l service"
		exit
	fi

	now=$(phc_ctl /dev/ptp1 get | gawk '/clock time is/ { print $5; }')
	# Phase-align the base time to the start of the next second.
	sec=$(echo "${now}" | gawk -F. '{ print $1; }')
	base_time="$(((${sec} + 1) * ${NSEC_PER_SEC}))"

	tc qdisc add dev swp5 parent root handle 100 taprio \
		num_tc 8 \
		map 0 1 2 3 5 6 7 \
		queues 1@0 1@1 1@2 1@3 1@4 1@5 1@6 1@7 \
		base-time ${base_time} \
		sched-entry S $(gatemask 7) 100000 \
		sched-entry S $(gatemask "0 1 2 3 4 5 6") 400000 \
		clockid CLOCK_TAI flags 2

The "state machine" is a workqueue invoked after each manipulation
command on the PTP clock (reset, adjust time, set time, adjust
frequency) which checks over the state of the time-aware scheduler.
So it is not monitored periodically, only in reaction to a PTP command
typically triggered from a userspace daemon (linuxptp). Otherwise there
is no reason for things to go wrong.

Now that the timecounter/cyclecounter has been replaced with hardware
operations on the PTP clock, the TAS Kconfig now depends upon PTP and
the standalone clocksource operating mode has been removed.

Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
Vladimir Oltean
2019-11-12 02:11:54 +02:00
committed by David S. Miller
parent 41603d78b3
commit 86db36a347
7 changed files with 487 additions and 17 deletions

View File

@@ -10,6 +10,11 @@
#define SJA1105_TAS_MAX_DELTA BIT(19)
#define SJA1105_GATE_MASK GENMASK_ULL(SJA1105_NUM_TC - 1, 0)
#define work_to_sja1105_tas(d) \
container_of((d), struct sja1105_tas_data, tas_work)
#define tas_to_sja1105(d) \
container_of((d), struct sja1105_private, tas_data)
/* This is not a preprocessor macro because the "ns" argument may or may not be
* s64 at caller side. This ensures it is properly type-cast before div_s64.
*/
@@ -18,6 +23,100 @@ static s64 ns_to_sja1105_delta(s64 ns)
return div_s64(ns, 200);
}
static s64 sja1105_delta_to_ns(s64 delta)
{
return delta * 200;
}
/* Calculate the first base_time in the future that satisfies this
* relationship:
*
* future_base_time = base_time + N x cycle_time >= now, or
*
* now - base_time
* N >= ---------------
* cycle_time
*
* Because N is an integer, the ceiling value of the above "a / b" ratio
* is in fact precisely the floor value of "(a + b - 1) / b", which is
* easier to calculate only having integer division tools.
*/
static s64 future_base_time(s64 base_time, s64 cycle_time, s64 now)
{
s64 a, b, n;
if (base_time >= now)
return base_time;
a = now - base_time;
b = cycle_time;
n = div_s64(a + b - 1, b);
return base_time + n * cycle_time;
}
static int sja1105_tas_set_runtime_params(struct sja1105_private *priv)
{
struct sja1105_tas_data *tas_data = &priv->tas_data;
struct dsa_switch *ds = priv->ds;
s64 earliest_base_time = S64_MAX;
s64 latest_base_time = 0;
s64 its_cycle_time = 0;
s64 max_cycle_time = 0;
int port;
tas_data->enabled = false;
for (port = 0; port < SJA1105_NUM_PORTS; port++) {
const struct tc_taprio_qopt_offload *offload;
offload = tas_data->offload[port];
if (!offload)
continue;
tas_data->enabled = true;
if (max_cycle_time < offload->cycle_time)
max_cycle_time = offload->cycle_time;
if (latest_base_time < offload->base_time)
latest_base_time = offload->base_time;
if (earliest_base_time > offload->base_time) {
earliest_base_time = offload->base_time;
its_cycle_time = offload->cycle_time;
}
}
if (!tas_data->enabled)
return 0;
/* Roll the earliest base time over until it is in a comparable
* time base with the latest, then compare their deltas.
* We want to enforce that all ports' base times are within
* SJA1105_TAS_MAX_DELTA 200ns cycles of one another.
*/
earliest_base_time = future_base_time(earliest_base_time,
its_cycle_time,
latest_base_time);
while (earliest_base_time > latest_base_time)
earliest_base_time -= its_cycle_time;
if (latest_base_time - earliest_base_time >
sja1105_delta_to_ns(SJA1105_TAS_MAX_DELTA)) {
dev_err(ds->dev,
"Base times too far apart: min %llu max %llu\n",
earliest_base_time, latest_base_time);
return -ERANGE;
}
tas_data->earliest_base_time = earliest_base_time;
tas_data->max_cycle_time = max_cycle_time;
dev_dbg(ds->dev, "earliest base time %lld ns\n", earliest_base_time);
dev_dbg(ds->dev, "latest base time %lld ns\n", latest_base_time);
dev_dbg(ds->dev, "longest cycle time %lld ns\n", max_cycle_time);
return 0;
}
/* Lo and behold: the egress scheduler from hell.
*
* At the hardware level, the Time-Aware Shaper holds a global linear arrray of
@@ -99,7 +198,11 @@ static int sja1105_init_scheduling(struct sja1105_private *priv)
int num_cycles = 0;
int cycle = 0;
int i, k = 0;
int port;
int port, rc;
rc = sja1105_tas_set_runtime_params(priv);
if (rc < 0)
return rc;
/* Discard previous Schedule Table */
table = &priv->static_config.tables[BLK_IDX_SCHEDULE];
@@ -184,11 +287,13 @@ static int sja1105_init_scheduling(struct sja1105_private *priv)
schedule_entry_points = table->entries;
/* Finally start populating the static config tables */
schedule_entry_points_params->clksrc = SJA1105_TAS_CLKSRC_STANDALONE;
schedule_entry_points_params->clksrc = SJA1105_TAS_CLKSRC_PTP;
schedule_entry_points_params->actsubsch = num_cycles - 1;
for (port = 0; port < SJA1105_NUM_PORTS; port++) {
const struct tc_taprio_qopt_offload *offload;
/* Relative base time */
s64 rbt;
offload = tas_data->offload[port];
if (!offload)
@@ -196,15 +301,21 @@ static int sja1105_init_scheduling(struct sja1105_private *priv)
schedule_start_idx = k;
schedule_end_idx = k + offload->num_entries - 1;
/* TODO this is the base time for the port's subschedule,
* relative to PTPSCHTM. But as we're using the standalone
* clock source and not PTP clock as time reference, there's
* little point in even trying to put more logic into this,
* like preserving the phases between the subschedules of
* different ports. We'll get all of that when switching to the
* PTP clock source.
/* This is the base time expressed as a number of TAS ticks
* relative to PTPSCHTM, which we'll (perhaps improperly) call
* the operational base time.
*/
entry_point_delta = 1;
rbt = future_base_time(offload->base_time,
offload->cycle_time,
tas_data->earliest_base_time);
rbt -= tas_data->earliest_base_time;
/* UM10944.pdf 4.2.2. Schedule Entry Points table says that
* delta cannot be zero, which is shitty. Advance all relative
* base times by 1 TAS delta, so that even the earliest base
* time becomes 1 in relative terms. Then start the operational
* base time (PTPSCHTM) one TAS delta earlier than planned.
*/
entry_point_delta = ns_to_sja1105_delta(rbt) + 1;
schedule_entry_points[cycle].subschindx = cycle;
schedule_entry_points[cycle].delta = entry_point_delta;
@@ -403,8 +514,303 @@ int sja1105_setup_tc_taprio(struct dsa_switch *ds, int port,
return sja1105_static_config_reload(priv, SJA1105_SCHEDULING);
}
static int sja1105_tas_check_running(struct sja1105_private *priv)
{
struct sja1105_tas_data *tas_data = &priv->tas_data;
struct dsa_switch *ds = priv->ds;
struct sja1105_ptp_cmd cmd = {0};
int rc;
rc = sja1105_ptp_commit(ds, &cmd, SPI_READ);
if (rc < 0)
return rc;
if (cmd.ptpstrtsch == 1)
/* Schedule successfully started */
tas_data->state = SJA1105_TAS_STATE_RUNNING;
else if (cmd.ptpstopsch == 1)
/* Schedule is stopped */
tas_data->state = SJA1105_TAS_STATE_DISABLED;
else
/* Schedule is probably not configured with PTP clock source */
rc = -EINVAL;
return rc;
}
/* Write to PTPCLKCORP */
static int sja1105_tas_adjust_drift(struct sja1105_private *priv,
u64 correction)
{
const struct sja1105_regs *regs = priv->info->regs;
u32 ptpclkcorp = ns_to_sja1105_ticks(correction);
return sja1105_xfer_u32(priv, SPI_WRITE, regs->ptpclkcorp,
&ptpclkcorp, NULL);
}
/* Write to PTPSCHTM */
static int sja1105_tas_set_base_time(struct sja1105_private *priv,
u64 base_time)
{
const struct sja1105_regs *regs = priv->info->regs;
u64 ptpschtm = ns_to_sja1105_ticks(base_time);
return sja1105_xfer_u64(priv, SPI_WRITE, regs->ptpschtm,
&ptpschtm, NULL);
}
static int sja1105_tas_start(struct sja1105_private *priv)
{
struct sja1105_tas_data *tas_data = &priv->tas_data;
struct sja1105_ptp_cmd *cmd = &priv->ptp_data.cmd;
struct dsa_switch *ds = priv->ds;
int rc;
dev_dbg(ds->dev, "Starting the TAS\n");
if (tas_data->state == SJA1105_TAS_STATE_ENABLED_NOT_RUNNING ||
tas_data->state == SJA1105_TAS_STATE_RUNNING) {
dev_err(ds->dev, "TAS already started\n");
return -EINVAL;
}
cmd->ptpstrtsch = 1;
cmd->ptpstopsch = 0;
rc = sja1105_ptp_commit(ds, cmd, SPI_WRITE);
if (rc < 0)
return rc;
tas_data->state = SJA1105_TAS_STATE_ENABLED_NOT_RUNNING;
return 0;
}
static int sja1105_tas_stop(struct sja1105_private *priv)
{
struct sja1105_tas_data *tas_data = &priv->tas_data;
struct sja1105_ptp_cmd *cmd = &priv->ptp_data.cmd;
struct dsa_switch *ds = priv->ds;
int rc;
dev_dbg(ds->dev, "Stopping the TAS\n");
if (tas_data->state == SJA1105_TAS_STATE_DISABLED) {
dev_err(ds->dev, "TAS already disabled\n");
return -EINVAL;
}
cmd->ptpstopsch = 1;
cmd->ptpstrtsch = 0;
rc = sja1105_ptp_commit(ds, cmd, SPI_WRITE);
if (rc < 0)
return rc;
tas_data->state = SJA1105_TAS_STATE_DISABLED;
return 0;
}
/* The schedule engine and the PTP clock are driven by the same oscillator, and
* they run in parallel. But whilst the PTP clock can keep an absolute
* time-of-day, the schedule engine is only running in 'ticks' (25 ticks make
* up a delta, which is 200ns), and wrapping around at the end of each cycle.
* The schedule engine is started when the PTP clock reaches the PTPSCHTM time
* (in PTP domain).
* Because the PTP clock can be rate-corrected (accelerated or slowed down) by
* a software servo, and the schedule engine clock runs in parallel to the PTP
* clock, there is logic internal to the switch that periodically keeps the
* schedule engine from drifting away. The frequency with which this internal
* syntonization happens is the PTP clock correction period (PTPCLKCORP). It is
* a value also in the PTP clock domain, and is also rate-corrected.
* To be precise, during a correction period, there is logic to determine by
* how many scheduler clock ticks has the PTP clock drifted. At the end of each
* correction period/beginning of new one, the length of a delta is shrunk or
* expanded with an integer number of ticks, compared with the typical 25.
* So a delta lasts for 200ns (or 25 ticks) only on average.
* Sometimes it is longer, sometimes it is shorter. The internal syntonization
* logic can adjust for at most 5 ticks each 20 ticks.
*
* The first implication is that you should choose your schedule correction
* period to be an integer multiple of the schedule length. Preferably one.
* In case there are schedules of multiple ports active, then the correction
* period needs to be a multiple of them all. Given the restriction that the
* cycle times have to be multiples of one another anyway, this means the
* correction period can simply be the largest cycle time, hence the current
* choice. This way, the updates are always synchronous to the transmission
* cycle, and therefore predictable.
*
* The second implication is that at the beginning of a correction period, the
* first few deltas will be modulated in time, until the schedule engine is
* properly phase-aligned with the PTP clock. For this reason, you should place
* your best-effort traffic at the beginning of a cycle, and your
* time-triggered traffic afterwards.
*
* The third implication is that once the schedule engine is started, it can
* only adjust for so much drift within a correction period. In the servo you
* can only change the PTPCLKRATE, but not step the clock (PTPCLKADD). If you
* want to do the latter, you need to stop and restart the schedule engine,
* which is what the state machine handles.
*/
static void sja1105_tas_state_machine(struct work_struct *work)
{
struct sja1105_tas_data *tas_data = work_to_sja1105_tas(work);
struct sja1105_private *priv = tas_to_sja1105(tas_data);
struct sja1105_ptp_data *ptp_data = &priv->ptp_data;
struct timespec64 base_time_ts, now_ts;
struct dsa_switch *ds = priv->ds;
struct timespec64 diff;
s64 base_time, now;
int rc = 0;
mutex_lock(&ptp_data->lock);
switch (tas_data->state) {
case SJA1105_TAS_STATE_DISABLED:
/* Can't do anything at all if clock is still being stepped */
if (tas_data->last_op != SJA1105_PTP_ADJUSTFREQ)
break;
rc = sja1105_tas_adjust_drift(priv, tas_data->max_cycle_time);
if (rc < 0)
break;
rc = __sja1105_ptp_gettimex(ds, &now, NULL);
if (rc < 0)
break;
/* Plan to start the earliest schedule first. The others
* will be started in hardware, by way of their respective
* entry points delta.
* Try our best to avoid fringe cases (race condition between
* ptpschtm and ptpstrtsch) by pushing the oper_base_time at
* least one second in the future from now. This is not ideal,
* but this only needs to buy us time until the
* sja1105_tas_start command below gets executed.
*/
base_time = future_base_time(tas_data->earliest_base_time,
tas_data->max_cycle_time,
now + 1ull * NSEC_PER_SEC);
base_time -= sja1105_delta_to_ns(1);
rc = sja1105_tas_set_base_time(priv, base_time);
if (rc < 0)
break;
tas_data->oper_base_time = base_time;
rc = sja1105_tas_start(priv);
if (rc < 0)
break;
base_time_ts = ns_to_timespec64(base_time);
now_ts = ns_to_timespec64(now);
dev_dbg(ds->dev, "OPER base time %lld.%09ld (now %lld.%09ld)\n",
base_time_ts.tv_sec, base_time_ts.tv_nsec,
now_ts.tv_sec, now_ts.tv_nsec);
break;
case SJA1105_TAS_STATE_ENABLED_NOT_RUNNING:
if (tas_data->last_op != SJA1105_PTP_ADJUSTFREQ) {
/* Clock was stepped.. bad news for TAS */
sja1105_tas_stop(priv);
break;
}
/* Check if TAS has actually started, by comparing the
* scheduled start time with the SJA1105 PTP clock
*/
rc = __sja1105_ptp_gettimex(ds, &now, NULL);
if (rc < 0)
break;
if (now < tas_data->oper_base_time) {
/* TAS has not started yet */
diff = ns_to_timespec64(tas_data->oper_base_time - now);
dev_dbg(ds->dev, "time to start: [%lld.%09ld]",
diff.tv_sec, diff.tv_nsec);
break;
}
/* Time elapsed, what happened? */
rc = sja1105_tas_check_running(priv);
if (rc < 0)
break;
if (tas_data->state != SJA1105_TAS_STATE_RUNNING)
/* TAS has started */
dev_err(ds->dev,
"TAS not started despite time elapsed\n");
break;
case SJA1105_TAS_STATE_RUNNING:
/* Clock was stepped.. bad news for TAS */
if (tas_data->last_op != SJA1105_PTP_ADJUSTFREQ) {
sja1105_tas_stop(priv);
break;
}
rc = sja1105_tas_check_running(priv);
if (rc < 0)
break;
if (tas_data->state != SJA1105_TAS_STATE_RUNNING)
dev_err(ds->dev, "TAS surprisingly stopped\n");
break;
default:
if (net_ratelimit())
dev_err(ds->dev, "TAS in an invalid state (incorrect use of API)!\n");
}
if (rc && net_ratelimit())
dev_err(ds->dev, "An operation returned %d\n", rc);
mutex_unlock(&ptp_data->lock);
}
void sja1105_tas_clockstep(struct dsa_switch *ds)
{
struct sja1105_private *priv = ds->priv;
struct sja1105_tas_data *tas_data = &priv->tas_data;
if (!tas_data->enabled)
return;
tas_data->last_op = SJA1105_PTP_CLOCKSTEP;
schedule_work(&tas_data->tas_work);
}
void sja1105_tas_adjfreq(struct dsa_switch *ds)
{
struct sja1105_private *priv = ds->priv;
struct sja1105_tas_data *tas_data = &priv->tas_data;
if (!tas_data->enabled)
return;
/* No reason to schedule the workqueue, nothing changed */
if (tas_data->state == SJA1105_TAS_STATE_RUNNING)
return;
tas_data->last_op = SJA1105_PTP_ADJUSTFREQ;
schedule_work(&tas_data->tas_work);
}
void sja1105_tas_setup(struct dsa_switch *ds)
{
struct sja1105_private *priv = ds->priv;
struct sja1105_tas_data *tas_data = &priv->tas_data;
INIT_WORK(&tas_data->tas_work, sja1105_tas_state_machine);
tas_data->state = SJA1105_TAS_STATE_DISABLED;
tas_data->last_op = SJA1105_PTP_NONE;
}
void sja1105_tas_teardown(struct dsa_switch *ds)
@@ -413,6 +819,8 @@ void sja1105_tas_teardown(struct dsa_switch *ds)
struct tc_taprio_qopt_offload *offload;
int port;
cancel_work_sync(&priv->tas_data.tas_work);
for (port = 0; port < SJA1105_NUM_PORTS; port++) {
offload = priv->tas_data.offload[port];
if (!offload)