Skip to content

Commit

Permalink
Mux forensics + sensor error counters (oxidecomputer#1013)
Browse files Browse the repository at this point in the history
This adds some forensics to better aid debuggability of an I2C hang on
a selected segment of a mux.  Specifically this work:

- Adds an interface to the I2C server to get the last selected
  mux/segment
- Modifies the I2C server to clear the mux/segment state *after* that
  state has been cleared successfully to preserve mux/segment state in
  the event of an unresponsive mux or hung bus
- Adds some rough error counters to the Sensor API
  • Loading branch information
bcantrill authored Dec 14, 2022
1 parent 9d8369d commit 3cdb1cd
Show file tree
Hide file tree
Showing 13 changed files with 247 additions and 18 deletions.
2 changes: 1 addition & 1 deletion app/psc/rev-a.toml
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ features = ["psc"]
[tasks.sensor]
name = "task-sensor"
priority = 3
max-sizes = {flash = 8192, ram = 2048 }
max-sizes = {flash = 8192, ram = 4096 }
stacksize = 1024
start = true

Expand Down
2 changes: 1 addition & 1 deletion app/psc/rev-b.toml
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ features = ["psc"]
[tasks.sensor]
name = "task-sensor"
priority = 3
max-sizes = {flash = 8192, ram = 2048 }
max-sizes = {flash = 8192, ram = 4096 }
stacksize = 1024
start = true

Expand Down
2 changes: 1 addition & 1 deletion app/sidecar/rev-a.toml
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,7 @@ task-slots = ["sys", "i2c_driver"]
name = "task-sensor"
features = ["itm"]
priority = 4
max-sizes = {flash = 8192, ram = 2048 }
max-sizes = {flash = 8192, ram = 4096 }
stacksize = 1024
start = true

Expand Down
2 changes: 1 addition & 1 deletion app/sidecar/rev-b.toml
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,7 @@ task-slots = ["sys", "i2c_driver", "sprot"]
name = "task-sensor"
features = ["itm"]
priority = 4
max-sizes = {flash = 8192, ram = 2048 }
max-sizes = {flash = 8192, ram = 4096 }
stacksize = 1024
start = true

Expand Down
39 changes: 39 additions & 0 deletions drv/i2c-api/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ use userlib::*;
pub enum Op {
WriteRead = 1,
WriteReadBlock = 2,
SelectedMuxSegment = 3,
}

/// The response code returned from the I2C server. These response codes pretty
Expand Down Expand Up @@ -84,6 +85,8 @@ pub enum ResponseCode {
BusError = 22,
/// Bad device state of unknown origin
BadDeviceState = 23,
/// Bad return value for selected mux/segment
BadSelectedMux = 24,
}

///
Expand Down Expand Up @@ -472,4 +475,40 @@ impl I2cDevice {
Ok(())
}
}

pub fn selected_mux_segment(
&self,
) -> Result<Option<(Mux, Segment)>, ResponseCode> {
let mut response = [0u8; 4];

let (code, _) = sys_send(
self.task,
Op::SelectedMuxSegment as u16,
&Marshal::marshal(&(
self.address,
self.controller,
self.port,
None,
)),
response.as_bytes_mut(),
&[],
);

if code != 0 {
Err(ResponseCode::from_u32(code)
.ok_or(ResponseCode::BadResponse)?)
} else {
let (address, controller, port, mux) =
Marshal::unmarshal(&response)?;

if controller != self.controller
|| address != self.address
|| port != self.port
{
Err(ResponseCode::BadSelectedMux)
} else {
Ok(mux)
}
}
}
}
38 changes: 30 additions & 8 deletions drv/stm32xx-i2c-server/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -121,16 +121,18 @@ fn configure_mux(
},
)?;
}

// Beyond this point, we want any failure to set our new
// mux+segment to leave our mux+segment unset rather than having
// it point to the old mux+segment.
map.remove((controller.controller, port));
}

// If we're here, our mux is valid, but the current segment is
// not the specfied segment; we will now call upon our
// driver to enable this segment.
//
// If we're here, our mux is valid, but the current segment is not the
// specified segment; we will now call upon our driver to enable this
// segment. Note that if we have an existing mux/segment, and we fail
// to enable the new mux/segment, the map will not be updated. This
// is deliberate: if we cannot enable a new mux/segment, it may very
// well be because an errant device on the old segment is locking the
// bus; if only for forensic purposes, we want to know what this mux +
// segment was.
//
mux.driver
.enable_segment(mux, controller, Some(segment), ctrl)?;
map.insert((controller.controller, port), (id, segment));
Expand Down Expand Up @@ -315,6 +317,26 @@ fn main() -> ! {
}
}
}
Op::SelectedMuxSegment => {
let (payload, caller) = msg
.fixed::<[u8; 4], [u8; 4]>()
.ok_or(ResponseCode::BadArg)?;

let (address, controller, port, _) =
Marshal::unmarshal(payload)?;

let controller = lookup_controller(&controllers, controller)?;
validate_port(&pins, controller.controller, port)?;

caller.reply(Marshal::marshal(&(
address,
controller.controller,
port,
muxmap.get((controller.controller, port)),
)));

Ok(())
}
});
}
}
Expand Down
12 changes: 12 additions & 0 deletions idl/sensor.idol
Original file line number Diff line number Diff line change
Expand Up @@ -44,5 +44,17 @@ Interface(
err: CLike("SensorError"),
),
),
"get_nerrors": (
args: {
"id": (
type: "SensorId",
recv: From("usize", None),
)
},
reply: Result(
ok: "u32",
err: CLike("SensorError"),
),
),
},
)
7 changes: 7 additions & 0 deletions idl/thermal.idol
Original file line number Diff line number Diff line change
Expand Up @@ -110,5 +110,12 @@ Interface(
err: CLike("ThermalError"),
),
),
"get_runtime": (
doc: "Get the most recent runtime of the thermal loop, in milliseconds",
reply: Result(
ok: "u64",
err: CLike("ThermalError"),
),
),
},
)
55 changes: 55 additions & 0 deletions task/hiffy/src/stm32h7.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ pub enum Functions {
(Controller, PortIndex, Mux, Segment, u8, u8, usize, usize),
ResponseCode,
),
#[cfg(feature = "i2c")]
I2cSelectedMuxSegment((Controller, PortIndex), ResponseCode),
#[cfg(feature = "gpio")]
GpioInput(drv_stm32xx_sys_api::Port, drv_stm32xx_sys_api::GpioError),
#[cfg(feature = "gpio")]
Expand Down Expand Up @@ -372,6 +374,57 @@ fn i2c_bulk_write(
}
}

#[cfg(feature = "i2c")]
fn i2c_selected_mux_segment(
stack: &[Option<u32>],
_data: &[u8],
rval: &mut [u8],
) -> Result<usize, Failure> {
if stack.len() < 2 {
return Err(Failure::Fault(Fault::MissingParameters));
}

let fp = stack.len() - 2;

let controller = match stack[fp] {
Some(controller) => match Controller::from_u32(controller) {
Some(controller) => controller,
None => return Err(Failure::Fault(Fault::BadParameter(0))),
},
None => return Err(Failure::Fault(Fault::EmptyParameter(0))),
};

let port = match stack[fp + 1] {
Some(port) => {
if port > core::u8::MAX.into() {
return Err(Failure::Fault(Fault::BadParameter(1)));
}

PortIndex(port as u8)
}
None => {
return Err(Failure::Fault(Fault::EmptyParameter(1)));
}
};

if rval.len() < 2 {
return Err(Failure::Fault(Fault::ReturnValueOverflow));
}

let task = I2C.get_task_id();
let device = I2cDevice::new(task, controller, port, None, 0);

match device.selected_mux_segment() {
Ok(None) => Ok(0),
Ok(Some((mux, segment))) => {
rval[0] = mux as u8;
rval[1] = segment as u8;
Ok(2)
}
Err(err) => Err(Failure::FunctionError(err.into())),
}
}

#[cfg(feature = "gpio")]
fn gpio_args(
stack: &[Option<u32>],
Expand Down Expand Up @@ -570,6 +623,8 @@ pub(crate) static HIFFY_FUNCS: &[Function] = &[
i2c_write,
#[cfg(feature = "i2c")]
i2c_bulk_write,
#[cfg(feature = "i2c")]
i2c_selected_mux_segment,
#[cfg(feature = "gpio")]
gpio_input,
#[cfg(feature = "gpio")]
Expand Down
3 changes: 3 additions & 0 deletions task/sensor-api/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,10 @@ fn main() -> Result<()> {
include!(concat!(env!("OUT_DIR"), "/i2c_config.rs"));
pub mod other_sensors {{
#[allow(unused_imports)]
use crate::SensorId;
#[allow(unused_imports)]
use super::NUM_I2C_SENSORS; // Used for offsetting
#[allow(dead_code)]
Expand Down
41 changes: 36 additions & 5 deletions task/sensor-api/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,16 +33,47 @@ pub enum Reading {
NoData(NoData),
}

//
// Note that [`counter_encoding`] relies on [`NoData`] being numbered from 0 and
// being numbered sequentially.
//
#[derive(
zerocopy::AsBytes, Copy, Clone, Debug, FromPrimitive, Eq, PartialEq,
)]
#[repr(u8)]
pub enum NoData {
DeviceOff,
DeviceError,
DeviceNotPresent,
DeviceUnavailable,
DeviceTimeout,
DeviceOff = 0,
DeviceError = 1,
DeviceNotPresent = 2,
DeviceUnavailable = 3,
DeviceTimeout = 4,
}

impl NoData {
///
/// Routine to determine the number of bits and size of shift
/// to pack a counter for each [`NoData`] variant into type `T`.
///
pub fn counter_encoding<T>(self) -> (usize, usize) {
//
// We need to encode the number of variants in [`NoData`] here. There
// is a very convenient core::mem::variant_count() that does exactly
// this, but it's currently unstable -- so instead we have an
// exhaustive match to assure that the enum can't be updated without
// modifying this code.
//
let nbits = (core::mem::size_of::<T>() * 8)
/ match self {
NoData::DeviceOff
| NoData::DeviceError
| NoData::DeviceNotPresent
| NoData::DeviceUnavailable
| NoData::DeviceTimeout => 5,
};

let shift = (self as usize) * nbits;
(nbits, shift)
}
}

impl From<ResponseCode> for NoData {
Expand Down
42 changes: 41 additions & 1 deletion task/sensor/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ use task_sensor_api::config::NUM_SENSORS;

struct ServerImpl {
data: &'static mut [Reading; NUM_SENSORS],
nerrors: &'static mut [u32; NUM_SENSORS],
deadline: u64,
}

Expand Down Expand Up @@ -69,11 +70,42 @@ impl idl::InOrderSensorImpl for ServerImpl {

if index < NUM_SENSORS {
self.data[index] = Reading::NoData(nodata);

//
// We pack per-`NoData` counters into a u32.
//
let (nbits, shift) = nodata.counter_encoding::<u32>();
let mask = (1 << nbits) - 1;
let bitmask = mask << shift;
let incr = 1 << shift;

//
// Perform a saturating increment by checking our current value
// against our bitmask: if we have unset bits, we can safely add.
//
if self.nerrors[index] & bitmask != bitmask {
self.nerrors[index] += incr;
}

Ok(())
} else {
Err(SensorError::InvalidSensor.into())
}
}

fn get_nerrors(
&mut self,
_: &RecvMessage,
id: SensorId,
) -> Result<u32, RequestError<SensorError>> {
let index = id.0;

if index < NUM_SENSORS {
Ok(self.nerrors[index])
} else {
Err(SensorError::InvalidSensor.into())
}
}
}

impl NotificationHandler for ServerImpl {
Expand All @@ -100,7 +132,15 @@ fn main() -> ! {
static mut SENSOR_DATA: [Reading; NUM_SENSORS] = [|| Reading::Absent; _];
};

let mut server = ServerImpl { data, deadline };
let nerrors = mutable_statics::mutable_statics! {
static mut SENSOR_NERRORS: [u32; NUM_SENSORS] = [|| 0; _];
};

let mut server = ServerImpl {
data,
nerrors,
deadline,
};

let mut buffer = [0; idl::INCOMING_SIZE];

Expand Down
Loading

0 comments on commit 3cdb1cd

Please sign in to comment.