hmi.sh 2.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182
  1. #!/bin/sh
  2. # SPDX-License-Identifier: GPL-2.0-only
  3. #
  4. # Copyright 2015, Daniel Axtens, IBM Corporation
  5. #
  6. # do we have ./getscom, ./putscom?
  7. if [ -x ./getscom ] && [ -x ./putscom ]; then
  8. GETSCOM=./getscom
  9. PUTSCOM=./putscom
  10. elif which getscom > /dev/null; then
  11. GETSCOM=$(which getscom)
  12. PUTSCOM=$(which putscom)
  13. else
  14. cat <<EOF
  15. Can't find getscom/putscom in . or \$PATH.
  16. See https://github.com/open-power/skiboot.
  17. The tool is in external/xscom-utils
  18. EOF
  19. exit 1
  20. fi
  21. # We will get 8 HMI events per injection
  22. # todo: deal with things being offline
  23. expected_hmis=8
  24. COUNT_HMIS() {
  25. dmesg | grep -c 'Harmless Hypervisor Maintenance interrupt'
  26. }
  27. # massively expand snooze delay, allowing injection on all cores
  28. ppc64_cpu --smt-snooze-delay=1000000000
  29. # when we exit, restore it
  30. trap "ppc64_cpu --smt-snooze-delay=100" 0 1
  31. # for each chip+core combination
  32. # todo - less fragile parsing
  33. egrep -o 'OCC: Chip [0-9a-f]+ Core [0-9a-f]' < /sys/firmware/opal/msglog |
  34. while read chipcore; do
  35. chip=$(echo "$chipcore"|awk '{print $3}')
  36. core=$(echo "$chipcore"|awk '{print $5}')
  37. fir="0x1${core}013100"
  38. # verify that Core FIR is zero as expected
  39. if [ "$($GETSCOM -c 0x${chip} $fir)" != 0 ]; then
  40. echo "FIR was not zero before injection for chip $chip, core $core. Aborting!"
  41. echo "Result of $GETSCOM -c 0x${chip} $fir:"
  42. $GETSCOM -c 0x${chip} $fir
  43. echo "If you get a -5 error, the core may be in idle state. Try stress-ng."
  44. echo "Otherwise, try $PUTSCOM -c 0x${chip} $fir 0"
  45. exit 1
  46. fi
  47. # keep track of the number of HMIs handled
  48. old_hmis=$(COUNT_HMIS)
  49. # do injection, adding a marker to dmesg for clarity
  50. echo "Injecting HMI on core $core, chip $chip" | tee /dev/kmsg
  51. # inject a RegFile recoverable error
  52. if ! $PUTSCOM -c 0x${chip} $fir 2000000000000000 > /dev/null; then
  53. echo "Error injecting. Aborting!"
  54. exit 1
  55. fi
  56. # now we want to wait for all the HMIs to be processed
  57. # we expect one per thread on the core
  58. i=0;
  59. new_hmis=$(COUNT_HMIS)
  60. while [ $new_hmis -lt $((old_hmis + expected_hmis)) ] && [ $i -lt 12 ]; do
  61. echo "Seen $((new_hmis - old_hmis)) HMI(s) out of $expected_hmis expected, sleeping"
  62. sleep 5;
  63. i=$((i + 1))
  64. new_hmis=$(COUNT_HMIS)
  65. done
  66. if [ $i = 12 ]; then
  67. echo "Haven't seen expected $expected_hmis recoveries after 1 min. Aborting."
  68. exit 1
  69. fi
  70. echo "Processed $expected_hmis events; presumed success. Check dmesg."
  71. echo ""
  72. done