edac_device.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621
  1. /*
  2. * edac_device.c
  3. * (C) 2007 www.douglaskthompson.com
  4. *
  5. * This file may be distributed under the terms of the
  6. * GNU General Public License.
  7. *
  8. * Written by Doug Thompson <[email protected]>
  9. *
  10. * edac_device API implementation
  11. * 19 Jan 2007
  12. */
  13. #include <asm/page.h>
  14. #include <linux/uaccess.h>
  15. #include <linux/ctype.h>
  16. #include <linux/highmem.h>
  17. #include <linux/init.h>
  18. #include <linux/jiffies.h>
  19. #include <linux/module.h>
  20. #include <linux/slab.h>
  21. #include <linux/smp.h>
  22. #include <linux/spinlock.h>
  23. #include <linux/sysctl.h>
  24. #include <linux/timer.h>
  25. #include "edac_device.h"
  26. #include "edac_module.h"
  27. /* lock for the list: 'edac_device_list', manipulation of this list
  28. * is protected by the 'device_ctls_mutex' lock
  29. */
  30. static DEFINE_MUTEX(device_ctls_mutex);
  31. static LIST_HEAD(edac_device_list);
  32. /* Default workqueue processing interval on this instance, in msecs */
  33. #define DEFAULT_POLL_INTERVAL 1000
  34. #ifdef CONFIG_EDAC_DEBUG
  35. static void edac_device_dump_device(struct edac_device_ctl_info *edac_dev)
  36. {
  37. edac_dbg(3, "\tedac_dev = %p dev_idx=%d\n",
  38. edac_dev, edac_dev->dev_idx);
  39. edac_dbg(4, "\tedac_dev->edac_check = %p\n", edac_dev->edac_check);
  40. edac_dbg(3, "\tdev = %p\n", edac_dev->dev);
  41. edac_dbg(3, "\tmod_name:ctl_name = %s:%s\n",
  42. edac_dev->mod_name, edac_dev->ctl_name);
  43. edac_dbg(3, "\tpvt_info = %p\n\n", edac_dev->pvt_info);
  44. }
  45. #endif /* CONFIG_EDAC_DEBUG */
  46. /*
  47. * @off_val: zero, 1, or other based offset
  48. */
  49. struct edac_device_ctl_info *
  50. edac_device_alloc_ctl_info(unsigned pvt_sz, char *dev_name, unsigned nr_instances,
  51. char *blk_name, unsigned nr_blocks, unsigned off_val,
  52. struct edac_dev_sysfs_block_attribute *attrib_spec,
  53. unsigned nr_attrib, int device_index)
  54. {
  55. struct edac_dev_sysfs_block_attribute *dev_attrib, *attrib_p, *attrib;
  56. struct edac_device_block *dev_blk, *blk_p, *blk;
  57. struct edac_device_instance *dev_inst, *inst;
  58. struct edac_device_ctl_info *dev_ctl;
  59. unsigned instance, block, attr;
  60. void *pvt;
  61. int err;
  62. edac_dbg(4, "instances=%d blocks=%d\n", nr_instances, nr_blocks);
  63. dev_ctl = kzalloc(sizeof(struct edac_device_ctl_info), GFP_KERNEL);
  64. if (!dev_ctl)
  65. return NULL;
  66. dev_inst = kcalloc(nr_instances, sizeof(struct edac_device_instance), GFP_KERNEL);
  67. if (!dev_inst)
  68. goto free;
  69. dev_ctl->instances = dev_inst;
  70. dev_blk = kcalloc(nr_instances * nr_blocks, sizeof(struct edac_device_block), GFP_KERNEL);
  71. if (!dev_blk)
  72. goto free;
  73. dev_ctl->blocks = dev_blk;
  74. if (nr_attrib) {
  75. dev_attrib = kcalloc(nr_attrib, sizeof(struct edac_dev_sysfs_block_attribute),
  76. GFP_KERNEL);
  77. if (!dev_attrib)
  78. goto free;
  79. dev_ctl->attribs = dev_attrib;
  80. }
  81. if (pvt_sz) {
  82. pvt = kzalloc(pvt_sz, GFP_KERNEL);
  83. if (!pvt)
  84. goto free;
  85. dev_ctl->pvt_info = pvt;
  86. }
  87. dev_ctl->dev_idx = device_index;
  88. dev_ctl->nr_instances = nr_instances;
  89. /* Default logging of CEs and UEs */
  90. dev_ctl->log_ce = 1;
  91. dev_ctl->log_ue = 1;
  92. /* Name of this edac device */
  93. snprintf(dev_ctl->name, sizeof(dev_ctl->name),"%s", dev_name);
  94. /* Initialize every Instance */
  95. for (instance = 0; instance < nr_instances; instance++) {
  96. inst = &dev_inst[instance];
  97. inst->ctl = dev_ctl;
  98. inst->nr_blocks = nr_blocks;
  99. blk_p = &dev_blk[instance * nr_blocks];
  100. inst->blocks = blk_p;
  101. /* name of this instance */
  102. snprintf(inst->name, sizeof(inst->name), "%s%u", dev_name, instance);
  103. /* Initialize every block in each instance */
  104. for (block = 0; block < nr_blocks; block++) {
  105. blk = &blk_p[block];
  106. blk->instance = inst;
  107. snprintf(blk->name, sizeof(blk->name),
  108. "%s%d", blk_name, block + off_val);
  109. edac_dbg(4, "instance=%d inst_p=%p block=#%d block_p=%p name='%s'\n",
  110. instance, inst, block, blk, blk->name);
  111. /* if there are NO attributes OR no attribute pointer
  112. * then continue on to next block iteration
  113. */
  114. if ((nr_attrib == 0) || (attrib_spec == NULL))
  115. continue;
  116. /* setup the attribute array for this block */
  117. blk->nr_attribs = nr_attrib;
  118. attrib_p = &dev_attrib[block*nr_instances*nr_attrib];
  119. blk->block_attributes = attrib_p;
  120. edac_dbg(4, "THIS BLOCK_ATTRIB=%p\n",
  121. blk->block_attributes);
  122. /* Initialize every user specified attribute in this
  123. * block with the data the caller passed in
  124. * Each block gets its own copy of pointers,
  125. * and its unique 'value'
  126. */
  127. for (attr = 0; attr < nr_attrib; attr++) {
  128. attrib = &attrib_p[attr];
  129. /* populate the unique per attrib
  130. * with the code pointers and info
  131. */
  132. attrib->attr = attrib_spec[attr].attr;
  133. attrib->show = attrib_spec[attr].show;
  134. attrib->store = attrib_spec[attr].store;
  135. attrib->block = blk; /* up link */
  136. edac_dbg(4, "alloc-attrib=%p attrib_name='%s' attrib-spec=%p spec-name=%s\n",
  137. attrib, attrib->attr.name,
  138. &attrib_spec[attr],
  139. attrib_spec[attr].attr.name
  140. );
  141. }
  142. }
  143. }
  144. /* Mark this instance as merely ALLOCATED */
  145. dev_ctl->op_state = OP_ALLOC;
  146. /*
  147. * Initialize the 'root' kobj for the edac_device controller
  148. */
  149. err = edac_device_register_sysfs_main_kobj(dev_ctl);
  150. if (err)
  151. goto free;
  152. /* at this point, the root kobj is valid, and in order to
  153. * 'free' the object, then the function:
  154. * edac_device_unregister_sysfs_main_kobj() must be called
  155. * which will perform kobj unregistration and the actual free
  156. * will occur during the kobject callback operation
  157. */
  158. return dev_ctl;
  159. free:
  160. __edac_device_free_ctl_info(dev_ctl);
  161. return NULL;
  162. }
  163. EXPORT_SYMBOL_GPL(edac_device_alloc_ctl_info);
  164. void edac_device_free_ctl_info(struct edac_device_ctl_info *ctl_info)
  165. {
  166. edac_device_unregister_sysfs_main_kobj(ctl_info);
  167. }
  168. EXPORT_SYMBOL_GPL(edac_device_free_ctl_info);
  169. /*
  170. * find_edac_device_by_dev
  171. * scans the edac_device list for a specific 'struct device *'
  172. *
  173. * lock to be held prior to call: device_ctls_mutex
  174. *
  175. * Return:
  176. * pointer to control structure managing 'dev'
  177. * NULL if not found on list
  178. */
  179. static struct edac_device_ctl_info *find_edac_device_by_dev(struct device *dev)
  180. {
  181. struct edac_device_ctl_info *edac_dev;
  182. struct list_head *item;
  183. edac_dbg(0, "\n");
  184. list_for_each(item, &edac_device_list) {
  185. edac_dev = list_entry(item, struct edac_device_ctl_info, link);
  186. if (edac_dev->dev == dev)
  187. return edac_dev;
  188. }
  189. return NULL;
  190. }
  191. /*
  192. * add_edac_dev_to_global_list
  193. * Before calling this function, caller must
  194. * assign a unique value to edac_dev->dev_idx.
  195. *
  196. * lock to be held prior to call: device_ctls_mutex
  197. *
  198. * Return:
  199. * 0 on success
  200. * 1 on failure.
  201. */
  202. static int add_edac_dev_to_global_list(struct edac_device_ctl_info *edac_dev)
  203. {
  204. struct list_head *item, *insert_before;
  205. struct edac_device_ctl_info *rover;
  206. insert_before = &edac_device_list;
  207. /* Determine if already on the list */
  208. rover = find_edac_device_by_dev(edac_dev->dev);
  209. if (unlikely(rover != NULL))
  210. goto fail0;
  211. /* Insert in ascending order by 'dev_idx', so find position */
  212. list_for_each(item, &edac_device_list) {
  213. rover = list_entry(item, struct edac_device_ctl_info, link);
  214. if (rover->dev_idx >= edac_dev->dev_idx) {
  215. if (unlikely(rover->dev_idx == edac_dev->dev_idx))
  216. goto fail1;
  217. insert_before = item;
  218. break;
  219. }
  220. }
  221. list_add_tail_rcu(&edac_dev->link, insert_before);
  222. return 0;
  223. fail0:
  224. edac_printk(KERN_WARNING, EDAC_MC,
  225. "%s (%s) %s %s already assigned %d\n",
  226. dev_name(rover->dev), edac_dev_name(rover),
  227. rover->mod_name, rover->ctl_name, rover->dev_idx);
  228. return 1;
  229. fail1:
  230. edac_printk(KERN_WARNING, EDAC_MC,
  231. "bug in low-level driver: attempt to assign\n"
  232. " duplicate dev_idx %d in %s()\n", rover->dev_idx,
  233. __func__);
  234. return 1;
  235. }
  236. /*
  237. * del_edac_device_from_global_list
  238. */
  239. static void del_edac_device_from_global_list(struct edac_device_ctl_info
  240. *edac_device)
  241. {
  242. list_del_rcu(&edac_device->link);
  243. /* these are for safe removal of devices from global list while
  244. * NMI handlers may be traversing list
  245. */
  246. synchronize_rcu();
  247. INIT_LIST_HEAD(&edac_device->link);
  248. }
  249. /*
  250. * edac_device_workq_function
  251. * performs the operation scheduled by a workq request
  252. *
  253. * this workq is embedded within an edac_device_ctl_info
  254. * structure, that needs to be polled for possible error events.
  255. *
  256. * This operation is to acquire the list mutex lock
  257. * (thus preventing insertation or deletion)
  258. * and then call the device's poll function IFF this device is
  259. * running polled and there is a poll function defined.
  260. */
  261. static void edac_device_workq_function(struct work_struct *work_req)
  262. {
  263. struct delayed_work *d_work = to_delayed_work(work_req);
  264. struct edac_device_ctl_info *edac_dev = to_edac_device_ctl_work(d_work);
  265. mutex_lock(&device_ctls_mutex);
  266. /* If we are being removed, bail out immediately */
  267. if (edac_dev->op_state == OP_OFFLINE) {
  268. mutex_unlock(&device_ctls_mutex);
  269. return;
  270. }
  271. /* Only poll controllers that are running polled and have a check */
  272. if ((edac_dev->op_state == OP_RUNNING_POLL) &&
  273. (edac_dev->edac_check != NULL)) {
  274. edac_dev->edac_check(edac_dev);
  275. }
  276. mutex_unlock(&device_ctls_mutex);
  277. /* Reschedule the workq for the next time period to start again
  278. * if the number of msec is for 1 sec, then adjust to the next
  279. * whole one second to save timers firing all over the period
  280. * between integral seconds
  281. */
  282. if (edac_dev->poll_msec == DEFAULT_POLL_INTERVAL)
  283. edac_queue_work(&edac_dev->work, round_jiffies_relative(edac_dev->delay));
  284. else
  285. edac_queue_work(&edac_dev->work, edac_dev->delay);
  286. }
  287. /*
  288. * edac_device_workq_setup
  289. * initialize a workq item for this edac_device instance
  290. * passing in the new delay period in msec
  291. */
  292. static void edac_device_workq_setup(struct edac_device_ctl_info *edac_dev,
  293. unsigned msec)
  294. {
  295. edac_dbg(0, "\n");
  296. /* take the arg 'msec' and set it into the control structure
  297. * to used in the time period calculation
  298. * then calc the number of jiffies that represents
  299. */
  300. edac_dev->poll_msec = msec;
  301. edac_dev->delay = msecs_to_jiffies(msec);
  302. INIT_DELAYED_WORK(&edac_dev->work, edac_device_workq_function);
  303. /* optimize here for the 1 second case, which will be normal value, to
  304. * fire ON the 1 second time event. This helps reduce all sorts of
  305. * timers firing on sub-second basis, while they are happy
  306. * to fire together on the 1 second exactly
  307. */
  308. if (edac_dev->poll_msec == DEFAULT_POLL_INTERVAL)
  309. edac_queue_work(&edac_dev->work, round_jiffies_relative(edac_dev->delay));
  310. else
  311. edac_queue_work(&edac_dev->work, edac_dev->delay);
  312. }
  313. /*
  314. * edac_device_workq_teardown
  315. * stop the workq processing on this edac_dev
  316. */
  317. static void edac_device_workq_teardown(struct edac_device_ctl_info *edac_dev)
  318. {
  319. if (!edac_dev->edac_check)
  320. return;
  321. edac_dev->op_state = OP_OFFLINE;
  322. edac_stop_work(&edac_dev->work);
  323. }
  324. /*
  325. * edac_device_reset_delay_period
  326. *
  327. * need to stop any outstanding workq queued up at this time
  328. * because we will be resetting the sleep time.
  329. * Then restart the workq on the new delay
  330. */
  331. void edac_device_reset_delay_period(struct edac_device_ctl_info *edac_dev,
  332. unsigned long msec)
  333. {
  334. edac_dev->poll_msec = msec;
  335. edac_dev->delay = msecs_to_jiffies(msec);
  336. /* See comment in edac_device_workq_setup() above */
  337. if (edac_dev->poll_msec == DEFAULT_POLL_INTERVAL)
  338. edac_mod_work(&edac_dev->work, round_jiffies_relative(edac_dev->delay));
  339. else
  340. edac_mod_work(&edac_dev->work, edac_dev->delay);
  341. }
  342. int edac_device_alloc_index(void)
  343. {
  344. static atomic_t device_indexes = ATOMIC_INIT(0);
  345. return atomic_inc_return(&device_indexes) - 1;
  346. }
  347. EXPORT_SYMBOL_GPL(edac_device_alloc_index);
  348. int edac_device_add_device(struct edac_device_ctl_info *edac_dev)
  349. {
  350. edac_dbg(0, "\n");
  351. #ifdef CONFIG_EDAC_DEBUG
  352. if (edac_debug_level >= 3)
  353. edac_device_dump_device(edac_dev);
  354. #endif
  355. mutex_lock(&device_ctls_mutex);
  356. if (add_edac_dev_to_global_list(edac_dev))
  357. goto fail0;
  358. /* set load time so that error rate can be tracked */
  359. edac_dev->start_time = jiffies;
  360. /* create this instance's sysfs entries */
  361. if (edac_device_create_sysfs(edac_dev)) {
  362. edac_device_printk(edac_dev, KERN_WARNING,
  363. "failed to create sysfs device\n");
  364. goto fail1;
  365. }
  366. /* If there IS a check routine, then we are running POLLED */
  367. if (edac_dev->edac_check != NULL) {
  368. /* This instance is NOW RUNNING */
  369. edac_dev->op_state = OP_RUNNING_POLL;
  370. edac_device_workq_setup(edac_dev, edac_dev->poll_msec ?: DEFAULT_POLL_INTERVAL);
  371. } else {
  372. edac_dev->op_state = OP_RUNNING_INTERRUPT;
  373. }
  374. /* Report action taken */
  375. edac_device_printk(edac_dev, KERN_INFO,
  376. "Giving out device to module %s controller %s: DEV %s (%s)\n",
  377. edac_dev->mod_name, edac_dev->ctl_name, edac_dev->dev_name,
  378. edac_op_state_to_string(edac_dev->op_state));
  379. mutex_unlock(&device_ctls_mutex);
  380. return 0;
  381. fail1:
  382. /* Some error, so remove the entry from the lsit */
  383. del_edac_device_from_global_list(edac_dev);
  384. fail0:
  385. mutex_unlock(&device_ctls_mutex);
  386. return 1;
  387. }
  388. EXPORT_SYMBOL_GPL(edac_device_add_device);
  389. struct edac_device_ctl_info *edac_device_del_device(struct device *dev)
  390. {
  391. struct edac_device_ctl_info *edac_dev;
  392. edac_dbg(0, "\n");
  393. mutex_lock(&device_ctls_mutex);
  394. /* Find the structure on the list, if not there, then leave */
  395. edac_dev = find_edac_device_by_dev(dev);
  396. if (edac_dev == NULL) {
  397. mutex_unlock(&device_ctls_mutex);
  398. return NULL;
  399. }
  400. /* mark this instance as OFFLINE */
  401. edac_dev->op_state = OP_OFFLINE;
  402. /* deregister from global list */
  403. del_edac_device_from_global_list(edac_dev);
  404. mutex_unlock(&device_ctls_mutex);
  405. /* clear workq processing on this instance */
  406. edac_device_workq_teardown(edac_dev);
  407. /* Tear down the sysfs entries for this instance */
  408. edac_device_remove_sysfs(edac_dev);
  409. edac_printk(KERN_INFO, EDAC_MC,
  410. "Removed device %d for %s %s: DEV %s\n",
  411. edac_dev->dev_idx,
  412. edac_dev->mod_name, edac_dev->ctl_name, edac_dev_name(edac_dev));
  413. return edac_dev;
  414. }
  415. EXPORT_SYMBOL_GPL(edac_device_del_device);
  416. static inline int edac_device_get_log_ce(struct edac_device_ctl_info *edac_dev)
  417. {
  418. return edac_dev->log_ce;
  419. }
  420. static inline int edac_device_get_log_ue(struct edac_device_ctl_info *edac_dev)
  421. {
  422. return edac_dev->log_ue;
  423. }
  424. static inline int edac_device_get_panic_on_ue(struct edac_device_ctl_info
  425. *edac_dev)
  426. {
  427. return edac_dev->panic_on_ue;
  428. }
  429. void edac_device_handle_ce_count(struct edac_device_ctl_info *edac_dev,
  430. unsigned int count, int inst_nr, int block_nr,
  431. const char *msg)
  432. {
  433. struct edac_device_instance *instance;
  434. struct edac_device_block *block = NULL;
  435. if (!count)
  436. return;
  437. if ((inst_nr >= edac_dev->nr_instances) || (inst_nr < 0)) {
  438. edac_device_printk(edac_dev, KERN_ERR,
  439. "INTERNAL ERROR: 'instance' out of range "
  440. "(%d >= %d)\n", inst_nr,
  441. edac_dev->nr_instances);
  442. return;
  443. }
  444. instance = edac_dev->instances + inst_nr;
  445. if ((block_nr >= instance->nr_blocks) || (block_nr < 0)) {
  446. edac_device_printk(edac_dev, KERN_ERR,
  447. "INTERNAL ERROR: instance %d 'block' "
  448. "out of range (%d >= %d)\n",
  449. inst_nr, block_nr,
  450. instance->nr_blocks);
  451. return;
  452. }
  453. if (instance->nr_blocks > 0) {
  454. block = instance->blocks + block_nr;
  455. block->counters.ce_count += count;
  456. }
  457. /* Propagate the count up the 'totals' tree */
  458. instance->counters.ce_count += count;
  459. edac_dev->counters.ce_count += count;
  460. if (edac_device_get_log_ce(edac_dev))
  461. edac_device_printk(edac_dev, KERN_WARNING,
  462. "CE: %s instance: %s block: %s count: %d '%s'\n",
  463. edac_dev->ctl_name, instance->name,
  464. block ? block->name : "N/A", count, msg);
  465. }
  466. EXPORT_SYMBOL_GPL(edac_device_handle_ce_count);
  467. void edac_device_handle_ue_count(struct edac_device_ctl_info *edac_dev,
  468. unsigned int count, int inst_nr, int block_nr,
  469. const char *msg)
  470. {
  471. struct edac_device_instance *instance;
  472. struct edac_device_block *block = NULL;
  473. if (!count)
  474. return;
  475. if ((inst_nr >= edac_dev->nr_instances) || (inst_nr < 0)) {
  476. edac_device_printk(edac_dev, KERN_ERR,
  477. "INTERNAL ERROR: 'instance' out of range "
  478. "(%d >= %d)\n", inst_nr,
  479. edac_dev->nr_instances);
  480. return;
  481. }
  482. instance = edac_dev->instances + inst_nr;
  483. if ((block_nr >= instance->nr_blocks) || (block_nr < 0)) {
  484. edac_device_printk(edac_dev, KERN_ERR,
  485. "INTERNAL ERROR: instance %d 'block' "
  486. "out of range (%d >= %d)\n",
  487. inst_nr, block_nr,
  488. instance->nr_blocks);
  489. return;
  490. }
  491. if (instance->nr_blocks > 0) {
  492. block = instance->blocks + block_nr;
  493. block->counters.ue_count += count;
  494. }
  495. /* Propagate the count up the 'totals' tree */
  496. instance->counters.ue_count += count;
  497. edac_dev->counters.ue_count += count;
  498. if (edac_device_get_log_ue(edac_dev))
  499. edac_device_printk(edac_dev, KERN_EMERG,
  500. "UE: %s instance: %s block: %s count: %d '%s'\n",
  501. edac_dev->ctl_name, instance->name,
  502. block ? block->name : "N/A", count, msg);
  503. if (edac_device_get_panic_on_ue(edac_dev))
  504. panic("EDAC %s: UE instance: %s block %s count: %d '%s'\n",
  505. edac_dev->ctl_name, instance->name,
  506. block ? block->name : "N/A", count, msg);
  507. }
  508. EXPORT_SYMBOL_GPL(edac_device_handle_ue_count);