habanalabs_drv.c 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Copyright 2016-2021 HabanaLabs, Ltd.
  4. * All Rights Reserved.
  5. *
  6. */
  7. #define pr_fmt(fmt) "habanalabs: " fmt
  8. #include "habanalabs.h"
  9. #include <linux/pci.h>
  10. #include <linux/aer.h>
  11. #include <linux/module.h>
  12. #define CREATE_TRACE_POINTS
  13. #include <trace/events/habanalabs.h>
  14. #define HL_DRIVER_AUTHOR "HabanaLabs Kernel Driver Team"
  15. #define HL_DRIVER_DESC "Driver for HabanaLabs's AI Accelerators"
  16. MODULE_AUTHOR(HL_DRIVER_AUTHOR);
  17. MODULE_DESCRIPTION(HL_DRIVER_DESC);
  18. MODULE_LICENSE("GPL v2");
  19. static int hl_major;
  20. static struct class *hl_class;
  21. static DEFINE_IDR(hl_devs_idr);
  22. static DEFINE_MUTEX(hl_devs_idr_lock);
  23. #define HL_DEFAULT_TIMEOUT_LOCKED 30 /* 30 seconds */
  24. #define GAUDI_DEFAULT_TIMEOUT_LOCKED 600 /* 10 minutes */
  25. static int timeout_locked = HL_DEFAULT_TIMEOUT_LOCKED;
  26. static int reset_on_lockup = 1;
  27. static int memory_scrub;
  28. static ulong boot_error_status_mask = ULONG_MAX;
  29. module_param(timeout_locked, int, 0444);
  30. MODULE_PARM_DESC(timeout_locked,
  31. "Device lockup timeout in seconds (0 = disabled, default 30s)");
  32. module_param(reset_on_lockup, int, 0444);
  33. MODULE_PARM_DESC(reset_on_lockup,
  34. "Do device reset on lockup (0 = no, 1 = yes, default yes)");
  35. module_param(memory_scrub, int, 0444);
  36. MODULE_PARM_DESC(memory_scrub,
  37. "Scrub device memory in various states (0 = no, 1 = yes, default no)");
  38. module_param(boot_error_status_mask, ulong, 0444);
  39. MODULE_PARM_DESC(boot_error_status_mask,
  40. "Mask of the error status during device CPU boot (If bitX is cleared then error X is masked. Default all 1's)");
  41. #define PCI_IDS_GOYA 0x0001
  42. #define PCI_IDS_GAUDI 0x1000
  43. #define PCI_IDS_GAUDI_SEC 0x1010
  44. #define PCI_IDS_GAUDI2 0x1020
  45. static const struct pci_device_id ids[] = {
  46. { PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GOYA), },
  47. { PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI), },
  48. { PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI_SEC), },
  49. { PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI2), },
  50. { 0, }
  51. };
  52. MODULE_DEVICE_TABLE(pci, ids);
  53. /*
  54. * get_asic_type - translate device id to asic type
  55. *
  56. * @device: id of the PCI device
  57. *
  58. * Translate device id to asic type.
  59. * In case of unidentified device, return -1
  60. */
  61. static enum hl_asic_type get_asic_type(u16 device)
  62. {
  63. enum hl_asic_type asic_type;
  64. switch (device) {
  65. case PCI_IDS_GOYA:
  66. asic_type = ASIC_GOYA;
  67. break;
  68. case PCI_IDS_GAUDI:
  69. asic_type = ASIC_GAUDI;
  70. break;
  71. case PCI_IDS_GAUDI_SEC:
  72. asic_type = ASIC_GAUDI_SEC;
  73. break;
  74. case PCI_IDS_GAUDI2:
  75. asic_type = ASIC_GAUDI2;
  76. break;
  77. default:
  78. asic_type = ASIC_INVALID;
  79. break;
  80. }
  81. return asic_type;
  82. }
  83. static bool is_asic_secured(enum hl_asic_type asic_type)
  84. {
  85. switch (asic_type) {
  86. case ASIC_GAUDI_SEC:
  87. return true;
  88. default:
  89. return false;
  90. }
  91. }
  92. /*
  93. * hl_device_open - open function for habanalabs device
  94. *
  95. * @inode: pointer to inode structure
  96. * @filp: pointer to file structure
  97. *
  98. * Called when process opens an habanalabs device.
  99. */
  100. int hl_device_open(struct inode *inode, struct file *filp)
  101. {
  102. enum hl_device_status status;
  103. struct hl_device *hdev;
  104. struct hl_fpriv *hpriv;
  105. int rc;
  106. mutex_lock(&hl_devs_idr_lock);
  107. hdev = idr_find(&hl_devs_idr, iminor(inode));
  108. mutex_unlock(&hl_devs_idr_lock);
  109. if (!hdev) {
  110. pr_err("Couldn't find device %d:%d\n",
  111. imajor(inode), iminor(inode));
  112. return -ENXIO;
  113. }
  114. hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
  115. if (!hpriv)
  116. return -ENOMEM;
  117. hpriv->hdev = hdev;
  118. filp->private_data = hpriv;
  119. hpriv->filp = filp;
  120. mutex_init(&hpriv->notifier_event.lock);
  121. mutex_init(&hpriv->restore_phase_mutex);
  122. mutex_init(&hpriv->ctx_lock);
  123. kref_init(&hpriv->refcount);
  124. nonseekable_open(inode, filp);
  125. hl_ctx_mgr_init(&hpriv->ctx_mgr);
  126. hl_mem_mgr_init(hpriv->hdev->dev, &hpriv->mem_mgr);
  127. hpriv->taskpid = get_task_pid(current, PIDTYPE_PID);
  128. mutex_lock(&hdev->fpriv_list_lock);
  129. if (!hl_device_operational(hdev, &status)) {
  130. dev_dbg_ratelimited(hdev->dev,
  131. "Can't open %s because it is %s\n",
  132. dev_name(hdev->dev), hdev->status[status]);
  133. if (status == HL_DEVICE_STATUS_IN_RESET ||
  134. status == HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE)
  135. rc = -EAGAIN;
  136. else
  137. rc = -EPERM;
  138. goto out_err;
  139. }
  140. if (hdev->is_in_dram_scrub) {
  141. dev_dbg_ratelimited(hdev->dev,
  142. "Can't open %s during dram scrub\n",
  143. dev_name(hdev->dev));
  144. rc = -EAGAIN;
  145. goto out_err;
  146. }
  147. if (hdev->compute_ctx_in_release) {
  148. dev_dbg_ratelimited(hdev->dev,
  149. "Can't open %s because another user is still releasing it\n",
  150. dev_name(hdev->dev));
  151. rc = -EAGAIN;
  152. goto out_err;
  153. }
  154. if (hdev->is_compute_ctx_active) {
  155. dev_dbg_ratelimited(hdev->dev,
  156. "Can't open %s because another user is working on it\n",
  157. dev_name(hdev->dev));
  158. rc = -EBUSY;
  159. goto out_err;
  160. }
  161. rc = hl_ctx_create(hdev, hpriv);
  162. if (rc) {
  163. dev_err(hdev->dev, "Failed to create context %d\n", rc);
  164. goto out_err;
  165. }
  166. list_add(&hpriv->dev_node, &hdev->fpriv_list);
  167. mutex_unlock(&hdev->fpriv_list_lock);
  168. hdev->asic_funcs->send_device_activity(hdev, true);
  169. hl_debugfs_add_file(hpriv);
  170. atomic_set(&hdev->captured_err_info.cs_timeout.write_enable, 1);
  171. atomic_set(&hdev->captured_err_info.razwi.write_enable, 1);
  172. hdev->captured_err_info.undef_opcode.write_enable = true;
  173. hdev->open_counter++;
  174. hdev->last_successful_open_jif = jiffies;
  175. hdev->last_successful_open_ktime = ktime_get();
  176. return 0;
  177. out_err:
  178. mutex_unlock(&hdev->fpriv_list_lock);
  179. hl_mem_mgr_fini(&hpriv->mem_mgr);
  180. hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
  181. filp->private_data = NULL;
  182. mutex_destroy(&hpriv->ctx_lock);
  183. mutex_destroy(&hpriv->restore_phase_mutex);
  184. mutex_destroy(&hpriv->notifier_event.lock);
  185. put_pid(hpriv->taskpid);
  186. kfree(hpriv);
  187. return rc;
  188. }
  189. int hl_device_open_ctrl(struct inode *inode, struct file *filp)
  190. {
  191. struct hl_device *hdev;
  192. struct hl_fpriv *hpriv;
  193. int rc;
  194. mutex_lock(&hl_devs_idr_lock);
  195. hdev = idr_find(&hl_devs_idr, iminor(inode));
  196. mutex_unlock(&hl_devs_idr_lock);
  197. if (!hdev) {
  198. pr_err("Couldn't find device %d:%d\n",
  199. imajor(inode), iminor(inode));
  200. return -ENXIO;
  201. }
  202. hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
  203. if (!hpriv)
  204. return -ENOMEM;
  205. /* Prevent other routines from reading partial hpriv data by
  206. * initializing hpriv fields before inserting it to the list
  207. */
  208. hpriv->hdev = hdev;
  209. filp->private_data = hpriv;
  210. hpriv->filp = filp;
  211. mutex_init(&hpriv->notifier_event.lock);
  212. nonseekable_open(inode, filp);
  213. hpriv->taskpid = get_task_pid(current, PIDTYPE_PID);
  214. mutex_lock(&hdev->fpriv_ctrl_list_lock);
  215. if (!hl_device_operational(hdev, NULL)) {
  216. dev_dbg_ratelimited(hdev->dev_ctrl,
  217. "Can't open %s because it is disabled or in reset\n",
  218. dev_name(hdev->dev_ctrl));
  219. rc = -EPERM;
  220. goto out_err;
  221. }
  222. list_add(&hpriv->dev_node, &hdev->fpriv_ctrl_list);
  223. mutex_unlock(&hdev->fpriv_ctrl_list_lock);
  224. return 0;
  225. out_err:
  226. mutex_unlock(&hdev->fpriv_ctrl_list_lock);
  227. filp->private_data = NULL;
  228. put_pid(hpriv->taskpid);
  229. kfree(hpriv);
  230. return rc;
  231. }
  232. static void set_driver_behavior_per_device(struct hl_device *hdev)
  233. {
  234. hdev->nic_ports_mask = 0;
  235. hdev->fw_components = FW_TYPE_ALL_TYPES;
  236. hdev->mmu_enable = MMU_EN_ALL;
  237. hdev->cpu_queues_enable = 1;
  238. hdev->pldm = 0;
  239. hdev->hard_reset_on_fw_events = 1;
  240. hdev->bmc_enable = 1;
  241. hdev->reset_on_preboot_fail = 1;
  242. hdev->heartbeat = 1;
  243. }
  244. static void copy_kernel_module_params_to_device(struct hl_device *hdev)
  245. {
  246. hdev->asic_prop.fw_security_enabled = is_asic_secured(hdev->asic_type);
  247. hdev->major = hl_major;
  248. hdev->memory_scrub = memory_scrub;
  249. hdev->reset_on_lockup = reset_on_lockup;
  250. hdev->boot_error_status_mask = boot_error_status_mask;
  251. }
  252. static void fixup_device_params_per_asic(struct hl_device *hdev, int timeout)
  253. {
  254. switch (hdev->asic_type) {
  255. case ASIC_GAUDI:
  256. case ASIC_GAUDI_SEC:
  257. /* If user didn't request a different timeout than the default one, we have
  258. * a different default timeout for Gaudi
  259. */
  260. if (timeout == HL_DEFAULT_TIMEOUT_LOCKED)
  261. hdev->timeout_jiffies = msecs_to_jiffies(GAUDI_DEFAULT_TIMEOUT_LOCKED *
  262. MSEC_PER_SEC);
  263. hdev->reset_upon_device_release = 0;
  264. break;
  265. case ASIC_GOYA:
  266. hdev->reset_upon_device_release = 0;
  267. break;
  268. default:
  269. hdev->reset_upon_device_release = 1;
  270. break;
  271. }
  272. }
  273. static int fixup_device_params(struct hl_device *hdev)
  274. {
  275. int tmp_timeout;
  276. tmp_timeout = timeout_locked;
  277. hdev->fw_poll_interval_usec = HL_FW_STATUS_POLL_INTERVAL_USEC;
  278. hdev->fw_comms_poll_interval_usec = HL_FW_STATUS_POLL_INTERVAL_USEC;
  279. if (tmp_timeout)
  280. hdev->timeout_jiffies = msecs_to_jiffies(tmp_timeout * MSEC_PER_SEC);
  281. else
  282. hdev->timeout_jiffies = MAX_SCHEDULE_TIMEOUT;
  283. hdev->stop_on_err = true;
  284. hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
  285. hdev->reset_info.prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT;
  286. /* Enable only after the initialization of the device */
  287. hdev->disabled = true;
  288. if (!(hdev->fw_components & FW_TYPE_PREBOOT_CPU) &&
  289. (hdev->fw_components & ~FW_TYPE_PREBOOT_CPU)) {
  290. pr_err("Preboot must be set along with other components");
  291. return -EINVAL;
  292. }
  293. /* If CPU queues not enabled, no way to do heartbeat */
  294. if (!hdev->cpu_queues_enable)
  295. hdev->heartbeat = 0;
  296. fixup_device_params_per_asic(hdev, tmp_timeout);
  297. return 0;
  298. }
  299. /**
  300. * create_hdev - create habanalabs device instance
  301. *
  302. * @dev: will hold the pointer to the new habanalabs device structure
  303. * @pdev: pointer to the pci device
  304. *
  305. * Allocate memory for habanalabs device and initialize basic fields
  306. * Identify the ASIC type
  307. * Allocate ID (minor) for the device (only for real devices)
  308. */
  309. static int create_hdev(struct hl_device **dev, struct pci_dev *pdev)
  310. {
  311. int main_id, ctrl_id = 0, rc = 0;
  312. struct hl_device *hdev;
  313. *dev = NULL;
  314. hdev = kzalloc(sizeof(*hdev), GFP_KERNEL);
  315. if (!hdev)
  316. return -ENOMEM;
  317. /* Will be NULL in case of simulator device */
  318. hdev->pdev = pdev;
  319. /* Assign status description string */
  320. strncpy(hdev->status[HL_DEVICE_STATUS_OPERATIONAL], "operational", HL_STR_MAX);
  321. strncpy(hdev->status[HL_DEVICE_STATUS_IN_RESET], "in reset", HL_STR_MAX);
  322. strncpy(hdev->status[HL_DEVICE_STATUS_MALFUNCTION], "disabled", HL_STR_MAX);
  323. strncpy(hdev->status[HL_DEVICE_STATUS_NEEDS_RESET], "needs reset", HL_STR_MAX);
  324. strncpy(hdev->status[HL_DEVICE_STATUS_IN_DEVICE_CREATION],
  325. "in device creation", HL_STR_MAX);
  326. strncpy(hdev->status[HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE],
  327. "in reset after device release", HL_STR_MAX);
  328. /* First, we must find out which ASIC are we handling. This is needed
  329. * to configure the behavior of the driver (kernel parameters)
  330. */
  331. hdev->asic_type = get_asic_type(pdev->device);
  332. if (hdev->asic_type == ASIC_INVALID) {
  333. dev_err(&pdev->dev, "Unsupported ASIC\n");
  334. rc = -ENODEV;
  335. goto free_hdev;
  336. }
  337. copy_kernel_module_params_to_device(hdev);
  338. set_driver_behavior_per_device(hdev);
  339. fixup_device_params(hdev);
  340. mutex_lock(&hl_devs_idr_lock);
  341. /* Always save 2 numbers, 1 for main device and 1 for control.
  342. * They must be consecutive
  343. */
  344. main_id = idr_alloc(&hl_devs_idr, hdev, 0, HL_MAX_MINORS, GFP_KERNEL);
  345. if (main_id >= 0)
  346. ctrl_id = idr_alloc(&hl_devs_idr, hdev, main_id + 1,
  347. main_id + 2, GFP_KERNEL);
  348. mutex_unlock(&hl_devs_idr_lock);
  349. if ((main_id < 0) || (ctrl_id < 0)) {
  350. if ((main_id == -ENOSPC) || (ctrl_id == -ENOSPC))
  351. pr_err("too many devices in the system\n");
  352. if (main_id >= 0) {
  353. mutex_lock(&hl_devs_idr_lock);
  354. idr_remove(&hl_devs_idr, main_id);
  355. mutex_unlock(&hl_devs_idr_lock);
  356. }
  357. rc = -EBUSY;
  358. goto free_hdev;
  359. }
  360. hdev->id = main_id;
  361. hdev->id_control = ctrl_id;
  362. *dev = hdev;
  363. return 0;
  364. free_hdev:
  365. kfree(hdev);
  366. return rc;
  367. }
  368. /*
  369. * destroy_hdev - destroy habanalabs device instance
  370. *
  371. * @dev: pointer to the habanalabs device structure
  372. *
  373. */
  374. static void destroy_hdev(struct hl_device *hdev)
  375. {
  376. /* Remove device from the device list */
  377. mutex_lock(&hl_devs_idr_lock);
  378. idr_remove(&hl_devs_idr, hdev->id);
  379. idr_remove(&hl_devs_idr, hdev->id_control);
  380. mutex_unlock(&hl_devs_idr_lock);
  381. kfree(hdev);
  382. }
  383. static int hl_pmops_suspend(struct device *dev)
  384. {
  385. struct hl_device *hdev = dev_get_drvdata(dev);
  386. pr_debug("Going to suspend PCI device\n");
  387. if (!hdev) {
  388. pr_err("device pointer is NULL in suspend\n");
  389. return 0;
  390. }
  391. return hl_device_suspend(hdev);
  392. }
  393. static int hl_pmops_resume(struct device *dev)
  394. {
  395. struct hl_device *hdev = dev_get_drvdata(dev);
  396. pr_debug("Going to resume PCI device\n");
  397. if (!hdev) {
  398. pr_err("device pointer is NULL in resume\n");
  399. return 0;
  400. }
  401. return hl_device_resume(hdev);
  402. }
  403. /**
  404. * hl_pci_probe - probe PCI habanalabs devices
  405. *
  406. * @pdev: pointer to pci device
  407. * @id: pointer to pci device id structure
  408. *
  409. * Standard PCI probe function for habanalabs device.
  410. * Create a new habanalabs device and initialize it according to the
  411. * device's type
  412. */
  413. static int hl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
  414. {
  415. struct hl_device *hdev;
  416. int rc;
  417. dev_info(&pdev->dev, HL_NAME
  418. " device found [%04x:%04x] (rev %x)\n",
  419. (int)pdev->vendor, (int)pdev->device, (int)pdev->revision);
  420. rc = create_hdev(&hdev, pdev);
  421. if (rc)
  422. return rc;
  423. pci_set_drvdata(pdev, hdev);
  424. pci_enable_pcie_error_reporting(pdev);
  425. rc = hl_device_init(hdev, hl_class);
  426. if (rc) {
  427. dev_err(&pdev->dev, "Fatal error during habanalabs device init\n");
  428. rc = -ENODEV;
  429. goto disable_device;
  430. }
  431. return 0;
  432. disable_device:
  433. pci_disable_pcie_error_reporting(pdev);
  434. pci_set_drvdata(pdev, NULL);
  435. destroy_hdev(hdev);
  436. return rc;
  437. }
  438. /*
  439. * hl_pci_remove - remove PCI habanalabs devices
  440. *
  441. * @pdev: pointer to pci device
  442. *
  443. * Standard PCI remove function for habanalabs device
  444. */
  445. static void hl_pci_remove(struct pci_dev *pdev)
  446. {
  447. struct hl_device *hdev;
  448. hdev = pci_get_drvdata(pdev);
  449. if (!hdev)
  450. return;
  451. hl_device_fini(hdev);
  452. pci_disable_pcie_error_reporting(pdev);
  453. pci_set_drvdata(pdev, NULL);
  454. destroy_hdev(hdev);
  455. }
  456. /**
  457. * hl_pci_err_detected - a PCI bus error detected on this device
  458. *
  459. * @pdev: pointer to pci device
  460. * @state: PCI error type
  461. *
  462. * Called by the PCI subsystem whenever a non-correctable
  463. * PCI bus error is detected
  464. */
  465. static pci_ers_result_t
  466. hl_pci_err_detected(struct pci_dev *pdev, pci_channel_state_t state)
  467. {
  468. struct hl_device *hdev = pci_get_drvdata(pdev);
  469. enum pci_ers_result result;
  470. switch (state) {
  471. case pci_channel_io_normal:
  472. return PCI_ERS_RESULT_CAN_RECOVER;
  473. case pci_channel_io_frozen:
  474. dev_warn(hdev->dev, "frozen state error detected\n");
  475. result = PCI_ERS_RESULT_NEED_RESET;
  476. break;
  477. case pci_channel_io_perm_failure:
  478. dev_warn(hdev->dev, "failure state error detected\n");
  479. result = PCI_ERS_RESULT_DISCONNECT;
  480. break;
  481. default:
  482. result = PCI_ERS_RESULT_NONE;
  483. }
  484. hdev->asic_funcs->halt_engines(hdev, true, false);
  485. return result;
  486. }
  487. /**
  488. * hl_pci_err_resume - resume after a PCI slot reset
  489. *
  490. * @pdev: pointer to pci device
  491. *
  492. */
  493. static void hl_pci_err_resume(struct pci_dev *pdev)
  494. {
  495. struct hl_device *hdev = pci_get_drvdata(pdev);
  496. dev_warn(hdev->dev, "Resuming device after PCI slot reset\n");
  497. hl_device_resume(hdev);
  498. }
  499. /**
  500. * hl_pci_err_slot_reset - a PCI slot reset has just happened
  501. *
  502. * @pdev: pointer to pci device
  503. *
  504. * Determine if the driver can recover from the PCI slot reset
  505. */
  506. static pci_ers_result_t hl_pci_err_slot_reset(struct pci_dev *pdev)
  507. {
  508. return PCI_ERS_RESULT_RECOVERED;
  509. }
  510. static const struct dev_pm_ops hl_pm_ops = {
  511. .suspend = hl_pmops_suspend,
  512. .resume = hl_pmops_resume,
  513. };
  514. static const struct pci_error_handlers hl_pci_err_handler = {
  515. .error_detected = hl_pci_err_detected,
  516. .slot_reset = hl_pci_err_slot_reset,
  517. .resume = hl_pci_err_resume,
  518. };
  519. static struct pci_driver hl_pci_driver = {
  520. .name = HL_NAME,
  521. .id_table = ids,
  522. .probe = hl_pci_probe,
  523. .remove = hl_pci_remove,
  524. .shutdown = hl_pci_remove,
  525. .driver = {
  526. .name = HL_NAME,
  527. .pm = &hl_pm_ops,
  528. .probe_type = PROBE_PREFER_ASYNCHRONOUS,
  529. },
  530. .err_handler = &hl_pci_err_handler,
  531. };
  532. /*
  533. * hl_init - Initialize the habanalabs kernel driver
  534. */
  535. static int __init hl_init(void)
  536. {
  537. int rc;
  538. dev_t dev;
  539. pr_info("loading driver\n");
  540. rc = alloc_chrdev_region(&dev, 0, HL_MAX_MINORS, HL_NAME);
  541. if (rc < 0) {
  542. pr_err("unable to get major\n");
  543. return rc;
  544. }
  545. hl_major = MAJOR(dev);
  546. hl_class = class_create(THIS_MODULE, HL_NAME);
  547. if (IS_ERR(hl_class)) {
  548. pr_err("failed to allocate class\n");
  549. rc = PTR_ERR(hl_class);
  550. goto remove_major;
  551. }
  552. hl_debugfs_init();
  553. rc = pci_register_driver(&hl_pci_driver);
  554. if (rc) {
  555. pr_err("failed to register pci device\n");
  556. goto remove_debugfs;
  557. }
  558. pr_debug("driver loaded\n");
  559. return 0;
  560. remove_debugfs:
  561. hl_debugfs_fini();
  562. class_destroy(hl_class);
  563. remove_major:
  564. unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS);
  565. return rc;
  566. }
  567. /*
  568. * hl_exit - Release all resources of the habanalabs kernel driver
  569. */
  570. static void __exit hl_exit(void)
  571. {
  572. pci_unregister_driver(&hl_pci_driver);
  573. /*
  574. * Removing debugfs must be after all devices or simulator devices
  575. * have been removed because otherwise we get a bug in the
  576. * debugfs module for referencing NULL objects
  577. */
  578. hl_debugfs_fini();
  579. class_destroy(hl_class);
  580. unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS);
  581. idr_destroy(&hl_devs_idr);
  582. pr_debug("driver removed\n");
  583. }
  584. module_init(hl_init);
  585. module_exit(hl_exit);