siw_main.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621
  1. // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
  2. /* Authors: Bernard Metzler <[email protected]> */
  3. /* Copyright (c) 2008-2019, IBM Corporation */
  4. #include <linux/init.h>
  5. #include <linux/errno.h>
  6. #include <linux/netdevice.h>
  7. #include <linux/inetdevice.h>
  8. #include <net/net_namespace.h>
  9. #include <linux/rtnetlink.h>
  10. #include <linux/if_arp.h>
  11. #include <linux/list.h>
  12. #include <linux/kernel.h>
  13. #include <linux/sched.h>
  14. #include <linux/module.h>
  15. #include <linux/dma-mapping.h>
  16. #include <net/addrconf.h>
  17. #include <rdma/ib_verbs.h>
  18. #include <rdma/ib_user_verbs.h>
  19. #include <rdma/rdma_netlink.h>
  20. #include <linux/kthread.h>
  21. #include "siw.h"
  22. #include "siw_verbs.h"
  23. MODULE_AUTHOR("Bernard Metzler");
  24. MODULE_DESCRIPTION("Software iWARP Driver");
  25. MODULE_LICENSE("Dual BSD/GPL");
  26. /* transmit from user buffer, if possible */
  27. const bool zcopy_tx = true;
  28. /* Restrict usage of GSO, if hardware peer iwarp is unable to process
  29. * large packets. try_gso = true lets siw try to use local GSO,
  30. * if peer agrees. Not using GSO severly limits siw maximum tx bandwidth.
  31. */
  32. const bool try_gso;
  33. /* Attach siw also with loopback devices */
  34. const bool loopback_enabled = true;
  35. /* We try to negotiate CRC on, if true */
  36. const bool mpa_crc_required;
  37. /* MPA CRC on/off enforced */
  38. const bool mpa_crc_strict;
  39. /* Control TCP_NODELAY socket option */
  40. const bool siw_tcp_nagle;
  41. /* Select MPA version to be used during connection setup */
  42. u_char mpa_version = MPA_REVISION_2;
  43. /* Selects MPA P2P mode (additional handshake during connection
  44. * setup, if true.
  45. */
  46. const bool peer_to_peer;
  47. struct task_struct *siw_tx_thread[NR_CPUS];
  48. struct crypto_shash *siw_crypto_shash;
  49. static int siw_device_register(struct siw_device *sdev, const char *name)
  50. {
  51. struct ib_device *base_dev = &sdev->base_dev;
  52. static int dev_id = 1;
  53. int rv;
  54. sdev->vendor_part_id = dev_id++;
  55. rv = ib_register_device(base_dev, name, NULL);
  56. if (rv) {
  57. pr_warn("siw: device registration error %d\n", rv);
  58. return rv;
  59. }
  60. siw_dbg(base_dev, "HWaddr=%pM\n", sdev->raw_gid);
  61. return 0;
  62. }
  63. static void siw_device_cleanup(struct ib_device *base_dev)
  64. {
  65. struct siw_device *sdev = to_siw_dev(base_dev);
  66. xa_destroy(&sdev->qp_xa);
  67. xa_destroy(&sdev->mem_xa);
  68. }
  69. static int siw_create_tx_threads(void)
  70. {
  71. int cpu, assigned = 0;
  72. for_each_online_cpu(cpu) {
  73. /* Skip HT cores */
  74. if (cpu % cpumask_weight(topology_sibling_cpumask(cpu)))
  75. continue;
  76. siw_tx_thread[cpu] =
  77. kthread_run_on_cpu(siw_run_sq,
  78. (unsigned long *)(long)cpu,
  79. cpu, "siw_tx/%u");
  80. if (IS_ERR(siw_tx_thread[cpu])) {
  81. siw_tx_thread[cpu] = NULL;
  82. continue;
  83. }
  84. assigned++;
  85. }
  86. return assigned;
  87. }
  88. static int siw_dev_qualified(struct net_device *netdev)
  89. {
  90. /*
  91. * Additional hardware support can be added here
  92. * (e.g. ARPHRD_FDDI, ARPHRD_ATM, ...) - see
  93. * <linux/if_arp.h> for type identifiers.
  94. */
  95. if (netdev->type == ARPHRD_ETHER || netdev->type == ARPHRD_IEEE802 ||
  96. netdev->type == ARPHRD_NONE ||
  97. (netdev->type == ARPHRD_LOOPBACK && loopback_enabled))
  98. return 1;
  99. return 0;
  100. }
  101. static DEFINE_PER_CPU(atomic_t, siw_use_cnt);
  102. static struct {
  103. struct cpumask **tx_valid_cpus;
  104. int num_nodes;
  105. } siw_cpu_info;
  106. static int siw_init_cpulist(void)
  107. {
  108. int i, num_nodes = nr_node_ids;
  109. memset(siw_tx_thread, 0, sizeof(siw_tx_thread));
  110. siw_cpu_info.num_nodes = num_nodes;
  111. siw_cpu_info.tx_valid_cpus =
  112. kcalloc(num_nodes, sizeof(struct cpumask *), GFP_KERNEL);
  113. if (!siw_cpu_info.tx_valid_cpus) {
  114. siw_cpu_info.num_nodes = 0;
  115. return -ENOMEM;
  116. }
  117. for (i = 0; i < siw_cpu_info.num_nodes; i++) {
  118. siw_cpu_info.tx_valid_cpus[i] =
  119. kzalloc(sizeof(struct cpumask), GFP_KERNEL);
  120. if (!siw_cpu_info.tx_valid_cpus[i])
  121. goto out_err;
  122. cpumask_clear(siw_cpu_info.tx_valid_cpus[i]);
  123. }
  124. for_each_possible_cpu(i)
  125. cpumask_set_cpu(i, siw_cpu_info.tx_valid_cpus[cpu_to_node(i)]);
  126. return 0;
  127. out_err:
  128. siw_cpu_info.num_nodes = 0;
  129. while (--i >= 0)
  130. kfree(siw_cpu_info.tx_valid_cpus[i]);
  131. kfree(siw_cpu_info.tx_valid_cpus);
  132. siw_cpu_info.tx_valid_cpus = NULL;
  133. return -ENOMEM;
  134. }
  135. static void siw_destroy_cpulist(void)
  136. {
  137. int i = 0;
  138. while (i < siw_cpu_info.num_nodes)
  139. kfree(siw_cpu_info.tx_valid_cpus[i++]);
  140. kfree(siw_cpu_info.tx_valid_cpus);
  141. }
  142. /*
  143. * Choose CPU with least number of active QP's from NUMA node of
  144. * TX interface.
  145. */
  146. int siw_get_tx_cpu(struct siw_device *sdev)
  147. {
  148. const struct cpumask *tx_cpumask;
  149. int i, num_cpus, cpu, min_use, node = sdev->numa_node, tx_cpu = -1;
  150. if (node < 0)
  151. tx_cpumask = cpu_online_mask;
  152. else
  153. tx_cpumask = siw_cpu_info.tx_valid_cpus[node];
  154. num_cpus = cpumask_weight(tx_cpumask);
  155. if (!num_cpus) {
  156. /* no CPU on this NUMA node */
  157. tx_cpumask = cpu_online_mask;
  158. num_cpus = cpumask_weight(tx_cpumask);
  159. }
  160. if (!num_cpus)
  161. goto out;
  162. cpu = cpumask_first(tx_cpumask);
  163. for (i = 0, min_use = SIW_MAX_QP; i < num_cpus;
  164. i++, cpu = cpumask_next(cpu, tx_cpumask)) {
  165. int usage;
  166. /* Skip any cores which have no TX thread */
  167. if (!siw_tx_thread[cpu])
  168. continue;
  169. usage = atomic_read(&per_cpu(siw_use_cnt, cpu));
  170. if (usage <= min_use) {
  171. tx_cpu = cpu;
  172. min_use = usage;
  173. }
  174. }
  175. siw_dbg(&sdev->base_dev,
  176. "tx cpu %d, node %d, %d qp's\n", tx_cpu, node, min_use);
  177. out:
  178. if (tx_cpu >= 0)
  179. atomic_inc(&per_cpu(siw_use_cnt, tx_cpu));
  180. else
  181. pr_warn("siw: no tx cpu found\n");
  182. return tx_cpu;
  183. }
  184. void siw_put_tx_cpu(int cpu)
  185. {
  186. atomic_dec(&per_cpu(siw_use_cnt, cpu));
  187. }
  188. static struct ib_qp *siw_get_base_qp(struct ib_device *base_dev, int id)
  189. {
  190. struct siw_qp *qp = siw_qp_id2obj(to_siw_dev(base_dev), id);
  191. if (qp) {
  192. /*
  193. * siw_qp_id2obj() increments object reference count
  194. */
  195. siw_qp_put(qp);
  196. return &qp->base_qp;
  197. }
  198. return NULL;
  199. }
  200. static const struct ib_device_ops siw_device_ops = {
  201. .owner = THIS_MODULE,
  202. .uverbs_abi_ver = SIW_ABI_VERSION,
  203. .driver_id = RDMA_DRIVER_SIW,
  204. .alloc_mr = siw_alloc_mr,
  205. .alloc_pd = siw_alloc_pd,
  206. .alloc_ucontext = siw_alloc_ucontext,
  207. .create_cq = siw_create_cq,
  208. .create_qp = siw_create_qp,
  209. .create_srq = siw_create_srq,
  210. .dealloc_driver = siw_device_cleanup,
  211. .dealloc_pd = siw_dealloc_pd,
  212. .dealloc_ucontext = siw_dealloc_ucontext,
  213. .dereg_mr = siw_dereg_mr,
  214. .destroy_cq = siw_destroy_cq,
  215. .destroy_qp = siw_destroy_qp,
  216. .destroy_srq = siw_destroy_srq,
  217. .get_dma_mr = siw_get_dma_mr,
  218. .get_port_immutable = siw_get_port_immutable,
  219. .iw_accept = siw_accept,
  220. .iw_add_ref = siw_qp_get_ref,
  221. .iw_connect = siw_connect,
  222. .iw_create_listen = siw_create_listen,
  223. .iw_destroy_listen = siw_destroy_listen,
  224. .iw_get_qp = siw_get_base_qp,
  225. .iw_reject = siw_reject,
  226. .iw_rem_ref = siw_qp_put_ref,
  227. .map_mr_sg = siw_map_mr_sg,
  228. .mmap = siw_mmap,
  229. .mmap_free = siw_mmap_free,
  230. .modify_qp = siw_verbs_modify_qp,
  231. .modify_srq = siw_modify_srq,
  232. .poll_cq = siw_poll_cq,
  233. .post_recv = siw_post_receive,
  234. .post_send = siw_post_send,
  235. .post_srq_recv = siw_post_srq_recv,
  236. .query_device = siw_query_device,
  237. .query_gid = siw_query_gid,
  238. .query_port = siw_query_port,
  239. .query_qp = siw_query_qp,
  240. .query_srq = siw_query_srq,
  241. .req_notify_cq = siw_req_notify_cq,
  242. .reg_user_mr = siw_reg_user_mr,
  243. INIT_RDMA_OBJ_SIZE(ib_cq, siw_cq, base_cq),
  244. INIT_RDMA_OBJ_SIZE(ib_pd, siw_pd, base_pd),
  245. INIT_RDMA_OBJ_SIZE(ib_qp, siw_qp, base_qp),
  246. INIT_RDMA_OBJ_SIZE(ib_srq, siw_srq, base_srq),
  247. INIT_RDMA_OBJ_SIZE(ib_ucontext, siw_ucontext, base_ucontext),
  248. };
  249. static struct siw_device *siw_device_create(struct net_device *netdev)
  250. {
  251. struct siw_device *sdev = NULL;
  252. struct ib_device *base_dev;
  253. int rv;
  254. sdev = ib_alloc_device(siw_device, base_dev);
  255. if (!sdev)
  256. return NULL;
  257. base_dev = &sdev->base_dev;
  258. sdev->netdev = netdev;
  259. if (netdev->addr_len) {
  260. memcpy(sdev->raw_gid, netdev->dev_addr,
  261. min_t(unsigned int, netdev->addr_len, ETH_ALEN));
  262. } else {
  263. /*
  264. * This device does not have a HW address, but
  265. * connection mangagement requires a unique gid.
  266. */
  267. eth_random_addr(sdev->raw_gid);
  268. }
  269. addrconf_addr_eui48((u8 *)&base_dev->node_guid, sdev->raw_gid);
  270. base_dev->uverbs_cmd_mask |= BIT_ULL(IB_USER_VERBS_CMD_POST_SEND);
  271. base_dev->node_type = RDMA_NODE_RNIC;
  272. memcpy(base_dev->node_desc, SIW_NODE_DESC_COMMON,
  273. sizeof(SIW_NODE_DESC_COMMON));
  274. /*
  275. * Current model (one-to-one device association):
  276. * One Softiwarp device per net_device or, equivalently,
  277. * per physical port.
  278. */
  279. base_dev->phys_port_cnt = 1;
  280. base_dev->num_comp_vectors = num_possible_cpus();
  281. xa_init_flags(&sdev->qp_xa, XA_FLAGS_ALLOC1);
  282. xa_init_flags(&sdev->mem_xa, XA_FLAGS_ALLOC1);
  283. ib_set_device_ops(base_dev, &siw_device_ops);
  284. rv = ib_device_set_netdev(base_dev, netdev, 1);
  285. if (rv)
  286. goto error;
  287. memcpy(base_dev->iw_ifname, netdev->name,
  288. sizeof(base_dev->iw_ifname));
  289. /* Disable TCP port mapping */
  290. base_dev->iw_driver_flags = IW_F_NO_PORT_MAP;
  291. sdev->attrs.max_qp = SIW_MAX_QP;
  292. sdev->attrs.max_qp_wr = SIW_MAX_QP_WR;
  293. sdev->attrs.max_ord = SIW_MAX_ORD_QP;
  294. sdev->attrs.max_ird = SIW_MAX_IRD_QP;
  295. sdev->attrs.max_sge = SIW_MAX_SGE;
  296. sdev->attrs.max_sge_rd = SIW_MAX_SGE_RD;
  297. sdev->attrs.max_cq = SIW_MAX_CQ;
  298. sdev->attrs.max_cqe = SIW_MAX_CQE;
  299. sdev->attrs.max_mr = SIW_MAX_MR;
  300. sdev->attrs.max_pd = SIW_MAX_PD;
  301. sdev->attrs.max_mw = SIW_MAX_MW;
  302. sdev->attrs.max_srq = SIW_MAX_SRQ;
  303. sdev->attrs.max_srq_wr = SIW_MAX_SRQ_WR;
  304. sdev->attrs.max_srq_sge = SIW_MAX_SGE;
  305. INIT_LIST_HEAD(&sdev->cep_list);
  306. INIT_LIST_HEAD(&sdev->qp_list);
  307. atomic_set(&sdev->num_ctx, 0);
  308. atomic_set(&sdev->num_srq, 0);
  309. atomic_set(&sdev->num_qp, 0);
  310. atomic_set(&sdev->num_cq, 0);
  311. atomic_set(&sdev->num_mr, 0);
  312. atomic_set(&sdev->num_pd, 0);
  313. sdev->numa_node = dev_to_node(&netdev->dev);
  314. spin_lock_init(&sdev->lock);
  315. return sdev;
  316. error:
  317. ib_dealloc_device(base_dev);
  318. return NULL;
  319. }
  320. /*
  321. * Network link becomes unavailable. Mark all
  322. * affected QP's accordingly.
  323. */
  324. static void siw_netdev_down(struct work_struct *work)
  325. {
  326. struct siw_device *sdev =
  327. container_of(work, struct siw_device, netdev_down);
  328. struct siw_qp_attrs qp_attrs;
  329. struct list_head *pos, *tmp;
  330. memset(&qp_attrs, 0, sizeof(qp_attrs));
  331. qp_attrs.state = SIW_QP_STATE_ERROR;
  332. list_for_each_safe(pos, tmp, &sdev->qp_list) {
  333. struct siw_qp *qp = list_entry(pos, struct siw_qp, devq);
  334. down_write(&qp->state_lock);
  335. WARN_ON(siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE));
  336. up_write(&qp->state_lock);
  337. }
  338. ib_device_put(&sdev->base_dev);
  339. }
  340. static void siw_device_goes_down(struct siw_device *sdev)
  341. {
  342. if (ib_device_try_get(&sdev->base_dev)) {
  343. INIT_WORK(&sdev->netdev_down, siw_netdev_down);
  344. schedule_work(&sdev->netdev_down);
  345. }
  346. }
  347. static int siw_netdev_event(struct notifier_block *nb, unsigned long event,
  348. void *arg)
  349. {
  350. struct net_device *netdev = netdev_notifier_info_to_dev(arg);
  351. struct ib_device *base_dev;
  352. struct siw_device *sdev;
  353. dev_dbg(&netdev->dev, "siw: event %lu\n", event);
  354. base_dev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_SIW);
  355. if (!base_dev)
  356. return NOTIFY_OK;
  357. sdev = to_siw_dev(base_dev);
  358. switch (event) {
  359. case NETDEV_UP:
  360. sdev->state = IB_PORT_ACTIVE;
  361. siw_port_event(sdev, 1, IB_EVENT_PORT_ACTIVE);
  362. break;
  363. case NETDEV_GOING_DOWN:
  364. siw_device_goes_down(sdev);
  365. break;
  366. case NETDEV_DOWN:
  367. sdev->state = IB_PORT_DOWN;
  368. siw_port_event(sdev, 1, IB_EVENT_PORT_ERR);
  369. break;
  370. case NETDEV_REGISTER:
  371. /*
  372. * Device registration now handled only by
  373. * rdma netlink commands. So it shall be impossible
  374. * to end up here with a valid siw device.
  375. */
  376. siw_dbg(base_dev, "unexpected NETDEV_REGISTER event\n");
  377. break;
  378. case NETDEV_UNREGISTER:
  379. ib_unregister_device_queued(&sdev->base_dev);
  380. break;
  381. case NETDEV_CHANGEADDR:
  382. siw_port_event(sdev, 1, IB_EVENT_LID_CHANGE);
  383. break;
  384. /*
  385. * Todo: Below netdev events are currently not handled.
  386. */
  387. case NETDEV_CHANGEMTU:
  388. case NETDEV_CHANGE:
  389. break;
  390. default:
  391. break;
  392. }
  393. ib_device_put(&sdev->base_dev);
  394. return NOTIFY_OK;
  395. }
  396. static struct notifier_block siw_netdev_nb = {
  397. .notifier_call = siw_netdev_event,
  398. };
  399. static int siw_newlink(const char *basedev_name, struct net_device *netdev)
  400. {
  401. struct ib_device *base_dev;
  402. struct siw_device *sdev = NULL;
  403. int rv = -ENOMEM;
  404. if (!siw_dev_qualified(netdev))
  405. return -EINVAL;
  406. base_dev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_SIW);
  407. if (base_dev) {
  408. ib_device_put(base_dev);
  409. return -EEXIST;
  410. }
  411. sdev = siw_device_create(netdev);
  412. if (sdev) {
  413. dev_dbg(&netdev->dev, "siw: new device\n");
  414. if (netif_running(netdev) && netif_carrier_ok(netdev))
  415. sdev->state = IB_PORT_ACTIVE;
  416. else
  417. sdev->state = IB_PORT_DOWN;
  418. rv = siw_device_register(sdev, basedev_name);
  419. if (rv)
  420. ib_dealloc_device(&sdev->base_dev);
  421. }
  422. return rv;
  423. }
  424. static struct rdma_link_ops siw_link_ops = {
  425. .type = "siw",
  426. .newlink = siw_newlink,
  427. };
  428. /*
  429. * siw_init_module - Initialize Softiwarp module and register with netdev
  430. * subsystem.
  431. */
  432. static __init int siw_init_module(void)
  433. {
  434. int rv;
  435. int nr_cpu;
  436. if (SENDPAGE_THRESH < SIW_MAX_INLINE) {
  437. pr_info("siw: sendpage threshold too small: %u\n",
  438. (int)SENDPAGE_THRESH);
  439. rv = -EINVAL;
  440. goto out_error;
  441. }
  442. rv = siw_init_cpulist();
  443. if (rv)
  444. goto out_error;
  445. rv = siw_cm_init();
  446. if (rv)
  447. goto out_error;
  448. if (!siw_create_tx_threads()) {
  449. pr_info("siw: Could not start any TX thread\n");
  450. rv = -ENOMEM;
  451. goto out_error;
  452. }
  453. /*
  454. * Locate CRC32 algorithm. If unsuccessful, fail
  455. * loading siw only, if CRC is required.
  456. */
  457. siw_crypto_shash = crypto_alloc_shash("crc32c", 0, 0);
  458. if (IS_ERR(siw_crypto_shash)) {
  459. pr_info("siw: Loading CRC32c failed: %ld\n",
  460. PTR_ERR(siw_crypto_shash));
  461. siw_crypto_shash = NULL;
  462. if (mpa_crc_required) {
  463. rv = -EOPNOTSUPP;
  464. goto out_error;
  465. }
  466. }
  467. rv = register_netdevice_notifier(&siw_netdev_nb);
  468. if (rv)
  469. goto out_error;
  470. rdma_link_register(&siw_link_ops);
  471. pr_info("SoftiWARP attached\n");
  472. return 0;
  473. out_error:
  474. for (nr_cpu = 0; nr_cpu < nr_cpu_ids; nr_cpu++) {
  475. if (siw_tx_thread[nr_cpu]) {
  476. siw_stop_tx_thread(nr_cpu);
  477. siw_tx_thread[nr_cpu] = NULL;
  478. }
  479. }
  480. if (siw_crypto_shash)
  481. crypto_free_shash(siw_crypto_shash);
  482. pr_info("SoftIWARP attach failed. Error: %d\n", rv);
  483. siw_cm_exit();
  484. siw_destroy_cpulist();
  485. return rv;
  486. }
  487. static void __exit siw_exit_module(void)
  488. {
  489. int cpu;
  490. for_each_possible_cpu(cpu) {
  491. if (siw_tx_thread[cpu]) {
  492. siw_stop_tx_thread(cpu);
  493. siw_tx_thread[cpu] = NULL;
  494. }
  495. }
  496. unregister_netdevice_notifier(&siw_netdev_nb);
  497. rdma_link_unregister(&siw_link_ops);
  498. ib_unregister_driver(RDMA_DRIVER_SIW);
  499. siw_cm_exit();
  500. siw_destroy_cpulist();
  501. if (siw_crypto_shash)
  502. crypto_free_shash(siw_crypto_shash);
  503. pr_info("SoftiWARP detached\n");
  504. }
  505. module_init(siw_init_module);
  506. module_exit(siw_exit_module);
  507. MODULE_ALIAS_RDMA_LINK("siw");