cmd.c 36 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362
  1. // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
  2. /*
  3. * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
  4. */
  5. #include "cmd.h"
  6. enum { CQ_OK = 0, CQ_EMPTY = -1, CQ_POLL_ERR = -2 };
  7. static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id,
  8. u16 *vhca_id);
  9. static void
  10. _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev);
  11. int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
  12. {
  13. u32 out[MLX5_ST_SZ_DW(suspend_vhca_out)] = {};
  14. u32 in[MLX5_ST_SZ_DW(suspend_vhca_in)] = {};
  15. lockdep_assert_held(&mvdev->state_mutex);
  16. if (mvdev->mdev_detach)
  17. return -ENOTCONN;
  18. MLX5_SET(suspend_vhca_in, in, opcode, MLX5_CMD_OP_SUSPEND_VHCA);
  19. MLX5_SET(suspend_vhca_in, in, vhca_id, mvdev->vhca_id);
  20. MLX5_SET(suspend_vhca_in, in, op_mod, op_mod);
  21. return mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out);
  22. }
  23. int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
  24. {
  25. u32 out[MLX5_ST_SZ_DW(resume_vhca_out)] = {};
  26. u32 in[MLX5_ST_SZ_DW(resume_vhca_in)] = {};
  27. lockdep_assert_held(&mvdev->state_mutex);
  28. if (mvdev->mdev_detach)
  29. return -ENOTCONN;
  30. MLX5_SET(resume_vhca_in, in, opcode, MLX5_CMD_OP_RESUME_VHCA);
  31. MLX5_SET(resume_vhca_in, in, vhca_id, mvdev->vhca_id);
  32. MLX5_SET(resume_vhca_in, in, op_mod, op_mod);
  33. return mlx5_cmd_exec_inout(mvdev->mdev, resume_vhca, in, out);
  34. }
  35. int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
  36. size_t *state_size)
  37. {
  38. u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {};
  39. u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {};
  40. int ret;
  41. lockdep_assert_held(&mvdev->state_mutex);
  42. if (mvdev->mdev_detach)
  43. return -ENOTCONN;
  44. MLX5_SET(query_vhca_migration_state_in, in, opcode,
  45. MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE);
  46. MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id);
  47. MLX5_SET(query_vhca_migration_state_in, in, op_mod, 0);
  48. ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in,
  49. out);
  50. if (ret)
  51. return ret;
  52. *state_size = MLX5_GET(query_vhca_migration_state_out, out,
  53. required_umem_size);
  54. return 0;
  55. }
  56. static void set_tracker_error(struct mlx5vf_pci_core_device *mvdev)
  57. {
  58. /* Mark the tracker under an error and wake it up if it's running */
  59. mvdev->tracker.is_err = true;
  60. complete(&mvdev->tracker_comp);
  61. }
  62. static int mlx5fv_vf_event(struct notifier_block *nb,
  63. unsigned long event, void *data)
  64. {
  65. struct mlx5vf_pci_core_device *mvdev =
  66. container_of(nb, struct mlx5vf_pci_core_device, nb);
  67. switch (event) {
  68. case MLX5_PF_NOTIFY_ENABLE_VF:
  69. mutex_lock(&mvdev->state_mutex);
  70. mvdev->mdev_detach = false;
  71. mlx5vf_state_mutex_unlock(mvdev);
  72. break;
  73. case MLX5_PF_NOTIFY_DISABLE_VF:
  74. mlx5vf_cmd_close_migratable(mvdev);
  75. mutex_lock(&mvdev->state_mutex);
  76. mvdev->mdev_detach = true;
  77. mlx5vf_state_mutex_unlock(mvdev);
  78. break;
  79. default:
  80. break;
  81. }
  82. return 0;
  83. }
  84. void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev)
  85. {
  86. if (!mvdev->migrate_cap)
  87. return;
  88. /* Must be done outside the lock to let it progress */
  89. set_tracker_error(mvdev);
  90. mutex_lock(&mvdev->state_mutex);
  91. mlx5vf_disable_fds(mvdev);
  92. _mlx5vf_free_page_tracker_resources(mvdev);
  93. mlx5vf_state_mutex_unlock(mvdev);
  94. }
  95. void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev)
  96. {
  97. if (!mvdev->migrate_cap)
  98. return;
  99. mlx5_sriov_blocking_notifier_unregister(mvdev->mdev, mvdev->vf_id,
  100. &mvdev->nb);
  101. destroy_workqueue(mvdev->cb_wq);
  102. }
  103. void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev,
  104. const struct vfio_migration_ops *mig_ops,
  105. const struct vfio_log_ops *log_ops)
  106. {
  107. struct pci_dev *pdev = mvdev->core_device.pdev;
  108. int ret;
  109. if (!pdev->is_virtfn)
  110. return;
  111. mvdev->mdev = mlx5_vf_get_core_dev(pdev);
  112. if (!mvdev->mdev)
  113. return;
  114. if (!MLX5_CAP_GEN(mvdev->mdev, migration))
  115. goto end;
  116. mvdev->vf_id = pci_iov_vf_id(pdev);
  117. if (mvdev->vf_id < 0)
  118. goto end;
  119. if (mlx5vf_cmd_get_vhca_id(mvdev->mdev, mvdev->vf_id + 1,
  120. &mvdev->vhca_id))
  121. goto end;
  122. mvdev->cb_wq = alloc_ordered_workqueue("mlx5vf_wq", 0);
  123. if (!mvdev->cb_wq)
  124. goto end;
  125. mutex_init(&mvdev->state_mutex);
  126. spin_lock_init(&mvdev->reset_lock);
  127. mvdev->nb.notifier_call = mlx5fv_vf_event;
  128. ret = mlx5_sriov_blocking_notifier_register(mvdev->mdev, mvdev->vf_id,
  129. &mvdev->nb);
  130. if (ret) {
  131. destroy_workqueue(mvdev->cb_wq);
  132. goto end;
  133. }
  134. mvdev->migrate_cap = 1;
  135. mvdev->core_device.vdev.migration_flags =
  136. VFIO_MIGRATION_STOP_COPY |
  137. VFIO_MIGRATION_P2P;
  138. mvdev->core_device.vdev.mig_ops = mig_ops;
  139. init_completion(&mvdev->tracker_comp);
  140. if (MLX5_CAP_GEN(mvdev->mdev, adv_virtualization))
  141. mvdev->core_device.vdev.log_ops = log_ops;
  142. end:
  143. mlx5_vf_put_core_dev(mvdev->mdev);
  144. }
  145. static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id,
  146. u16 *vhca_id)
  147. {
  148. u32 in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {};
  149. int out_size;
  150. void *out;
  151. int ret;
  152. out_size = MLX5_ST_SZ_BYTES(query_hca_cap_out);
  153. out = kzalloc(out_size, GFP_KERNEL);
  154. if (!out)
  155. return -ENOMEM;
  156. MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP);
  157. MLX5_SET(query_hca_cap_in, in, other_function, 1);
  158. MLX5_SET(query_hca_cap_in, in, function_id, function_id);
  159. MLX5_SET(query_hca_cap_in, in, op_mod,
  160. MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE << 1 |
  161. HCA_CAP_OPMOD_GET_CUR);
  162. ret = mlx5_cmd_exec_inout(mdev, query_hca_cap, in, out);
  163. if (ret)
  164. goto err_exec;
  165. *vhca_id = MLX5_GET(query_hca_cap_out, out,
  166. capability.cmd_hca_cap.vhca_id);
  167. err_exec:
  168. kfree(out);
  169. return ret;
  170. }
  171. static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn,
  172. struct mlx5_vf_migration_file *migf,
  173. struct mlx5_vhca_recv_buf *recv_buf,
  174. u32 *mkey)
  175. {
  176. size_t npages = migf ? DIV_ROUND_UP(migf->total_length, PAGE_SIZE) :
  177. recv_buf->npages;
  178. int err = 0, inlen;
  179. __be64 *mtt;
  180. void *mkc;
  181. u32 *in;
  182. inlen = MLX5_ST_SZ_BYTES(create_mkey_in) +
  183. sizeof(*mtt) * round_up(npages, 2);
  184. in = kvzalloc(inlen, GFP_KERNEL);
  185. if (!in)
  186. return -ENOMEM;
  187. MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
  188. DIV_ROUND_UP(npages, 2));
  189. mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
  190. if (migf) {
  191. struct sg_dma_page_iter dma_iter;
  192. for_each_sgtable_dma_page(&migf->table.sgt, &dma_iter, 0)
  193. *mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter));
  194. } else {
  195. int i;
  196. for (i = 0; i < npages; i++)
  197. *mtt++ = cpu_to_be64(recv_buf->dma_addrs[i]);
  198. }
  199. mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
  200. MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
  201. MLX5_SET(mkc, mkc, lr, 1);
  202. MLX5_SET(mkc, mkc, lw, 1);
  203. MLX5_SET(mkc, mkc, rr, 1);
  204. MLX5_SET(mkc, mkc, rw, 1);
  205. MLX5_SET(mkc, mkc, pd, pdn);
  206. MLX5_SET(mkc, mkc, bsf_octword_size, 0);
  207. MLX5_SET(mkc, mkc, qpn, 0xffffff);
  208. MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
  209. MLX5_SET(mkc, mkc, translations_octword_size, DIV_ROUND_UP(npages, 2));
  210. MLX5_SET64(mkc, mkc, len,
  211. migf ? migf->total_length : (npages * PAGE_SIZE));
  212. err = mlx5_core_create_mkey(mdev, mkey, in, inlen);
  213. kvfree(in);
  214. return err;
  215. }
  216. void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work)
  217. {
  218. struct mlx5vf_async_data *async_data = container_of(_work,
  219. struct mlx5vf_async_data, work);
  220. struct mlx5_vf_migration_file *migf = container_of(async_data,
  221. struct mlx5_vf_migration_file, async_data);
  222. struct mlx5_core_dev *mdev = migf->mvdev->mdev;
  223. mutex_lock(&migf->lock);
  224. if (async_data->status) {
  225. migf->is_err = true;
  226. wake_up_interruptible(&migf->poll_wait);
  227. }
  228. mutex_unlock(&migf->lock);
  229. mlx5_core_destroy_mkey(mdev, async_data->mkey);
  230. dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0);
  231. mlx5_core_dealloc_pd(mdev, async_data->pdn);
  232. kvfree(async_data->out);
  233. fput(migf->filp);
  234. }
  235. static void mlx5vf_save_callback(int status, struct mlx5_async_work *context)
  236. {
  237. struct mlx5vf_async_data *async_data = container_of(context,
  238. struct mlx5vf_async_data, cb_work);
  239. struct mlx5_vf_migration_file *migf = container_of(async_data,
  240. struct mlx5_vf_migration_file, async_data);
  241. if (!status) {
  242. WRITE_ONCE(migf->total_length,
  243. MLX5_GET(save_vhca_state_out, async_data->out,
  244. actual_image_size));
  245. wake_up_interruptible(&migf->poll_wait);
  246. }
  247. /*
  248. * The error and the cleanup flows can't run from an
  249. * interrupt context
  250. */
  251. async_data->status = status;
  252. queue_work(migf->mvdev->cb_wq, &async_data->work);
  253. }
  254. int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
  255. struct mlx5_vf_migration_file *migf)
  256. {
  257. u32 out_size = MLX5_ST_SZ_BYTES(save_vhca_state_out);
  258. u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {};
  259. struct mlx5vf_async_data *async_data;
  260. struct mlx5_core_dev *mdev;
  261. u32 pdn, mkey;
  262. int err;
  263. lockdep_assert_held(&mvdev->state_mutex);
  264. if (mvdev->mdev_detach)
  265. return -ENOTCONN;
  266. mdev = mvdev->mdev;
  267. err = mlx5_core_alloc_pd(mdev, &pdn);
  268. if (err)
  269. return err;
  270. err = dma_map_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE,
  271. 0);
  272. if (err)
  273. goto err_dma_map;
  274. err = _create_mkey(mdev, pdn, migf, NULL, &mkey);
  275. if (err)
  276. goto err_create_mkey;
  277. MLX5_SET(save_vhca_state_in, in, opcode,
  278. MLX5_CMD_OP_SAVE_VHCA_STATE);
  279. MLX5_SET(save_vhca_state_in, in, op_mod, 0);
  280. MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id);
  281. MLX5_SET(save_vhca_state_in, in, mkey, mkey);
  282. MLX5_SET(save_vhca_state_in, in, size, migf->total_length);
  283. async_data = &migf->async_data;
  284. async_data->out = kvzalloc(out_size, GFP_KERNEL);
  285. if (!async_data->out) {
  286. err = -ENOMEM;
  287. goto err_out;
  288. }
  289. /* no data exists till the callback comes back */
  290. migf->total_length = 0;
  291. get_file(migf->filp);
  292. async_data->mkey = mkey;
  293. async_data->pdn = pdn;
  294. err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in),
  295. async_data->out,
  296. out_size, mlx5vf_save_callback,
  297. &async_data->cb_work);
  298. if (err)
  299. goto err_exec;
  300. return 0;
  301. err_exec:
  302. fput(migf->filp);
  303. kvfree(async_data->out);
  304. err_out:
  305. mlx5_core_destroy_mkey(mdev, mkey);
  306. err_create_mkey:
  307. dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0);
  308. err_dma_map:
  309. mlx5_core_dealloc_pd(mdev, pdn);
  310. return err;
  311. }
  312. int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev,
  313. struct mlx5_vf_migration_file *migf)
  314. {
  315. struct mlx5_core_dev *mdev;
  316. u32 out[MLX5_ST_SZ_DW(save_vhca_state_out)] = {};
  317. u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {};
  318. u32 pdn, mkey;
  319. int err;
  320. lockdep_assert_held(&mvdev->state_mutex);
  321. if (mvdev->mdev_detach)
  322. return -ENOTCONN;
  323. mutex_lock(&migf->lock);
  324. if (!migf->total_length) {
  325. err = -EINVAL;
  326. goto end;
  327. }
  328. mdev = mvdev->mdev;
  329. err = mlx5_core_alloc_pd(mdev, &pdn);
  330. if (err)
  331. goto end;
  332. err = dma_map_sgtable(mdev->device, &migf->table.sgt, DMA_TO_DEVICE, 0);
  333. if (err)
  334. goto err_reg;
  335. err = _create_mkey(mdev, pdn, migf, NULL, &mkey);
  336. if (err)
  337. goto err_mkey;
  338. MLX5_SET(load_vhca_state_in, in, opcode,
  339. MLX5_CMD_OP_LOAD_VHCA_STATE);
  340. MLX5_SET(load_vhca_state_in, in, op_mod, 0);
  341. MLX5_SET(load_vhca_state_in, in, vhca_id, mvdev->vhca_id);
  342. MLX5_SET(load_vhca_state_in, in, mkey, mkey);
  343. MLX5_SET(load_vhca_state_in, in, size, migf->total_length);
  344. err = mlx5_cmd_exec_inout(mdev, load_vhca_state, in, out);
  345. mlx5_core_destroy_mkey(mdev, mkey);
  346. err_mkey:
  347. dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_TO_DEVICE, 0);
  348. err_reg:
  349. mlx5_core_dealloc_pd(mdev, pdn);
  350. end:
  351. mutex_unlock(&migf->lock);
  352. return err;
  353. }
  354. static void combine_ranges(struct rb_root_cached *root, u32 cur_nodes,
  355. u32 req_nodes)
  356. {
  357. struct interval_tree_node *prev, *curr, *comb_start, *comb_end;
  358. unsigned long min_gap;
  359. unsigned long curr_gap;
  360. /* Special shortcut when a single range is required */
  361. if (req_nodes == 1) {
  362. unsigned long last;
  363. curr = comb_start = interval_tree_iter_first(root, 0, ULONG_MAX);
  364. while (curr) {
  365. last = curr->last;
  366. prev = curr;
  367. curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
  368. if (prev != comb_start)
  369. interval_tree_remove(prev, root);
  370. }
  371. comb_start->last = last;
  372. return;
  373. }
  374. /* Combine ranges which have the smallest gap */
  375. while (cur_nodes > req_nodes) {
  376. prev = NULL;
  377. min_gap = ULONG_MAX;
  378. curr = interval_tree_iter_first(root, 0, ULONG_MAX);
  379. while (curr) {
  380. if (prev) {
  381. curr_gap = curr->start - prev->last;
  382. if (curr_gap < min_gap) {
  383. min_gap = curr_gap;
  384. comb_start = prev;
  385. comb_end = curr;
  386. }
  387. }
  388. prev = curr;
  389. curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
  390. }
  391. comb_start->last = comb_end->last;
  392. interval_tree_remove(comb_end, root);
  393. cur_nodes--;
  394. }
  395. }
  396. static int mlx5vf_create_tracker(struct mlx5_core_dev *mdev,
  397. struct mlx5vf_pci_core_device *mvdev,
  398. struct rb_root_cached *ranges, u32 nnodes)
  399. {
  400. int max_num_range =
  401. MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_max_num_range);
  402. struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
  403. int record_size = MLX5_ST_SZ_BYTES(page_track_range);
  404. u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
  405. struct interval_tree_node *node = NULL;
  406. u64 total_ranges_len = 0;
  407. u32 num_ranges = nnodes;
  408. u8 log_addr_space_size;
  409. void *range_list_ptr;
  410. void *obj_context;
  411. void *cmd_hdr;
  412. int inlen;
  413. void *in;
  414. int err;
  415. int i;
  416. if (num_ranges > max_num_range) {
  417. combine_ranges(ranges, nnodes, max_num_range);
  418. num_ranges = max_num_range;
  419. }
  420. inlen = MLX5_ST_SZ_BYTES(create_page_track_obj_in) +
  421. record_size * num_ranges;
  422. in = kzalloc(inlen, GFP_KERNEL);
  423. if (!in)
  424. return -ENOMEM;
  425. cmd_hdr = MLX5_ADDR_OF(create_page_track_obj_in, in,
  426. general_obj_in_cmd_hdr);
  427. MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode,
  428. MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
  429. MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type,
  430. MLX5_OBJ_TYPE_PAGE_TRACK);
  431. obj_context = MLX5_ADDR_OF(create_page_track_obj_in, in, obj_context);
  432. MLX5_SET(page_track, obj_context, vhca_id, mvdev->vhca_id);
  433. MLX5_SET(page_track, obj_context, track_type, 1);
  434. MLX5_SET(page_track, obj_context, log_page_size,
  435. ilog2(tracker->host_qp->tracked_page_size));
  436. MLX5_SET(page_track, obj_context, log_msg_size,
  437. ilog2(tracker->host_qp->max_msg_size));
  438. MLX5_SET(page_track, obj_context, reporting_qpn, tracker->fw_qp->qpn);
  439. MLX5_SET(page_track, obj_context, num_ranges, num_ranges);
  440. range_list_ptr = MLX5_ADDR_OF(page_track, obj_context, track_range);
  441. node = interval_tree_iter_first(ranges, 0, ULONG_MAX);
  442. for (i = 0; i < num_ranges; i++) {
  443. void *addr_range_i_base = range_list_ptr + record_size * i;
  444. unsigned long length = node->last - node->start;
  445. MLX5_SET64(page_track_range, addr_range_i_base, start_address,
  446. node->start);
  447. MLX5_SET64(page_track_range, addr_range_i_base, length, length);
  448. total_ranges_len += length;
  449. node = interval_tree_iter_next(node, 0, ULONG_MAX);
  450. }
  451. WARN_ON(node);
  452. log_addr_space_size = ilog2(total_ranges_len);
  453. if (log_addr_space_size <
  454. (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_min_addr_space)) ||
  455. log_addr_space_size >
  456. (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_max_addr_space))) {
  457. err = -EOPNOTSUPP;
  458. goto out;
  459. }
  460. MLX5_SET(page_track, obj_context, log_addr_space_size,
  461. log_addr_space_size);
  462. err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
  463. if (err)
  464. goto out;
  465. tracker->id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
  466. out:
  467. kfree(in);
  468. return err;
  469. }
  470. static int mlx5vf_cmd_destroy_tracker(struct mlx5_core_dev *mdev,
  471. u32 tracker_id)
  472. {
  473. u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {};
  474. u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
  475. MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
  476. MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK);
  477. MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, tracker_id);
  478. return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
  479. }
  480. static int mlx5vf_cmd_modify_tracker(struct mlx5_core_dev *mdev,
  481. u32 tracker_id, unsigned long iova,
  482. unsigned long length, u32 tracker_state)
  483. {
  484. u32 in[MLX5_ST_SZ_DW(modify_page_track_obj_in)] = {};
  485. u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
  486. void *obj_context;
  487. void *cmd_hdr;
  488. cmd_hdr = MLX5_ADDR_OF(modify_page_track_obj_in, in, general_obj_in_cmd_hdr);
  489. MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT);
  490. MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK);
  491. MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, tracker_id);
  492. obj_context = MLX5_ADDR_OF(modify_page_track_obj_in, in, obj_context);
  493. MLX5_SET64(page_track, obj_context, modify_field_select, 0x3);
  494. MLX5_SET64(page_track, obj_context, range_start_address, iova);
  495. MLX5_SET64(page_track, obj_context, length, length);
  496. MLX5_SET(page_track, obj_context, state, tracker_state);
  497. return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
  498. }
  499. static int alloc_cq_frag_buf(struct mlx5_core_dev *mdev,
  500. struct mlx5_vhca_cq_buf *buf, int nent,
  501. int cqe_size)
  502. {
  503. struct mlx5_frag_buf *frag_buf = &buf->frag_buf;
  504. u8 log_wq_stride = 6 + (cqe_size == 128 ? 1 : 0);
  505. u8 log_wq_sz = ilog2(cqe_size);
  506. int err;
  507. err = mlx5_frag_buf_alloc_node(mdev, nent * cqe_size, frag_buf,
  508. mdev->priv.numa_node);
  509. if (err)
  510. return err;
  511. mlx5_init_fbc(frag_buf->frags, log_wq_stride, log_wq_sz, &buf->fbc);
  512. buf->cqe_size = cqe_size;
  513. buf->nent = nent;
  514. return 0;
  515. }
  516. static void init_cq_frag_buf(struct mlx5_vhca_cq_buf *buf)
  517. {
  518. struct mlx5_cqe64 *cqe64;
  519. void *cqe;
  520. int i;
  521. for (i = 0; i < buf->nent; i++) {
  522. cqe = mlx5_frag_buf_get_wqe(&buf->fbc, i);
  523. cqe64 = buf->cqe_size == 64 ? cqe : cqe + 64;
  524. cqe64->op_own = MLX5_CQE_INVALID << 4;
  525. }
  526. }
  527. static void mlx5vf_destroy_cq(struct mlx5_core_dev *mdev,
  528. struct mlx5_vhca_cq *cq)
  529. {
  530. mlx5_core_destroy_cq(mdev, &cq->mcq);
  531. mlx5_frag_buf_free(mdev, &cq->buf.frag_buf);
  532. mlx5_db_free(mdev, &cq->db);
  533. }
  534. static void mlx5vf_cq_event(struct mlx5_core_cq *mcq, enum mlx5_event type)
  535. {
  536. if (type != MLX5_EVENT_TYPE_CQ_ERROR)
  537. return;
  538. set_tracker_error(container_of(mcq, struct mlx5vf_pci_core_device,
  539. tracker.cq.mcq));
  540. }
  541. static int mlx5vf_event_notifier(struct notifier_block *nb, unsigned long type,
  542. void *data)
  543. {
  544. struct mlx5_vhca_page_tracker *tracker =
  545. mlx5_nb_cof(nb, struct mlx5_vhca_page_tracker, nb);
  546. struct mlx5vf_pci_core_device *mvdev = container_of(
  547. tracker, struct mlx5vf_pci_core_device, tracker);
  548. struct mlx5_eqe *eqe = data;
  549. u8 event_type = (u8)type;
  550. u8 queue_type;
  551. int qp_num;
  552. switch (event_type) {
  553. case MLX5_EVENT_TYPE_WQ_CATAS_ERROR:
  554. case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
  555. case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
  556. queue_type = eqe->data.qp_srq.type;
  557. if (queue_type != MLX5_EVENT_QUEUE_TYPE_QP)
  558. break;
  559. qp_num = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff;
  560. if (qp_num != tracker->host_qp->qpn &&
  561. qp_num != tracker->fw_qp->qpn)
  562. break;
  563. set_tracker_error(mvdev);
  564. break;
  565. default:
  566. break;
  567. }
  568. return NOTIFY_OK;
  569. }
  570. static void mlx5vf_cq_complete(struct mlx5_core_cq *mcq,
  571. struct mlx5_eqe *eqe)
  572. {
  573. struct mlx5vf_pci_core_device *mvdev =
  574. container_of(mcq, struct mlx5vf_pci_core_device,
  575. tracker.cq.mcq);
  576. complete(&mvdev->tracker_comp);
  577. }
  578. static int mlx5vf_create_cq(struct mlx5_core_dev *mdev,
  579. struct mlx5_vhca_page_tracker *tracker,
  580. size_t ncqe)
  581. {
  582. int cqe_size = cache_line_size() == 128 ? 128 : 64;
  583. u32 out[MLX5_ST_SZ_DW(create_cq_out)];
  584. struct mlx5_vhca_cq *cq;
  585. int inlen, err, eqn;
  586. void *cqc, *in;
  587. __be64 *pas;
  588. int vector;
  589. cq = &tracker->cq;
  590. ncqe = roundup_pow_of_two(ncqe);
  591. err = mlx5_db_alloc_node(mdev, &cq->db, mdev->priv.numa_node);
  592. if (err)
  593. return err;
  594. cq->ncqe = ncqe;
  595. cq->mcq.set_ci_db = cq->db.db;
  596. cq->mcq.arm_db = cq->db.db + 1;
  597. cq->mcq.cqe_sz = cqe_size;
  598. err = alloc_cq_frag_buf(mdev, &cq->buf, ncqe, cqe_size);
  599. if (err)
  600. goto err_db_free;
  601. init_cq_frag_buf(&cq->buf);
  602. inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
  603. MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) *
  604. cq->buf.frag_buf.npages;
  605. in = kvzalloc(inlen, GFP_KERNEL);
  606. if (!in) {
  607. err = -ENOMEM;
  608. goto err_buff;
  609. }
  610. vector = raw_smp_processor_id() % mlx5_comp_vectors_count(mdev);
  611. err = mlx5_vector2eqn(mdev, vector, &eqn);
  612. if (err)
  613. goto err_vec;
  614. cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
  615. MLX5_SET(cqc, cqc, log_cq_size, ilog2(ncqe));
  616. MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn);
  617. MLX5_SET(cqc, cqc, uar_page, tracker->uar->index);
  618. MLX5_SET(cqc, cqc, log_page_size, cq->buf.frag_buf.page_shift -
  619. MLX5_ADAPTER_PAGE_SHIFT);
  620. MLX5_SET64(cqc, cqc, dbr_addr, cq->db.dma);
  621. pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas);
  622. mlx5_fill_page_frag_array(&cq->buf.frag_buf, pas);
  623. cq->mcq.comp = mlx5vf_cq_complete;
  624. cq->mcq.event = mlx5vf_cq_event;
  625. err = mlx5_core_create_cq(mdev, &cq->mcq, in, inlen, out, sizeof(out));
  626. if (err)
  627. goto err_vec;
  628. mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map,
  629. cq->mcq.cons_index);
  630. kvfree(in);
  631. return 0;
  632. err_vec:
  633. kvfree(in);
  634. err_buff:
  635. mlx5_frag_buf_free(mdev, &cq->buf.frag_buf);
  636. err_db_free:
  637. mlx5_db_free(mdev, &cq->db);
  638. return err;
  639. }
  640. static struct mlx5_vhca_qp *
  641. mlx5vf_create_rc_qp(struct mlx5_core_dev *mdev,
  642. struct mlx5_vhca_page_tracker *tracker, u32 max_recv_wr)
  643. {
  644. u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {};
  645. struct mlx5_vhca_qp *qp;
  646. u8 log_rq_stride;
  647. u8 log_rq_sz;
  648. void *qpc;
  649. int inlen;
  650. void *in;
  651. int err;
  652. qp = kzalloc(sizeof(*qp), GFP_KERNEL);
  653. if (!qp)
  654. return ERR_PTR(-ENOMEM);
  655. qp->rq.wqe_cnt = roundup_pow_of_two(max_recv_wr);
  656. log_rq_stride = ilog2(MLX5_SEND_WQE_DS);
  657. log_rq_sz = ilog2(qp->rq.wqe_cnt);
  658. err = mlx5_db_alloc_node(mdev, &qp->db, mdev->priv.numa_node);
  659. if (err)
  660. goto err_free;
  661. if (max_recv_wr) {
  662. err = mlx5_frag_buf_alloc_node(mdev,
  663. wq_get_byte_sz(log_rq_sz, log_rq_stride),
  664. &qp->buf, mdev->priv.numa_node);
  665. if (err)
  666. goto err_db_free;
  667. mlx5_init_fbc(qp->buf.frags, log_rq_stride, log_rq_sz, &qp->rq.fbc);
  668. }
  669. qp->rq.db = &qp->db.db[MLX5_RCV_DBR];
  670. inlen = MLX5_ST_SZ_BYTES(create_qp_in) +
  671. MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) *
  672. qp->buf.npages;
  673. in = kvzalloc(inlen, GFP_KERNEL);
  674. if (!in) {
  675. err = -ENOMEM;
  676. goto err_in;
  677. }
  678. qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
  679. MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
  680. MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
  681. MLX5_SET(qpc, qpc, pd, tracker->pdn);
  682. MLX5_SET(qpc, qpc, uar_page, tracker->uar->index);
  683. MLX5_SET(qpc, qpc, log_page_size,
  684. qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
  685. MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(mdev));
  686. if (MLX5_CAP_GEN(mdev, cqe_version) == 1)
  687. MLX5_SET(qpc, qpc, user_index, 0xFFFFFF);
  688. MLX5_SET(qpc, qpc, no_sq, 1);
  689. if (max_recv_wr) {
  690. MLX5_SET(qpc, qpc, cqn_rcv, tracker->cq.mcq.cqn);
  691. MLX5_SET(qpc, qpc, log_rq_stride, log_rq_stride - 4);
  692. MLX5_SET(qpc, qpc, log_rq_size, log_rq_sz);
  693. MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ);
  694. MLX5_SET64(qpc, qpc, dbr_addr, qp->db.dma);
  695. mlx5_fill_page_frag_array(&qp->buf,
  696. (__be64 *)MLX5_ADDR_OF(create_qp_in,
  697. in, pas));
  698. } else {
  699. MLX5_SET(qpc, qpc, rq_type, MLX5_ZERO_LEN_RQ);
  700. }
  701. MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP);
  702. err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
  703. kvfree(in);
  704. if (err)
  705. goto err_in;
  706. qp->qpn = MLX5_GET(create_qp_out, out, qpn);
  707. return qp;
  708. err_in:
  709. if (max_recv_wr)
  710. mlx5_frag_buf_free(mdev, &qp->buf);
  711. err_db_free:
  712. mlx5_db_free(mdev, &qp->db);
  713. err_free:
  714. kfree(qp);
  715. return ERR_PTR(err);
  716. }
  717. static void mlx5vf_post_recv(struct mlx5_vhca_qp *qp)
  718. {
  719. struct mlx5_wqe_data_seg *data;
  720. unsigned int ix;
  721. WARN_ON(qp->rq.pc - qp->rq.cc >= qp->rq.wqe_cnt);
  722. ix = qp->rq.pc & (qp->rq.wqe_cnt - 1);
  723. data = mlx5_frag_buf_get_wqe(&qp->rq.fbc, ix);
  724. data->byte_count = cpu_to_be32(qp->max_msg_size);
  725. data->lkey = cpu_to_be32(qp->recv_buf.mkey);
  726. data->addr = cpu_to_be64(qp->recv_buf.next_rq_offset);
  727. qp->rq.pc++;
  728. /* Make sure that descriptors are written before doorbell record. */
  729. dma_wmb();
  730. *qp->rq.db = cpu_to_be32(qp->rq.pc & 0xffff);
  731. }
  732. static int mlx5vf_activate_qp(struct mlx5_core_dev *mdev,
  733. struct mlx5_vhca_qp *qp, u32 remote_qpn,
  734. bool host_qp)
  735. {
  736. u32 init_in[MLX5_ST_SZ_DW(rst2init_qp_in)] = {};
  737. u32 rtr_in[MLX5_ST_SZ_DW(init2rtr_qp_in)] = {};
  738. u32 rts_in[MLX5_ST_SZ_DW(rtr2rts_qp_in)] = {};
  739. void *qpc;
  740. int ret;
  741. /* Init */
  742. qpc = MLX5_ADDR_OF(rst2init_qp_in, init_in, qpc);
  743. MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1);
  744. MLX5_SET(qpc, qpc, pm_state, MLX5_QPC_PM_STATE_MIGRATED);
  745. MLX5_SET(qpc, qpc, rre, 1);
  746. MLX5_SET(qpc, qpc, rwe, 1);
  747. MLX5_SET(rst2init_qp_in, init_in, opcode, MLX5_CMD_OP_RST2INIT_QP);
  748. MLX5_SET(rst2init_qp_in, init_in, qpn, qp->qpn);
  749. ret = mlx5_cmd_exec_in(mdev, rst2init_qp, init_in);
  750. if (ret)
  751. return ret;
  752. if (host_qp) {
  753. struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
  754. int i;
  755. for (i = 0; i < qp->rq.wqe_cnt; i++) {
  756. mlx5vf_post_recv(qp);
  757. recv_buf->next_rq_offset += qp->max_msg_size;
  758. }
  759. }
  760. /* RTR */
  761. qpc = MLX5_ADDR_OF(init2rtr_qp_in, rtr_in, qpc);
  762. MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn);
  763. MLX5_SET(qpc, qpc, mtu, IB_MTU_4096);
  764. MLX5_SET(qpc, qpc, log_msg_max, MLX5_CAP_GEN(mdev, log_max_msg));
  765. MLX5_SET(qpc, qpc, remote_qpn, remote_qpn);
  766. MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1);
  767. MLX5_SET(qpc, qpc, primary_address_path.fl, 1);
  768. MLX5_SET(qpc, qpc, min_rnr_nak, 1);
  769. MLX5_SET(init2rtr_qp_in, rtr_in, opcode, MLX5_CMD_OP_INIT2RTR_QP);
  770. MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn);
  771. ret = mlx5_cmd_exec_in(mdev, init2rtr_qp, rtr_in);
  772. if (ret || host_qp)
  773. return ret;
  774. /* RTS */
  775. qpc = MLX5_ADDR_OF(rtr2rts_qp_in, rts_in, qpc);
  776. MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn);
  777. MLX5_SET(qpc, qpc, retry_count, 7);
  778. MLX5_SET(qpc, qpc, rnr_retry, 7); /* Infinite retry if RNR NACK */
  779. MLX5_SET(qpc, qpc, primary_address_path.ack_timeout, 0x8); /* ~1ms */
  780. MLX5_SET(rtr2rts_qp_in, rts_in, opcode, MLX5_CMD_OP_RTR2RTS_QP);
  781. MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn);
  782. return mlx5_cmd_exec_in(mdev, rtr2rts_qp, rts_in);
  783. }
  784. static void mlx5vf_destroy_qp(struct mlx5_core_dev *mdev,
  785. struct mlx5_vhca_qp *qp)
  786. {
  787. u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {};
  788. MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
  789. MLX5_SET(destroy_qp_in, in, qpn, qp->qpn);
  790. mlx5_cmd_exec_in(mdev, destroy_qp, in);
  791. mlx5_frag_buf_free(mdev, &qp->buf);
  792. mlx5_db_free(mdev, &qp->db);
  793. kfree(qp);
  794. }
  795. static void free_recv_pages(struct mlx5_vhca_recv_buf *recv_buf)
  796. {
  797. int i;
  798. /* Undo alloc_pages_bulk_array() */
  799. for (i = 0; i < recv_buf->npages; i++)
  800. __free_page(recv_buf->page_list[i]);
  801. kvfree(recv_buf->page_list);
  802. }
  803. static int alloc_recv_pages(struct mlx5_vhca_recv_buf *recv_buf,
  804. unsigned int npages)
  805. {
  806. unsigned int filled = 0, done = 0;
  807. int i;
  808. recv_buf->page_list = kvcalloc(npages, sizeof(*recv_buf->page_list),
  809. GFP_KERNEL);
  810. if (!recv_buf->page_list)
  811. return -ENOMEM;
  812. for (;;) {
  813. filled = alloc_pages_bulk_array(GFP_KERNEL, npages - done,
  814. recv_buf->page_list + done);
  815. if (!filled)
  816. goto err;
  817. done += filled;
  818. if (done == npages)
  819. break;
  820. }
  821. recv_buf->npages = npages;
  822. return 0;
  823. err:
  824. for (i = 0; i < npages; i++) {
  825. if (recv_buf->page_list[i])
  826. __free_page(recv_buf->page_list[i]);
  827. }
  828. kvfree(recv_buf->page_list);
  829. return -ENOMEM;
  830. }
  831. static int register_dma_recv_pages(struct mlx5_core_dev *mdev,
  832. struct mlx5_vhca_recv_buf *recv_buf)
  833. {
  834. int i, j;
  835. recv_buf->dma_addrs = kvcalloc(recv_buf->npages,
  836. sizeof(*recv_buf->dma_addrs),
  837. GFP_KERNEL);
  838. if (!recv_buf->dma_addrs)
  839. return -ENOMEM;
  840. for (i = 0; i < recv_buf->npages; i++) {
  841. recv_buf->dma_addrs[i] = dma_map_page(mdev->device,
  842. recv_buf->page_list[i],
  843. 0, PAGE_SIZE,
  844. DMA_FROM_DEVICE);
  845. if (dma_mapping_error(mdev->device, recv_buf->dma_addrs[i]))
  846. goto error;
  847. }
  848. return 0;
  849. error:
  850. for (j = 0; j < i; j++)
  851. dma_unmap_single(mdev->device, recv_buf->dma_addrs[j],
  852. PAGE_SIZE, DMA_FROM_DEVICE);
  853. kvfree(recv_buf->dma_addrs);
  854. return -ENOMEM;
  855. }
  856. static void unregister_dma_recv_pages(struct mlx5_core_dev *mdev,
  857. struct mlx5_vhca_recv_buf *recv_buf)
  858. {
  859. int i;
  860. for (i = 0; i < recv_buf->npages; i++)
  861. dma_unmap_single(mdev->device, recv_buf->dma_addrs[i],
  862. PAGE_SIZE, DMA_FROM_DEVICE);
  863. kvfree(recv_buf->dma_addrs);
  864. }
  865. static void mlx5vf_free_qp_recv_resources(struct mlx5_core_dev *mdev,
  866. struct mlx5_vhca_qp *qp)
  867. {
  868. struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
  869. mlx5_core_destroy_mkey(mdev, recv_buf->mkey);
  870. unregister_dma_recv_pages(mdev, recv_buf);
  871. free_recv_pages(&qp->recv_buf);
  872. }
  873. static int mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev *mdev,
  874. struct mlx5_vhca_qp *qp, u32 pdn,
  875. u64 rq_size)
  876. {
  877. unsigned int npages = DIV_ROUND_UP_ULL(rq_size, PAGE_SIZE);
  878. struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
  879. int err;
  880. err = alloc_recv_pages(recv_buf, npages);
  881. if (err < 0)
  882. return err;
  883. err = register_dma_recv_pages(mdev, recv_buf);
  884. if (err)
  885. goto end;
  886. err = _create_mkey(mdev, pdn, NULL, recv_buf, &recv_buf->mkey);
  887. if (err)
  888. goto err_create_mkey;
  889. return 0;
  890. err_create_mkey:
  891. unregister_dma_recv_pages(mdev, recv_buf);
  892. end:
  893. free_recv_pages(recv_buf);
  894. return err;
  895. }
  896. static void
  897. _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev)
  898. {
  899. struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
  900. struct mlx5_core_dev *mdev = mvdev->mdev;
  901. lockdep_assert_held(&mvdev->state_mutex);
  902. if (!mvdev->log_active)
  903. return;
  904. WARN_ON(mvdev->mdev_detach);
  905. mlx5_eq_notifier_unregister(mdev, &tracker->nb);
  906. mlx5vf_cmd_destroy_tracker(mdev, tracker->id);
  907. mlx5vf_destroy_qp(mdev, tracker->fw_qp);
  908. mlx5vf_free_qp_recv_resources(mdev, tracker->host_qp);
  909. mlx5vf_destroy_qp(mdev, tracker->host_qp);
  910. mlx5vf_destroy_cq(mdev, &tracker->cq);
  911. mlx5_core_dealloc_pd(mdev, tracker->pdn);
  912. mlx5_put_uars_page(mdev, tracker->uar);
  913. mvdev->log_active = false;
  914. }
  915. int mlx5vf_stop_page_tracker(struct vfio_device *vdev)
  916. {
  917. struct mlx5vf_pci_core_device *mvdev = container_of(
  918. vdev, struct mlx5vf_pci_core_device, core_device.vdev);
  919. mutex_lock(&mvdev->state_mutex);
  920. if (!mvdev->log_active)
  921. goto end;
  922. _mlx5vf_free_page_tracker_resources(mvdev);
  923. mvdev->log_active = false;
  924. end:
  925. mlx5vf_state_mutex_unlock(mvdev);
  926. return 0;
  927. }
  928. int mlx5vf_start_page_tracker(struct vfio_device *vdev,
  929. struct rb_root_cached *ranges, u32 nnodes,
  930. u64 *page_size)
  931. {
  932. struct mlx5vf_pci_core_device *mvdev = container_of(
  933. vdev, struct mlx5vf_pci_core_device, core_device.vdev);
  934. struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
  935. u8 log_tracked_page = ilog2(*page_size);
  936. struct mlx5_vhca_qp *host_qp;
  937. struct mlx5_vhca_qp *fw_qp;
  938. struct mlx5_core_dev *mdev;
  939. u32 max_msg_size = PAGE_SIZE;
  940. u64 rq_size = SZ_2M;
  941. u32 max_recv_wr;
  942. int err;
  943. mutex_lock(&mvdev->state_mutex);
  944. if (mvdev->mdev_detach) {
  945. err = -ENOTCONN;
  946. goto end;
  947. }
  948. if (mvdev->log_active) {
  949. err = -EINVAL;
  950. goto end;
  951. }
  952. mdev = mvdev->mdev;
  953. memset(tracker, 0, sizeof(*tracker));
  954. tracker->uar = mlx5_get_uars_page(mdev);
  955. if (IS_ERR(tracker->uar)) {
  956. err = PTR_ERR(tracker->uar);
  957. goto end;
  958. }
  959. err = mlx5_core_alloc_pd(mdev, &tracker->pdn);
  960. if (err)
  961. goto err_uar;
  962. max_recv_wr = DIV_ROUND_UP_ULL(rq_size, max_msg_size);
  963. err = mlx5vf_create_cq(mdev, tracker, max_recv_wr);
  964. if (err)
  965. goto err_dealloc_pd;
  966. host_qp = mlx5vf_create_rc_qp(mdev, tracker, max_recv_wr);
  967. if (IS_ERR(host_qp)) {
  968. err = PTR_ERR(host_qp);
  969. goto err_cq;
  970. }
  971. host_qp->max_msg_size = max_msg_size;
  972. if (log_tracked_page < MLX5_CAP_ADV_VIRTUALIZATION(mdev,
  973. pg_track_log_min_page_size)) {
  974. log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev,
  975. pg_track_log_min_page_size);
  976. } else if (log_tracked_page > MLX5_CAP_ADV_VIRTUALIZATION(mdev,
  977. pg_track_log_max_page_size)) {
  978. log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev,
  979. pg_track_log_max_page_size);
  980. }
  981. host_qp->tracked_page_size = (1ULL << log_tracked_page);
  982. err = mlx5vf_alloc_qp_recv_resources(mdev, host_qp, tracker->pdn,
  983. rq_size);
  984. if (err)
  985. goto err_host_qp;
  986. fw_qp = mlx5vf_create_rc_qp(mdev, tracker, 0);
  987. if (IS_ERR(fw_qp)) {
  988. err = PTR_ERR(fw_qp);
  989. goto err_recv_resources;
  990. }
  991. err = mlx5vf_activate_qp(mdev, host_qp, fw_qp->qpn, true);
  992. if (err)
  993. goto err_activate;
  994. err = mlx5vf_activate_qp(mdev, fw_qp, host_qp->qpn, false);
  995. if (err)
  996. goto err_activate;
  997. tracker->host_qp = host_qp;
  998. tracker->fw_qp = fw_qp;
  999. err = mlx5vf_create_tracker(mdev, mvdev, ranges, nnodes);
  1000. if (err)
  1001. goto err_activate;
  1002. MLX5_NB_INIT(&tracker->nb, mlx5vf_event_notifier, NOTIFY_ANY);
  1003. mlx5_eq_notifier_register(mdev, &tracker->nb);
  1004. *page_size = host_qp->tracked_page_size;
  1005. mvdev->log_active = true;
  1006. mlx5vf_state_mutex_unlock(mvdev);
  1007. return 0;
  1008. err_activate:
  1009. mlx5vf_destroy_qp(mdev, fw_qp);
  1010. err_recv_resources:
  1011. mlx5vf_free_qp_recv_resources(mdev, host_qp);
  1012. err_host_qp:
  1013. mlx5vf_destroy_qp(mdev, host_qp);
  1014. err_cq:
  1015. mlx5vf_destroy_cq(mdev, &tracker->cq);
  1016. err_dealloc_pd:
  1017. mlx5_core_dealloc_pd(mdev, tracker->pdn);
  1018. err_uar:
  1019. mlx5_put_uars_page(mdev, tracker->uar);
  1020. end:
  1021. mlx5vf_state_mutex_unlock(mvdev);
  1022. return err;
  1023. }
  1024. static void
  1025. set_report_output(u32 size, int index, struct mlx5_vhca_qp *qp,
  1026. struct iova_bitmap *dirty)
  1027. {
  1028. u32 entry_size = MLX5_ST_SZ_BYTES(page_track_report_entry);
  1029. u32 nent = size / entry_size;
  1030. struct page *page;
  1031. u64 addr;
  1032. u64 *buf;
  1033. int i;
  1034. if (WARN_ON(index >= qp->recv_buf.npages ||
  1035. (nent > qp->max_msg_size / entry_size)))
  1036. return;
  1037. page = qp->recv_buf.page_list[index];
  1038. buf = kmap_local_page(page);
  1039. for (i = 0; i < nent; i++) {
  1040. addr = MLX5_GET(page_track_report_entry, buf + i,
  1041. dirty_address_low);
  1042. addr |= (u64)MLX5_GET(page_track_report_entry, buf + i,
  1043. dirty_address_high) << 32;
  1044. iova_bitmap_set(dirty, addr, qp->tracked_page_size);
  1045. }
  1046. kunmap_local(buf);
  1047. }
  1048. static void
  1049. mlx5vf_rq_cqe(struct mlx5_vhca_qp *qp, struct mlx5_cqe64 *cqe,
  1050. struct iova_bitmap *dirty, int *tracker_status)
  1051. {
  1052. u32 size;
  1053. int ix;
  1054. qp->rq.cc++;
  1055. *tracker_status = be32_to_cpu(cqe->immediate) >> 28;
  1056. size = be32_to_cpu(cqe->byte_cnt);
  1057. ix = be16_to_cpu(cqe->wqe_counter) & (qp->rq.wqe_cnt - 1);
  1058. /* zero length CQE, no data */
  1059. WARN_ON(!size && *tracker_status == MLX5_PAGE_TRACK_STATE_REPORTING);
  1060. if (size)
  1061. set_report_output(size, ix, qp, dirty);
  1062. qp->recv_buf.next_rq_offset = ix * qp->max_msg_size;
  1063. mlx5vf_post_recv(qp);
  1064. }
  1065. static void *get_cqe(struct mlx5_vhca_cq *cq, int n)
  1066. {
  1067. return mlx5_frag_buf_get_wqe(&cq->buf.fbc, n);
  1068. }
  1069. static struct mlx5_cqe64 *get_sw_cqe(struct mlx5_vhca_cq *cq, int n)
  1070. {
  1071. void *cqe = get_cqe(cq, n & (cq->ncqe - 1));
  1072. struct mlx5_cqe64 *cqe64;
  1073. cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64;
  1074. if (likely(get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) &&
  1075. !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & (cq->ncqe)))) {
  1076. return cqe64;
  1077. } else {
  1078. return NULL;
  1079. }
  1080. }
  1081. static int
  1082. mlx5vf_cq_poll_one(struct mlx5_vhca_cq *cq, struct mlx5_vhca_qp *qp,
  1083. struct iova_bitmap *dirty, int *tracker_status)
  1084. {
  1085. struct mlx5_cqe64 *cqe;
  1086. u8 opcode;
  1087. cqe = get_sw_cqe(cq, cq->mcq.cons_index);
  1088. if (!cqe)
  1089. return CQ_EMPTY;
  1090. ++cq->mcq.cons_index;
  1091. /*
  1092. * Make sure we read CQ entry contents after we've checked the
  1093. * ownership bit.
  1094. */
  1095. rmb();
  1096. opcode = get_cqe_opcode(cqe);
  1097. switch (opcode) {
  1098. case MLX5_CQE_RESP_SEND_IMM:
  1099. mlx5vf_rq_cqe(qp, cqe, dirty, tracker_status);
  1100. return CQ_OK;
  1101. default:
  1102. return CQ_POLL_ERR;
  1103. }
  1104. }
  1105. int mlx5vf_tracker_read_and_clear(struct vfio_device *vdev, unsigned long iova,
  1106. unsigned long length,
  1107. struct iova_bitmap *dirty)
  1108. {
  1109. struct mlx5vf_pci_core_device *mvdev = container_of(
  1110. vdev, struct mlx5vf_pci_core_device, core_device.vdev);
  1111. struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
  1112. struct mlx5_vhca_cq *cq = &tracker->cq;
  1113. struct mlx5_core_dev *mdev;
  1114. int poll_err, err;
  1115. mutex_lock(&mvdev->state_mutex);
  1116. if (!mvdev->log_active) {
  1117. err = -EINVAL;
  1118. goto end;
  1119. }
  1120. if (mvdev->mdev_detach) {
  1121. err = -ENOTCONN;
  1122. goto end;
  1123. }
  1124. mdev = mvdev->mdev;
  1125. err = mlx5vf_cmd_modify_tracker(mdev, tracker->id, iova, length,
  1126. MLX5_PAGE_TRACK_STATE_REPORTING);
  1127. if (err)
  1128. goto end;
  1129. tracker->status = MLX5_PAGE_TRACK_STATE_REPORTING;
  1130. while (tracker->status == MLX5_PAGE_TRACK_STATE_REPORTING &&
  1131. !tracker->is_err) {
  1132. poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp, dirty,
  1133. &tracker->status);
  1134. if (poll_err == CQ_EMPTY) {
  1135. mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map,
  1136. cq->mcq.cons_index);
  1137. poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp,
  1138. dirty, &tracker->status);
  1139. if (poll_err == CQ_EMPTY) {
  1140. wait_for_completion(&mvdev->tracker_comp);
  1141. continue;
  1142. }
  1143. }
  1144. if (poll_err == CQ_POLL_ERR) {
  1145. err = -EIO;
  1146. goto end;
  1147. }
  1148. mlx5_cq_set_ci(&cq->mcq);
  1149. }
  1150. if (tracker->status == MLX5_PAGE_TRACK_STATE_ERROR)
  1151. tracker->is_err = true;
  1152. if (tracker->is_err)
  1153. err = -EIO;
  1154. end:
  1155. mlx5vf_state_mutex_unlock(mvdev);
  1156. return err;
  1157. }