file.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * (C) 2001 Clemson University and The University of Chicago
  4. * Copyright 2018 Omnibond Systems, L.L.C.
  5. *
  6. * See COPYING in top-level directory.
  7. */
  8. /*
  9. * Linux VFS file operations.
  10. */
  11. #include "protocol.h"
  12. #include "orangefs-kernel.h"
  13. #include "orangefs-bufmap.h"
  14. #include <linux/fs.h>
  15. #include <linux/pagemap.h>
  16. static int flush_racache(struct inode *inode)
  17. {
  18. struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
  19. struct orangefs_kernel_op_s *new_op;
  20. int ret;
  21. gossip_debug(GOSSIP_UTILS_DEBUG,
  22. "%s: %pU: Handle is %pU | fs_id %d\n", __func__,
  23. get_khandle_from_ino(inode), &orangefs_inode->refn.khandle,
  24. orangefs_inode->refn.fs_id);
  25. new_op = op_alloc(ORANGEFS_VFS_OP_RA_FLUSH);
  26. if (!new_op)
  27. return -ENOMEM;
  28. new_op->upcall.req.ra_cache_flush.refn = orangefs_inode->refn;
  29. ret = service_operation(new_op, "orangefs_flush_racache",
  30. get_interruptible_flag(inode));
  31. gossip_debug(GOSSIP_UTILS_DEBUG, "%s: got return value of %d\n",
  32. __func__, ret);
  33. op_release(new_op);
  34. return ret;
  35. }
  36. /*
  37. * Post and wait for the I/O upcall to finish
  38. */
  39. ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode,
  40. loff_t *offset, struct iov_iter *iter, size_t total_size,
  41. loff_t readahead_size, struct orangefs_write_range *wr,
  42. int *index_return, struct file *file)
  43. {
  44. struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
  45. struct orangefs_khandle *handle = &orangefs_inode->refn.khandle;
  46. struct orangefs_kernel_op_s *new_op = NULL;
  47. int buffer_index;
  48. ssize_t ret;
  49. size_t copy_amount;
  50. int open_for_read;
  51. int open_for_write;
  52. new_op = op_alloc(ORANGEFS_VFS_OP_FILE_IO);
  53. if (!new_op)
  54. return -ENOMEM;
  55. /* synchronous I/O */
  56. new_op->upcall.req.io.readahead_size = readahead_size;
  57. new_op->upcall.req.io.io_type = type;
  58. new_op->upcall.req.io.refn = orangefs_inode->refn;
  59. populate_shared_memory:
  60. /* get a shared buffer index */
  61. buffer_index = orangefs_bufmap_get();
  62. if (buffer_index < 0) {
  63. ret = buffer_index;
  64. gossip_debug(GOSSIP_FILE_DEBUG,
  65. "%s: orangefs_bufmap_get failure (%zd)\n",
  66. __func__, ret);
  67. goto out;
  68. }
  69. gossip_debug(GOSSIP_FILE_DEBUG,
  70. "%s(%pU): GET op %p -> buffer_index %d\n",
  71. __func__,
  72. handle,
  73. new_op,
  74. buffer_index);
  75. new_op->uses_shared_memory = 1;
  76. new_op->upcall.req.io.buf_index = buffer_index;
  77. new_op->upcall.req.io.count = total_size;
  78. new_op->upcall.req.io.offset = *offset;
  79. if (type == ORANGEFS_IO_WRITE && wr) {
  80. new_op->upcall.uid = from_kuid(&init_user_ns, wr->uid);
  81. new_op->upcall.gid = from_kgid(&init_user_ns, wr->gid);
  82. }
  83. /*
  84. * Orangefs has no open, and orangefs checks file permissions
  85. * on each file access. Posix requires that file permissions
  86. * be checked on open and nowhere else. Orangefs-through-the-kernel
  87. * needs to seem posix compliant.
  88. *
  89. * The VFS opens files, even if the filesystem provides no
  90. * method. We can see if a file was successfully opened for
  91. * read and or for write by looking at file->f_mode.
  92. *
  93. * When writes are flowing from the page cache, file is no
  94. * longer available. We can trust the VFS to have checked
  95. * file->f_mode before writing to the page cache.
  96. *
  97. * The mode of a file might change between when it is opened
  98. * and IO commences, or it might be created with an arbitrary mode.
  99. *
  100. * We'll make sure we don't hit EACCES during the IO stage by
  101. * using UID 0. Some of the time we have access without changing
  102. * to UID 0 - how to check?
  103. */
  104. if (file) {
  105. open_for_write = file->f_mode & FMODE_WRITE;
  106. open_for_read = file->f_mode & FMODE_READ;
  107. } else {
  108. open_for_write = 1;
  109. open_for_read = 0; /* not relevant? */
  110. }
  111. if ((type == ORANGEFS_IO_WRITE) && open_for_write)
  112. new_op->upcall.uid = 0;
  113. if ((type == ORANGEFS_IO_READ) && open_for_read)
  114. new_op->upcall.uid = 0;
  115. gossip_debug(GOSSIP_FILE_DEBUG,
  116. "%s(%pU): offset: %llu total_size: %zd\n",
  117. __func__,
  118. handle,
  119. llu(*offset),
  120. total_size);
  121. /*
  122. * Stage 1: copy the buffers into client-core's address space
  123. */
  124. if (type == ORANGEFS_IO_WRITE && total_size) {
  125. ret = orangefs_bufmap_copy_from_iovec(iter, buffer_index,
  126. total_size);
  127. if (ret < 0) {
  128. gossip_err("%s: Failed to copy-in buffers. Please make sure that the pvfs2-client is running. %ld\n",
  129. __func__, (long)ret);
  130. goto out;
  131. }
  132. }
  133. gossip_debug(GOSSIP_FILE_DEBUG,
  134. "%s(%pU): Calling post_io_request with tag (%llu)\n",
  135. __func__,
  136. handle,
  137. llu(new_op->tag));
  138. /* Stage 2: Service the I/O operation */
  139. ret = service_operation(new_op,
  140. type == ORANGEFS_IO_WRITE ?
  141. "file_write" :
  142. "file_read",
  143. get_interruptible_flag(inode));
  144. /*
  145. * If service_operation() returns -EAGAIN #and# the operation was
  146. * purged from orangefs_request_list or htable_ops_in_progress, then
  147. * we know that the client was restarted, causing the shared memory
  148. * area to be wiped clean. To restart a write operation in this
  149. * case, we must re-copy the data from the user's iovec to a NEW
  150. * shared memory location. To restart a read operation, we must get
  151. * a new shared memory location.
  152. */
  153. if (ret == -EAGAIN && op_state_purged(new_op)) {
  154. orangefs_bufmap_put(buffer_index);
  155. if (type == ORANGEFS_IO_WRITE)
  156. iov_iter_revert(iter, total_size);
  157. gossip_debug(GOSSIP_FILE_DEBUG,
  158. "%s:going to repopulate_shared_memory.\n",
  159. __func__);
  160. goto populate_shared_memory;
  161. }
  162. if (ret < 0) {
  163. if (ret == -EINTR) {
  164. /*
  165. * We can't return EINTR if any data was written,
  166. * it's not POSIX. It is minimally acceptable
  167. * to give a partial write, the way NFS does.
  168. *
  169. * It would be optimal to return all or nothing,
  170. * but if a userspace write is bigger than
  171. * an IO buffer, and the interrupt occurs
  172. * between buffer writes, that would not be
  173. * possible.
  174. */
  175. switch (new_op->op_state - OP_VFS_STATE_GIVEN_UP) {
  176. /*
  177. * If the op was waiting when the interrupt
  178. * occurred, then the client-core did not
  179. * trigger the write.
  180. */
  181. case OP_VFS_STATE_WAITING:
  182. if (*offset == 0)
  183. ret = -EINTR;
  184. else
  185. ret = 0;
  186. break;
  187. /*
  188. * If the op was in progress when the interrupt
  189. * occurred, then the client-core was able to
  190. * trigger the write.
  191. */
  192. case OP_VFS_STATE_INPROGR:
  193. if (type == ORANGEFS_IO_READ)
  194. ret = -EINTR;
  195. else
  196. ret = total_size;
  197. break;
  198. default:
  199. gossip_err("%s: unexpected op state :%d:.\n",
  200. __func__,
  201. new_op->op_state);
  202. ret = 0;
  203. break;
  204. }
  205. gossip_debug(GOSSIP_FILE_DEBUG,
  206. "%s: got EINTR, state:%d: %p\n",
  207. __func__,
  208. new_op->op_state,
  209. new_op);
  210. } else {
  211. gossip_err("%s: error in %s handle %pU, returning %zd\n",
  212. __func__,
  213. type == ORANGEFS_IO_READ ?
  214. "read from" : "write to",
  215. handle, ret);
  216. }
  217. if (orangefs_cancel_op_in_progress(new_op))
  218. return ret;
  219. goto out;
  220. }
  221. /*
  222. * Stage 3: Post copy buffers from client-core's address space
  223. */
  224. if (type == ORANGEFS_IO_READ && new_op->downcall.resp.io.amt_complete) {
  225. /*
  226. * NOTE: the iovector can either contain addresses which
  227. * can futher be kernel-space or user-space addresses.
  228. * or it can pointers to struct page's
  229. */
  230. copy_amount = new_op->downcall.resp.io.amt_complete;
  231. ret = orangefs_bufmap_copy_to_iovec(iter, buffer_index,
  232. copy_amount);
  233. if (ret < 0) {
  234. gossip_err("%s: Failed to copy-out buffers. Please make sure that the pvfs2-client is running (%ld)\n",
  235. __func__, (long)ret);
  236. goto out;
  237. }
  238. }
  239. gossip_debug(GOSSIP_FILE_DEBUG,
  240. "%s(%pU): Amount %s, returned by the sys-io call:%d\n",
  241. __func__,
  242. handle,
  243. type == ORANGEFS_IO_READ ? "read" : "written",
  244. (int)new_op->downcall.resp.io.amt_complete);
  245. ret = new_op->downcall.resp.io.amt_complete;
  246. out:
  247. if (buffer_index >= 0) {
  248. orangefs_bufmap_put(buffer_index);
  249. gossip_debug(GOSSIP_FILE_DEBUG,
  250. "%s(%pU): PUT buffer_index %d\n",
  251. __func__, handle, buffer_index);
  252. buffer_index = -1;
  253. }
  254. op_release(new_op);
  255. return ret;
  256. }
  257. int orangefs_revalidate_mapping(struct inode *inode)
  258. {
  259. struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
  260. struct address_space *mapping = inode->i_mapping;
  261. unsigned long *bitlock = &orangefs_inode->bitlock;
  262. int ret;
  263. while (1) {
  264. ret = wait_on_bit(bitlock, 1, TASK_KILLABLE);
  265. if (ret)
  266. return ret;
  267. spin_lock(&inode->i_lock);
  268. if (test_bit(1, bitlock)) {
  269. spin_unlock(&inode->i_lock);
  270. continue;
  271. }
  272. if (!time_before(jiffies, orangefs_inode->mapping_time))
  273. break;
  274. spin_unlock(&inode->i_lock);
  275. return 0;
  276. }
  277. set_bit(1, bitlock);
  278. smp_wmb();
  279. spin_unlock(&inode->i_lock);
  280. unmap_mapping_range(mapping, 0, 0, 0);
  281. ret = filemap_write_and_wait(mapping);
  282. if (!ret)
  283. ret = invalidate_inode_pages2(mapping);
  284. orangefs_inode->mapping_time = jiffies +
  285. orangefs_cache_timeout_msecs*HZ/1000;
  286. clear_bit(1, bitlock);
  287. smp_mb__after_atomic();
  288. wake_up_bit(bitlock, 1);
  289. return ret;
  290. }
  291. static ssize_t orangefs_file_read_iter(struct kiocb *iocb,
  292. struct iov_iter *iter)
  293. {
  294. int ret;
  295. orangefs_stats.reads++;
  296. down_read(&file_inode(iocb->ki_filp)->i_rwsem);
  297. ret = orangefs_revalidate_mapping(file_inode(iocb->ki_filp));
  298. if (ret)
  299. goto out;
  300. ret = generic_file_read_iter(iocb, iter);
  301. out:
  302. up_read(&file_inode(iocb->ki_filp)->i_rwsem);
  303. return ret;
  304. }
  305. static ssize_t orangefs_file_write_iter(struct kiocb *iocb,
  306. struct iov_iter *iter)
  307. {
  308. int ret;
  309. orangefs_stats.writes++;
  310. if (iocb->ki_pos > i_size_read(file_inode(iocb->ki_filp))) {
  311. ret = orangefs_revalidate_mapping(file_inode(iocb->ki_filp));
  312. if (ret)
  313. return ret;
  314. }
  315. ret = generic_file_write_iter(iocb, iter);
  316. return ret;
  317. }
  318. static vm_fault_t orangefs_fault(struct vm_fault *vmf)
  319. {
  320. struct file *file = vmf->vma->vm_file;
  321. int ret;
  322. ret = orangefs_inode_getattr(file->f_mapping->host,
  323. ORANGEFS_GETATTR_SIZE);
  324. if (ret == -ESTALE)
  325. ret = -EIO;
  326. if (ret) {
  327. gossip_err("%s: orangefs_inode_getattr failed, "
  328. "ret:%d:.\n", __func__, ret);
  329. return VM_FAULT_SIGBUS;
  330. }
  331. return filemap_fault(vmf);
  332. }
  333. static const struct vm_operations_struct orangefs_file_vm_ops = {
  334. .fault = orangefs_fault,
  335. .map_pages = filemap_map_pages,
  336. .page_mkwrite = orangefs_page_mkwrite,
  337. };
  338. /*
  339. * Memory map a region of a file.
  340. */
  341. static int orangefs_file_mmap(struct file *file, struct vm_area_struct *vma)
  342. {
  343. int ret;
  344. ret = orangefs_revalidate_mapping(file_inode(file));
  345. if (ret)
  346. return ret;
  347. gossip_debug(GOSSIP_FILE_DEBUG,
  348. "orangefs_file_mmap: called on %pD\n", file);
  349. /* set the sequential readahead hint */
  350. vm_flags_mod(vma, VM_SEQ_READ, VM_RAND_READ);
  351. file_accessed(file);
  352. vma->vm_ops = &orangefs_file_vm_ops;
  353. return 0;
  354. }
  355. #define mapping_nrpages(idata) ((idata)->nrpages)
  356. /*
  357. * Called to notify the module that there are no more references to
  358. * this file (i.e. no processes have it open).
  359. *
  360. * \note Not called when each file is closed.
  361. */
  362. static int orangefs_file_release(struct inode *inode, struct file *file)
  363. {
  364. gossip_debug(GOSSIP_FILE_DEBUG,
  365. "orangefs_file_release: called on %pD\n",
  366. file);
  367. /*
  368. * remove all associated inode pages from the page cache and
  369. * readahead cache (if any); this forces an expensive refresh of
  370. * data for the next caller of mmap (or 'get_block' accesses)
  371. */
  372. if (mapping_nrpages(file->f_mapping)) {
  373. if (orangefs_features & ORANGEFS_FEATURE_READAHEAD) {
  374. gossip_debug(GOSSIP_INODE_DEBUG,
  375. "calling flush_racache on %pU\n",
  376. get_khandle_from_ino(inode));
  377. flush_racache(inode);
  378. gossip_debug(GOSSIP_INODE_DEBUG,
  379. "flush_racache finished\n");
  380. }
  381. }
  382. return 0;
  383. }
  384. /*
  385. * Push all data for a specific file onto permanent storage.
  386. */
  387. static int orangefs_fsync(struct file *file,
  388. loff_t start,
  389. loff_t end,
  390. int datasync)
  391. {
  392. int ret;
  393. struct orangefs_inode_s *orangefs_inode =
  394. ORANGEFS_I(file_inode(file));
  395. struct orangefs_kernel_op_s *new_op = NULL;
  396. ret = filemap_write_and_wait_range(file_inode(file)->i_mapping,
  397. start, end);
  398. if (ret < 0)
  399. return ret;
  400. new_op = op_alloc(ORANGEFS_VFS_OP_FSYNC);
  401. if (!new_op)
  402. return -ENOMEM;
  403. new_op->upcall.req.fsync.refn = orangefs_inode->refn;
  404. ret = service_operation(new_op,
  405. "orangefs_fsync",
  406. get_interruptible_flag(file_inode(file)));
  407. gossip_debug(GOSSIP_FILE_DEBUG,
  408. "orangefs_fsync got return value of %d\n",
  409. ret);
  410. op_release(new_op);
  411. return ret;
  412. }
  413. /*
  414. * Change the file pointer position for an instance of an open file.
  415. *
  416. * \note If .llseek is overriden, we must acquire lock as described in
  417. * Documentation/filesystems/locking.rst.
  418. *
  419. * Future upgrade could support SEEK_DATA and SEEK_HOLE but would
  420. * require much changes to the FS
  421. */
  422. static loff_t orangefs_file_llseek(struct file *file, loff_t offset, int origin)
  423. {
  424. int ret = -EINVAL;
  425. struct inode *inode = file_inode(file);
  426. if (origin == SEEK_END) {
  427. /*
  428. * revalidate the inode's file size.
  429. * NOTE: We are only interested in file size here,
  430. * so we set mask accordingly.
  431. */
  432. ret = orangefs_inode_getattr(file->f_mapping->host,
  433. ORANGEFS_GETATTR_SIZE);
  434. if (ret == -ESTALE)
  435. ret = -EIO;
  436. if (ret) {
  437. gossip_debug(GOSSIP_FILE_DEBUG,
  438. "%s:%s:%d calling make bad inode\n",
  439. __FILE__,
  440. __func__,
  441. __LINE__);
  442. return ret;
  443. }
  444. }
  445. gossip_debug(GOSSIP_FILE_DEBUG,
  446. "orangefs_file_llseek: offset is %ld | origin is %d"
  447. " | inode size is %lu\n",
  448. (long)offset,
  449. origin,
  450. (unsigned long)i_size_read(inode));
  451. return generic_file_llseek(file, offset, origin);
  452. }
  453. /*
  454. * Support local locks (locks that only this kernel knows about)
  455. * if Orangefs was mounted -o local_lock.
  456. */
  457. static int orangefs_lock(struct file *filp, int cmd, struct file_lock *fl)
  458. {
  459. int rc = -EINVAL;
  460. if (ORANGEFS_SB(file_inode(filp)->i_sb)->flags & ORANGEFS_OPT_LOCAL_LOCK) {
  461. if (cmd == F_GETLK) {
  462. rc = 0;
  463. posix_test_lock(filp, fl);
  464. } else {
  465. rc = posix_lock_file(filp, fl, NULL);
  466. }
  467. }
  468. return rc;
  469. }
  470. static int orangefs_flush(struct file *file, fl_owner_t id)
  471. {
  472. /*
  473. * This is vfs_fsync_range(file, 0, LLONG_MAX, 0) without the
  474. * service_operation in orangefs_fsync.
  475. *
  476. * Do not send fsync to OrangeFS server on a close. Do send fsync
  477. * on an explicit fsync call. This duplicates historical OrangeFS
  478. * behavior.
  479. */
  480. int r;
  481. r = filemap_write_and_wait_range(file->f_mapping, 0, LLONG_MAX);
  482. if (r > 0)
  483. return 0;
  484. else
  485. return r;
  486. }
  487. /** ORANGEFS implementation of VFS file operations */
  488. const struct file_operations orangefs_file_operations = {
  489. .llseek = orangefs_file_llseek,
  490. .read_iter = orangefs_file_read_iter,
  491. .write_iter = orangefs_file_write_iter,
  492. .lock = orangefs_lock,
  493. .mmap = orangefs_file_mmap,
  494. .open = generic_file_open,
  495. .splice_read = generic_file_splice_read,
  496. .splice_write = iter_file_splice_write,
  497. .flush = orangefs_flush,
  498. .release = orangefs_file_release,
  499. .fsync = orangefs_fsync,
  500. };