tcp_mmap.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * Copyright 2018 Google Inc.
  4. * Author: Eric Dumazet ([email protected])
  5. *
  6. * Reference program demonstrating tcp mmap() usage,
  7. * and SO_RCVLOWAT hints for receiver.
  8. *
  9. * Note : NIC with header split is needed to use mmap() on TCP :
  10. * Each incoming frame must be a multiple of PAGE_SIZE bytes of TCP payload.
  11. *
  12. * How to use on loopback interface :
  13. *
  14. * ifconfig lo mtu 61512 # 15*4096 + 40 (ipv6 header) + 32 (TCP with TS option header)
  15. * tcp_mmap -s -z &
  16. * tcp_mmap -H ::1 -z
  17. *
  18. * Or leave default lo mtu, but use -M option to set TCP_MAXSEG option to (4096 + 12)
  19. * (4096 : page size on x86, 12: TCP TS option length)
  20. * tcp_mmap -s -z -M $((4096+12)) &
  21. * tcp_mmap -H ::1 -z -M $((4096+12))
  22. *
  23. * Note: -z option on sender uses MSG_ZEROCOPY, which forces a copy when packets go through loopback interface.
  24. * We might use sendfile() instead, but really this test program is about mmap(), for receivers ;)
  25. *
  26. * $ ./tcp_mmap -s & # Without mmap()
  27. * $ for i in {1..4}; do ./tcp_mmap -H ::1 -z ; done
  28. * received 32768 MB (0 % mmap'ed) in 14.1157 s, 19.4732 Gbit
  29. * cpu usage user:0.057 sys:7.815, 240.234 usec per MB, 65531 c-switches
  30. * received 32768 MB (0 % mmap'ed) in 14.6833 s, 18.7204 Gbit
  31. * cpu usage user:0.043 sys:8.103, 248.596 usec per MB, 65524 c-switches
  32. * received 32768 MB (0 % mmap'ed) in 11.143 s, 24.6682 Gbit
  33. * cpu usage user:0.044 sys:6.576, 202.026 usec per MB, 65519 c-switches
  34. * received 32768 MB (0 % mmap'ed) in 14.9056 s, 18.4413 Gbit
  35. * cpu usage user:0.036 sys:8.193, 251.129 usec per MB, 65530 c-switches
  36. * $ kill %1 # kill tcp_mmap server
  37. *
  38. * $ ./tcp_mmap -s -z & # With mmap()
  39. * $ for i in {1..4}; do ./tcp_mmap -H ::1 -z ; done
  40. * received 32768 MB (99.9939 % mmap'ed) in 6.73792 s, 40.7956 Gbit
  41. * cpu usage user:0.045 sys:2.827, 87.6465 usec per MB, 65532 c-switches
  42. * received 32768 MB (99.9939 % mmap'ed) in 7.26732 s, 37.8238 Gbit
  43. * cpu usage user:0.037 sys:3.087, 95.3369 usec per MB, 65532 c-switches
  44. * received 32768 MB (99.9939 % mmap'ed) in 7.61661 s, 36.0893 Gbit
  45. * cpu usage user:0.046 sys:3.559, 110.016 usec per MB, 65529 c-switches
  46. * received 32768 MB (99.9939 % mmap'ed) in 7.43764 s, 36.9577 Gbit
  47. * cpu usage user:0.035 sys:3.467, 106.873 usec per MB, 65530 c-switches
  48. */
  49. #define _GNU_SOURCE
  50. #include <pthread.h>
  51. #include <sys/types.h>
  52. #include <fcntl.h>
  53. #include <error.h>
  54. #include <sys/socket.h>
  55. #include <sys/mman.h>
  56. #include <sys/resource.h>
  57. #include <unistd.h>
  58. #include <string.h>
  59. #include <stdlib.h>
  60. #include <stdio.h>
  61. #include <errno.h>
  62. #include <time.h>
  63. #include <sys/time.h>
  64. #include <netinet/in.h>
  65. #include <arpa/inet.h>
  66. #include <poll.h>
  67. #include <linux/tcp.h>
  68. #include <assert.h>
  69. #ifndef MSG_ZEROCOPY
  70. #define MSG_ZEROCOPY 0x4000000
  71. #endif
  72. #define FILE_SZ (1ULL << 35)
  73. static int cfg_family = AF_INET6;
  74. static socklen_t cfg_alen = sizeof(struct sockaddr_in6);
  75. static int cfg_port = 8787;
  76. static int rcvbuf; /* Default: autotuning. Can be set with -r <integer> option */
  77. static int sndbuf; /* Default: autotuning. Can be set with -w <integer> option */
  78. static int zflg; /* zero copy option. (MSG_ZEROCOPY for sender, mmap() for receiver */
  79. static int xflg; /* hash received data (simple xor) (-h option) */
  80. static int keepflag; /* -k option: receiver shall keep all received file in memory (no munmap() calls) */
  81. static size_t chunk_size = 512*1024;
  82. static size_t map_align;
  83. unsigned long htotal;
  84. static inline void prefetch(const void *x)
  85. {
  86. #if defined(__x86_64__)
  87. asm volatile("prefetcht0 %P0" : : "m" (*(const char *)x));
  88. #endif
  89. }
  90. void hash_zone(void *zone, unsigned int length)
  91. {
  92. unsigned long temp = htotal;
  93. while (length >= 8*sizeof(long)) {
  94. prefetch(zone + 384);
  95. temp ^= *(unsigned long *)zone;
  96. temp ^= *(unsigned long *)(zone + sizeof(long));
  97. temp ^= *(unsigned long *)(zone + 2*sizeof(long));
  98. temp ^= *(unsigned long *)(zone + 3*sizeof(long));
  99. temp ^= *(unsigned long *)(zone + 4*sizeof(long));
  100. temp ^= *(unsigned long *)(zone + 5*sizeof(long));
  101. temp ^= *(unsigned long *)(zone + 6*sizeof(long));
  102. temp ^= *(unsigned long *)(zone + 7*sizeof(long));
  103. zone += 8*sizeof(long);
  104. length -= 8*sizeof(long);
  105. }
  106. while (length >= 1) {
  107. temp ^= *(unsigned char *)zone;
  108. zone += 1;
  109. length--;
  110. }
  111. htotal = temp;
  112. }
  113. #define ALIGN_UP(x, align_to) (((x) + ((align_to)-1)) & ~((align_to)-1))
  114. #define ALIGN_PTR_UP(p, ptr_align_to) ((typeof(p))ALIGN_UP((unsigned long)(p), ptr_align_to))
  115. static void *mmap_large_buffer(size_t need, size_t *allocated)
  116. {
  117. void *buffer;
  118. size_t sz;
  119. /* Attempt to use huge pages if possible. */
  120. sz = ALIGN_UP(need, map_align);
  121. buffer = mmap(NULL, sz, PROT_READ | PROT_WRITE,
  122. MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0);
  123. if (buffer == (void *)-1) {
  124. sz = need;
  125. buffer = mmap(NULL, sz, PROT_READ | PROT_WRITE,
  126. MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
  127. if (buffer != (void *)-1)
  128. fprintf(stderr, "MAP_HUGETLB attempt failed, look at /sys/kernel/mm/hugepages for optimal performance\n");
  129. }
  130. *allocated = sz;
  131. return buffer;
  132. }
  133. void *child_thread(void *arg)
  134. {
  135. unsigned long total_mmap = 0, total = 0;
  136. struct tcp_zerocopy_receive zc;
  137. unsigned long delta_usec;
  138. int flags = MAP_SHARED;
  139. struct timeval t0, t1;
  140. char *buffer = NULL;
  141. void *raddr = NULL;
  142. void *addr = NULL;
  143. double throughput;
  144. struct rusage ru;
  145. size_t buffer_sz;
  146. int lu, fd;
  147. fd = (int)(unsigned long)arg;
  148. gettimeofday(&t0, NULL);
  149. fcntl(fd, F_SETFL, O_NDELAY);
  150. buffer = mmap_large_buffer(chunk_size, &buffer_sz);
  151. if (buffer == (void *)-1) {
  152. perror("mmap");
  153. goto error;
  154. }
  155. if (zflg) {
  156. raddr = mmap(NULL, chunk_size + map_align, PROT_READ, flags, fd, 0);
  157. if (raddr == (void *)-1) {
  158. perror("mmap");
  159. zflg = 0;
  160. } else {
  161. addr = ALIGN_PTR_UP(raddr, map_align);
  162. }
  163. }
  164. while (1) {
  165. struct pollfd pfd = { .fd = fd, .events = POLLIN, };
  166. int sub;
  167. poll(&pfd, 1, 10000);
  168. if (zflg) {
  169. socklen_t zc_len = sizeof(zc);
  170. int res;
  171. memset(&zc, 0, sizeof(zc));
  172. zc.address = (__u64)((unsigned long)addr);
  173. zc.length = chunk_size;
  174. res = getsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE,
  175. &zc, &zc_len);
  176. if (res == -1)
  177. break;
  178. if (zc.length) {
  179. assert(zc.length <= chunk_size);
  180. total_mmap += zc.length;
  181. if (xflg)
  182. hash_zone(addr, zc.length);
  183. /* It is more efficient to unmap the pages right now,
  184. * instead of doing this in next TCP_ZEROCOPY_RECEIVE.
  185. */
  186. madvise(addr, zc.length, MADV_DONTNEED);
  187. total += zc.length;
  188. }
  189. if (zc.recv_skip_hint) {
  190. assert(zc.recv_skip_hint <= chunk_size);
  191. lu = read(fd, buffer, zc.recv_skip_hint);
  192. if (lu > 0) {
  193. if (xflg)
  194. hash_zone(buffer, lu);
  195. total += lu;
  196. }
  197. }
  198. continue;
  199. }
  200. sub = 0;
  201. while (sub < chunk_size) {
  202. lu = read(fd, buffer + sub, chunk_size - sub);
  203. if (lu == 0)
  204. goto end;
  205. if (lu < 0)
  206. break;
  207. if (xflg)
  208. hash_zone(buffer + sub, lu);
  209. total += lu;
  210. sub += lu;
  211. }
  212. }
  213. end:
  214. gettimeofday(&t1, NULL);
  215. delta_usec = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
  216. throughput = 0;
  217. if (delta_usec)
  218. throughput = total * 8.0 / (double)delta_usec / 1000.0;
  219. getrusage(RUSAGE_THREAD, &ru);
  220. if (total > 1024*1024) {
  221. unsigned long total_usec;
  222. unsigned long mb = total >> 20;
  223. total_usec = 1000000*ru.ru_utime.tv_sec + ru.ru_utime.tv_usec +
  224. 1000000*ru.ru_stime.tv_sec + ru.ru_stime.tv_usec;
  225. printf("received %lg MB (%lg %% mmap'ed) in %lg s, %lg Gbit\n"
  226. " cpu usage user:%lg sys:%lg, %lg usec per MB, %lu c-switches\n",
  227. total / (1024.0 * 1024.0),
  228. 100.0*total_mmap/total,
  229. (double)delta_usec / 1000000.0,
  230. throughput,
  231. (double)ru.ru_utime.tv_sec + (double)ru.ru_utime.tv_usec / 1000000.0,
  232. (double)ru.ru_stime.tv_sec + (double)ru.ru_stime.tv_usec / 1000000.0,
  233. (double)total_usec/mb,
  234. ru.ru_nvcsw);
  235. }
  236. error:
  237. munmap(buffer, buffer_sz);
  238. close(fd);
  239. if (zflg)
  240. munmap(raddr, chunk_size + map_align);
  241. pthread_exit(0);
  242. }
  243. static void apply_rcvsnd_buf(int fd)
  244. {
  245. if (rcvbuf && setsockopt(fd, SOL_SOCKET,
  246. SO_RCVBUF, &rcvbuf, sizeof(rcvbuf)) == -1) {
  247. perror("setsockopt SO_RCVBUF");
  248. }
  249. if (sndbuf && setsockopt(fd, SOL_SOCKET,
  250. SO_SNDBUF, &sndbuf, sizeof(sndbuf)) == -1) {
  251. perror("setsockopt SO_SNDBUF");
  252. }
  253. }
  254. static void setup_sockaddr(int domain, const char *str_addr,
  255. struct sockaddr_storage *sockaddr)
  256. {
  257. struct sockaddr_in6 *addr6 = (void *) sockaddr;
  258. struct sockaddr_in *addr4 = (void *) sockaddr;
  259. switch (domain) {
  260. case PF_INET:
  261. memset(addr4, 0, sizeof(*addr4));
  262. addr4->sin_family = AF_INET;
  263. addr4->sin_port = htons(cfg_port);
  264. if (str_addr &&
  265. inet_pton(AF_INET, str_addr, &(addr4->sin_addr)) != 1)
  266. error(1, 0, "ipv4 parse error: %s", str_addr);
  267. break;
  268. case PF_INET6:
  269. memset(addr6, 0, sizeof(*addr6));
  270. addr6->sin6_family = AF_INET6;
  271. addr6->sin6_port = htons(cfg_port);
  272. if (str_addr &&
  273. inet_pton(AF_INET6, str_addr, &(addr6->sin6_addr)) != 1)
  274. error(1, 0, "ipv6 parse error: %s", str_addr);
  275. break;
  276. default:
  277. error(1, 0, "illegal domain");
  278. }
  279. }
  280. static void do_accept(int fdlisten)
  281. {
  282. pthread_attr_t attr;
  283. int rcvlowat;
  284. pthread_attr_init(&attr);
  285. pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
  286. rcvlowat = chunk_size;
  287. if (setsockopt(fdlisten, SOL_SOCKET, SO_RCVLOWAT,
  288. &rcvlowat, sizeof(rcvlowat)) == -1) {
  289. perror("setsockopt SO_RCVLOWAT");
  290. }
  291. apply_rcvsnd_buf(fdlisten);
  292. while (1) {
  293. struct sockaddr_in addr;
  294. socklen_t addrlen = sizeof(addr);
  295. pthread_t th;
  296. int fd, res;
  297. fd = accept(fdlisten, (struct sockaddr *)&addr, &addrlen);
  298. if (fd == -1) {
  299. perror("accept");
  300. continue;
  301. }
  302. res = pthread_create(&th, &attr, child_thread,
  303. (void *)(unsigned long)fd);
  304. if (res) {
  305. errno = res;
  306. perror("pthread_create");
  307. close(fd);
  308. }
  309. }
  310. }
  311. /* Each thread should reserve a big enough vma to avoid
  312. * spinlock collisions in ptl locks.
  313. * This size is 2MB on x86_64, and is exported in /proc/meminfo.
  314. */
  315. static unsigned long default_huge_page_size(void)
  316. {
  317. FILE *f = fopen("/proc/meminfo", "r");
  318. unsigned long hps = 0;
  319. size_t linelen = 0;
  320. char *line = NULL;
  321. if (!f)
  322. return 0;
  323. while (getline(&line, &linelen, f) > 0) {
  324. if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) {
  325. hps <<= 10;
  326. break;
  327. }
  328. }
  329. free(line);
  330. fclose(f);
  331. return hps;
  332. }
  333. int main(int argc, char *argv[])
  334. {
  335. struct sockaddr_storage listenaddr, addr;
  336. unsigned int max_pacing_rate = 0;
  337. uint64_t total = 0;
  338. char *host = NULL;
  339. int fd, c, on = 1;
  340. size_t buffer_sz;
  341. char *buffer;
  342. int sflg = 0;
  343. int mss = 0;
  344. while ((c = getopt(argc, argv, "46p:svr:w:H:zxkP:M:C:a:")) != -1) {
  345. switch (c) {
  346. case '4':
  347. cfg_family = PF_INET;
  348. cfg_alen = sizeof(struct sockaddr_in);
  349. break;
  350. case '6':
  351. cfg_family = PF_INET6;
  352. cfg_alen = sizeof(struct sockaddr_in6);
  353. break;
  354. case 'p':
  355. cfg_port = atoi(optarg);
  356. break;
  357. case 'H':
  358. host = optarg;
  359. break;
  360. case 's': /* server : listen for incoming connections */
  361. sflg++;
  362. break;
  363. case 'r':
  364. rcvbuf = atoi(optarg);
  365. break;
  366. case 'w':
  367. sndbuf = atoi(optarg);
  368. break;
  369. case 'z':
  370. zflg = 1;
  371. break;
  372. case 'M':
  373. mss = atoi(optarg);
  374. break;
  375. case 'x':
  376. xflg = 1;
  377. break;
  378. case 'k':
  379. keepflag = 1;
  380. break;
  381. case 'P':
  382. max_pacing_rate = atoi(optarg) ;
  383. break;
  384. case 'C':
  385. chunk_size = atol(optarg);
  386. break;
  387. case 'a':
  388. map_align = atol(optarg);
  389. break;
  390. default:
  391. exit(1);
  392. }
  393. }
  394. if (!map_align) {
  395. map_align = default_huge_page_size();
  396. /* if really /proc/meminfo is not helping,
  397. * we use the default x86_64 hugepagesize.
  398. */
  399. if (!map_align)
  400. map_align = 2*1024*1024;
  401. }
  402. if (sflg) {
  403. int fdlisten = socket(cfg_family, SOCK_STREAM, 0);
  404. if (fdlisten == -1) {
  405. perror("socket");
  406. exit(1);
  407. }
  408. apply_rcvsnd_buf(fdlisten);
  409. setsockopt(fdlisten, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on));
  410. setup_sockaddr(cfg_family, host, &listenaddr);
  411. if (mss &&
  412. setsockopt(fdlisten, IPPROTO_TCP, TCP_MAXSEG,
  413. &mss, sizeof(mss)) == -1) {
  414. perror("setsockopt TCP_MAXSEG");
  415. exit(1);
  416. }
  417. if (bind(fdlisten, (const struct sockaddr *)&listenaddr, cfg_alen) == -1) {
  418. perror("bind");
  419. exit(1);
  420. }
  421. if (listen(fdlisten, 128) == -1) {
  422. perror("listen");
  423. exit(1);
  424. }
  425. do_accept(fdlisten);
  426. }
  427. buffer = mmap_large_buffer(chunk_size, &buffer_sz);
  428. if (buffer == (char *)-1) {
  429. perror("mmap");
  430. exit(1);
  431. }
  432. fd = socket(cfg_family, SOCK_STREAM, 0);
  433. if (fd == -1) {
  434. perror("socket");
  435. exit(1);
  436. }
  437. apply_rcvsnd_buf(fd);
  438. setup_sockaddr(cfg_family, host, &addr);
  439. if (mss &&
  440. setsockopt(fd, IPPROTO_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == -1) {
  441. perror("setsockopt TCP_MAXSEG");
  442. exit(1);
  443. }
  444. if (connect(fd, (const struct sockaddr *)&addr, cfg_alen) == -1) {
  445. perror("connect");
  446. exit(1);
  447. }
  448. if (max_pacing_rate &&
  449. setsockopt(fd, SOL_SOCKET, SO_MAX_PACING_RATE,
  450. &max_pacing_rate, sizeof(max_pacing_rate)) == -1)
  451. perror("setsockopt SO_MAX_PACING_RATE");
  452. if (zflg && setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY,
  453. &on, sizeof(on)) == -1) {
  454. perror("setsockopt SO_ZEROCOPY, (-z option disabled)");
  455. zflg = 0;
  456. }
  457. while (total < FILE_SZ) {
  458. int64_t wr = FILE_SZ - total;
  459. if (wr > chunk_size)
  460. wr = chunk_size;
  461. /* Note : we just want to fill the pipe with 0 bytes */
  462. wr = send(fd, buffer, (size_t)wr, zflg ? MSG_ZEROCOPY : 0);
  463. if (wr <= 0)
  464. break;
  465. total += wr;
  466. }
  467. close(fd);
  468. munmap(buffer, buffer_sz);
  469. return 0;
  470. }