tcp_metrics.c 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739
  1. #include <linux/rcupdate.h>
  2. #include <linux/spinlock.h>
  3. #include <linux/jiffies.h>
  4. #include <linux/bootmem.h>
  5. #include <linux/module.h>
  6. #include <linux/cache.h>
  7. #include <linux/slab.h>
  8. #include <linux/init.h>
  9. #include <linux/tcp.h>
  10. #include <net/inet_connection_sock.h>
  11. #include <net/net_namespace.h>
  12. #include <net/request_sock.h>
  13. #include <net/inetpeer.h>
  14. #include <net/sock.h>
  15. #include <net/ipv6.h>
  16. #include <net/dst.h>
  17. #include <net/tcp.h>
  18. int sysctl_tcp_nometrics_save __read_mostly;
  19. enum tcp_metric_index {
  20. TCP_METRIC_RTT,
  21. TCP_METRIC_RTTVAR,
  22. TCP_METRIC_SSTHRESH,
  23. TCP_METRIC_CWND,
  24. TCP_METRIC_REORDERING,
  25. /* Always last. */
  26. TCP_METRIC_MAX,
  27. };
  28. struct tcp_fastopen_metrics {
  29. u16 mss;
  30. struct tcp_fastopen_cookie cookie;
  31. };
  32. struct tcp_metrics_block {
  33. struct tcp_metrics_block __rcu *tcpm_next;
  34. struct inetpeer_addr tcpm_addr;
  35. unsigned long tcpm_stamp;
  36. u32 tcpm_ts;
  37. u32 tcpm_ts_stamp;
  38. u32 tcpm_lock;
  39. u32 tcpm_vals[TCP_METRIC_MAX];
  40. struct tcp_fastopen_metrics tcpm_fastopen;
  41. };
  42. static bool tcp_metric_locked(struct tcp_metrics_block *tm,
  43. enum tcp_metric_index idx)
  44. {
  45. return tm->tcpm_lock & (1 << idx);
  46. }
  47. static u32 tcp_metric_get(struct tcp_metrics_block *tm,
  48. enum tcp_metric_index idx)
  49. {
  50. return tm->tcpm_vals[idx];
  51. }
  52. static u32 tcp_metric_get_jiffies(struct tcp_metrics_block *tm,
  53. enum tcp_metric_index idx)
  54. {
  55. return msecs_to_jiffies(tm->tcpm_vals[idx]);
  56. }
  57. static void tcp_metric_set(struct tcp_metrics_block *tm,
  58. enum tcp_metric_index idx,
  59. u32 val)
  60. {
  61. tm->tcpm_vals[idx] = val;
  62. }
  63. static void tcp_metric_set_msecs(struct tcp_metrics_block *tm,
  64. enum tcp_metric_index idx,
  65. u32 val)
  66. {
  67. tm->tcpm_vals[idx] = jiffies_to_msecs(val);
  68. }
  69. static bool addr_same(const struct inetpeer_addr *a,
  70. const struct inetpeer_addr *b)
  71. {
  72. const struct in6_addr *a6, *b6;
  73. if (a->family != b->family)
  74. return false;
  75. if (a->family == AF_INET)
  76. return a->addr.a4 == b->addr.a4;
  77. a6 = (const struct in6_addr *) &a->addr.a6[0];
  78. b6 = (const struct in6_addr *) &b->addr.a6[0];
  79. return ipv6_addr_equal(a6, b6);
  80. }
  81. struct tcpm_hash_bucket {
  82. struct tcp_metrics_block __rcu *chain;
  83. };
  84. static DEFINE_SPINLOCK(tcp_metrics_lock);
  85. static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst)
  86. {
  87. u32 val;
  88. val = 0;
  89. if (dst_metric_locked(dst, RTAX_RTT))
  90. val |= 1 << TCP_METRIC_RTT;
  91. if (dst_metric_locked(dst, RTAX_RTTVAR))
  92. val |= 1 << TCP_METRIC_RTTVAR;
  93. if (dst_metric_locked(dst, RTAX_SSTHRESH))
  94. val |= 1 << TCP_METRIC_SSTHRESH;
  95. if (dst_metric_locked(dst, RTAX_CWND))
  96. val |= 1 << TCP_METRIC_CWND;
  97. if (dst_metric_locked(dst, RTAX_REORDERING))
  98. val |= 1 << TCP_METRIC_REORDERING;
  99. tm->tcpm_lock = val;
  100. tm->tcpm_vals[TCP_METRIC_RTT] = dst_metric_raw(dst, RTAX_RTT);
  101. tm->tcpm_vals[TCP_METRIC_RTTVAR] = dst_metric_raw(dst, RTAX_RTTVAR);
  102. tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH);
  103. tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND);
  104. tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING);
  105. tm->tcpm_ts = 0;
  106. tm->tcpm_ts_stamp = 0;
  107. tm->tcpm_fastopen.mss = 0;
  108. tm->tcpm_fastopen.cookie.len = 0;
  109. }
  110. static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
  111. struct inetpeer_addr *addr,
  112. unsigned int hash,
  113. bool reclaim)
  114. {
  115. struct tcp_metrics_block *tm;
  116. struct net *net;
  117. spin_lock_bh(&tcp_metrics_lock);
  118. net = dev_net(dst->dev);
  119. if (unlikely(reclaim)) {
  120. struct tcp_metrics_block *oldest;
  121. oldest = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain);
  122. for (tm = rcu_dereference(oldest->tcpm_next); tm;
  123. tm = rcu_dereference(tm->tcpm_next)) {
  124. if (time_before(tm->tcpm_stamp, oldest->tcpm_stamp))
  125. oldest = tm;
  126. }
  127. tm = oldest;
  128. } else {
  129. tm = kmalloc(sizeof(*tm), GFP_ATOMIC);
  130. if (!tm)
  131. goto out_unlock;
  132. }
  133. tm->tcpm_addr = *addr;
  134. tm->tcpm_stamp = jiffies;
  135. tcpm_suck_dst(tm, dst);
  136. if (likely(!reclaim)) {
  137. tm->tcpm_next = net->ipv4.tcp_metrics_hash[hash].chain;
  138. rcu_assign_pointer(net->ipv4.tcp_metrics_hash[hash].chain, tm);
  139. }
  140. out_unlock:
  141. spin_unlock_bh(&tcp_metrics_lock);
  142. return tm;
  143. }
  144. #define TCP_METRICS_TIMEOUT (60 * 60 * HZ)
  145. static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst)
  146. {
  147. if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT)))
  148. tcpm_suck_dst(tm, dst);
  149. }
  150. #define TCP_METRICS_RECLAIM_DEPTH 5
  151. #define TCP_METRICS_RECLAIM_PTR (struct tcp_metrics_block *) 0x1UL
  152. static struct tcp_metrics_block *tcp_get_encode(struct tcp_metrics_block *tm, int depth)
  153. {
  154. if (tm)
  155. return tm;
  156. if (depth > TCP_METRICS_RECLAIM_DEPTH)
  157. return TCP_METRICS_RECLAIM_PTR;
  158. return NULL;
  159. }
  160. static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *addr,
  161. struct net *net, unsigned int hash)
  162. {
  163. struct tcp_metrics_block *tm;
  164. int depth = 0;
  165. for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
  166. tm = rcu_dereference(tm->tcpm_next)) {
  167. if (addr_same(&tm->tcpm_addr, addr))
  168. break;
  169. depth++;
  170. }
  171. return tcp_get_encode(tm, depth);
  172. }
  173. static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
  174. struct dst_entry *dst)
  175. {
  176. struct tcp_metrics_block *tm;
  177. struct inetpeer_addr addr;
  178. unsigned int hash;
  179. struct net *net;
  180. addr.family = req->rsk_ops->family;
  181. switch (addr.family) {
  182. case AF_INET:
  183. addr.addr.a4 = inet_rsk(req)->rmt_addr;
  184. hash = (__force unsigned int) addr.addr.a4;
  185. break;
  186. case AF_INET6:
  187. *(struct in6_addr *)addr.addr.a6 = inet6_rsk(req)->rmt_addr;
  188. hash = ipv6_addr_hash(&inet6_rsk(req)->rmt_addr);
  189. break;
  190. default:
  191. return NULL;
  192. }
  193. hash ^= (hash >> 24) ^ (hash >> 16) ^ (hash >> 8);
  194. net = dev_net(dst->dev);
  195. hash &= net->ipv4.tcp_metrics_hash_mask;
  196. for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
  197. tm = rcu_dereference(tm->tcpm_next)) {
  198. if (addr_same(&tm->tcpm_addr, &addr))
  199. break;
  200. }
  201. tcpm_check_stamp(tm, dst);
  202. return tm;
  203. }
  204. static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock *tw)
  205. {
  206. struct inet6_timewait_sock *tw6;
  207. struct tcp_metrics_block *tm;
  208. struct inetpeer_addr addr;
  209. unsigned int hash;
  210. struct net *net;
  211. addr.family = tw->tw_family;
  212. switch (addr.family) {
  213. case AF_INET:
  214. addr.addr.a4 = tw->tw_daddr;
  215. hash = (__force unsigned int) addr.addr.a4;
  216. break;
  217. case AF_INET6:
  218. tw6 = inet6_twsk((struct sock *)tw);
  219. *(struct in6_addr *)addr.addr.a6 = tw6->tw_v6_daddr;
  220. hash = ipv6_addr_hash(&tw6->tw_v6_daddr);
  221. break;
  222. default:
  223. return NULL;
  224. }
  225. hash ^= (hash >> 24) ^ (hash >> 16) ^ (hash >> 8);
  226. net = twsk_net(tw);
  227. hash &= net->ipv4.tcp_metrics_hash_mask;
  228. for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
  229. tm = rcu_dereference(tm->tcpm_next)) {
  230. if (addr_same(&tm->tcpm_addr, &addr))
  231. break;
  232. }
  233. return tm;
  234. }
  235. static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
  236. struct dst_entry *dst,
  237. bool create)
  238. {
  239. struct tcp_metrics_block *tm;
  240. struct inetpeer_addr addr;
  241. unsigned int hash;
  242. struct net *net;
  243. bool reclaim;
  244. addr.family = sk->sk_family;
  245. switch (addr.family) {
  246. case AF_INET:
  247. addr.addr.a4 = inet_sk(sk)->inet_daddr;
  248. hash = (__force unsigned int) addr.addr.a4;
  249. break;
  250. case AF_INET6:
  251. *(struct in6_addr *)addr.addr.a6 = inet6_sk(sk)->daddr;
  252. hash = ipv6_addr_hash(&inet6_sk(sk)->daddr);
  253. break;
  254. default:
  255. return NULL;
  256. }
  257. hash ^= (hash >> 24) ^ (hash >> 16) ^ (hash >> 8);
  258. net = dev_net(dst->dev);
  259. hash &= net->ipv4.tcp_metrics_hash_mask;
  260. tm = __tcp_get_metrics(&addr, net, hash);
  261. reclaim = false;
  262. if (tm == TCP_METRICS_RECLAIM_PTR) {
  263. reclaim = true;
  264. tm = NULL;
  265. }
  266. if (!tm && create)
  267. tm = tcpm_new(dst, &addr, hash, reclaim);
  268. else
  269. tcpm_check_stamp(tm, dst);
  270. return tm;
  271. }
  272. /* Save metrics learned by this TCP session. This function is called
  273. * only, when TCP finishes successfully i.e. when it enters TIME-WAIT
  274. * or goes from LAST-ACK to CLOSE.
  275. */
  276. void tcp_update_metrics(struct sock *sk)
  277. {
  278. const struct inet_connection_sock *icsk = inet_csk(sk);
  279. struct dst_entry *dst = __sk_dst_get(sk);
  280. struct tcp_sock *tp = tcp_sk(sk);
  281. struct tcp_metrics_block *tm;
  282. unsigned long rtt;
  283. u32 val;
  284. int m;
  285. if (sysctl_tcp_nometrics_save || !dst)
  286. return;
  287. if (dst->flags & DST_HOST)
  288. dst_confirm(dst);
  289. rcu_read_lock();
  290. if (icsk->icsk_backoff || !tp->srtt) {
  291. /* This session failed to estimate rtt. Why?
  292. * Probably, no packets returned in time. Reset our
  293. * results.
  294. */
  295. tm = tcp_get_metrics(sk, dst, false);
  296. if (tm && !tcp_metric_locked(tm, TCP_METRIC_RTT))
  297. tcp_metric_set(tm, TCP_METRIC_RTT, 0);
  298. goto out_unlock;
  299. } else
  300. tm = tcp_get_metrics(sk, dst, true);
  301. if (!tm)
  302. goto out_unlock;
  303. rtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT);
  304. m = rtt - tp->srtt;
  305. /* If newly calculated rtt larger than stored one, store new
  306. * one. Otherwise, use EWMA. Remember, rtt overestimation is
  307. * always better than underestimation.
  308. */
  309. if (!tcp_metric_locked(tm, TCP_METRIC_RTT)) {
  310. if (m <= 0)
  311. rtt = tp->srtt;
  312. else
  313. rtt -= (m >> 3);
  314. tcp_metric_set_msecs(tm, TCP_METRIC_RTT, rtt);
  315. }
  316. if (!tcp_metric_locked(tm, TCP_METRIC_RTTVAR)) {
  317. unsigned long var;
  318. if (m < 0)
  319. m = -m;
  320. /* Scale deviation to rttvar fixed point */
  321. m >>= 1;
  322. if (m < tp->mdev)
  323. m = tp->mdev;
  324. var = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR);
  325. if (m >= var)
  326. var = m;
  327. else
  328. var -= (var - m) >> 2;
  329. tcp_metric_set_msecs(tm, TCP_METRIC_RTTVAR, var);
  330. }
  331. if (tcp_in_initial_slowstart(tp)) {
  332. /* Slow start still did not finish. */
  333. if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
  334. val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
  335. if (val && (tp->snd_cwnd >> 1) > val)
  336. tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
  337. tp->snd_cwnd >> 1);
  338. }
  339. if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
  340. val = tcp_metric_get(tm, TCP_METRIC_CWND);
  341. if (tp->snd_cwnd > val)
  342. tcp_metric_set(tm, TCP_METRIC_CWND,
  343. tp->snd_cwnd);
  344. }
  345. } else if (tp->snd_cwnd > tp->snd_ssthresh &&
  346. icsk->icsk_ca_state == TCP_CA_Open) {
  347. /* Cong. avoidance phase, cwnd is reliable. */
  348. if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH))
  349. tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
  350. max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
  351. if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
  352. val = tcp_metric_get(tm, TCP_METRIC_CWND);
  353. tcp_metric_set(tm, TCP_METRIC_CWND, (val + tp->snd_cwnd) >> 1);
  354. }
  355. } else {
  356. /* Else slow start did not finish, cwnd is non-sense,
  357. * ssthresh may be also invalid.
  358. */
  359. if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
  360. val = tcp_metric_get(tm, TCP_METRIC_CWND);
  361. tcp_metric_set(tm, TCP_METRIC_CWND,
  362. (val + tp->snd_ssthresh) >> 1);
  363. }
  364. if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
  365. val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
  366. if (val && tp->snd_ssthresh > val)
  367. tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
  368. tp->snd_ssthresh);
  369. }
  370. if (!tcp_metric_locked(tm, TCP_METRIC_REORDERING)) {
  371. val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
  372. if (val < tp->reordering &&
  373. tp->reordering != sysctl_tcp_reordering)
  374. tcp_metric_set(tm, TCP_METRIC_REORDERING,
  375. tp->reordering);
  376. }
  377. }
  378. tm->tcpm_stamp = jiffies;
  379. out_unlock:
  380. rcu_read_unlock();
  381. }
  382. /* Initialize metrics on socket. */
  383. void tcp_init_metrics(struct sock *sk)
  384. {
  385. struct dst_entry *dst = __sk_dst_get(sk);
  386. struct tcp_sock *tp = tcp_sk(sk);
  387. struct tcp_metrics_block *tm;
  388. u32 val;
  389. if (dst == NULL)
  390. goto reset;
  391. dst_confirm(dst);
  392. rcu_read_lock();
  393. tm = tcp_get_metrics(sk, dst, true);
  394. if (!tm) {
  395. rcu_read_unlock();
  396. goto reset;
  397. }
  398. if (tcp_metric_locked(tm, TCP_METRIC_CWND))
  399. tp->snd_cwnd_clamp = tcp_metric_get(tm, TCP_METRIC_CWND);
  400. val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
  401. if (val) {
  402. tp->snd_ssthresh = val;
  403. if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
  404. tp->snd_ssthresh = tp->snd_cwnd_clamp;
  405. } else {
  406. /* ssthresh may have been reduced unnecessarily during.
  407. * 3WHS. Restore it back to its initial default.
  408. */
  409. tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
  410. }
  411. val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
  412. if (val && tp->reordering != val) {
  413. tcp_disable_fack(tp);
  414. tcp_disable_early_retrans(tp);
  415. tp->reordering = val;
  416. }
  417. val = tcp_metric_get(tm, TCP_METRIC_RTT);
  418. if (val == 0 || tp->srtt == 0) {
  419. rcu_read_unlock();
  420. goto reset;
  421. }
  422. /* Initial rtt is determined from SYN,SYN-ACK.
  423. * The segment is small and rtt may appear much
  424. * less than real one. Use per-dst memory
  425. * to make it more realistic.
  426. *
  427. * A bit of theory. RTT is time passed after "normal" sized packet
  428. * is sent until it is ACKed. In normal circumstances sending small
  429. * packets force peer to delay ACKs and calculation is correct too.
  430. * The algorithm is adaptive and, provided we follow specs, it
  431. * NEVER underestimate RTT. BUT! If peer tries to make some clever
  432. * tricks sort of "quick acks" for time long enough to decrease RTT
  433. * to low value, and then abruptly stops to do it and starts to delay
  434. * ACKs, wait for troubles.
  435. */
  436. val = msecs_to_jiffies(val);
  437. if (val > tp->srtt) {
  438. tp->srtt = val;
  439. tp->rtt_seq = tp->snd_nxt;
  440. }
  441. val = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR);
  442. if (val > tp->mdev) {
  443. tp->mdev = val;
  444. tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
  445. }
  446. rcu_read_unlock();
  447. tcp_set_rto(sk);
  448. reset:
  449. if (tp->srtt == 0) {
  450. /* RFC6298: 5.7 We've failed to get a valid RTT sample from
  451. * 3WHS. This is most likely due to retransmission,
  452. * including spurious one. Reset the RTO back to 3secs
  453. * from the more aggressive 1sec to avoid more spurious
  454. * retransmission.
  455. */
  456. tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK;
  457. inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
  458. }
  459. /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
  460. * retransmitted. In light of RFC6298 more aggressive 1sec
  461. * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
  462. * retransmission has occurred.
  463. */
  464. if (tp->total_retrans > 1)
  465. tp->snd_cwnd = 1;
  466. else
  467. tp->snd_cwnd = tcp_init_cwnd(tp, dst);
  468. tp->snd_cwnd_stamp = tcp_time_stamp;
  469. }
  470. bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool paws_check)
  471. {
  472. struct tcp_metrics_block *tm;
  473. bool ret;
  474. if (!dst)
  475. return false;
  476. rcu_read_lock();
  477. tm = __tcp_get_metrics_req(req, dst);
  478. if (paws_check) {
  479. if (tm &&
  480. (u32)get_seconds() - tm->tcpm_ts_stamp < TCP_PAWS_MSL &&
  481. (s32)(tm->tcpm_ts - req->ts_recent) > TCP_PAWS_WINDOW)
  482. ret = false;
  483. else
  484. ret = true;
  485. } else {
  486. if (tm && tcp_metric_get(tm, TCP_METRIC_RTT) && tm->tcpm_ts_stamp)
  487. ret = true;
  488. else
  489. ret = false;
  490. }
  491. rcu_read_unlock();
  492. return ret;
  493. }
  494. EXPORT_SYMBOL_GPL(tcp_peer_is_proven);
  495. void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst)
  496. {
  497. struct tcp_metrics_block *tm;
  498. rcu_read_lock();
  499. tm = tcp_get_metrics(sk, dst, true);
  500. if (tm) {
  501. struct tcp_sock *tp = tcp_sk(sk);
  502. if ((u32)get_seconds() - tm->tcpm_ts_stamp <= TCP_PAWS_MSL) {
  503. tp->rx_opt.ts_recent_stamp = tm->tcpm_ts_stamp;
  504. tp->rx_opt.ts_recent = tm->tcpm_ts;
  505. }
  506. }
  507. rcu_read_unlock();
  508. }
  509. EXPORT_SYMBOL_GPL(tcp_fetch_timewait_stamp);
  510. /* VJ's idea. Save last timestamp seen from this destination and hold
  511. * it at least for normal timewait interval to use for duplicate
  512. * segment detection in subsequent connections, before they enter
  513. * synchronized state.
  514. */
  515. bool tcp_remember_stamp(struct sock *sk)
  516. {
  517. struct dst_entry *dst = __sk_dst_get(sk);
  518. bool ret = false;
  519. if (dst) {
  520. struct tcp_metrics_block *tm;
  521. rcu_read_lock();
  522. tm = tcp_get_metrics(sk, dst, true);
  523. if (tm) {
  524. struct tcp_sock *tp = tcp_sk(sk);
  525. if ((s32)(tm->tcpm_ts - tp->rx_opt.ts_recent) <= 0 ||
  526. ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL &&
  527. tm->tcpm_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
  528. tm->tcpm_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
  529. tm->tcpm_ts = tp->rx_opt.ts_recent;
  530. }
  531. ret = true;
  532. }
  533. rcu_read_unlock();
  534. }
  535. return ret;
  536. }
  537. bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
  538. {
  539. struct tcp_metrics_block *tm;
  540. bool ret = false;
  541. rcu_read_lock();
  542. tm = __tcp_get_metrics_tw(tw);
  543. if (tw) {
  544. const struct tcp_timewait_sock *tcptw;
  545. struct sock *sk = (struct sock *) tw;
  546. tcptw = tcp_twsk(sk);
  547. if ((s32)(tm->tcpm_ts - tcptw->tw_ts_recent) <= 0 ||
  548. ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL &&
  549. tm->tcpm_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
  550. tm->tcpm_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
  551. tm->tcpm_ts = tcptw->tw_ts_recent;
  552. }
  553. ret = true;
  554. }
  555. rcu_read_unlock();
  556. return ret;
  557. }
  558. static DEFINE_SEQLOCK(fastopen_seqlock);
  559. void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
  560. struct tcp_fastopen_cookie *cookie)
  561. {
  562. struct tcp_metrics_block *tm;
  563. rcu_read_lock();
  564. tm = tcp_get_metrics(sk, __sk_dst_get(sk), false);
  565. if (tm) {
  566. struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen;
  567. unsigned int seq;
  568. do {
  569. seq = read_seqbegin(&fastopen_seqlock);
  570. if (tfom->mss)
  571. *mss = tfom->mss;
  572. *cookie = tfom->cookie;
  573. } while (read_seqretry(&fastopen_seqlock, seq));
  574. }
  575. rcu_read_unlock();
  576. }
  577. void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
  578. struct tcp_fastopen_cookie *cookie)
  579. {
  580. struct tcp_metrics_block *tm;
  581. rcu_read_lock();
  582. tm = tcp_get_metrics(sk, __sk_dst_get(sk), true);
  583. if (tm) {
  584. struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen;
  585. write_seqlock_bh(&fastopen_seqlock);
  586. tfom->mss = mss;
  587. if (cookie->len > 0)
  588. tfom->cookie = *cookie;
  589. write_sequnlock_bh(&fastopen_seqlock);
  590. }
  591. rcu_read_unlock();
  592. }
  593. static unsigned long tcpmhash_entries;
  594. static int __init set_tcpmhash_entries(char *str)
  595. {
  596. ssize_t ret;
  597. if (!str)
  598. return 0;
  599. ret = kstrtoul(str, 0, &tcpmhash_entries);
  600. if (ret)
  601. return 0;
  602. return 1;
  603. }
  604. __setup("tcpmhash_entries=", set_tcpmhash_entries);
  605. static int __net_init tcp_net_metrics_init(struct net *net)
  606. {
  607. int slots, size;
  608. slots = tcpmhash_entries;
  609. if (!slots) {
  610. if (totalram_pages >= 128 * 1024)
  611. slots = 16 * 1024;
  612. else
  613. slots = 8 * 1024;
  614. }
  615. size = slots * sizeof(struct tcpm_hash_bucket);
  616. net->ipv4.tcp_metrics_hash = kzalloc(size, GFP_KERNEL);
  617. if (!net->ipv4.tcp_metrics_hash)
  618. return -ENOMEM;
  619. net->ipv4.tcp_metrics_hash_mask = (slots - 1);
  620. return 0;
  621. }
  622. static void __net_exit tcp_net_metrics_exit(struct net *net)
  623. {
  624. kfree(net->ipv4.tcp_metrics_hash);
  625. }
  626. static __net_initdata struct pernet_operations tcp_net_metrics_ops = {
  627. .init = tcp_net_metrics_init,
  628. .exit = tcp_net_metrics_exit,
  629. };
  630. void __init tcp_metrics_init(void)
  631. {
  632. register_pernet_subsys(&tcp_net_metrics_ops);
  633. }