Index: sys/netinet/tcp_output.c =================================================================== diff -u -N -r16a0682b2cb2db4daee841586b59eb7d4eab751e -re20125a686f6f7425367b52cd360e323bb403da4 --- sys/netinet/tcp_output.c (.../tcp_output.c) (revision 16a0682b2cb2db4daee841586b59eb7d4eab751e) +++ sys/netinet/tcp_output.c (.../tcp_output.c) (revision e20125a686f6f7425367b52cd360e323bb403da4) @@ -1,4 +1,3 @@ -/* $MidnightBSD$ */ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. @@ -31,12 +30,11 @@ */ #include -__FBSDID("$FreeBSD: stable/10/sys/netinet/tcp_output.c 317375 2017-04-24 16:31:28Z smh $"); +__FBSDID("$FreeBSD: stable/11/sys/netinet/tcp_output.c 333627 2018-05-15 11:43:05Z ae $"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" -#include "opt_kdtrace.h" #include "opt_tcpdebug.h" #include @@ -57,7 +55,6 @@ #include #include -#include #include #include #include @@ -73,58 +70,61 @@ #ifdef TCP_RFC7413 #include #endif +#include #define TCPOUTFLAGS #include #include #include #include #include +#include +#ifdef TCPPCAP +#include +#endif #ifdef TCPDEBUG #include #endif #ifdef TCP_OFFLOAD #include #endif -#ifdef IPSEC -#include -#endif /*IPSEC*/ +#include #include #include VNET_DEFINE(int, path_mtu_discovery) = 1; -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(path_mtu_discovery), 1, "Enable Path MTU Discovery"); VNET_DEFINE(int, tcp_do_tso) = 1; #define V_tcp_do_tso VNET(tcp_do_tso) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_tso), 0, "Enable TCP Segmentation Offload"); VNET_DEFINE(int, tcp_sendspace) = 1024*32; #define V_tcp_sendspace VNET(tcp_sendspace) -SYSCTL_VNET_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_sendspace), 0, "Initial send socket buffer size"); VNET_DEFINE(int, tcp_do_autosndbuf) = 1; #define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_autosndbuf), 0, "Enable automatic send buffer sizing"); VNET_DEFINE(int, tcp_autosndbuf_inc) = 8*1024; #define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_autosndbuf_inc), 0, "Incrementor step size of automatic send buffer"); VNET_DEFINE(int, tcp_autosndbuf_max) = 2*1024*1024; #define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_autosndbuf_max), 0, "Max size of automatic send buffer"); @@ -191,7 +191,7 @@ struct tcphdr *th; u_char opt[TCP_MAXOLEN]; unsigned ipoptlen, optlen, hdrlen; -#ifdef IPSEC +#if defined(IPSEC) || defined(IPSEC_SUPPORT) unsigned ipsec_optlen = 0; #endif int idle, sendalot; @@ -223,7 +223,7 @@ */ if ((tp->t_flags & TF_FASTOPEN) && (tp->t_state == TCPS_SYN_RECEIVED) && - SEQ_GT(tp->snd_max, tp->snd_una) && /* inital SYN|ACK sent */ + SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN|ACK sent */ (tp->snd_nxt != tp->snd_una)) /* not a retransmit */ return (0); #endif @@ -348,7 +348,7 @@ * to send then the probe will be the FIN * itself. */ - if (off < so->so_snd.sb_cc) + if (off < sbused(&so->so_snd)) flags &= ~TH_FIN; sendwin = 1; } else { @@ -374,7 +374,8 @@ */ if (sack_rxmit == 0) { if (sack_bytes_rxmt == 0) - len = ((long)ulmin(so->so_snd.sb_cc, sendwin) - off); + len = ((long)ulmin(sbavail(&so->so_snd), sendwin) - + off); else { long cwin; @@ -383,8 +384,8 @@ * sending new data, having retransmitted all the * data possible in the scoreboard. */ - len = ((long)ulmin(so->so_snd.sb_cc, tp->snd_wnd) - - off); + len = ((long)ulmin(sbavail(&so->so_snd), tp->snd_wnd) - + off); /* * Don't remove this (len > 0) check ! * We explicitly check for len > 0 here (although it @@ -464,7 +465,7 @@ */ len = 0; if ((sendwin == 0) && (TCPS_HAVEESTABLISHED(tp->t_state)) && - (off < (int) so->so_snd.sb_cc)) { + (off < (int) sbavail(&so->so_snd))) { tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rxtshift = 0; tp->snd_nxt = tp->snd_una; @@ -502,20 +503,23 @@ * and does at most one step per received ACK. This fast * scaling has the drawback of growing the send buffer beyond * what is strictly necessary to make full use of a given - * delay*bandwith product. However testing has shown this not + * delay*bandwidth product. However testing has shown this not * to be much of an problem. At worst we are trading wasting - * of available bandwith (the non-use of it) for wasting some + * of available bandwidth (the non-use of it) for wasting some * socket buffer memory. * * TODO: Shrink send buffer during idle periods together * with congestion window. Requires another timer. Has to * wait for upcoming tcp timer rewrite. + * + * XXXGL: should there be used sbused() or sbavail()? */ if (V_tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) { if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat && - so->so_snd.sb_cc >= (so->so_snd.sb_hiwat / 8 * 7) && - so->so_snd.sb_cc < V_tcp_autosndbuf_max && - sendwin >= (so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una))) { + sbused(&so->so_snd) >= (so->so_snd.sb_hiwat / 8 * 7) && + sbused(&so->so_snd) < V_tcp_autosndbuf_max && + sendwin >= (sbused(&so->so_snd) - + (tp->snd_nxt - tp->snd_una))) { if (!sbreserve_locked(&so->so_snd, min(so->so_snd.sb_hiwat + V_tcp_autosndbuf_inc, V_tcp_autosndbuf_max), so, curthread)) @@ -533,29 +537,55 @@ * (except for the sequence number) for all generated packets. This * makes it impossible to transmit any options which vary per generated * segment or packet. + * + * IPv4 handling has a clear separation of ip options and ip header + * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does + * the right thing below to provide length of just ip options and thus + * checking for ipoptlen is enough to decide if ip options are present. */ -#ifdef IPSEC +#if defined(IPSEC) || defined(IPSEC_SUPPORT) /* * Pre-calculate here as we save another lookup into the darknesses * of IPsec that way and can actually decide if TSO is ok. */ - ipsec_optlen = ipsec_hdrsiz_tcp(tp); +#ifdef INET6 + if (isipv6 && IPSEC_ENABLED(ipv6)) + ipsec_optlen = IPSEC_HDRSIZE(ipv6, tp->t_inpcb); +#ifdef INET + else #endif +#endif /* INET6 */ +#ifdef INET + if (IPSEC_ENABLED(ipv4)) + ipsec_optlen = IPSEC_HDRSIZE(ipv4, tp->t_inpcb); +#endif /* INET */ +#endif /* IPSEC */ +#ifdef INET6 + if (isipv6) + ipoptlen = ip6_optlen(tp->t_inpcb); + else +#endif + if (tp->t_inpcb->inp_options) + ipoptlen = tp->t_inpcb->inp_options->m_len - + offsetof(struct ipoption, ipopt_list); + else + ipoptlen = 0; +#if defined(IPSEC) || defined(IPSEC_SUPPORT) + ipoptlen += ipsec_optlen; +#endif + if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg && ((tp->t_flags & TF_SIGNATURE) == 0) && tp->rcv_numsacks == 0 && sack_rxmit == 0 && -#ifdef IPSEC - ipsec_optlen == 0 && -#endif - tp->t_inpcb->inp_options == NULL && - tp->t_inpcb->in6p_options == NULL) + ipoptlen == 0) tso = 1; if (sack_rxmit) { - if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc)) + if (SEQ_LT(p->rxmit + len, tp->snd_una + sbused(&so->so_snd))) flags &= ~TH_FIN; } else { - if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc)) + if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + + sbused(&so->so_snd))) flags &= ~TH_FIN; } @@ -585,7 +615,7 @@ */ if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ (idle || (tp->t_flags & TF_NODELAY)) && - len + off >= so->so_snd.sb_cc && + len + off >= sbavail(&so->so_snd) && (tp->t_flags & TF_NOPUSH) == 0) { goto send; } @@ -647,17 +677,20 @@ oldwin = 0; /* - * If the new window size ends up being the same as the old - * size when it is scaled, then don't force a window update. + * If the new window size ends up being the same as or less + * than the old size when it is scaled, then don't force + * a window update. */ - if (oldwin >> tp->rcv_scale == (adv + oldwin) >> tp->rcv_scale) + if (oldwin >> tp->rcv_scale >= (adv + oldwin) >> tp->rcv_scale) goto dontupdate; if (adv >= (long)(2 * tp->t_maxseg) && (adv >= (long)(so->so_rcv.sb_hiwat / 4) || recwin <= (long)(so->so_rcv.sb_hiwat / 8) || so->so_rcv.sb_hiwat <= 8 * tp->t_maxseg)) goto send; + if (2 * adv >= (int32_t)so->so_rcv.sb_hiwat) + goto send; } dontupdate: @@ -713,7 +746,7 @@ * if window is nonzero, transmit what we can, * otherwise force out a byte. */ - if (so->so_snd.sb_cc && !tcp_timer_active(tp, TT_REXMT) && + if (sbavail(&so->so_snd) && !tcp_timer_active(tp, TT_REXMT) && !tcp_timer_active(tp, TT_PERSIST)) { tp->t_rxtshift = 0; tcp_setpersist(tp); @@ -810,8 +843,12 @@ to.to_sacks = (u_char *)tp->sackblks; } } -#ifdef TCP_SIGNATURE +#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) /* TCP-MD5 (RFC2385). */ + /* + * Check that TCP_MD5SIG is enabled in tcpcb to + * account the size needed to set this TCP option. + */ if (tp->t_flags & TF_SIGNATURE) to.to_flags |= TOF_SIGNATURE; #endif /* TCP_SIGNATURE */ @@ -820,27 +857,13 @@ hdrlen += optlen = tcp_addoptions(&to, opt); } -#ifdef INET6 - if (isipv6) - ipoptlen = ip6_optlen(tp->t_inpcb); - else -#endif - if (tp->t_inpcb->inp_options) - ipoptlen = tp->t_inpcb->inp_options->m_len - - offsetof(struct ipoption, ipopt_list); - else - ipoptlen = 0; -#ifdef IPSEC - ipoptlen += ipsec_optlen; -#endif - /* * Adjust data length if insertion of options will - * bump the packet length beyond the t_maxopd length. + * bump the packet length beyond the t_maxseg length. * Clear the FIN bit because we cut off the tail of * the segment. */ - if (len + optlen + ipoptlen > tp->t_maxopd) { + if (len + optlen + ipoptlen > tp->t_maxseg) { flags &= ~TH_FIN; if (tso) { @@ -943,8 +966,8 @@ * fractional unless the send sockbuf can be * emptied: */ - max_len = (tp->t_maxopd - optlen); - if ((off + len) < so->so_snd.sb_cc) { + max_len = (tp->t_maxseg - optlen); + if ((off + len) < sbavail(&so->so_snd)) { moff = len % max_len; if (moff != 0) { len -= moff; @@ -973,7 +996,7 @@ sendalot = 1; } else { - len = tp->t_maxopd - optlen - ipoptlen; + len = tp->t_maxseg - optlen - ipoptlen; sendalot = 1; } } else @@ -1060,7 +1083,7 @@ * give data to the user when a buffer fills or * a PUSH comes in.) */ - if ((off + len == so->so_snd.sb_cc) && !(flags & TH_SYN)) + if ((off + len == sbused(&so->so_snd)) && !(flags & TH_SYN)) flags |= TH_PUSH; SOCKBUF_UNLOCK(&so->so_snd); } else { @@ -1083,7 +1106,7 @@ #ifdef INET6 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && MHLEN >= hdrlen) { - MH_ALIGN(m, hdrlen); + M_ALIGN(m, hdrlen); } else #endif m->m_data += max_linkhdr; @@ -1238,20 +1261,32 @@ */ tp->snd_up = tp->snd_una; /* drag it along */ -#ifdef TCP_SIGNATURE - if (to.to_flags & TOF_SIGNATURE) { - int sigoff = to.to_signature - opt; - tcp_signature_compute(m, 0, len, optlen, - (u_char *)(th + 1) + sigoff, IPSEC_DIR_OUTBOUND); - } -#endif - /* * Put TCP length in extended header, and then * checksum extended header and data. */ m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); + +#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) + if (to.to_flags & TOF_SIGNATURE) { + /* + * Calculate MD5 signature and put it into the place + * determined before. + * NOTE: since TCP options buffer doesn't point into + * mbuf's data, calculate offset and use it. + */ + if (!TCPMD5_ENABLED() || (error = TCPMD5_OUTPUT(m, th, + (u_char *)(th + 1) + (to.to_signature - opt))) != 0) { + /* + * Do not send segment if the calculation of MD5 + * digest has failed. + */ + m_freem(m); + goto out; + } + } +#endif #ifdef INET6 if (isipv6) { /* @@ -1281,16 +1316,15 @@ /* * Enable TSO and specify the size of the segments. * The TCP pseudo header checksum is always provided. - * XXX: Fixme: This is currently not the case for IPv6. */ if (tso) { - KASSERT(len > tp->t_maxopd - optlen, + KASSERT(len > tp->t_maxseg - optlen, ("%s: len <= tso_segsz", __func__)); m->m_pkthdr.csum_flags |= CSUM_TSO; - m->m_pkthdr.tso_segsz = tp->t_maxopd - optlen; + m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; } -#ifdef IPSEC +#if defined(IPSEC) || defined(IPSEC_SUPPORT) KASSERT(len + hdrlen + ipoptlen - ipsec_optlen == m_length(m, NULL), ("%s: mbuf chain shorter than expected: %ld + %u + %u - %u != %u", __func__, len, hdrlen, ipoptlen, ipsec_optlen, m_length(m, NULL))); @@ -1323,6 +1357,7 @@ ipov->ih_len = save; } #endif /* TCPDEBUG */ + TCP_PROBE3(debug__output, tp, th, m); /* * Fill in IP length and desired time to live and @@ -1331,14 +1366,11 @@ * the template, but need a way to checksum without them. */ /* - * m->m_pkthdr.len should have been set before cksum calcuration, + * m->m_pkthdr.len should have been set before checksum calculation, * because in6_cksum() need it. */ #ifdef INET6 if (isipv6) { - struct route_in6 ro; - - bzero(&ro, sizeof(ro)); /* * we separately set hoplimit for every segment, since the * user might want to change the value via setsockopt. @@ -1354,7 +1386,7 @@ */ ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); - if (V_path_mtu_discovery && tp->t_maxopd > V_tcp_minmss) + if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) tp->t_flags2 |= TF2_PLPMTU_PMTUD; else tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; @@ -1364,24 +1396,26 @@ TCP_PROBE5(send, NULL, tp, ip6, tp, th); +#ifdef TCPPCAP + /* Save packet, if requested. */ + tcp_pcap_add(th, m, &(tp->t_outpkts)); +#endif + /* TODO: IPv6 IP6TOS_ECT bit on */ - error = ip6_output(m, tp->t_inpcb->in6p_outputopts, &ro, + error = ip6_output(m, tp->t_inpcb->in6p_outputopts, + &tp->t_inpcb->inp_route6, ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), NULL, NULL, tp->t_inpcb); - if (error == EMSGSIZE && ro.ro_rt != NULL) - mtu = ro.ro_rt->rt_mtu; - RO_RTFREE(&ro); + if (error == EMSGSIZE && tp->t_inpcb->inp_route6.ro_rt != NULL) + mtu = tp->t_inpcb->inp_route6.ro_rt->rt_mtu; } #endif /* INET6 */ #if defined(INET) && defined(INET6) else #endif #ifdef INET { - struct route ro; - - bzero(&ro, sizeof(ro)); ip->ip_len = htons(m->m_pkthdr.len); #ifdef INET6 if (tp->t_inpcb->inp_vflag & INP_IPV6PROTO) @@ -1395,7 +1429,7 @@ * * NB: Don't set DF on small MTU/MSS to have a safe fallback. */ - if (V_path_mtu_discovery && tp->t_maxopd > V_tcp_minmss) { + if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { ip->ip_off |= htons(IP_DF); tp->t_flags2 |= TF2_PLPMTU_PMTUD; } else { @@ -1407,13 +1441,17 @@ TCP_PROBE5(send, NULL, tp, ip, tp, th); - error = ip_output(m, tp->t_inpcb->inp_options, &ro, +#ifdef TCPPCAP + /* Save packet, if requested. */ + tcp_pcap_add(th, m, &(tp->t_outpkts)); +#endif + + error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route, ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0, tp->t_inpcb); - if (error == EMSGSIZE && ro.ro_rt != NULL) - mtu = ro.ro_rt->rt_mtu; - RO_RTFREE(&ro); + if (error == EMSGSIZE && tp->t_inpcb->inp_route.ro_rt != NULL) + mtu = tp->t_inpcb->inp_route.ro_rt->rt_mtu; } #endif /* INET */ @@ -1470,7 +1508,7 @@ tp->t_rxtshift = 0; } tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); - } else if (len == 0 && so->so_snd.sb_cc && + } else if (len == 0 && sbavail(&so->so_snd) && !tcp_timer_active(tp, TT_REXMT) && !tcp_timer_active(tp, TT_PERSIST)) { /* @@ -1508,7 +1546,7 @@ tp->t_flags |= TF_SENTFIN; } if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) - tp->snd_max = tp->snd_nxt + len; + tp->snd_max = tp->snd_nxt + xlen; } if (error) { @@ -1540,6 +1578,7 @@ } SOCKBUF_UNLOCK_ASSERT(&so->so_snd); /* Check gotos. */ switch (error) { + case EACCES: case EPERM: tp->t_softerror = error; return (error); @@ -1616,7 +1655,7 @@ if (tcp_timer_active(tp, TT_REXMT)) panic("tcp_setpersist: retransmit pending"); /* - * Start/restart persistance timer. + * Start/restart persistence timer. */ TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], tcp_persmin, tcp_persmax); @@ -1645,7 +1684,7 @@ int tcp_addoptions(struct tcpopt *to, u_char *optp) { - u_int mask, optlen = 0; + u_int32_t mask, optlen = 0; for (mask = 1; mask < TOF_MAXOPT; mask <<= 1) { if ((to->to_flags & mask) != mask) @@ -1707,7 +1746,6 @@ bcopy((u_char *)&to->to_tsecr, optp, sizeof(to->to_tsecr)); optp += sizeof(to->to_tsecr); break; -#ifdef TCP_SIGNATURE case TOF_SIGNATURE: { int siglen = TCPOLEN_SIGNATURE - 2; @@ -1716,8 +1754,10 @@ optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } - if (TCP_MAXOLEN - optlen < TCPOLEN_SIGNATURE) + if (TCP_MAXOLEN - optlen < TCPOLEN_SIGNATURE) { + to->to_flags &= ~TOF_SIGNATURE; continue; + } optlen += TCPOLEN_SIGNATURE; *optp++ = TCPOPT_SIGNATURE; *optp++ = TCPOLEN_SIGNATURE; @@ -1726,7 +1766,6 @@ *optp++ = 0; break; } -#endif case TOF_SACK: { int sackblks = 0;