rfc9040xml2.original.xml | rfc9040.xml | |||
---|---|---|---|---|
<?xml version='1.0' encoding='utf-8'?> | <?xml version="1.0" encoding="UTF-8"?> | |||
<!-- [rfced] Change log section removed from draft-ietf-tcpm-2140bis-11-manual.t | ||||
xt --> | ||||
<!DOCTYPE rfc SYSTEM "rfc2629.dtd" [ | ||||
<!ENTITY RFC0793 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
C.0793.xml"> | ||||
<!ENTITY RFC1122 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
C.1122.xml"> | ||||
<!ENTITY RFC1191 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
C.1191.xml"> | ||||
<!ENTITY RFC2119 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
C.2119.xml"> | ||||
<!ENTITY RFC4821 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
C.4821.xml"> | ||||
<!ENTITY RFC5681 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
C.5681.xml"> | ||||
<!ENTITY RFC6298 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
C.6298.xml"> | ||||
<!ENTITY RFC7413 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
C.7413.xml"> | ||||
<!ENTITY RFC8174 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
C.8174.xml"> | ||||
<!ENTITY RFC8201 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
C.8201.xml"> | ||||
<!ENTITY I-D.allman-tcpm-bump-initcwnd SYSTEM "https://xml2rfc.ietf.org/public/r | ||||
fc/bibxml3/reference.I-D.draft-allman-tcpm-bump-initcwnd-00.xml"> | ||||
<!ENTITY I-D.ietf-tcpm-generalized-ecn SYSTEM "https://xml2rfc.ietf.org/public/r | ||||
fc/bibxml3/reference.I-D.draft-ietf-tcpm-generalized-ecn-07.xml"> | ||||
<!ENTITY I-D.hughes-restart SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml3/ | ||||
reference.I-D.draft-hughes-restart-00.xml"> | ||||
<!ENTITY RFC1644 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
C.1644.xml"> | ||||
<!ENTITY RFC1379 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
C.1379.xml"> | ||||
<!ENTITY RFC2001 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
C.2001.xml"> | ||||
<!ENTITY RFC2140 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
C.2140.xml"> | ||||
<!ENTITY RFC2414 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
C.2414.xml"> | ||||
<!ENTITY RFC2663 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
C.2663.xml"> | ||||
<!ENTITY RFC3390 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
C.3390.xml"> | ||||
<!ENTITY RFC3124 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
C.3124.xml"> | ||||
<!ENTITY RFC4340 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
C.4340.xml"> | ||||
<!ENTITY RFC4960 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
C.4960.xml"> | ||||
<!ENTITY RFC5925 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
C.5925.xml"> | ||||
<!ENTITY RFC6437 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
C.6437.xml"> | ||||
<!ENTITY RFC6691 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
C.6691.xml"> | ||||
<!ENTITY RFC6928 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
C.6928.xml"> | ||||
<!ENTITY RFC7231 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
C.7231.xml"> | ||||
<!ENTITY RFC7323 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
C.7323.xml"> | ||||
<!ENTITY RFC7424 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
C.7424.xml"> | ||||
<!ENTITY RFC7540 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
C.7540.xml"> | ||||
<!ENTITY RFC7661 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
C.7661.xml"> | ||||
<!ENTITY RFC8684 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
C.8684.xml"> | ||||
]> | ||||
<rfc submissionType="IETF" docName="draft-ietf-tcpm-2140bis-11" category="info" | ||||
obsoletes="2140" ipr="trust200902"> | ||||
<!-- Generated by id2xml 1.5.0 on 2021-05-03T23:46:00Z --> | ||||
<?rfc strict="yes"?> | ||||
<?rfc compact="yes"?> | ||||
<?rfc subcompact="no"?> | ||||
<?rfc symrefs="yes"?> | ||||
<?rfc sortrefs="no"?> | ||||
<?rfc text-list-symbols="o*+-"?> | ||||
<?rfc toc="yes"?> | ||||
<front> | ||||
<title>TCP Control Block Interdependence</title> | ||||
<author initials="J." surname="Touch" fullname="Joe Touch"> | ||||
<organization abbrev="Independent"></organization> | ||||
<address> | ||||
<postal> | ||||
<street/> | ||||
<city>Manhattan Beach</city> | ||||
<region>CA</region> | ||||
<code>90266</code> | ||||
<country>United States of America</country> | ||||
</postal> | ||||
<phone>+1 (310) 560-0334</phone> | ||||
<email>touch@strayalpha.com</email> | ||||
</address> | ||||
</author> | ||||
<author initials="M." surname="Welzl" fullname="Michael Welzl"> | ||||
<organization>University of Oslo</organization> | ||||
<address> | ||||
<postal> | ||||
<street>PO Box 1080 Blindern</street> | ||||
<city>Oslo</city> | ||||
<region/> | ||||
<code>N-0316</code> | ||||
<country>Norway</country> | ||||
</postal> | ||||
<phone>+47 22 85 24 20</phone> | ||||
<email>michawe@ifi.uio.no</email> | ||||
</address> | ||||
</author> | ||||
<author initials="S." surname="Islam" fullname="Safiqul Islam"> | ||||
<organization>University of Oslo</organization> | ||||
<address><postal><street>PO Box 1080 Blindern</street> | ||||
<street>Oslo N-0316</street> | ||||
<street>Norway</street> | ||||
</postal> | ||||
<phone>+47 22 84 08 37</phone> | ||||
<email>safiquli@ifi.uio.no</email> | ||||
</address> | ||||
</author> | ||||
<date year="2021" month="May"/> | <!DOCTYPE rfc SYSTEM "rfc2629-xhtml.ent"> | |||
<workgroup>TCPM WG</workgroup> | ||||
<!-- [rfced] Please insert any keywords (beyond those that appear in | <rfc xmlns:xi="http://www.w3.org/2001/XInclude" docName="draft-ietf-tcpm-2140bis | |||
the title) for use on https://www.rfc-editor.org/search. --> | -11" | |||
number="9040" submissionType="IETF" category="info" consensus="true" obsoletes=" | ||||
2140" | ||||
ipr="trust200902" updates="" xml:lang="en" symRefs="true" sortRefs="true" tocInc | ||||
lude="true" | ||||
version="3"> | ||||
<keyword>example</keyword> | <front> | |||
<title>TCP Control Block Interdependence</title> | ||||
<seriesInfo name="RFC" value="9040"/> | ||||
<author initials="J." surname="Touch" fullname="Joe Touch"> | ||||
<organization abbrev="Independent"/> | ||||
<address> | ||||
<postal> | ||||
<street/> | ||||
<city>Manhattan Beach</city> | ||||
<region>CA</region> | ||||
<code>90266</code> | ||||
<country>United States of America</country> | ||||
</postal> | ||||
<phone>+1 (310) 560-0334</phone> | ||||
<email>touch@strayalpha.com</email> | ||||
</address> | ||||
</author> | ||||
<author initials="M." surname="Welzl" fullname="Michael Welzl"> | ||||
<organization>University of Oslo</organization> | ||||
<address> | ||||
<postal> | ||||
<street>PO Box 1080 Blindern</street> | ||||
<city>Oslo</city> | ||||
<region/> | ||||
<code>N-0316</code> | ||||
<country>Norway</country> | ||||
</postal> | ||||
<phone>+47 22 85 24 20</phone> | ||||
<email>michawe@ifi.uio.no</email> | ||||
</address> | ||||
</author> | ||||
<author initials="S." surname="Islam" fullname="Safiqul Islam"> | ||||
<organization>University of Oslo</organization> | ||||
<address> | ||||
<postal> | ||||
<street>PO Box 1080 Blindern</street> | ||||
<street>Oslo N-0316</street> | ||||
<street>Norway</street> | ||||
</postal> | ||||
<phone>+47 22 84 08 37</phone> | ||||
<email>safiquli@ifi.uio.no</email> | ||||
</address> | ||||
</author> | ||||
<date year="2021" month="July"/> | ||||
<workgroup>TCPM WG</workgroup> | ||||
<abstract><t> | <abstract> | |||
<t> | ||||
This memo provides guidance to TCP implementers that is intended to | This memo provides guidance to TCP implementers that is intended to | |||
help improve connection convergence to steady-state operation | help improve connection convergence to steady-state operation | |||
without affecting interoperability. It updates and replaces RFC | without affecting interoperability. It updates and replaces RFC | |||
2140's description of sharing TCP state, as typically represented in | 2140's description of sharing TCP state, as typically represented in | |||
TCP Control Blocks, among similar concurrent or consecutive | TCP Control Blocks, among similar concurrent or consecutive | |||
connections.</t> | connections.</t> | |||
</abstract> | ||||
</front> | ||||
<middle> | ||||
<section anchor="sect-1" numbered="true" toc="default"> | ||||
<name>Introduction</name> | ||||
</abstract> | <t> | |||
</front> | TCP is a connection-oriented reliable transport protocol layered over IP | |||
<xref target="RFC0793" format="default"/>. Each TCP connection maintains | ||||
<middle> | state, usually in a data structure called the "TCP Control Block (TCB)". The | |||
<section title="Introduction" anchor="sect-1"><t> | TCB contains information about the connection state, its associated local | |||
TCP is a connection-oriented reliable transport protocol layered | ||||
over IP <xref target="RFC0793"/>. Each TCP connection maintains state, usuall | ||||
y in a | ||||
data structure called the TCP Control Block (TCB). The TCB contains | ||||
information about the connection state, its associated local | ||||
process, and feedback parameters about the connection's transmission | process, and feedback parameters about the connection's transmission | |||
properties. As originally specified and usually implemented, most | properties. As originally specified and usually implemented, most TCB | |||
TCB information is maintained on a per-connection basis. Some | information is maintained on a per-connection basis. Some implementations | |||
implementations share certain TCB information across connections to | share certain TCB information across connections to the same host <xref | |||
the same host <xref target="RFC2140"/>. Such sharing is intended to lead to b | target="RFC2140" format="default"/>. Such sharing is intended to lead to | |||
etter | better overall transient performance, especially for numerous short-lived | |||
overall transient performance, especially for numerous short-lived | and simultaneous connections, as can be used in the World Wide Web and | |||
and simultaneous connections, as can be used in the World-Wide Web | other applications <xref target="Be94" format="default"/> <xref | |||
and other applications <xref target="Be94"/><xref target="Br02"/>. This shari | target="Br02" format="default"/>. This sharing of state is intended to help | |||
ng of state is | TCP connections converge to long-term behavior (assuming stable application | |||
intended to help TCP connections converge to long term behavior | load, i.e., so-called "steady-state") more quickly without affecting TCP | |||
(assuming stable application load, i.e., so-called "steady-state") | interoperability.</t> | |||
more quickly without affecting TCP interoperability.</t> | ||||
<t> | <t> | |||
This document updates RFC 2140's discussion of TCB state sharing and | This document updates RFC 2140's discussion of TCB state sharing and | |||
provides a complete replacement for that document. This state | provides a complete replacement for that document. This state sharing | |||
sharing affects only TCB initialization <xref target="RFC2140"/> and thus has | affects only TCB initialization <xref target="RFC2140" format="default"/> | |||
no | and thus has no effect on the long-term behavior of TCP after a connection | |||
effect on the long-term behavior of TCP after a connection has been | has been established or on interoperability. Path information shared | |||
established nor on interoperability. Path information shared across | across SYN destination port numbers assumes that TCP segments having the | |||
SYN destination port numbers assumes that TCP segments having the | same host-pair experience the same path properties, i.e., that traffic is | |||
same host-pair experience the same path properties, i.e., that | not routed differently based on port numbers or other connection parameters | |||
traffic is not routed differently based on port numbers or other | (also addressed further in <xref target="sect-8.1" format="default"/>). The | |||
connection parameters (also addressed further in <xref target="sect-8.1"/>). | observations about TCB sharing in this document apply similarly to any | |||
The | protocol with congestion state, including the Stream Control Transmission | |||
observations about TCB sharing in this document apply similarly to | Protocol (SCTP) <xref target="RFC4960" format="default"/> and the Datagram | |||
any protocol with congestion state, including SCTP <xref target="RFC4960"/> a | Congestion Control Protocol (DCCP) <xref target="RFC4340" | |||
nd | format="default"/>, as well as to individual subflows in Multipath TCP | |||
DCCP <xref target="RFC4340"/>, as well as for individual subflows in Multipat | <xref target="RFC8684" format="default"/>.</t> | |||
h TCP | </section> | |||
<xref target="RFC8684"/>.</t> | ||||
</section> | ||||
<section title="Conventions Used in This Document" anchor="sect-2"><t> | ||||
The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", | ||||
"SHOULD", "SHOULD NOT", "RECOMMENDED", "NOT RECOMMENDED", "MAY", and | ||||
"OPTIONAL" in this document are to be interpreted as described in | ||||
BCP 14 <xref target="RFC2119"/> <xref target="RFC8174"/> when, and only when, | ||||
they appear in all | ||||
capitals, as shown here.</t> | ||||
<t> | ||||
The core of this document describes behavior that is already | ||||
permitted by TCP standards. As a result, it provides informative | ||||
guidance but does not use normative language, except when quoting | ||||
other documents. Normative language is used in Appendix C as | ||||
examples of requirements for future consideration.</t> | ||||
</section> | ||||
<section title="Terminology" anchor="sect-3"><t> | ||||
The following terminology is used frequently in this document. Items | ||||
preceded with a "+" may be part of the state maintained as TCP | ||||
connection state in the associated connections TCB and are the focus | ||||
of sharing as described in this document. Note that terms are used | ||||
as originally introduced where possible; in some cases, direction is | ||||
indicated with a suffix (_S for send, _R for receive) and in other | ||||
cases spelled out (sendcwnd). | ||||
<list style="hanging" hangIndent="6"> | ||||
<t hangText="+cwnd:">TCP congestion window size <xref target="RFC5681"/>< | ||||
/t> | ||||
<t hangText="host:">a source or sink of TCP segments associated with a si | ||||
ngle IP | ||||
address</t> | ||||
<t hangText="host-pair:">a pair of hosts and their corresponding IP addre | ||||
sses</t> | ||||
<t hangText="+MMS_R:">maximum message size that can be received, the larg | ||||
est | ||||
received transport payload of an IP datagram <xref target="RFC1122"/></t> | ||||
<t hangText="+MMS_S:">maximum message size that can be sent, the largest | ||||
transmitted transport payload of an IP datagram <xref target="RFC1122"/>< | ||||
/t> | ||||
<t hangText="path:">an Internet path between the IP addresses of two host | ||||
s</t> | ||||
<t hangText="PCB:">protocol control block, the data associated with | <section anchor="sect-2" numbered="true" toc="default"> | |||
a protocol as maintained by an endpoint; a TCP PCB is called a TCB | <name>Conventions Used in This Document</name> | |||
PLPMTUD - packetization-layer path MTU discovery, a mechanism that | ||||
uses transport packets to discover the PMTU <xref | ||||
target="RFC4821"/></t> | ||||
<t hangText="+PMTU:">largest IP datagram that can traverse a path | <t> | |||
<xref target="RFC1191"/><xref target="RFC8201"/></t> | The key words "<bcp14>MUST</bcp14>", "<bcp14>MUST NOT</bcp14>", | |||
"<bcp14>REQUIRED</bcp14>", "<bcp14>SHALL</bcp14>", "<bcp14>SHALL | ||||
NOT</bcp14>", "<bcp14>SHOULD</bcp14>", "<bcp14>SHOULD NOT</bcp14>", | ||||
"<bcp14>RECOMMENDED</bcp14>", "<bcp14>NOT RECOMMENDED</bcp14>", | ||||
"<bcp14>MAY</bcp14>", and "<bcp14>OPTIONAL</bcp14>" in this document are | ||||
to be interpreted as described in BCP 14 <xref target="RFC2119"/> | ||||
<xref target="RFC8174"/> when, and only when, they appear in all capitals, | ||||
as shown here. | ||||
</t> | ||||
<t hangText="PMTUD:">path-layer MTU discovery, a mechanism that | <t> | |||
relies on ICMP error messages to discover the PMTU <xref | The core of this document describes behavior that is already permitted by | |||
target="RFC1191"/><xref target="RFC8201"/></t> | TCP standards. As a result, this document provides informative guidance but d | |||
oes not | ||||
use normative language except when quoting other documents. Normative | ||||
language is used in <xref target="sect-c"/> as examples of requirements for | ||||
future consideration.</t> | ||||
</section> | ||||
<t hangText="+RTT:">round-trip time of a TCP packet exchange <xref | <section anchor="sect-3" numbered="true" toc="default"> | |||
target="RFC0793"/></t> | <name>Terminology</name> | |||
<t hangText="+RTTVAR:">variation of round-trip times of a TCP packet | <t> | |||
exchange <xref target="RFC6298"/></t> | The following terminology is used frequently in this document. Items | |||
preceded with a "+" may be part of the state maintained as TCP connection | ||||
state in the TCB of associated connections and are the focus of sharing as | ||||
described in this document. Note that terms are used as originally | ||||
introduced where possible; in some cases, direction is indicated with a | ||||
suffix (_S for send, _R for receive) and in other cases spelled out | ||||
(sendcwnd). | ||||
<t hangText="+rwnd:">TCP receive window size <xref | </t> | |||
target="RFC5681"/></t> | ||||
<t hangText="+sendcwnd:">TCP send-side congestion window (cwnd) size | <dl newline="false" spacing="normal" indent="6"> | |||
<xref target="RFC5681"/></t> | <dt>+cwnd:</dt> | |||
<dd>TCP congestion window size <xref target="RFC5681" format="default"/> | ||||
</dd> | ||||
<dt>host:</dt> | ||||
<dd>a source or sink of TCP segments associated with a single IP | ||||
address</dd> | ||||
<dt>host-pair:</dt> | ||||
<dd>a pair of hosts and their corresponding IP addresses</dd> | ||||
<dt>ISN: | ||||
</dt> | ||||
<dd>Initial Sequence Number | ||||
</dd> | ||||
<dt>+MMS_R:</dt> | ||||
<dd>maximum message size that can be received, the largest | ||||
received transport payload of an IP datagram <xref target="RFC1122" forma | ||||
t="default"/></dd> | ||||
<dt>+MMS_S:</dt> | ||||
<dd>maximum message size that can be sent, the largest | ||||
transmitted transport payload of an IP datagram <xref target="RFC1122" fo | ||||
rmat="default"/></dd> | ||||
<dt>path:</dt> | ||||
<dd>an Internet path between the IP addresses of two hosts</dd> | ||||
<t hangText="+sendMSS:">TCP maximum segment size, a value | <dt>PCB:</dt> | |||
<dd>protocol control block, the data associated with a protocol as | ||||
maintained by an endpoint; a TCP PCB is called a "TCB"</dd> | ||||
<dt>PLPMTUD:</dt><dd>packetization-layer path MTU discovery, a mechanism | ||||
that | ||||
uses transport packets to discover the Path Maximum Transmission Unit (P | ||||
MTU) <xref target="RFC4821" | ||||
format="default"/></dd> | ||||
<dt>+PMTU:</dt> | ||||
<dd>largest IP datagram that can traverse a path | ||||
<xref target="RFC1191" format="default"/> <xref target="RFC8201" format=" | ||||
default"/></dd> | ||||
<dt>PMTUD:</dt> | ||||
<dd>path-layer MTU discovery, a mechanism that | ||||
relies on ICMP error messages to discover the PMTU <xref target="RFC1191" | ||||
format="default"/> <xref target="RFC8201" format="default"/></dd> | ||||
<dt>+RTT:</dt> | ||||
<dd>round-trip time of a TCP packet exchange <xref target="RFC0793" form | ||||
at="default"/></dd> | ||||
<dt>+RTTVAR:</dt> | ||||
<dd>variation of round-trip times of a TCP packet | ||||
exchange <xref target="RFC6298" format="default"/></dd> | ||||
<dt>+rwnd:</dt> | ||||
<dd>TCP receive window size <xref target="RFC5681" format="default"/></d | ||||
d> | ||||
<dt>+sendcwnd:</dt> | ||||
<dd>TCP send-side congestion window (cwnd) size | ||||
<xref target="RFC5681" format="default"/></dd> | ||||
<dt>+sendMSS:</dt> | ||||
<dd>TCP maximum segment size, a value | ||||
transmitted in a TCP option that represents the largest TCP user data | transmitted in a TCP option that represents the largest TCP user data | |||
payload that can be received <xref target="RFC6691"/></t> | payload that can be received <xref target="RFC6691" format="default"/></d | |||
d> | ||||
<t hangText="+ssthresh:">TCP slow-start threshold <xref | <dt>+ssthresh:</dt> | |||
target="RFC5681"/></t> | <dd>TCP slow-start threshold <xref target="RFC5681" format="default"/></ | |||
dd> | ||||
<t hangText="TCB:">TCP Control Block, the data associated with a TCP | <dt>TCB:</dt> | |||
connection as maintained by an endpoint</t> | <dd>TCP Control Block, the data associated with a TCP | |||
connection as maintained by an endpoint</dd> | ||||
<t hangText="TCP-AO:">TCP Authentication Option <xref | <dt>TCP-AO:</dt> | |||
target="RFC5925"/></t> | <dd>TCP Authentication Option <xref target="RFC5925" format="default"/>< | |||
/dd> | ||||
<t hangText="TFO:">TCP Fast Open option <xref target="RFC7413"/></t> | <dt>TFO:</dt> | |||
<dd>TCP Fast Open option <xref target="RFC7413" format="default"/></dd> | ||||
<t hangText="+TFO_cookie:">TCP Fast Open cookie, state that is used | <dt>+TFO_cookie:</dt> | |||
as part of the TFO mechanism, when TFO is supported <xref | <dd>TCP Fast Open cookie, state that is used | |||
target="RFC7413"/></t> | as part of the TFO mechanism, when TFO is supported <xref target="RFC7413 | |||
" format="default"/></dd> | ||||
<t hangText="+TFO_failure:">an indication of when TFO option | <dt>+TFO_failure:</dt> | |||
negotiation failed, when TFO is supported</t> | <dd>an indication of when TFO option | |||
negotiation failed, when TFO is supported</dd> | ||||
<t hangText="+TFOinfo:">information cached when a TFO connection is | <dt>+TFOinfo:</dt> | |||
established, which includes the TFO_cookie <xref | <dd>information cached when a TFO connection is | |||
target="RFC7413"/></t> | established, which includes the TFO_cookie <xref target="RFC7413" format= | |||
"default"/></dd> | ||||
</list> | </dl> | |||
</t> | </section> | |||
<section anchor="sect-4" numbered="true" toc="default"> | ||||
</section> | <name>The TCP Control Block (TCB)</name> | |||
<t> | ||||
<section title="The TCP Control Block (TCB)" anchor="sect-4"><t> | ||||
A TCB describes the data associated with each connection, i.e., with | A TCB describes the data associated with each connection, i.e., with | |||
each association of a pair of applications across the network. The | each association of a pair of applications across the network. The | |||
TCB contains at least the following information <xref target="RFC0793"/>:</t> | TCB contains at least the following information <xref target="RFC0793" format ="default"/>:</t> | |||
<figure><artwork><![CDATA[ | <ul empty="true"> | |||
Local process state | <li><t>Local process state</t> | |||
pointers to send and receive buffers | <ul empty="true" spacing="compact"> | |||
pointers to retransmission queue and current segment | <li>pointers to send and receive buffers</li> | |||
pointers to Internet Protocol (IP) PCB | <li>pointers to retransmission queue and current segment</li> | |||
Per-connection shared state | <li>pointers to Internet Protocol (IP) PCB</li> | |||
macro-state | </ul> | |||
connection state | </li> | |||
timers | <li><t>Per-connection shared state</t> | |||
flags | <ul empty="true" spacing="compact"> | |||
local and remote host numbers and ports | <li><t>macro-state</t> | |||
TCP option state | <ul empty="true" spacing="compact"> | |||
micro-state | <li>connection state</li> | |||
send and receive window state (size*, current number) | <li>timers</li> | |||
congestion window size (sendcwnd)* | <li>flags</li> | |||
congestion window size threshold (ssthresh)* | <li>local and remote host numbers and ports</li> | |||
max window size seen* | <li>TCP option state</li> | |||
sendMSS# | </ul> | |||
MMS_S# | </li> | |||
MMS_R# | <li><t>micro-state</t> | |||
PMTU# | <ul empty="true" spacing="compact"> | |||
round-trip time and its variation# | <li>send and receive window state (size*, current number)</li> | |||
]]></artwork></figure> | <li>congestion window size (sendcwnd)*</li> | |||
<li>congestion window size threshold (ssthresh)*</li> | ||||
<li>max window size seen*</li> | ||||
<li>sendMSS#</li> | ||||
<li>MMS_S#</li> | ||||
<li>MMS_R#</li> | ||||
<li>PMTU#</li> | ||||
<li>round-trip time and its variation#</li> | ||||
</ul> | ||||
</li> | ||||
</ul> | ||||
</li> | ||||
</ul> | ||||
<t> | <t> | |||
The per-connection information is shown as split into macro-state | The per-connection information is shown as split into macro-state and | |||
and micro-state, terminology borrowed from <xref target="Co91"/>. Macro-state | micro-state, terminology borrowed from <xref target="Co91" | |||
describes the protocol for establishing the initial shared state | format="default"/>. Macro-state describes the protocol for establishing the | |||
about the connection; we include the endpoint numbers and components | initial shared state about the connection; we include the endpoint numbers | |||
(timers, flags) required upon commencement that are later used to | and components (timers, flags) required upon commencement that are later | |||
help maintain that state. Micro-state describes the protocol after a | used to help maintain that state. Micro-state describes the protocol after | |||
connection has been established, to maintain the reliability and | a connection has been established, to maintain the reliability and | |||
congestion control of the data transferred in the connection.</t> | congestion control of the data transferred in the connection.</t> | |||
<t> | <t> | |||
We distinguish two other classes of shared micro-state that are | We distinguish two other classes of shared micro-state that are associated | |||
associated more with host-pairs than with application pairs. One | more with host-pairs than with application pairs. | |||
class is clearly host-pair dependent (shown above as "#", e.g., | ||||
sendMSS, MMS_R, MMS_S, PMTU, RTT), because these parameters are | ||||
defined by the endpoint or endpoint pair (sendMSS, MMS_R, MMS_S, | ||||
RTT) or are already cached and shared on that basis (PMTU | ||||
<xref target="RFC1191"/><xref target="RFC4821"/>). The other is host-pair dep | ||||
endent in its | ||||
aggregate (shown above as "*", e.g., congestion window information, | ||||
current window sizes, etc.) because they depend on the total | ||||
capacity between the two endpoints.</t> | ||||
<t> | One class is clearly host-pair dependent (shown above as "#", e.g., | |||
Not all of the TCB state is necessarily sharable. In particular, | sendMSS, MMS_R, MMS_S, PMTU, RTT), because these parameters are defined by | |||
the endpoint or endpoint pair (of the given example: sendMSS, MMS_R, MMS_S, | ||||
RTT) or are already cached and shared on that basis (of the given example: | ||||
PMTU <xref target="RFC1191" format="default"/> <xref target="RFC4821" | ||||
format="default"/>). | ||||
The other is host-pair dependent in its aggregate (shown above as "*", e.g., | ||||
congestion window information, current window sizes, etc.) because they depend | ||||
on the total capacity between the two endpoints.</t> | ||||
<t> | ||||
Not all of the TCB state is necessarily shareable. In particular, | ||||
some TCP options are negotiated only upon request by the application | some TCP options are negotiated only upon request by the application | |||
layer, so their use may not be correlated across connections. Other | layer, so their use may not be correlated across connections. Other | |||
options negotiate connection-specific parameters, which are | options negotiate connection-specific parameters, which are | |||
similarly not shareable. These are discussed further in Appendix B.</t> | similarly not shareable. These are discussed further in <xref target="sect-b" | |||
/>.</t> | ||||
<t> | <t> | |||
Finally, we exclude rwnd from further discussion because its value | Finally, we exclude rwnd from further discussion because its value | |||
should depend on the send window size, so it is already addressed by | should depend on the send window size, so it is already addressed by | |||
send window sharing and is not independently affected by sharing.</t> | send window sharing and is not independently affected by sharing.</t> | |||
</section> | ||||
</section> | <section anchor="sect-5" numbered="true" toc="default"> | |||
<name>TCB Interdependence</name> | ||||
<section title="TCB Interdependence" anchor="sect-5"><t> | <t> | |||
There are two cases of TCB interdependence. Temporal sharing occurs | There are two cases of TCB interdependence. Temporal sharing occurs | |||
when the TCB of an earlier (now CLOSED) connection to a host is used | when the TCB of an earlier (now CLOSED) connection to a host is used | |||
to initialize some parameters of a new connection to that same host, | to initialize some parameters of a new connection to that same host, | |||
i.e., in sequence. Ensemble sharing occurs when a currently active | i.e., in sequence. Ensemble sharing occurs when a currently active | |||
connection to a host is used to initialize another (concurrent) | connection to a host is used to initialize another (concurrent) | |||
connection to that host.</t> | connection to that host.</t> | |||
</section> | ||||
<section anchor="sect-6" numbered="true" toc="default"> | ||||
<name>Temporal Sharing</name> | ||||
</section> | <t> | |||
<section title="Temporal Sharing" anchor="sect-6"><t> | ||||
The TCB data cache is accessed in two ways: it is read to initialize | The TCB data cache is accessed in two ways: it is read to initialize | |||
new TCBs and written when more current per-host state is available.</t> | new TCBs and written when more current per-host state is available.</t> | |||
<section anchor="sect-6.1" numbered="true" toc="default"> | ||||
<section title="Initialization of a new TCB" anchor="sect-6.1"><t> | <name>Initialization of a New TCB</name> | |||
<t> | ||||
TCBs for new connections can be initialized using cached context | TCBs for new connections can be initialized using cached context | |||
from past connections as follows:</t> | from past connections as follows:</t> | |||
<figure><artwork><![CDATA[ | <table anchor="TCB_initialization"> | |||
TEMPORAL SHARING - TCB Initialization | <name>Temporal Sharing - TCB Initialization</name> | |||
<thead> | ||||
Cached TCB New TCB | <tr> | |||
-------------------------------------- | <th>Cached TCB</th> | |||
old_MMS_S old_MMS_S or not cached* | <th>New TCB</th> | |||
</tr> | ||||
old_MMS_R old_MMS_R or not cached* | </thead> | |||
<tbody> | ||||
old_sendMSS old_sendMSS | <tr> | |||
<td>old_MMS_S</td> | ||||
old_PMTU old_PMTU+ | <td>old_MMS_S or not cached (2)</td> | |||
</tr> | ||||
old_RTT old_RTT | <tr> | |||
<td>old_MMS_R</td> | ||||
old_RTTVAR old_RTTVAR | <td>old_MMS_R or not cached (2)</td> | |||
</tr> | ||||
old_option (option specific) | <tr> | |||
<td>old_sendMSS</td> | ||||
old_ssthresh old_ssthresh | <td>old_sendMSS</td> | |||
</tr> | ||||
old_sendcwnd old_sendcwnd | <tr> | |||
]]></artwork></figure> | <td>old_PMTU</td> | |||
<td>old_PMTU (1)</td> | ||||
<t> | </tr> | |||
+Note that PMTU is cached at the IP layer <xref target="RFC1191"/><xref targe | <tr> | |||
t="RFC4821"/>. | <td>old_RTT</td> | |||
*Note that some values are not cached when they are computed locally | <td>old_RTT</td> | |||
(MMS_R) or indicated in the connection itself (MMS_S in the SYN).</t> | </tr> | |||
<tr> | ||||
<t> | <td>old_RTTVAR</td> | |||
The table below gives an overview of option-specific information | <td>old_RTTVAR</td> | |||
that can be shared. Additional information on some specific TCP | </tr> | |||
options and sharing is provided in Appendix B.</t> | <tr> | |||
<td>old_option</td> | ||||
<figure><artwork><![CDATA[ | <td>(option specific)</td> | |||
TEMPORAL SHARING - Option Info Initialization | </tr> | |||
<tr> | ||||
<td>old_ssthresh</td> | ||||
<td>old_ssthresh</td> | ||||
</tr> | ||||
<tr> | ||||
<td>old_sendcwnd</td> | ||||
<td>old_sendcwnd</td> | ||||
</tr> | ||||
</tbody> | ||||
</table> | ||||
Cached New | <dl> | |||
------------------------------------ | <dt>(1)</dt><dd>Note that PMTU is cached at the IP layer <xref target="RFC1191" | |||
old_TFO_cookie old_TFO_cookie | format="default"/> <xref target="RFC4821" format="default"/>. | |||
</dd> | ||||
<dt>(2)</dt><dd>Note that some values are not cached when they are computed loca | ||||
lly | ||||
(MMS_R) or indicated in the connection itself (MMS_S in the SYN).</dd> | ||||
</dl> | ||||
<t> | ||||
old_TFO_failure old_TFO_failure | <xref target="Option_Info_Initialization"/> gives an overview of | |||
]]></artwork> | option-specific information that can be shared. Additional information on | |||
</figure> | some specific TCP options and sharing is provided in <xref | |||
target="sect-b"/>.</t> | ||||
</section> | <table anchor="Option_Info_Initialization"> | |||
<name>Temporal Sharing - Option Info Initialization</name> | ||||
<thead> | ||||
<tr> | ||||
<th>Cached</th> | ||||
<th>New</th> | ||||
</tr> | ||||
</thead> | ||||
<tbody> | ||||
<tr> | ||||
<td>old_TFO_cookie</td> | ||||
<td>old_TFO_cookie</td> | ||||
</tr> | ||||
<tr> | ||||
<td>old_TFO_failure</td> | ||||
<td>old_TFO_failure</td> | ||||
</tr> | ||||
</tbody> | ||||
</table> | ||||
<section title="Updates to the TCB cache" anchor="sect-6.2"><t> | </section> | |||
<section anchor="sect-6.2" numbered="true" toc="default"> | ||||
<name>Updates to the TCB Cache</name> | ||||
<t> | ||||
During a connection, the TCB cache can be updated based on events of | During a connection, the TCB cache can be updated based on events of | |||
current connections and their TCBs as they progress over time, as | current connections and their TCBs as they progress over time, as shown in | |||
shown below:</t> | <xref target="Cache_Updates"/>.</t> | |||
<figure><artwork><![CDATA[ | ||||
TEMPORAL SHARING - Cache Updates | ||||
Cached TCB Current TCB when? New Cached TCB | ||||
---------------------------------------------------------- | ||||
old_MMS_S curr_MMS_S OPEN curr_MMS_S | ||||
old_MMS_R curr_MMS_R OPEN curr_MMS_R | ||||
old_sendMSS curr_sendMSS MSSopt curr_sendMSS | ||||
old_PMTU curr_PMTU PMTUD+ / curr_PMTU | ||||
PLPMTUD+ | ||||
old_RTT curr_RTT CLOSE merge(curr,old) | ||||
old_RTTVAR curr_RTTVAR CLOSE merge(curr,old) | ||||
old_option curr_option ESTAB (depends on option) | ||||
old_ssthresh curr_ssthresh CLOSE merge(curr,old) | ||||
old_sendcwnd curr_sendcwnd CLOSE merge(curr,old) | ||||
]]></artwork> | ||||
</figure> | ||||
<t> | <table anchor="Cache_Updates"> | |||
+Note that PMTU is cached at the IP layer <xref target="RFC1191"/><xref targe | <name>Temporal Sharing - Cache Updates</name> | |||
t="RFC4821"/>.</t> | <thead> | |||
<tr> | ||||
<th>Cached TCB</th> | ||||
<th>Current TCB</th> | ||||
<th>When?</th> | ||||
<th>New Cached TCB</th> | ||||
</tr> | ||||
</thead> | ||||
<tbody> | ||||
<tr> | ||||
<td>old_MMS_S</td> | ||||
<td>curr_MMS_S</td> | ||||
<td>OPEN</td> | ||||
<td>curr_MMS_S</td> | ||||
</tr> | ||||
<tr> | ||||
<td>old_MMS_R</td> | ||||
<td>curr_MMS_R</td> | ||||
<td>OPEN</td> | ||||
<td>curr_MMS_R</td> | ||||
</tr> | ||||
<tr> | ||||
<td>old_sendMSS</td> | ||||
<td>curr_sendMSS</td> | ||||
<td>MSSopt</td> | ||||
<td>curr_sendMSS</td> | ||||
</tr> | ||||
<tr> | ||||
<td>old_PMTU</td> | ||||
<td>curr_PMTU</td> | ||||
<td>PMTUD (1) / PLPMTUD (1)</td> | ||||
<td>curr_PMTU</td> | ||||
</tr> | ||||
<tr> | ||||
<td>old_RTT</td> | ||||
<td>curr_RTT</td> | ||||
<td>CLOSE</td> | ||||
<td>merge(curr,old)</td> | ||||
</tr> | ||||
<tr> | ||||
<td>old_RTTVAR</td> | ||||
<td>curr_RTTVAR</td> | ||||
<td>CLOSE</td> | ||||
<td>merge(curr,old)</td> | ||||
</tr> | ||||
<tr> | ||||
<td>old_option</td> | ||||
<td>curr_option</td> | ||||
<td>ESTAB</td> | ||||
<td>(depends on option)</td> | ||||
</tr> | ||||
<tr> | ||||
<td>old_ssthresh</td> | ||||
<td>curr_ssthresh</td> | ||||
<td>CLOSE</td> | ||||
<td>merge(curr,old)</td> | ||||
</tr> | ||||
<tr> | ||||
<td>old_sendcwnd</td> | ||||
<td>curr_sendcwnd</td> | ||||
<td>CLOSE</td> | ||||
<td>merge(curr,old)</td> | ||||
</tr> | ||||
</tbody> | ||||
</table> | ||||
<t> | <dl> | |||
<dt>(1)</dt><dd>Note that PMTU is cached at the IP layer <xref target="RFC1191" | ||||
format="default"/> <xref target="RFC4821" format="default"/>.</dd> | ||||
</dl> | ||||
<t> | ||||
Merge() is the function that combines the current and previous (old) | Merge() is the function that combines the current and previous (old) | |||
values and may vary for each parameter of the TCB cache. The | values and may vary for each parameter of the TCB cache. The | |||
particular function is not specified in this document; examples | particular function is not specified in this document; examples | |||
include windowed averages (mean of the past N values, for some N) | include windowed averages (mean of the past N values, for some N) | |||
and exponential decay (new = (1-alpha)*old + alpha *new, where alpha | and exponential decay (new = (1-alpha)*old + alpha *new, where alpha | |||
is in the range [0..1]).</t> | is in the range [0..1]).</t> | |||
<t> | ||||
<xref target="Option_Info_Updates"/> gives an overview of option-specific | ||||
information that can be similarly shared. The TFO cookie is maintained | ||||
until the client explicitly requests it be updated as a separate event.</t> | ||||
<t> | <table anchor="Option_Info_Updates"> | |||
The table below gives an overview of option-specific information | <name>Temporal Sharing - Option Info Updates</name> | |||
that can be similarly shared. The TFO cookie is maintained until the | <thead> | |||
client explicitly requests it be updated as a separate event.</t> | <tr> | |||
<th>Cached</th> | ||||
<figure><artwork><![CDATA[ | <th>Current</th> | |||
TEMPORAL SHARING - Option Info Updates | <th>When?</th> | |||
<th>New Cached</th> | ||||
Cached Current when? New Cached | </tr> | |||
--------------------------------------------------------- | </thead> | |||
old_TFO_cookie old_TFO_cookie ESTAB old_TFO_cookie | <tbody> | |||
<tr> | ||||
old_TFO_failure old_TFO_failure ESTAB old_TFO_failure | <td>old_TFO_cookie</td> | |||
]]></artwork> | <td>old_TFO_cookie</td> | |||
</figure> | <td>ESTAB</td> | |||
<td>old_TFO_cookie</td> | ||||
</section> | </tr> | |||
<tr> | ||||
<section title="Discussion" anchor="sect-6.3"><t> | <td>old_TFO_failure</td> | |||
As noted, there is no particular benefit to caching MMS_S and MMS_R | <td>old_TFO_failure</td> | |||
as these are reported by the local IP stack. Caching sendMSS and | <td>ESTAB</td> | |||
PMTU is trivial; reported values are cached (PMTU at the IP layer), | <td>old_TFO_failure</td> | |||
and the most recent values are used. The cache is updated when the | </tr> | |||
MSS option is received in a SYN or after PMTUD (i.e., when an ICMPv4 | </tbody> | |||
Fragmentation Needed <xref target="RFC1191"/> or ICMPv6 Packet Too Big messag | </table> | |||
e is | ||||
received <xref target="RFC8201"/> or the equivalent is inferred, e.g., as fro | ||||
m | ||||
PLPMTUD <xref target="RFC4821"/>), respectively, so the cache always has the | ||||
most | ||||
recent values from any connection. For sendMSS, the cache is | ||||
consulted only at connection establishment and not otherwise | ||||
updated, which means that MSS options do not affect current | ||||
connections. The default sendMSS is never saved; only reported MSS | ||||
values update the cache, so an explicit override is required to | ||||
reduce the sendMSS. Cached sendMSS affects only data sent in the SYN | ||||
segment, i.e., during client connection initiation or during | ||||
simultaneous open; all other segment MSS are based on the value | ||||
updated as included in the SYN.</t> | ||||
<t> | </section> | |||
RTT values are updated by formulae that merges the old and new | <section anchor="sect-6.3" numbered="true" toc="default"> | |||
values, as noted in <xref target="sect-6.2"/>. Dynamic RTT estimation require | <name>Discussion</name> | |||
s a | <t> | |||
sequence of RTT measurements. As a result, the cached RTT (and its | As noted, there is no particular benefit to caching MMS_S and MMS_R as | |||
variation) is an average of its previous value with the contents of | these are reported by the local IP stack. Caching sendMSS and PMTU is | |||
the currently active TCB for that host, when a TCB is closed. RTT | trivial; reported values are cached (PMTU at the IP layer), and the most | |||
values are updated only when a connection is closed. The method for | recent values are used. The cache is updated when the MSS option is | |||
merging old and current values needs to attempt to reduce the | received in a SYN or after PMTUD (i.e., when an ICMPv4 Fragmentation Needed | |||
transient effects of the new connections.</t> | <xref target="RFC1191" format="default"/> or ICMPv6 Packet Too Big message | |||
is received <xref target="RFC8201" format="default"/> or the equivalent is | ||||
inferred, e.g., as from PLPMTUD <xref target="RFC4821" format="default"/>), | ||||
respectively, so the cache always has the most recent values from any | ||||
connection. For sendMSS, the cache is consulted only at connection | ||||
establishment and not otherwise updated, which means that MSS options do | ||||
not affect current connections. The default sendMSS is never saved; only | ||||
reported MSS values update the cache, so an explicit override is required | ||||
to reduce the sendMSS. Cached sendMSS affects only data sent in the SYN | ||||
segment, i.e., during client connection initiation or during simultaneous | ||||
open; the MSS of all other segments are constrained by the value updated as | ||||
included in the SYN. | ||||
</t> | ||||
<t> | <t> | |||
The updates for RTT, RTTVAR and ssthresh rely on existing | RTT values are updated by formulae that merge the old and new values, as | |||
noted in <xref target="sect-6.2" format="default"/>. Dynamic RTT estimation | ||||
requires a sequence of RTT measurements. As a result, the cached RTT (and | ||||
its variation) is an average of its previous value with the contents of the | ||||
currently active TCB for that host, when a TCB is closed. RTT values are | ||||
updated only when a connection is closed. The method for merging old and | ||||
current values needs to attempt to reduce the transient effects of the new | ||||
connections.</t> | ||||
<t> | ||||
The updates for RTT, RTTVAR, and ssthresh rely on existing | ||||
information, i.e., old values. Should no such values exist, the | information, i.e., old values. Should no such values exist, the | |||
current values are cached instead.</t> | current values are cached instead.</t> | |||
<t> | ||||
<t> | ||||
TCP options are copied or merged depending on the details of each | TCP options are copied or merged depending on the details of each | |||
option. E.g., TFO state is updated when a connection is established | option. For example, TFO state is updated when a connection is established | |||
and read before establishing a new connection.</t> | and read before establishing a new connection.</t> | |||
<t> | <t> | |||
Sections 8 and 9 discuss compatibility issues and implications of | Sections <xref target="sect-8" format="counter"/> and <xref target="sect-9" | |||
sharing the specific information listed above. <xref target="sect-10"/> gives | format="counter"/> discuss compatibility issues and implications of sharing | |||
an | the specific information listed above. <xref target="sect-10" | |||
overview of known implementations.</t> | format="default"/> gives an overview of known implementations.</t> | |||
<t> | ||||
<t> | Most cached TCB values are updated when a connection closes. The exceptions | |||
Most cached TCB values are updated when a connection closes. The | are MMS_R and MMS_S, which are reported by IP <xref target="RFC1122" | |||
exceptions are MMS_R and MMS_S, which are reported by IP <xref target="RFC112 | format="default"/>; PMTU, which is updated after Path MTU Discovery and | |||
2"/>, | also reported by IP <xref target="RFC1191" format="default"/> <xref | |||
PMTU which is updated after Path MTU Discovery and also reported by | target="RFC4821" format="default"/> <xref target="RFC8201" | |||
IP <xref target="RFC1191"/><xref target="RFC4821"/><xref target="RFC8201"/>, | format="default"/>; and sendMSS, which is updated if the MSS option is | |||
and sendMSS, which is updated if the | received in the TCP SYN header.</t> | |||
MSS option is received in the TCP SYN header.</t> | <t> | |||
<t> | ||||
Sharing sendMSS information affects only data in the SYN of the next | Sharing sendMSS information affects only data in the SYN of the next | |||
connection, because sendMSS information is typically included in | connection, because sendMSS information is typically included in | |||
most TCP SYN segments. Caching PMTU can accelerate the efficiency of | most TCP SYN segments. Caching PMTU can accelerate the efficiency of | |||
PMTUD but can also result in black-holing until corrected if in | PMTUD but can also result in black-holing until corrected if in | |||
error. Caching MMS_R and MMS_S may be of little direct value as they | error. Caching MMS_R and MMS_S may be of little direct value as they | |||
are reported by the local IP stack anyway.</t> | are reported by the local IP stack anyway.</t> | |||
<t> | <t> | |||
The way in which other TCP option state can be shared depends on the | The way in which state related to other TCP options can be shared depends on | |||
details of that option. E.g., TFO state includes the TCP Fast Open | the | |||
Cookie <xref target="RFC7413"/> or, in case TFO fails, a negative TCP Fast Op | details of that option. For example, TFO state includes the TCP Fast Open | |||
en | cookie <xref target="RFC7413" format="default"/> or, in case TFO fails, a neg | |||
response. RFC 7413 states, "The client MUST cache negative responses from the | ative TCP Fast Open | |||
server in order to avoid potential connection failures. Negative responses incl | response. RFC 7413 states, </t> | |||
ude the server not acknowledging the data in the SYN, ICMP error messages, and ( | ||||
most importantly) no response (SYN-ACK) from the server at all, i.e., connection | ||||
timeout." [RFC 7413]. TFOinfo is cached when a connection is established.</t> | ||||
<t> | ||||
Other TCP option state might not be as readily cached. E.g., TCP-AO | ||||
<xref target="RFC5925"/> success or failure between a host pair for a single | ||||
SYN | ||||
destination port might be usefully cached. TCP-AO success or failure | ||||
to other SYN destination ports on that host pair is never useful to | ||||
cache because TCP-AO security parameters can vary per service.</t> | ||||
</section> | ||||
</section> | ||||
<section title="Ensemble Sharing" anchor="sect-7"><t> | <blockquote>The client <bcp14>MUST</bcp14> cache negative responses from the ser | |||
ver in order to avoid potential connection failures. Negative responses include | ||||
the server not acknowledging the data in the SYN, ICMP error messages, and (most | ||||
importantly) no response (SYN-ACK) from the server at all, i.e., connection tim | ||||
eout. | ||||
</blockquote> | ||||
<t>TFOinfo is cached when a connection is established.</t> | ||||
<t> | ||||
State related to other TCP options might not be as readily cached. For | ||||
example, TCP-AO <xref target="RFC5925" format="default"/> success or | ||||
failure between a host-pair for a single SYN destination port might be | ||||
usefully cached. TCP-AO success or failure to other SYN destination ports | ||||
on that host-pair is never useful to cache because TCP-AO security | ||||
parameters can vary per service.</t> | ||||
</section> | ||||
</section> | ||||
<section anchor="sect-7" numbered="true" toc="default"> | ||||
<name>Ensemble Sharing</name> | ||||
<t> | ||||
Sharing cached TCB data across concurrent connections requires | Sharing cached TCB data across concurrent connections requires | |||
attention to the aggregate nature of some of the shared state. For | attention to the aggregate nature of some of the shared state. For | |||
example, although MSS and RTT values can be shared by copying, it | example, although MSS and RTT values can be shared by copying, it | |||
may not be appropriate to simply copy congestion window or ssthresh | may not be appropriate to simply copy congestion window or ssthresh | |||
information; instead, the new values can be a function (f) of the | information; instead, the new values can be a function (f) of the | |||
cumulative values and the number of connections (N).</t> | cumulative values and the number of connections (N).</t> | |||
<section anchor="sect-7.1" numbered="true" toc="default"> | ||||
<section title="Initialization of a new TCB" anchor="sect-7.1"><t> | <name>Initialization of a New TCB</name> | |||
<t> | ||||
TCBs for new connections can be initialized using cached context | TCBs for new connections can be initialized using cached context | |||
from concurrent connections as follows:</t> | from concurrent connections as follows:</t> | |||
<figure><artwork><![CDATA[ | <table anchor="TCB_Initialization"> | |||
ENSEMBLE SHARING - TCB Initialization | <name>Ensemble Sharing - TCB Initialization</name> | |||
<thead> | ||||
Cached TCB New TCB | <tr> | |||
------------------------------------------ | <th>Cached TCB</th> | |||
old_MMS_S old_MMS_S | <th>New TCB</th> | |||
</tr> | ||||
old_MMS_R old_MMS_R | </thead> | |||
<tbody> | ||||
old_sendMSS old_sendMSS | <tr> | |||
<td>old_MMS_S</td> | ||||
old_PMTU old_PMTU+ | <td>old_MMS_S</td> | |||
</tr> | ||||
old_RTT old_RTT | <tr> | |||
<td>old_MMS_R</td> | ||||
old_RTTVAR old_RTTVAR | <td>old_MMS_R</td> | |||
</tr> | ||||
sum(old_ssthresh) f(sum(old_ssthresh), N) | <tr> | |||
<td>old_sendMSS</td> | ||||
sum(old_sendcwnd) f(sum(old_sendcwnd), N) | <td>old_sendMSS</td> | |||
_ | </tr> | |||
old_option (option specific) | <tr> | |||
]]></artwork> | <td>old_PMTU</td> | |||
</figure> | <td>old_PMTU (1)</td> | |||
</tr> | ||||
<t> | <tr> | |||
+Note that PMTU is cached at the IP layer <xref target="RFC1191"/><xref targe | <td>old_RTT</td> | |||
t="RFC4821"/>.</t> | <td>old_RTT</td> | |||
</tr> | ||||
<tr> | ||||
<td>old_RTTVAR</td> | ||||
<td>old_RTTVAR</td> | ||||
</tr> | ||||
<tr> | ||||
<td>sum(old_ssthresh)</td> | ||||
<td>f(sum(old_ssthresh), N)</td> | ||||
</tr> | ||||
<tr> | ||||
<td>sum(old_sendcwnd)</td> | ||||
<td>f(sum(old_sendcwnd), N)</td> | ||||
</tr> | ||||
<tr> | ||||
<td>old_option</td> | ||||
<td>(option specific)</td> | ||||
</tr> | ||||
</tbody> | ||||
</table> | ||||
<t> | <dl> | |||
In the table, the cached sum() is a total across all active | <dt>(1)</dt> | |||
connections because these parameters act in aggregate; similarly f() | <dd>Note that PMTU is cached at the IP layer <xref target="RFC1191" format="defa | |||
ult"/> <xref target="RFC4821" format="default"/>.</dd> | ||||
</dl> | ||||
<t> | ||||
In <xref target="TCB_Initialization"/>, the cached sum() is a total across al | ||||
l active | ||||
connections because these parameters act in aggregate; similarly, f() | ||||
is a function that updates that sum based on the new connection's | is a function that updates that sum based on the new connection's | |||
values, represented as "N".</t> | values, represented as "N".</t> | |||
<t> | ||||
<xref target="Ensemble_Option_Info_Initialization"/> gives an overview of | ||||
option-specific information that can be similarly shared. Again, the | ||||
TFO_cookie is updated upon explicit client request, which is a separate | ||||
event.</t> | ||||
<t> | <table anchor="Ensemble_Option_Info_Initialization"> | |||
The table below gives an overview of option-specific information | <name>Ensemble Sharing - Option Info Initialization</name> | |||
that can be similarly shared. Again, The TFO_cookie is updated upon | <thead> | |||
explicit client request, which is a separate event.</t> | <tr> | |||
<th>Cached</th> | ||||
<figure><artwork><![CDATA[ | <th>New</th> | |||
ENSEMBLE SHARING - Option Info Initialization | </tr> | |||
</thead> | ||||
Cached New | <tbody> | |||
------------------------------------ | <tr> | |||
old_TFO_cookie old_TFO_cookie | <td>old_TFO_cookie</td> | |||
<td>old_TFO_cookie</td> | ||||
old_TFO_failure old_TFO_failure | </tr> | |||
]]></artwork> | <tr> | |||
</figure> | <td>old_TFO_failure</td> | |||
<td>old_TFO_failure</td> | ||||
</section> | </tr> | |||
</tbody> | ||||
<section title="Updates to the TCB cache" anchor="sect-7.2"><t> | </table> | |||
During a connection, the TCB cache can be updated based on changes | ||||
to concurrent connections and their TCBs, as shown below:</t> | ||||
<figure><artwork><![CDATA[ | ||||
ENSEMBLE SHARING - Cache Updates | ||||
Cached TCB Current TCB when? New Cached TCB | ||||
--------------------------------------------------------------- | ||||
old_MMS_S curr_MMS_S OPEN curr_MMS_S | ||||
old_MMS_R curr_MMS_R OPEN curr_MMS_R | ||||
old_sendMSS curr_sendMSS MSSopt curr_sendMSS | ||||
old_PMTU curr_PMTU PMTUD+ / curr_PMTU | ||||
PLPMTUD+ | ||||
old_RTT curr_RTT update rtt_update(old, curr) | ||||
old_RTTVAR curr_RTTVAR update rtt_update(old, curr) | ||||
old_ssthresh curr_ssthresh update adjust sum as appropriate | ||||
old_sendcwnd curr_sendcwnd update adjust sum as appropriate | ||||
old_option curr_option (depends) (option specific) | ||||
]]></artwork> | ||||
</figure> | ||||
<t> | </section> | |||
+Note that the PMTU is cached at the IP layer <xref target="RFC1191"/><xref t | <section anchor="sect-7.2" numbered="true" toc="default"> | |||
arget="RFC4821"/>.</t> | <name>Updates to the TCB Cache</name> | |||
<t> | ||||
During a connection, the TCB cache can be updated based on changes to | ||||
concurrent connections and their TCBs, as shown below:</t> | ||||
<t> | <table anchor="Ensemble_Cache_Updates"> | |||
In the table, rtt_update() is the function used to combine old and | <name>Ensemble Sharing - Cache Updates</name> | |||
current values, e.g., as a windowed average or exponentially decayed | <thead> | |||
average.</t> | <tr> | |||
<th>Cached TCB</th> | ||||
<th>Current TCB</th> | ||||
<th>When?</th> | ||||
<th>New Cached TCB</th> | ||||
</tr> | ||||
</thead> | ||||
<tbody> | ||||
<tr> | ||||
<td>old_MMS_S</td> | ||||
<td>curr_MMS_S</td> | ||||
<td>OPEN</td> | ||||
<td>curr_MMS_S</td> | ||||
</tr> | ||||
<tr> | ||||
<td>old_MMS_R</td> | ||||
<td>curr_MMS_R</td> | ||||
<td>OPEN</td> | ||||
<td>curr_MMS_R</td> | ||||
</tr> | ||||
<tr> | ||||
<td>old_sendMSS</td> | ||||
<td>curr_sendMSS</td> | ||||
<td>MSSopt</td> | ||||
<td>curr_sendMSS</td> | ||||
</tr> | ||||
<tr> | ||||
<td>old_PMTU</td> | ||||
<td>curr_PMTU</td> | ||||
<td>PMTUD+ / PLPMTUD+</td> | ||||
<td>curr_PMTU</td> | ||||
</tr> | ||||
<tr> | ||||
<td>old_RTT</td> | ||||
<td>curr_RTT</td> | ||||
<td>update</td> | ||||
<td>rtt_update(old, curr)</td> | ||||
</tr> | ||||
<tr> | ||||
<td>old_RTTVAR</td> | ||||
<td>curr_RTTVAR</td> | ||||
<td>update</td> | ||||
<td>rtt_update(old, curr)</td> | ||||
</tr> | ||||
<tr> | ||||
<td>old_ssthresh</td> | ||||
<td>curr_ssthresh</td> | ||||
<td>update</td> | ||||
<td>adjust sum as appropriate</td> | ||||
</tr> | ||||
<tr> | ||||
<td>old_sendcwnd</td> | ||||
<td>curr_sendcwnd</td> | ||||
<td>update</td> | ||||
<td>adjust sum as appropriate</td> | ||||
</tr> | ||||
<tr> | ||||
<td>old_option</td> | ||||
<td>curr_option</td> | ||||
<td>(depends)</td> | ||||
<td>(option specific)</td> | ||||
</tr> | ||||
</tbody> | ||||
</table> | ||||
<t> | <dl> | |||
The table below gives an overview of option-specific information | <dt>+</dt> | |||
<dd>Note that the PMTU is cached at the IP layer <xref target="RFC1191" format=" | ||||
default"/> <xref target="RFC4821" format="default"/>.</dd> | ||||
</dl> | ||||
<t> | ||||
In <xref target="Ensemble_Cache_Updates"/>, rtt_update() is the function | ||||
used to combine old and current values, e.g., as a windowed average or | ||||
exponentially decayed average.</t> | ||||
<t> | ||||
<xref target="Ensemble_Option_Info_Updates"/> gives an overview of opti | ||||
on-specific information | ||||
that can be similarly shared.</t> | that can be similarly shared.</t> | |||
<figure><artwork><![CDATA[ | <table anchor="Ensemble_Option_Info_Updates"> | |||
ENSEMBLE SHARING - Option Info Updates | <name>Ensemble Sharing - Option Info Updates</name> | |||
<thead> | ||||
Cached Current when? New Cached | <tr> | |||
---------------------------------------------------------- | <th>Cached</th> | |||
old_TFO_cookie old_TFO_cookie ESTAB old_TFO_cookie | <th>Current</th> | |||
<th>When?</th> | ||||
old_TFO_failure old_TFO_failure ESTAB old_TFO_failure | <th>New Cached</th> | |||
]]></artwork></figure> | </tr> | |||
</thead> | ||||
<tbody> | ||||
<tr> | ||||
<td>old_TFO_cookie</td> | ||||
<td>old_TFO_cookie</td> | ||||
<td>ESTAB</td> | ||||
<td>old_TFO_cookie</td> | ||||
</tr> | ||||
<tr> | ||||
<td>old_TFO_failure</td> | ||||
<td>old_TFO_failure</td> | ||||
<td>ESTAB</td> | ||||
<td>old_TFO_failure</td> | ||||
</tr> | ||||
</tbody> | ||||
</table> | ||||
</section> | </section> | |||
<section anchor="sect-7.3" numbered="true" toc="default"> | ||||
<name>Discussion</name> | ||||
<section title="Discussion" anchor="sect-7.3"><t> | <t> | |||
For ensemble sharing, TCB information should be cached as early as | For ensemble sharing, TCB information should be cached as early as | |||
possible, sometimes before a connection is closed. Otherwise, | possible, sometimes before a connection is closed. Otherwise, | |||
opening multiple concurrent connections may not result in TCB data | opening multiple concurrent connections may not result in TCB data | |||
sharing if no connection closes before others open. The amount of | sharing if no connection closes before others open. The amount of | |||
work involved in updating the aggregate average should be minimized, | work involved in updating the aggregate average should be minimized, | |||
but the resulting value should be equivalent to having all values | but the resulting value should be equivalent to having all values | |||
measured within a single connection. The function "rtt_update" in | measured within a single connection. | |||
the ensemble sharing table indicates this operation, which occurs | ||||
whenever the RTT would have been updated in the individual TCP | ||||
connection. As a result, the cache contains the shared RTT | ||||
variables, which no longer need to reside in the TCB.</t> | ||||
<t> | The function "rtt_update" in <xref target="Ensemble_Cache_Updates" | |||
format="default"/> indicates this operation, which occurs whenever the RTT | ||||
would have been updated in the individual TCP connection. As a result, the | ||||
cache contains the shared RTT variables, which no longer need to reside in the | ||||
TCB.</t> | ||||
<t> | ||||
Congestion window size and ssthresh aggregation are more complicated | Congestion window size and ssthresh aggregation are more complicated | |||
in the concurrent case. When there is an ensemble of connections, we | in the concurrent case. When there is an ensemble of connections, we | |||
need to decide how that ensemble would have shared these variables, | need to decide how that ensemble would have shared these variables, | |||
in order to derive initial values for new TCBs.</t> | in order to derive initial values for new TCBs.</t> | |||
<t> | ||||
<t> | Sections <xref target="sect-8" format="counter"/> and <xref target="sect-9" | |||
Sections 8 and 9 discuss compatibility issues and implications of | format="counter"/> discuss compatibility issues and implications of sharing | |||
sharing the specific information listed above.</t> | the specific information listed above.</t> | |||
<t> | ||||
<t> | There are several ways to initialize the congestion window in a new TCB | |||
There are several ways to initialize the congestion window in a new | among an ensemble of current connections to a host. Current TCP | |||
TCB among an ensemble of current connections to a host. Current TCP | implementations initialize it to 4 segments as standard <xref | |||
implementations initialize it to four segments as standard <xref target="RFC3 | target="RFC3390" format="default"/> and 10 segments experimentally <xref | |||
390"/> | target="RFC6928" format="default"/>. These approaches assume that new | |||
and 10 segments experimentally <xref target="RFC6928"/>. These approaches ass | connections should behave as conservatively as possible. The algorithm | |||
ume | described in <xref target="Ba12" format="default"/> adjusts the initial | |||
that new connections should behave as conservatively as possible. | cwnd depending on the cwnd values of ongoing connections. It is also | |||
The algorithm described in <xref target="Ba12"/> adjusts the initial cwnd dep | possible to use sharing mechanisms over long timescales to adapt TCP's | |||
ending | initial window automatically, as described further in <xref | |||
on the cwnd values of ongoing connections. It is also possible to | target="sect-c"/>.</t> | |||
use sharing mechanisms over long timescales to adapt TCP's initial | </section> | |||
window automatically, as described further in Appendix C.</t> | </section> | |||
<section anchor="sect-8" numbered="true" toc="default"> | ||||
</section> | <name>Issues with TCB Information Sharing</name> | |||
<t> | ||||
</section> | ||||
<section title="Issues with TCB information sharing" anchor="sect-8"><t> | ||||
Here, we discuss various types of problems that may arise with TCB | Here, we discuss various types of problems that may arise with TCB | |||
information sharing.</t> | information sharing.</t> | |||
<t> | ||||
<t> | ||||
For the congestion and current window information, the initial | For the congestion and current window information, the initial | |||
values computed by TCB interdependence may not be consistent with | values computed by TCB interdependence may not be consistent with | |||
the long-term aggregate behavior of a set of concurrent connections | the long-term aggregate behavior of a set of concurrent connections | |||
between the same endpoints. Under conventional TCP congestion | between the same endpoints. | |||
control, if the congestion window of a single existing connection | ||||
has converged to 40 segments, two newly joining concurrent | ||||
connections assume initial windows of 10 segments <xref target="RFC6928"/>, a | ||||
nd the | ||||
current connection's window doesn't decrease to accommodate this | ||||
additional load and connections can mutually interfere. One example | ||||
of this is seen on low-bandwidth, high-delay links, where concurrent | ||||
connections supporting Web traffic can collide because their initial | ||||
windows were too large, even when set at one segment.</t> | ||||
<t> | Under conventional TCP congestion control, if the congestion window of a | |||
The authors of <xref target="Hu12"/> recommend caching ssthresh for temporal | single existing connection has converged to 40 segments, two newly joining | |||
sharing only when flows are long. Some studies suggest that sharing | concurrent connections will assume initial windows of 10 segments <xref | |||
ssthresh between short flows can deteriorate the performance of | target="RFC6928"/> and the existing connection's window will not decrease | |||
individual connections [Hu12, <xref target="Du16"/>], although this may benef | to accommodate this additional load. As a consequence, the three | |||
it | connections can mutually interfere. | |||
aggregate network performance.</t> | ||||
<section title="Traversing the same network path" anchor="sect-8.1"><t> | One example of this is seen on low-bandwidth, high-delay links, where | |||
concurrent connections supporting Web traffic can collide because their | ||||
initial windows were too large, even when set at 1 segment.</t> | ||||
<t> | ||||
The authors of <xref target="Hu12" format="default"/> recommend caching | ||||
ssthresh for temporal sharing only when flows are long. Some studies | ||||
suggest that sharing ssthresh between short flows can deteriorate the | ||||
performance of individual connections <xref target="Hu12"/> <xref | ||||
target="Du16" format="default"/>, although this may benefit aggregate | ||||
network performance.</t> | ||||
<section anchor="sect-8.1" numbered="true" toc="default"> | ||||
<name>Traversing the Same Network Path</name> | ||||
<t> | ||||
TCP is sometimes used in situations where packets of the same host-pair do | TCP is sometimes used in situations where packets of the same host-pair do | |||
not always take the same path, such as when connection- specific parameters | not always take the same path, such as when connection-specific parameters | |||
are used for routing (e.g., for load balancing). Multipath routing that | are used for routing (e.g., for load balancing). Multipath routing that | |||
relies on examining transport headers, such as ECMP and LAG <xref target="RFC | relies on examining transport headers, such as ECMP and Link Aggregation | |||
7424"/>, may | Group (LAG) <xref target="RFC7424" format="default"/>, may not result in | |||
not result in repeatable path selection when TCP segments are encapsulated, | repeatable path selection when TCP segments are encapsulated, encrypted, or | |||
encrypted, or altered - for example, in some Virtual Private Network (VPN) | altered -- for example, in some Virtual Private Network (VPN) tunnels that | |||
tunnels that rely on proprietary encapsulation. Similarly, such approaches | rely on proprietary encapsulation. Similarly, such approaches cannot | |||
cannot operate deterministically when the TCP header is encrypted, e.g., | operate deterministically when the TCP header is encrypted, e.g., when | |||
when using IPsec ESP (although TCB interdependence among the entire set | using IPsec Encapsulating Security Payload (ESP) (although TCB | |||
sharing the same endpoint IP addresses should work without problems when | interdependence among the entire set sharing the same endpoint IP addresses | |||
the TCP header is encrypted). Measures to increase the probability that | should work without problems when the TCP header is encrypted). Measures to | |||
connections use the same path could be applied: e.g., the connections could | increase the probability that connections use the same path could be | |||
be given the same IPv6 flow label <xref target="RFC6437"/>. TCB interdependen | applied; for example, the connections could be given the same IPv6 flow | |||
ce can also | label <xref target="RFC6437" format="default"/>. TCB interdependence can | |||
be extended to sets of host IP address pairs that share the same network | also be extended to sets of host IP address pairs that share the same | |||
path conditions, such as when a group of addresses is on the same LAN (see | network path conditions, such as when a group of addresses is on the same | |||
<xref target="sect-9"/>).</t> | LAN (see <xref target="sect-9" format="default"/>).</t> | |||
<t> | ||||
Traversing the same path is not important for host-specific information | ||||
(e.g., rwnd), TCP option state (e.g., TFOinfo), or for information that is | ||||
already cached per-host (e.g., path MTU). | ||||
<t> | ||||
Traversing the same path is not important for host-specific | ||||
information such as rwnd and TCP option state, such as TFOinfo, or | ||||
for information that is already cached per-host, such as path MTU. | ||||
When TCB information is shared across different SYN destination | When TCB information is shared across different SYN destination | |||
ports, path-related information can be incorrect; however, the | ports, path-related information can be incorrect; however, the | |||
impact of this error is potentially diminished if (as discussed | impact of this error is potentially diminished if (as discussed | |||
here) TCB sharing affects only the transient event of a connection | here) TCB sharing affects only the transient event of a connection | |||
start or if TCB information is shared only within connections to the | start or if TCB information is shared only within connections to the | |||
same SYN destination port.</t> | same SYN destination port.</t> | |||
<t> | ||||
In the case of temporal sharing, TCB information could also become invalid | ||||
over time, i.e., indicating that although the path remains the same, path | ||||
properties have changed. Because this is similar to the case when a | ||||
connection becomes idle, mechanisms that address idle TCP connections | ||||
(e.g., <xref target="RFC7661" format="default"/>) could also be applied to | ||||
TCB cache management, especially when TCP Fast Open is used <xref | ||||
target="RFC7413" format="default"/>.</t> | ||||
</section> | ||||
<section anchor="sect-8.2" numbered="true" toc="default"> | ||||
<name>State Dependence</name> | ||||
<t> | <t> | |||
In case of Temporal Sharing, TCB information could also become | There may be additional considerations to the way in which TCB | |||
invalid over time, i.e., indicating that although the path remains | interdependence rebalances congestion feedback among the current | |||
the same, path properties have changed. Because this is similar to | connections. For example, it may be appropriate to consider the impact of a | |||
the case when a connection becomes idle, mechanisms that address | connection being in Fast Recovery <xref target="RFC5681" format="default"/> | |||
idle TCP connections (e.g., <xref target="RFC7661"/>) could also be applied t | or some other similar unusual feedback state that could inhibit or affect the | |||
o TCB | calculations described herein. | |||
cache management, especially when TCP Fast Open is used <xref target="RFC7413 | </t> | |||
"/>.</t> | </section> | |||
<section anchor="sect-8.3" numbered="true" toc="default"> | ||||
</section> | <name>Problems with Sharing Based on IP Address</name> | |||
<t> | ||||
<section title="State dependence" anchor="sect-8.2"><t> | ||||
There may be additional considerations to the way in which TCB | ||||
interdependence rebalances congestion feedback among the current | ||||
connections, e.g., it may be appropriate to consider the impact of a | ||||
connection being in Fast Recovery <xref target="RFC5681"/> or some other simi | ||||
lar | ||||
unusual feedback state, e.g., as inhibiting or affecting the | ||||
calculations described herein.</t> | ||||
</section> | ||||
<section title="Problems with sharing based on IP address" anchor="sect-8 | It can be wrong to share TCB information between TCP connections on the | |||
.3"><t> | same host as identified by the IP address if an IP address is assigned to a | |||
It can be wrong to share TCB information between TCP connections on | new host (e.g., IP address spinning, as is used by ISPs to inhibit running | |||
the same host as identified by the IP address if an IP address is | servers). | |||
assigned to a new host (e.g., IP address spinning, as is used by | ||||
ISPs to inhibit running servers). It can be wrong if Network Address | ||||
(and Port) Translation (NA(P)T) <xref target="RFC2663"/> or any other IP shar | ||||
ing | ||||
mechanism is used. Such mechanisms are less likely to be used with | ||||
IPv6. Other methods to identify a host could also be considered to | ||||
make correct TCB sharing more likely. Moreover, some TCB information | ||||
is about dominant path properties rather than the specific host. IP | ||||
addresses may differ, yet the relevant part of the path may be the | ||||
same.</t> | ||||
</section> | It can be wrong if Network Address Translation (NAT) <xref target="RFC2663" | |||
format="default"/>, Network Address and Port Translation (NAPT) <xref | ||||
target="RFC2663" format="default"/>, or any other IP sharing mechanism is | ||||
used. | ||||
</section> | Such mechanisms are less likely to be used with IPv6. Other methods to | |||
identify a host could also be considered to make correct TCB sharing more | ||||
likely. Moreover, some TCB information is about dominant path properties | ||||
rather than the specific host. IP addresses may differ, yet the relevant | ||||
part of the path may be the same.</t> | ||||
</section> | ||||
<section title="Implications" anchor="sect-9"><t> | </section> | |||
<section anchor="sect-9" numbered="true" toc="default"> | ||||
<name>Implications</name> | ||||
<t> | ||||
There are several implications to incorporating TCB interdependence in TCP | There are several implications to incorporating TCB interdependence in TCP | |||
implementations. First, it may reduce the need for application-layer | implementations. First, it may reduce the need for application-layer | |||
multiplexing for performance enhancement <xref target="RFC7231"/>. Protocols | multiplexing for performance enhancement <xref target="RFC7231" format="defau | |||
like HTTP/2 | lt"/>. Protocols like HTTP/2 | |||
<xref target="RFC7540"/> avoid connection reestablishment costs by serializin | <xref target="RFC7540" format="default"/> avoid connection re-establishment c | |||
g or | osts by serializing or | |||
multiplexing a set of per-host connections across a single TCP | multiplexing a set of per-host connections across a single TCP | |||
connection. This avoids TCP's per-connection OPEN handshake and also avoids | connection. This avoids TCP's per-connection OPEN handshake and also avoids | |||
recomputing the MSS, RTT, and congestion window values. By avoiding the | recomputing the MSS, RTT, and congestion window values. By avoiding the | |||
so-called "slow-start restart", performance can be optimized <xref target="I- D.hughes-restart"/>. TCB | so-called "slow-start restart", performance can be optimized <xref target="I- D.hughes-restart" format="default"/>. TCB | |||
interdependence can provide the "slow-start restart avoidance" of | interdependence can provide the "slow-start restart avoidance" of | |||
multiplexing, without requiring a multiplexing mechanism at the application | multiplexing, without requiring a multiplexing mechanism at the application | |||
layer.</t> | layer.</t> | |||
<t> | ||||
<t> | Like the initial version of this document <xref target="RFC2140" | |||
Like the initial version of this document <xref target="RFC2140"/>, this upda | format="default"/>, this update's approach to TCB interdependence focuses | |||
te's | on sharing a set of TCBs by updating the TCB state to reduce the impact of | |||
approach to TCB interdependence focuses on sharing a set of TCBs by | transients when connections begin, end, or otherwise significantly change | |||
updating the TCB state to reduce the impact of transients when | state. | |||
connections begin, end, or otherwise significantly change state. | ||||
Other mechanisms have since been proposed to continuously share | ||||
information between all ongoing communication (including | ||||
connectionless protocols), updating the congestion state during any | ||||
congestion-related event (e.g., timeout, loss confirmation, etc.) | ||||
<xref target="RFC3124"/>. By dealing exclusively with transients, the approac | ||||
h in | ||||
this document is more likely to exhibit the "steady-state" behavior | ||||
as unmodified, independent TCP connections.</t> | ||||
<section title="Layering" anchor="sect-9.1"><t> | Other mechanisms have since been proposed to continuously share information | |||
TCB interdependence pushes some of the TCP implementation from the | between all ongoing communication (including connectionless protocols) and | |||
traditional transport layer (in the ISO model), to the network | update the congestion state during any congestion-related event (e.g., | |||
layer. This acknowledges that some state is in fact per-host-pair or | timeout, loss confirmation, etc.) <xref target="RFC3124" | |||
can be per-path as indicated solely by that host-pair. Transport | format="default"/>. | |||
protocols typically manage per-application-pair associations (per | ||||
stream), and network protocols manage per-host-pair and path | ||||
associations (routing). Round-trip time, MSS, and congestion | ||||
information could be more appropriately handled at the network | ||||
layer, aggregated among concurrent connections, and shared across | ||||
connection instances <xref target="RFC3124"/>.</t> | ||||
<t> | By dealing exclusively with transients, the approach in this document is | |||
An earlier version of RTT sharing suggested implementing RTT state | more likely to exhibit the "steady-state" behavior as unmodified, | |||
at the IP layer, rather than at the TCP layer. Our observations | independent TCP connections.</t> | |||
describe sharing state among TCP connections, which avoids some of | <section anchor="sect-9.1" numbered="true" toc="default"> | |||
the difficulties in an IP-layer solution. One such problem of an IP | <name>Layering</name> | |||
layer solution is determining the correspondence between packet | ||||
exchanges using IP header information alone, where such | ||||
correspondence is needed to compute RTT. Because TCB sharing | ||||
computes RTTs inside the TCP layer using TCP header information, it | ||||
can be implemented more directly and simply than at the IP layer. | ||||
This is a case where information should be computed at the transport | ||||
layer but could be shared at the network layer.</t> | ||||
</section> | <t> | |||
<section title="Other possibilities" anchor="sect-9.2"><t> | TCB interdependence pushes some of the TCP implementation from its typical | |||
Per-host-pair associations are not the limit of these techniques. It | placement solely within the transport layer (in the ISO model) to the | |||
is possible that TCBs could be similarly shared between hosts on a | network layer. | |||
subnet or within a cluster, because the predominant path can be | ||||
subnet-subnet, rather than host-host. Additionally, TCB | This acknowledges that some components of state are, in fact, per-host-pair | |||
interdependence can be applied to any protocol with congestion | or can be per-path as indicated solely by that host-pair. | |||
state, including SCTP <xref target="RFC4960"/> and DCCP <xref target="RFC4340 | ||||
"/>, as well as for | Transport protocols typically manage per-application-pair associations (per | |||
individual subflows in Multipath TCP <xref target="RFC8684"/>.</t> | stream), and network protocols manage per-host-pair and path associations | |||
(routing). Round-trip time, MSS, and congestion information could be more | ||||
appropriately handled at the network layer, aggregated among concurrent | ||||
connections, and shared across connection instances <xref target="RFC3124" | ||||
format="default"/>.</t> | ||||
<t> | ||||
An earlier version of RTT sharing suggested implementing RTT state at the | ||||
IP layer rather than at the TCP layer. Our observations describe sharing | ||||
state among TCP connections, which avoids some of the difficulties in an | ||||
IP-layer solution. One such problem of an IP-layer solution is determining | ||||
the correspondence between packet exchanges using IP header information | ||||
alone, where such correspondence is needed to compute RTT. Because TCB | ||||
sharing computes RTTs inside the TCP layer using TCP header information, it | ||||
can be implemented more directly and simply than at the IP layer. This is | ||||
a case where information should be computed at the transport layer but | ||||
could be shared at the network layer.</t> | ||||
</section> | ||||
<section anchor="sect-9.2" numbered="true" toc="default"> | ||||
<name>Other Possibilities</name> | ||||
<t> | ||||
Per-host-pair associations are not the limit of these techniques. It is | ||||
possible that TCBs could be similarly shared between hosts on a subnet or | ||||
within a cluster, because the predominant path can be subnet-subnet rather | ||||
than host-host. Additionally, TCB interdependence can be applied to any | ||||
protocol with congestion state, including SCTP <xref target="RFC4960" | ||||
format="default"/> and DCCP <xref target="RFC4340" format="default"/>, as | ||||
well as to individual subflows in Multipath TCP <xref target="RFC8684" | ||||
format="default"/>.</t> | ||||
<t> | ||||
<t> | ||||
There may be other information that can be shared between concurrent | There may be other information that can be shared between concurrent | |||
connections. For example, knowing that another connection has just | connections. For example, knowing that another connection has just | |||
tried to expand its window size and failed, a connection may not | tried to expand its window size and failed, a connection may not | |||
attempt to do the same for some period. The idea is that existing | attempt to do the same for some period. The idea is that existing | |||
TCP implementations infer the behavior of all competing connections, | TCP implementations infer the behavior of all competing connections, | |||
including those within the same host or subnet. One possible | including those within the same host or subnet. One possible | |||
optimization is to make that implicit feedback explicit, via | optimization is to make that implicit feedback explicit, via | |||
extended information associated with the endpoint IP address and its | extended information associated with the endpoint IP address and its | |||
TCP implementation, rather than per-connection state in the TCB.</t> | TCP implementation, rather than per-connection state in the TCB.</t> | |||
<t> | ||||
<t> | ||||
This document focuses on sharing TCB information at connection | This document focuses on sharing TCB information at connection | |||
initialization. Subsequent to RFC 2140, there have been numerous approaches | initialization. Subsequent to RFC 2140, there have been numerous approaches | |||
that attempt to coordinate ongoing state across concurrent connections, | that attempt to coordinate ongoing state across concurrent connections, | |||
both within TCP and other congestion-reactive protocols, which are | both within TCP and other congestion-reactive protocols, which are | |||
summarized in <xref target="Is18"/>. These approaches are more complex to imp | summarized in <xref target="Is18" format="default"/>. These approaches are | |||
lement and | more complex to implement, and their comparison to steady-state TCP | |||
their comparison to steady-state TCP equivalence can be more difficult to | equivalence can be more difficult to establish, sometimes intentionally | |||
establish, sometimes intentionally (i.e., they sometimes intend to provide | (i.e., they sometimes intend to provide a different kind of "fairness" than | |||
a different kind of "fairness" than emerges from TCP operation).</t> | emerges from TCP operation).</t> | |||
</section> | ||||
</section> | </section> | |||
</section> | ||||
<section title="Implementation Observations" anchor="sect-10"><t> | ||||
The observation that some TCB state is host-pair specific rather | ||||
than application-pair dependent is not new and is a common | ||||
engineering decision in layered protocol implementations. Although | ||||
now deprecated, T/TCP <xref target="RFC1644"/> was the first to propose using | ||||
caches in order to maintain TCB states (see Appendix A).</t> | ||||
<t> | ||||
The table below describes the current implementation status for TCB | ||||
temporal sharing in Windows as of December 2020, Apple variants | ||||
(macOS, iOS, iPadOS, tvOS, watchOS) as of January 2021, Linux kernel | ||||
version 5.10.3, and FreeBSD 12. Ensemble sharing is not yet | ||||
implemented.</t> | ||||
<figure><artwork><![CDATA[ | ||||
KNOWN IMPLEMENTATION STATUS | ||||
TCB data Status | ||||
------------------------------------------------------------ | ||||
old_MMS_S Not shared | ||||
old_MMS_R Not shared | ||||
old_sendMSS Cached and shared in Apple, Linux (MSS) | ||||
old_PMTU Cached and shared in Apple, FreeBSD, Windows (PMTU) | ||||
old_RTT Cached and shared in Apple, FreeBSD, Linux, Windows | ||||
old_RTTVAR Cached and shared in Apple, FreeBSD, Windows | ||||
old_TFOinfo Cached and shared in Apple, Linux, Windows | ||||
old_sendcwnd Not shared | ||||
old_ssthresh Cached and shared in Apple, FreeBSD*, Linux* | <section anchor="sect-10" numbered="true" toc="default"> | |||
<name>Implementation Observations</name> | ||||
<t> | ||||
The observation that some TCB state is host-pair specific rather than | ||||
application-pair dependent is not new and is a common engineering decision | ||||
in layered protocol implementations. Although now deprecated, T/TCP <xref | ||||
target="RFC1644" format="default"/> was the first to propose using caches | ||||
in order to maintain TCB states (see <xref target="sect-a"/>).</t> | ||||
<t> | ||||
<xref target="Known_Implementation_Status"/> describes the current | ||||
implementation status for TCB temporal sharing in Windows as of | ||||
December 2020, Apple variants (macOS, iOS, iPadOS, tvOS, and watchOS) | ||||
as of January 2021, Linux kernel version 5.10.3, and FreeBSD | ||||
12. Ensemble sharing is not yet implemented.</t> | ||||
TFO failure Cached and shared in Apple | <table anchor="Known_Implementation_Status"> | |||
]]></artwork> | <name>KNOWN IMPLEMENTATION STATUS</name> | |||
</figure> | <thead> | |||
<tr> | ||||
<th>TCB data</th> | ||||
<th>Status</th> | ||||
</tr> | ||||
</thead> | ||||
<tbody> | ||||
<tr> | ||||
<td>old_MMS_S</td> | ||||
<td>Not shared</td> | ||||
</tr> | ||||
<tr> | ||||
<td>old_MMS_R</td> | ||||
<td>Not shared</td> | ||||
</tr> | ||||
<tr> | ||||
<td>old_sendMSS</td> | ||||
<td>Cached and shared in Apple, Linux (MSS)</td> | ||||
</tr> | ||||
<tr> | ||||
<td>old_PMTU</td> | ||||
<td>Cached and shared in Apple, FreeBSD, Windows (PMTU)</td> | ||||
</tr> | ||||
<tr> | ||||
<td>old_RTT</td> | ||||
<td>Cached and shared in Apple, FreeBSD, Linux, Windows</td> | ||||
</tr> | ||||
<tr> | ||||
<td>old_RTTVAR</td> | ||||
<td>Cached and shared in Apple, FreeBSD, Windows</td> | ||||
</tr> | ||||
<tr> | ||||
<td>old_TFOinfo</td> | ||||
<td>Cached and shared in Apple, Linux, Windows</td> | ||||
</tr> | ||||
<tr> | ||||
<td>old_sendcwnd</td> | ||||
<td>Not shared</td> | ||||
</tr> | ||||
<tr> | ||||
<td>old_ssthresh</td> | ||||
<td>Cached and shared in Apple, FreeBSD*, Linux*</td> | ||||
</tr> | ||||
<tr> | ||||
<td>TFO failure</td> | ||||
<td>Cached and shared in Apple</td> | ||||
</tr> | ||||
</tbody> | ||||
</table> | ||||
<t> | <dl> | |||
In the table above, "Apple" refers to all Apple OSes, i.e., | <dt>*</dt> | |||
desktop/laptop macOS, phone iOS, pad iPadOS, video player tvOS, and | <dd>Note: | |||
watch watchOS, which all share the same Internet protocol stack.</t> | ||||
<t> | In FreeBSD, new ssthresh is the mean of curr_ssthresh and its previous value | |||
*Note: In FreeBSD, new ssthresh is the mean of curr_ssthresh and | if a previous value exists; in Linux, the calculation depends on state and is | |||
previous value if a previous value exists; in Linux, the calculation | max(curr_cwnd/2, old_ssthresh) in most cases.</dd> | |||
depends on state and is max(curr_cwnd/2, old_ssthresh) in most | </dl> | |||
cases.</t> | ||||
</section> | <t>In <xref target="Known_Implementation_Status"/>, "Apple" refers to all | |||
Apple OSes, i.e., macOS (desktop/laptop), iOS (phone), iPadOS (tablet), tvOS | ||||
(video player), and watchOS (smart watch), which all share the same Internet | ||||
protocol stack. | ||||
</t> | ||||
<section title="Changes Compared to RFC 2140" anchor="sect-11"><t> | </section> | |||
This document updates the description of TCB sharing in RFC 2140 and | <section anchor="sect-11" numbered="true" toc="default"> | |||
its associated impact on existing and new connection state, | <name>Changes Compared to RFC 2140</name> | |||
providing a complete replacement for that document <xref target="RFC2140"/>. | <t> | |||
It | This document updates the description of TCB sharing in RFC 2140 and its | |||
clarifies the previous description and terminology and extends the | associated impact on existing and new connection state, providing a | |||
mechanism to its impact on new protocols and mechanisms, including | complete replacement for that document <xref target="RFC2140" | |||
multipath TCP, fast open, PLPMTUD, NAT, and the TCP Authentication | format="default"/>. It clarifies the previous description and terminology | |||
Option.</t> | and extends the mechanism to its impact on new protocols and mechanisms, | |||
including multipath TCP, Fast Open, PLPMTUD, NAT, and the TCP | ||||
Authentication Option.</t> | ||||
<t> | <t> | |||
The detailed impact on TCB state addresses TCB parameters in greater | The detailed impact on TCB state addresses TCB parameters with greater | |||
detail, addressing MSS in both the send and receive direction, MSS | specificity. It separates the way MSS is used in both send and receive | |||
and sendMSS separately, adds path MTU and ssthresh, and addresses | directions, it separates the way both of these MSS values differ from | |||
the impact on TCP option state.</t> | sendMSS, it adds both path MTU and ssthresh, and it addresses the impact on | |||
state associated with TCP options. | ||||
</t> | ||||
<t> | <t> | |||
New sections have been added to address compatibility issues and | New sections have been added to address compatibility issues and | |||
implementation observations. The relation of this work to T/TCP has | implementation observations. | |||
been moved to 0 on history, partly to reflect the deprecation of | ||||
that protocol.</t> | ||||
<t> | The relation of this work to T/TCP has been moved to <xref | |||
Appendix C has been added to discuss the potential to use temporal | target="sect-a"/> (which describes the history to TCB sharing) partly to | |||
reflect the deprecation of that protocol. | ||||
</t> | ||||
<t> | ||||
<xref target="sect-c"/> has been added to discuss the potential to use tempor | ||||
al | ||||
sharing over long timescales to adapt TCP's initial window | sharing over long timescales to adapt TCP's initial window | |||
automatically, avoiding the need to periodically revise a single | automatically, avoiding the need to periodically revise a single | |||
global constant value.</t> | global constant value.</t> | |||
<t> | ||||
<t> | ||||
Finally, this document updates and significantly expands the | Finally, this document updates and significantly expands the | |||
referenced literature.</t> | referenced literature.</t> | |||
</section> | ||||
</section> | <section anchor="sect-12" numbered="true" toc="default"> | |||
<name>Security Considerations</name> | ||||
<section title="Security Considerations" anchor="sect-12"><t> | <t> | |||
These presented implementation methods do not have additional | These presented implementation methods do not have additional ramifications | |||
ramifications for direct (connection-aborting or information | for direct (connection-aborting or information-injecting) attacks on | |||
injecting) attacks on individual connections. Individual | individual connections. Individual connections, whether using sharing or | |||
connections, whether using sharing or not, also may be susceptible | not, also may be susceptible to denial-of-service attacks that reduce | |||
to denial-of-service attacks that reduce performance or completely | performance or completely deny connections and transfers if not otherwise | |||
deny connections and transfers if not otherwise secured.</t> | secured.</t> | |||
<t> | ||||
<t> | TCB sharing may create additional denial-of-service attacks that affect the | |||
TCB sharing may create additional denial-of-service attacks that | performance of other connections by polluting the cached information. This | |||
affect the performance of other connections by polluting the cached | can occur across any set of connections in which the TCB is shared, | |||
information. This can occur across whatever set of connections where | between connections in a single host, or between hosts if TCB sharing is | |||
the TCB is shared, between connections in a single host, or between | implemented within a subnet (see <xref target="sect-9" | |||
hosts if TCB sharing is implemented within a subnet (see | sectionFormat="bare">"Implications"</xref>). Some shared TCB parameters are | |||
Implications section). Some shared TCB parameters are used only to | used only to create new TCBs; others are shared among the TCBs of ongoing | |||
create new TCBs, others are shared among the TCBs of ongoing | connections. New connections can join the ongoing set, e.g., to optimize | |||
connections. New connections can join the ongoing set, e.g., to | send window size among a set of connections to the same host. PMTU is | |||
optimize send window size among a set of connections to the same | defined as shared at the IP layer and is already susceptible in this | |||
host. PMTU is defined as shared at the IP layer, and is already | way.</t> | |||
susceptible in this way.</t> | <t> | |||
<t> | ||||
Options in client SYNs can be easier to forge than complete, two-way | Options in client SYNs can be easier to forge than complete, two-way | |||
connections. As a result, their values may not be safely | connections. As a result, their values may not be safely | |||
incorporated in shared values until after the three-way handshake | incorporated in shared values until after the three-way handshake | |||
completes.</t> | completes.</t> | |||
<t> | ||||
<t> | ||||
Attacks on parameters used only for initialization affect only the | Attacks on parameters used only for initialization affect only the | |||
transient performance of a TCP connection. For short connections, the | transient performance of a TCP connection. For short connections, the | |||
performance ramification can approach that of a denial-of-service | performance ramification can approach that of a denial-of-service | |||
attack. E.g., if an application changes its TCB to have a false and small | attack. For example, if an application changes its TCB to have a false and sm all | |||
window size, subsequent connections will experience performance degradation | window size, subsequent connections will experience performance degradation | |||
until their window grew appropriately.</t> | until their window grows appropriately.</t> | |||
<t> | ||||
<t> | ||||
TCB sharing reuses and mixes information from past and current | TCB sharing reuses and mixes information from past and current | |||
connections. Although reusing information could create a potential | connections. Although reusing information could create a potential | |||
for fingerprinting to identify hosts, the mixing reduces that | for fingerprinting to identify hosts, the mixing reduces that | |||
potential. There has been no evidence of fingerprinting based on | potential. There has been no evidence of fingerprinting based on | |||
this technique and it is currently considered safe in that regard. | this technique, and it is currently considered safe in that regard. | |||
Further, information about the performance of a TCP connection has | Further, information about the performance of a TCP connection has | |||
not been considered as private.</t> | not been considered as private.</t> | |||
</section> | ||||
<section anchor="sect-13" numbered="true" toc="default"> | ||||
<name>IANA Considerations</name> | ||||
<t> | ||||
This document has no IANA actions.</t> | ||||
</section> | </section> | |||
</middle> | ||||
<section title="IANA Considerations" anchor="sect-13"><t> | <back> | |||
There are no IANA implications or requests in this document.</t> | ||||
<t> | <displayreference target="I-D.allman-tcpm-bump-initcwnd" to="Al10"/> | |||
This section should be removed upon final publication as an RFC.</t> | <displayreference target="I-D.ietf-tcpm-generalized-ecn" to="Ba20"/> | |||
<displayreference target="I-D.hughes-restart" to="Hu01"/> | ||||
</section> | <references> | |||
<name>References</name> | ||||
<references> | ||||
<name>Normative References</name> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
FC.0793.xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
FC.1122.xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
FC.1191.xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
FC.2119.xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
FC.4821.xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
FC.5681.xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
FC.6298.xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
FC.7413.xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
FC.8174.xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
FC.8201.xml"/> | ||||
</references> | ||||
<references> | ||||
<name>Informative References</name> | ||||
</middle> | <xi:include href="https://datatracker.ietf.org/doc/bibxml3/reference.I-D .allman-tcpm-bump-initcwnd.xml"/> | |||
<back> | <reference anchor="Ba12"> | |||
<references title="Normative References"> | <front> | |||
&RFC0793; | <title>LISA: A linked slow-start algorithm for MPTCP</title> | |||
&RFC1122; | <author initials="R." surname="Barik" fullname="Runa Barik"> | |||
&RFC1191; | ||||
&RFC2119; | ||||
&RFC4821; | ||||
&RFC5681; | ||||
&RFC6298; | ||||
&RFC7413; | ||||
&RFC8174; | ||||
&RFC8201; | ||||
</references> | ||||
<references title="Informative References"> | ||||
&I-D.allman-tcpm-bump-initcwnd; | ||||
<reference anchor="Ba12"><front> | ||||
<title>LISA: A Linked Slow-Start Algorithm for MPTCP</title> | ||||
<author initials="R." surname="Barik" fullname="R. Barik"> | ||||
</author> | </author> | |||
<author initials="M." surname="Welzl" fullname="Michael Welzl"> | ||||
<author initials="M." surname="Welzl" fullname="M. Welzl"> | ||||
</author> | </author> | |||
<author initials="S." surname="Ferlin" fullname="Simone Ferlin"> | ||||
<author initials="S." surname="Ferlin" fullname="S. Ferlin"> | ||||
</author> | </author> | |||
<author initials="O." surname="Alay" fullname="Ozgu Alay"> | ||||
<author initials="O." surname="Alay" fullname="O. Alay"> | ||||
</author> | </author> | |||
<date month="May" year="2016"/> | ||||
</front> | ||||
<refcontent>IEEE ICC | ||||
</refcontent> | ||||
<seriesInfo name="DOI" value="10.1109/ICC.2016.7510786"/> | ||||
</reference> | ||||
<date month="May" year="2016"/> | <xi:include href="https://datatracker.ietf.org/doc/bibxml3/reference.I-D | |||
</front> | .ietf-tcpm-generalized-ecn.xml"/> | |||
<seriesInfo name="IEEE" value="ICC"/> | <reference anchor="Be94"> | |||
</reference> | <front> | |||
&I-D.ietf-tcpm-generalized-ecn; | <title>The World-Wide Web</title> | |||
<reference anchor="Be94"><front> | <author initials="T." surname="Berners-Lee" fullname="Tim Berners-Le | |||
<title>The World-Wide Web</title> | e"> | |||
<author initials="T." surname="Berners-Lee" fullname="T. Berners-Lee"> | ||||
</author> | </author> | |||
<author initials="C." surname="Cailliau" fullname="Robert Cailliau"/ | ||||
> | ||||
<author initials="A." surname="Luotonen" fullname="Ari Luotonen"/> | ||||
<author initials="H." surname="Nielsen" fullname="Henrik Frystyk Niel | ||||
sen"/> | ||||
<author initials="A." surname="Secret" fullname="Arthur Secret"/> | ||||
<date month="August" year="1994"/> | <date month="August" year="1994"/> | |||
</front> | </front> | |||
<seriesInfo name="DOI" value="10.1145/179606.179671"/> | ||||
<seriesInfo name="Communications" value="of the ACM"/> | <refcontent>Communications of the ACM V37, pp. 76-82</refcontent> | |||
</reference> | ||||
<reference anchor="Br94"><front> | ||||
<title>T/TCP -- Transaction TCP: Source Changes for Sun OS 4.1.3</title> | ||||
<author initials="B." surname="Braden" fullname="B. Braden"> | ||||
</author> | ||||
<date month="September" year="1994"/> | </reference> | |||
</front> | ||||
</reference> | <reference anchor="Br94"> | |||
<reference anchor="Br02"><front> | <front> | |||
<title>Understanding Internet Traffic Streams: Dragonflies and Tortoises< | <title>T/TCP -- Transaction TCP: Source Changes for Sun OS 4.1.3</ti | |||
/title> | tle> | |||
<author initials="N." surname="Brownlee" fullname="N. Brownlee"> | <author initials="B." surname="Braden" fullname="Bob Braden"> | |||
</author> | </author> | |||
<date month="September" year="1994"/> | ||||
</front> | ||||
<refcontent>USC/ISI Release 1.0</refcontent> | ||||
</reference> | ||||
<author initials="K." surname="Claffy" fullname="K. Claffy"> | <reference anchor="Br02"> | |||
<front> | ||||
<title>Understanding Internet traffic streams: dragonflies and torto | ||||
ises</title> | ||||
<author initials="N" surname="Brownlee" fullname="Nevil Brownlee"> | ||||
</author> | </author> | |||
<author initials="KC" surname="Claffy" fullname="KC Claffy"> | ||||
<date year="2002"/> | ||||
</front> | ||||
<seriesInfo name="IEEE" value="Communications Magazine p110-117"/> | ||||
</reference> | ||||
<reference anchor="Co91"><front> | ||||
<title>Internetworking with TCP/IP</title> | ||||
<author initials="D." surname="Comer" fullname="D. Comer"> | ||||
</author> | </author> | |||
<date year="2002"/> | ||||
</front> | ||||
<seriesInfo name="DOI" value="10.1109/MCOM.2002.1039865"/> | ||||
<refcontent>IEEE Communications Magazine, pp. 110-117</refcontent> | ||||
</reference> | ||||
<author initials="D." surname="Stevens" fullname="D. Stevens"> | <reference anchor="Co91"> | |||
<front> | ||||
<title>Internetworking with TCP/IP</title> | ||||
<author initials="D" surname="Comer" fullname="Douglas Comer"> | ||||
</author> | </author> | |||
<author initials="D" surname="Stevens" fullname="David Stevens"> | ||||
<date year="1991"/> | ||||
</front> | ||||
</reference> | ||||
<reference anchor="Du16"><front> | ||||
<title>Research Impacting the Practice of Congestion Control</title> | ||||
<author> | ||||
<organization>Dukkipati, N., Yuchung C. and V. Amin</organization> | ||||
</author> | </author> | |||
<date year="1991"/> | ||||
</front> | ||||
<seriesInfo name='ISBN 10:' value='0134685059' /> | ||||
<seriesInfo name='ISBN 13:' value='9780134685052' /> | ||||
</reference> | ||||
<date month="July" year="2016"/> | <reference anchor="Du16"> | |||
</front> | <front> | |||
<title>Research Impacting the Practice of Congestion Control</title> | ||||
<author initials="N" surname="Dukkipati" fullname="Nandita Dukkipati | ||||
"/> | ||||
<author initials="Y" surname="Cheng" fullname="Yuchung Cheng"/> | ||||
<author initials="A" surname="Vahdat" fullname="Amin Vahdat"/> | ||||
<date month="July" year="2016"/> | ||||
</front> | ||||
<refcontent>Computer Communication Review</refcontent> | ||||
<refcontent>The ACM SIGCOMM newsletter</refcontent> | ||||
</reference> | ||||
<seriesInfo name="ACM" value="SIGCOMM CCR editorial"/> | <reference anchor="FreeBSD" target="https://www.freebsd.org/"> | |||
</reference> | <front> | |||
<reference anchor="FreeBSD" target="http://www.freebsd.org/"><front> | <title>The FreeBSD Project</title> | |||
<title>FreeBSD source code</title> | <author> | |||
<author> | <organization>FreeBSD</organization> | |||
</author> | </author> | |||
<date/> | ||||
</front> | ||||
</reference> | ||||
<date/> | <reference anchor="I-D.hughes-restart"> | |||
</front> | <front> | |||
<title>Issues in TCP Slow-Start Restart After Idle</title> | ||||
</reference> | <author initials="A" surname="Hughes" fullname="Amy Hughes"/> | |||
&I-D.hughes-restart; | <author initials="J" surname="Touch" fullname="Joe Touch"/> | |||
<reference anchor="Hu12"><front> | <author initials="J" surname="Heidemann" fullname="John Heidemann"/> | |||
<title>Enhanced metric caching for short TCP flows</title> | ||||
<author initials="P." surname="Hurtig" fullname="P. Hurtig"> | ||||
</author> | ||||
<author initials="A." surname="Brunstrom" fullname="A. Brunstrom"> | <date month="December" year="2001" /> | |||
</author> | </front> | |||
<date year="2012"/> | <seriesInfo name="Internet-Draft" value="draft-hughes-restart-00" /> | |||
</front> | </reference> | |||
<seriesInfo name="IEEE" value="International Conference on Communications | <reference anchor="Hu12"> | |||
"/> | <front> | |||
</reference> | <title>Enhanced metric caching for short TCP flows</title> | |||
<reference anchor="IANA" target="https://www.iana.org/assignments/tcp-par | <author initials="P." surname="Hurtig" fullname="Per Hurtig"> | |||
ameters"><front> | ||||
<title>IANA TCP Parameters (options) registry</title> | ||||
<author> | ||||
</author> | </author> | |||
<author initials="A." surname="Brunstrom" fullname="Anna Brunstrom"> | ||||
<date/> | ||||
</front> | ||||
</reference> | ||||
<reference anchor="Is18"><front> | ||||
<title>ctrlTCP: Reducing Latency through Coupled Heterogeneous Multi-Flow | ||||
TCP Congestion Control</title> | ||||
<author initials="S." surname="Islam" fullname="S. Islam"> | ||||
</author> | </author> | |||
<date year="2012"/> | ||||
</front> | ||||
<seriesInfo name="DOI" value="10.1109/ICC.2012.6364516"/> | ||||
<refcontent>IEEE International Conference on Communications</refcontent> | ||||
</reference> | ||||
<author initials="M." surname="Welzl" fullname="M. Welzl"> | <reference anchor="IANA" target="https://www.iana.org/assignments/tcp-pa | |||
rameters"> | ||||
<front> | ||||
<title>Transmission Control Protocol (TCP) Parameters</title> | ||||
<author> | ||||
<organization>IANA</organization> | ||||
</author> | </author> | |||
<date/> | ||||
</front> | ||||
</reference> | ||||
<author initials="K." surname="Hiorth" fullname="K. Hiorth"> | <reference anchor="Is18"> | |||
<front> | ||||
<title>ctrlTCP: Reducing latency through coupled, heterogeneous | ||||
multi-flow TCP congestion control</title> | ||||
<author initials="S." surname="Islam" fullname="Safiqul Islam"> | ||||
</author> | </author> | |||
<author initials="M." surname="Welzl" fullname="Michael Welzl"> | ||||
<author initials="D." surname="Hayes" fullname="D. Hayes"> | ||||
</author> | </author> | |||
<author initials="K." surname="Hiorth" fullname="Kristian Hiorth"> | ||||
<author initials="G." surname="Armitage" fullname="G. Armitage"> | ||||
</author> | </author> | |||
<author initials="D." surname="Hayes" fullname="David Hayes"> | ||||
<author initials="S." surname="Gjessing" fullname="S. Gjessing"> | ||||
</author> | </author> | |||
<author initials="G." surname="Armitage" fullname="Grenville Armitag | ||||
<date month="April" year="2018"/> | e"> | |||
</front> | ||||
<seriesInfo name="Proc" value="IEEE INFOCOM Global Internet Symposium GI | ||||
workshop"/> | ||||
</reference> | ||||
<reference anchor="Ja88"><front> | ||||
<title>Congestion Avoidance and Control</title> | ||||
<author initials="V." surname="Jacobson" fullname="V. Jacobson"> | ||||
</author> | </author> | |||
<author initials="S." surname="Gjessing" fullname="Stein Gjessing"> | ||||
<author initials="M." surname="Karels" fullname="M. Karels"> | ||||
</author> | </author> | |||
<date month="April" year="2018"/> | ||||
</front> | ||||
<seriesInfo name="DOI" value="10.1109/INFCOMW.2018.8406887"/> | ||||
<refcontent>IEEE INFOCOM 2018 - IEEE Conference on Computer | ||||
Communications Workshops (INFOCOM WKSHPS)</refcontent> | ||||
</reference> | ||||
<date year="1988"/> | <reference anchor="Ja88"> | |||
</front> | <front> | |||
<title>Congestion Avoidance and Control</title> | ||||
<seriesInfo name="Proc" value="Sigcomm"/> | <author initials="V." surname="Jacobson" fullname="Van Jacobson"> | |||
</reference> | </author> | |||
&RFC1644; | <author initials="M." surname="Karels" fullname="Michael Karels"> | |||
&RFC1379; | </author> | |||
&RFC2001; | <date month="November" year="1988"/> | |||
&RFC2140; | </front> | |||
&RFC2414; | <refcontent>SIGCOMM Symposium proceedings on Communications | |||
&RFC2663; | architectures and protocols | |||
&RFC3390; | </refcontent> | |||
&RFC3124; | </reference> | |||
&RFC4340; | ||||
&RFC4960; | ||||
&RFC5925; | ||||
&RFC6437; | ||||
&RFC6691; | ||||
&RFC6928; | ||||
&RFC7231; | ||||
&RFC7323; | ||||
&RFC7424; | ||||
&RFC7540; | ||||
&RFC7661; | ||||
&RFC8684; | ||||
</references> | ||||
<section title="TCB Sharing History" anchor="sect-a"><t> | ||||
T/TCP proposed using caches to maintain TCB information across | ||||
instances (temporal sharing), e.g., smoothed RTT, RTT variation, | ||||
congestion avoidance threshold, and MSS <xref target="RFC1644"/>. These value | ||||
s were | ||||
in addition to connection counts used by T/TCP to accelerate data | ||||
delivery prior to the full three-way handshake during an OPEN. The | ||||
goal was to aggregate TCB components where they reflect one | ||||
association - that of the host-pair, rather than artificially | ||||
separating those components by connection.</t> | ||||
<t> | <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | |||
FC.1644.xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
FC.1379.xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
FC.2001.xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
FC.2140.xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
FC.2414.xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
FC.2663.xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
FC.3390.xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
FC.3124.xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
FC.4340.xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
FC.4960.xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
FC.5925.xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
FC.6437.xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
FC.6691.xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
FC.6928.xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
FC.7231.xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
FC.7323.xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
FC.7424.xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
FC.7540.xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
FC.7661.xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
FC.8684.xml"/> | ||||
</references> | ||||
</references> | ||||
<section anchor="sect-a" numbered="true" toc="default"> | ||||
<name>TCB Sharing History</name> | ||||
<t> | ||||
T/TCP proposed using caches to maintain TCB information across instances | ||||
(temporal sharing), e.g., smoothed RTT, RTT variation, congestion-avoidance | ||||
threshold, and MSS <xref target="RFC1644" format="default"/>. These values | ||||
were in addition to connection counts used by T/TCP to accelerate data | ||||
delivery prior to the full three-way handshake during an OPEN. The goal was | ||||
to aggregate TCB components where they reflect one association -- that of the | ||||
host-pair rather than artificially separating those components by | ||||
connection.</t> | ||||
<t> | ||||
At least one T/TCP implementation saved the MSS and aggregated the | At least one T/TCP implementation saved the MSS and aggregated the | |||
RTT parameters across multiple connections but omitted caching the | RTT parameters across multiple connections but omitted caching the | |||
congestion window information <xref target="Br94"/>, as originally specified | congestion window information <xref target="Br94" format="default"/>, as orig | |||
in | inally specified in | |||
<xref target="RFC1379"/>. Some T/TCP implementations immediately updated MSS | <xref target="RFC1379" format="default"/>. Some T/TCP implementations immedia | |||
when | tely updated MSS when | |||
the TCP MSS header option was received <xref target="Br94"/>, although this w | the TCP MSS header option was received <xref target="Br94" format="default"/> | |||
as not | , although this was not | |||
addressed specifically in the concepts or functional specification | addressed specifically in the concepts or functional specification | |||
<xref target="RFC1379"/><xref target="RFC1644"/>. In later T/TCP implementati ons, RTT values were | <xref target="RFC1379" format="default"/> <xref target="RFC1644" format="defa ult"/>. In later T/TCP implementations, RTT values were | |||
updated only after a CLOSE, which does not benefit concurrent | updated only after a CLOSE, which does not benefit concurrent | |||
sessions.</t> | sessions.</t> | |||
<t> | ||||
Temporal sharing of cached TCB data was originally implemented in the Sun | ||||
OS 4.1.3 T/TCP extensions <xref target="Br94" format="default"/> and the | ||||
FreeBSD port of same <xref target="FreeBSD" format="default"/>. As | ||||
mentioned before, only the MSS and RTT parameters were cached, as originally | ||||
specified in <xref target="RFC1379" format="default"/>. Later discussion of | ||||
T/TCP suggested including congestion control parameters in this cache; for | ||||
example, <xref target="RFC1644" sectionFormat="of" section="3.1" | ||||
format="default"/> hints at initializing the congestion window to the old | ||||
window size.</t> | ||||
</section> | ||||
<section anchor="sect-b" numbered="true" toc="default"> | ||||
<t> | <name>TCP Option Sharing and Caching</name> | |||
Temporal sharing of cached TCB data was originally implemented in | <t> | |||
the SunOS 4.1.3 T/TCP extensions <xref target="Br94"/> and the FreeBSD port o | In addition to the options that can be cached and shared, this memo also | |||
f same | lists known TCP options <xref target="IANA" format="default"/> for which | |||
<xref target="FreeBSD"/>. As mentioned before, only the MSS and RTT parameter | state is unsafe to be kept. This list is not intended to be authoritative | |||
s were | or exhaustive.</t> | |||
cached, as originally specified in <xref target="RFC1379"/>. Later discussion | ||||
of | ||||
T/TCP suggested including congestion control parameters in this | ||||
cache; for example, <xref target="RFC1644"/> (Section 3.1) hints at initializ | ||||
ing | ||||
the congestion window to the old window size.</t> | ||||
</section> | ||||
<section title="TCP Option Sharing and Caching" anchor="sect-b"><t> | ||||
In addition to the options that can be cached and shared, this memo | ||||
also lists known TCP options <xref target="IANA"/> for which state is unsafe | ||||
to be | ||||
kept. This list is not intended to be authoritative or exhaustive.</t> | ||||
<figure><artwork><![CDATA[ | ||||
Obsolete (unsafe to keep state): | ||||
ECHO | ||||
ECHO REPLY | <t>Obsolete (unsafe to keep state): | |||
</t> | ||||
<ul empty="true"> | ||||
PO Conn permitted | <li>Echo | |||
</li> | ||||
PO service profile | <li>Echo Reply | |||
</li> | ||||
CC | <li>Partial Order Connection Permitted | |||
</li> | ||||
CC.NEW | <li>Partial Order Service Profile | |||
</li> | ||||
CC.ECHO | <li>CC | |||
</li> | ||||
Alt CS req | <li>CC.NEW | |||
</li> | ||||
Alt CS data | <li>CC.ECHO | |||
</li> | ||||
No state to keep: | <li>TCP Alternate Checksum Request | |||
</li> | ||||
EOL | <li>TCP Alternate Checksum Data | |||
</li> | ||||
NOP | </ul> | |||
WS | <t>No state to keep: | |||
</t> | ||||
SACK | <ul empty="true"> | |||
<li>End of Option List (EOL) | ||||
</li> | ||||
<li>No-Operation (NOP) | ||||
</li> | ||||
<li>Window Scale (WS) | ||||
</li> | ||||
<li>SACK | ||||
</li> | ||||
<li>Timestamps (TS) | ||||
</li> | ||||
<li>MD5 Signature Option | ||||
</li> | ||||
<li>TCP Authentication Option (TCP-AO) | ||||
</li> | ||||
<li>RFC3692-style Experiment 1 | ||||
</li> | ||||
<li>RFC3692-style Experiment 2 | ||||
</li> | ||||
</ul> | ||||
TS | <t>Unsafe to keep state: | |||
</t> | ||||
MD5 | <ul empty="true"> | |||
TCP-AO | <li>Skeeter (DH exchange, known to be vulnerable) | |||
</li> | ||||
EXP1 | <li>Bubba (DH exchange, known to be vulnerable) | |||
</li> | ||||
EXP2 | <li>Trailer Checksum Option | |||
</li> | ||||
Unsafe to keep state: | <li>SCPS capabilities | |||
</li> | ||||
Skeeter (DH exchange, known to be vulnerable) | <li>Selective Negative Acknowledgements (S-NACK) | |||
</li> | ||||
Bubba (DH exchange, known to be vulnerable) | <li>Records Boundaries | |||
</li> | ||||
Trailer CS | <li>Corruption experienced | |||
</li> | ||||
SCPS capabilities | <li>SNAP | |||
</li> | ||||
S-NACK | <li>TCP Compression Filter | |||
</li> | ||||
Records boundaries | <li>Quick-Start Response | |||
</li> | ||||
Corruption experienced | <li>User Timeout Option (UTO) | |||
</li> | ||||
SNAP | <li>Multipath TCP (MPTCP) negotiation success (see below for negotiation failure | |||
) | ||||
</li> | ||||
TCP Compression | <li>TCP Fast Open (TFO) negotiation success (see below for negotiation failure) | |||
</li> | ||||
Quickstart response | </ul> | |||
UTO | <t>Safe but optional to keep state: | |||
</t> | ||||
MPTCP negotiation success (see below for negotiation failure) | <ul empty="true"> | |||
<li>Multipath TCP (MPTCP) negotiation failure (to avoid negotiation retries) | ||||
</li> | ||||
TFO negotiation success (see below for negotiation failure) | <li>Maximum Segment Size (MSS) | |||
</li> | ||||
Safe but optional to keep state: | <li>TCP Fast Open (TFO) negotiation failure (to avoid negotiation retries) | |||
</li> | ||||
MPTCP negotiation failure (to avoid negotiation retries) | </ul> | |||
MSS | <t>Safe and necessary to keep state: | |||
</t> | ||||
TFO negotiation failure (to avoid negotiation retries) | <ul empty="true"> | |||
Safe and necessary to keep state: | <li>TCP Fast Open (TFO) Cookie (if TFO succeeded in the past) | |||
</li> | ||||
TFO cookie (if TFO succeeded in the past) | </ul> | |||
]]></artwork> | ||||
</figure> | ||||
</section> | ||||
<section title="Automating the Initial Window in TCP over Long Timescales | </section> | |||
" anchor="sect-c"><section title="Introduction" anchor="sect-c.1"><t> | <section anchor="sect-c" numbered="true" toc="default"> | |||
<name>Automating the Initial Window in TCP over Long Timescales</name> | ||||
<section anchor="sect-c.1" numbered="true" toc="default"> | ||||
<name>Introduction</name> | ||||
<t> | ||||
Temporal sharing, as described earlier in this document, builds on | Temporal sharing, as described earlier in this document, builds on | |||
the assumption that multiple consecutive connections between the | the assumption that multiple consecutive connections between the | |||
same host pair are somewhat likely to be exposed to similar | same host-pair are somewhat likely to be exposed to similar | |||
environment characteristics. The stored information can become less | environment characteristics. The stored information can become less | |||
accurate over time and suitable precautions should take this ageing | accurate over time and suitable precautions should take this aging | |||
into consideration (this is discussed further in section 8.1). | into consideration (this is discussed further in <xref target="sect-8.1"/>). | |||
However, there are also cases where it can make sense to track these | However, there are also cases where it can make sense to track these | |||
values over longer periods, observing properties of TCP connections | values over longer periods, observing properties of TCP connections | |||
to gradually influence evolving trends in TCP parameters. This | to gradually influence evolving trends in TCP parameters. This | |||
appendix describes an example of such a case.</t> | appendix describes an example of such a case.</t> | |||
<t> | ||||
<t> | ||||
TCP's congestion control algorithm uses an initial window value | TCP's congestion control algorithm uses an initial window value | |||
(IW), both as a starting point for new connections and as an upper | (IW) both as a starting point for new connections and as an upper | |||
limit for restarting after an idle period <xref target="RFC5681"/><xref targe | limit for restarting after an idle period <xref target="RFC5681" format="defa | |||
t="RFC7661"/>. This | ult"/> <xref target="RFC7661" format="default"/>. This | |||
value has evolved over time, originally one maximum segment size | value has evolved over time; it was originally 1 maximum segment size | |||
(MSS), and increased to the lesser of four MSS or 4,380 bytes | (MSS) and increased to the lesser of 4 MSSs or 4,380 bytes | |||
<xref target="RFC3390"/><xref target="RFC5681"/>. For a typical Internet conn | <xref target="RFC3390" format="default"/> <xref target="RFC5681" format="defa | |||
ection with a maximum | ult"/>. For a typical Internet connection with a maximum | |||
transmission unit (MTU) of 1500 bytes, this permits three segments | transmission unit (MTU) of 1500 bytes, this permits 3 segments | |||
of 1,460 bytes each.</t> | of 1,460 bytes each.</t> | |||
<t> | ||||
<t> | The IW value was originally implied in the original TCP congestion control | |||
The IW value was originally implied in the original TCP congestion | description and documented as a standard in 1997 <xref target="RFC2001" | |||
control description and documented as a standard in 1997 | format="default"/> <xref target="Ja88" format="default"/>. The value was | |||
<xref target="RFC2001"/><xref target="Ja88"/>. The value was updated in 1998 | updated in 1998 experimentally and moved to the Standards Track in 2002 | |||
experimentally and | <xref target="RFC2414" format="default"/> <xref target="RFC3390" | |||
moved to the standards track in 2002 <xref target="RFC2414"/><xref target="RF | format="default"/>. In 2013, it was experimentally increased to 10 <xref | |||
C3390"/>. In 2013, it | target="RFC6928" format="default"/>.</t> | |||
was experimentally increased to 10 <xref target="RFC6928"/>.</t> | <t> | |||
<t> | ||||
This appendix discusses how TCP can objectively measure when an IW | This appendix discusses how TCP can objectively measure when an IW | |||
is too large, and that such feedback should be used over long | is too large and that such feedback should be used over long | |||
timescales to adjust the IW automatically. The result should be | timescales to adjust the IW automatically. The result should be | |||
safer to deploy and might avoid the need to repeatedly revisit IW | safer to deploy and might avoid the need to repeatedly revisit IW | |||
over time.</t> | over time.</t> | |||
<t> | ||||
<t> | ||||
Note that this mechanism attempts to make the IW more adaptive over | Note that this mechanism attempts to make the IW more adaptive over | |||
time. It can increase the IW beyond that which is currently | time. It can increase the IW beyond that which is currently | |||
recommended for widescale deployment, and so its use should be | recommended for wide-scale deployment, so its use should be | |||
carefully monitored.</t> | carefully monitored.</t> | |||
</section> | ||||
</section> | <section anchor="sect-c.2" numbered="true" toc="default"> | |||
<name>Design Considerations</name> | ||||
<section title="Design Considerations" anchor="sect-c.2"><t> | <t> | |||
TCP's IW value has existed statically for over two decades, so any | TCP's IW value has existed statically for over two decades, so any | |||
solution to adjusting the IW dynamically should have similarly | solution to adjusting the IW dynamically should have similarly | |||
stable, non-invasive effects on the performance and complexity of | stable, non-invasive effects on the performance and complexity of | |||
TCP. In order to be fair, the IW should be similar for most machines | TCP. In order to be fair, the IW should be similar for most machines | |||
on the public Internet. Finally, a desirable goal is to develop a | on the public Internet. Finally, a desirable goal is to develop a | |||
self-correcting algorithm, so that IW values that cause network | self-correcting algorithm so that IW values that cause network | |||
problems can be avoided. To that end, we propose the following | problems can be avoided. To that end, we propose the following | |||
design goals:</t> | design goals:</t> | |||
<ul spacing="normal"> | ||||
<t><list style="symbols"><t>Impart little to no impact to TCP in the abse | <li>Impart little to no impact to TCP in the absence of loss, i.e., | |||
nce of loss, i.e., | ||||
it should not increase the complexity of default packet | it should not increase the complexity of default packet | |||
processing in the normal case.</t> | processing in the normal case.</li> | |||
<li>Adapt to network feedback over long timescales, avoiding values | ||||
<t>Adapt to network feedback over long timescales, avoiding values | that persistently cause network problems.</li> | |||
that persistently cause network problems.</t> | <li>Decrease the IW in the presence of sustained loss of IW segments, | |||
as determined over a number of different connections.</li> | ||||
<t>Decrease the IW in the presence of sustained loss of IW segments, | <li>Increase the IW in the absence of sustained loss of IW segments, | |||
as determined over a number of different connections.</t> | as determined over a number of different connections.</li> | |||
<li>Operate conservatively, i.e., tend towards leaving the IW the | ||||
<t>Increase the IW in the absence of sustained loss of IW segments, | ||||
as determined over a number of different connections.</t> | ||||
<t>Operate conservatively, i.e., tend towards leaving the IW the | ||||
same in the absence of sufficient information, and give greater | same in the absence of sufficient information, and give greater | |||
consideration to IW segment loss than IW segment success.</t> | consideration to IW segment loss than IW segment success.</li> | |||
</ul> | ||||
</list> | <t> | |||
</t> | ||||
<t> | ||||
We expect that, without other context, a good IW algorithm will | We expect that, without other context, a good IW algorithm will | |||
converge to a single value, but this is not required. An endpoint | converge to a single value, but this is not required. An endpoint | |||
with additional context or information, or deployed in a constrained | with additional context or information, or deployed in a constrained | |||
environment, can always use a different value. In particular, | environment, can always use a different value. In particular, | |||
information from previous connections, or sets of connections with a | information from previous connections, or sets of connections with a | |||
similar path, can already be used as context for such decisions (as | similar path, can already be used as context for such decisions (as | |||
noted in the core of this document).</t> | noted in the core of this document).</t> | |||
<t> | ||||
<t> | ||||
However, if a given IW value persistently causes packet loss during | However, if a given IW value persistently causes packet loss during | |||
the initial burst of packets, it is clearly inappropriate and could | the initial burst of packets, it is clearly inappropriate and could | |||
be inducing unnecessary loss in other competing connections. This | be inducing unnecessary loss in other competing connections. This | |||
might happen for sites behind very slow boxes with small buffers, | might happen for sites behind very slow boxes with small buffers, | |||
which may or may not be the first hop.</t> | which may or may not be the first hop.</t> | |||
</section> | ||||
</section> | <section anchor="sect-c.3" numbered="true" toc="default"> | |||
<name>Proposed IW Algorithm</name> | ||||
<section title="Proposed IW Algorithm" anchor="sect-c.3"><t> | <t> | |||
Below is a simple description of the proposed IW algorithm. It | Below is a simple description of the proposed IW algorithm. It | |||
relies on the following parameters:</t> | relies on the following parameters:</t> | |||
<ul spacing="normal"> | ||||
<t><list style="symbols"><t>MinIW = 3 MSS or 4,380 bytes (as per <xref ta | <li>MinIW = 3 MSS or 4,380 bytes (as per <xref target="RFC3390" format | |||
rget="RFC3390"/>)</t> | ="default"/>)</li> | |||
<li>MaxIW = 10 MSS (as per <xref target="RFC6928" format="default"/>)< | ||||
<t>MaxIW = 10 MSS (as per <xref target="RFC6928"/>)</t> | /li> | |||
<li>MulDecr = 0.5</li> | ||||
<t>MulDecr = 0.5</t> | <li>AddIncr = 2 MSS</li> | |||
<li>Threshold = 0.05</li> | ||||
<t>AddIncr = 2 MSS</t> | </ul> | |||
<t> | ||||
<t>Threshold = 0.05</t> | ||||
</list> | ||||
</t> | ||||
<t> | ||||
We assume that the minimum IW (MinIW) should be as currently specified as | We assume that the minimum IW (MinIW) should be as currently specified as | |||
standard <xref target="RFC3390"/>. The maximum IW can be set to a fixed | standard <xref target="RFC3390" format="default"/>. The maximum IW (MaxIW) ca | |||
value (we suggest using the experimental and now somewhat de- facto | n be | |||
standard in <xref target="RFC6928"/>) or set based on a schedule if trusted | set to a fixed value (we suggest using the experimental and now somewhat de | |||
time references are available <xref | facto standard in <xref target="RFC6928" format="default"/>) or set based | |||
target="I-D.allman-tcpm-bump-initcwnd"/>; here we prefer a fixed value. We | on a schedule if trusted time references are available <xref | |||
also propose to use an AIMD algorithm, with increase and decreases as | target="I-D.allman-tcpm-bump-initcwnd" format="default"/>; here, we prefer | |||
noted.</t> | a fixed value. We also propose to use an Additive Increase Multiplicative | |||
Decrease (AIMD) algorithm, with increase and decreases as noted.</t> | ||||
<t> | <t> | |||
Although these parameters are somewhat arbitrary, their initial | Although these parameters are somewhat arbitrary, their initial | |||
values are not important except that the algorithm is AIMD and the | values are not important except that the algorithm is AIMD and the | |||
MaxIW should not exceed that recommended for other systems on the | MaxIW should not exceed that recommended for other systems on the | |||
Internet (here we selected the current de-facto standard rather than | Internet (here, we selected the current de facto standard rather than | |||
the actual standard). Current proposals, including default current | the actual standard). Current proposals, including default current | |||
operation, are degenerate cases of the algorithm below for given | operation, are degenerate cases of the algorithm below for given | |||
parameters - notably MulDec = 1.0 and AddIncr = 0 MSS, thus | parameters, notably MulDec = 1.0 and AddIncr = 0 MSS, thus | |||
disabling the automatic part of the algorithm.</t> | disabling the automatic part of the algorithm.</t> | |||
<t> | ||||
<t> | ||||
The proposed algorithm is as follows:</t> | The proposed algorithm is as follows:</t> | |||
<figure><artwork><![CDATA[ | <ol> | |||
1. On boot: | ||||
IW = MaxIW; # assume this is in bytes, and indicates an integer | ||||
multiple of 2 MSS (an even number to support ACK compression) | ||||
2. Upon starting a new connection: | ||||
CWND = IW; | <li> | |||
conncount++; | <t>On boot:</t> | |||
IWnotchecked = 1; # true | <sourcecode type="pseudocode"> | |||
IW = MaxIW; # assume this is in bytes and indicates an integer | ||||
# multiple of 2 MSS (an even number to support | ||||
# ACK compression) | ||||
</sourcecode> | ||||
</li> | ||||
3. During a connection's SYN-ACK processing, if SYN-ACK includes ECN | <li><t>Upon starting a new connection:</t> | |||
(as similarly addressed in Sec 5 of ECN++ for TCP [Ba20]), treat | <sourcecode type="pseudocode"> | |||
as if the IW is too large: | CWND = IW; | |||
conncount++; | ||||
IWnotchecked = 1; # true | ||||
</sourcecode> | ||||
</li> | ||||
if (IWnotchecked && (synackecn == 1)) { | <li> | |||
losscount++; | <t>During a connection's SYN-ACK processing, if SYN-ACK includes ECN (as | |||
IWnotchecked = 0; # never check again | similarly addressed in Section 5 of ECN++ for TCP <xref | |||
} | target="I-D.ietf-tcpm-generalized-ecn"/>), treat as if the IW is too large: | |||
</t> | ||||
<sourcecode type="pseudocode"> | ||||
if (IWnotchecked && (synackecn == 1)) { | ||||
losscount++; | ||||
IWnotchecked = 0; # never check again | ||||
} | ||||
</sourcecode> | ||||
</li> | ||||
4. During a connection, if retransmission occurs, check the seqno of | <li><t>During a connection, if retransmission occurs, check the seqno of the | |||
the outgoing packet (in bytes) to see if the resent segment fixes | outgoing packet (in bytes) to see if the re-sent segment fixes an IW loss:</t> | |||
an IW loss: | <sourcecode type="pseudocode"> | |||
if (Retransmitting && IWnotchecked && ((seqno - ISN) < IW) | ||||
)) { | ||||
losscount++; | ||||
IWnotchecked = 0; # never do this entire "if" again | ||||
} else { | ||||
IWnotchecked = 0; # you're beyond the IW so stop checking | ||||
} | ||||
</sourcecode> | ||||
</li> | ||||
if (Retransmitting && IWnotchecked && ((seqno - ISN) < IW))) { | <li> | |||
losscount++; | <t>Once every 1000 connections, as a separate process (i.e., not as part of | |||
IWnotchecked = 0; # never do this entire "if" again | processing a given connection): | |||
</t> | ||||
<sourcecode type="pseudocode"> | ||||
if (conncount > 1000) { | ||||
if (losscount/conncount > threshold) { | ||||
# the number of connections with errors is too high | ||||
IW = IW * MulDecr; | ||||
} else { | } else { | |||
IWnotchecked = 0; # you're beyond the IW so stop checking | IW = IW + AddIncr; | |||
} | } | |||
} | ||||
</sourcecode> | ||||
</li> | ||||
5. Once every 1000 connections, as a separate process (i.e., not as | </ol> | |||
part of processing a given connection): | ||||
if (conncount > 1000) { | ||||
if (losscount/conncount > threshold) { | ||||
# the number of connections with errors is too high | ||||
IW = IW * MulDecr; | ||||
} else { | ||||
IW = IW + AddIncr; | ||||
} | ||||
} | ||||
]]></artwork> | ||||
</figure> | ||||
<t> | ||||
As presented, this algorithm can yield a false positive when the | ||||
sequence number wraps around, e.g., the code might increment | ||||
losscount in step 4 when no loss occurred or fail to increment | ||||
losscount when a loss did occur. This can be avoided using either | ||||
PAWS <xref target="RFC7323"/> context or internal extended sequence number | ||||
representations (as in TCP-AO <xref target="RFC5925"/>). Alternately, false | ||||
positives can be tolerated because they are expected to be | ||||
infrequent and thus will not significantly impact the algorithm.</t> | ||||
<t> | <t> | |||
As presented, this algorithm can yield a false positive when the sequence | ||||
number wraps around, e.g., the code might increment losscount in step 4 | ||||
when no loss occurred or fail to increment losscount when a loss did | ||||
occur. This can be avoided using either Protection Against Wrapped | ||||
Sequences (PAWS) <xref target="RFC7323" format="default"/> context or | ||||
internal extended sequence number representations (as in TCP Authentication | ||||
Option (TCP-AO) <xref target="RFC5925" format="default"/>). Alternately, | ||||
false positives can be tolerated because they are expected to be infrequent | ||||
and thus will not significantly impact the algorithm.</t> | ||||
<t> | ||||
A number of additional constraints need to be imposed if this | A number of additional constraints need to be imposed if this | |||
mechanism is implemented to ensure that it defaults to values that | mechanism is implemented to ensure that it defaults to values that | |||
comply with current Internet standards, is conservative in how it | comply with current Internet standards, is conservative in how it | |||
extends those values, and returns to those values in the absence of | extends those values, and returns to those values in the absence of | |||
positive feedback (i.e., success). To that end, we recommend the | positive feedback (i.e., success). To that end, we recommend the | |||
following list of example constraints:</t> | following list of example constraints:</t> | |||
<t> | <ul> | |||
>> The automatic IW algorithm MUST initialize MaxIW a value no | <li> <t> The automatic IW algorithm <bcp14>MUST</bcp14> initialize MaxIW a | |||
larger than the currently recommended Internet default, in the | value no larger than the currently recommended Internet default in the | |||
absence of other context information.</t> | absence of other context information.</t> | |||
<t> | ||||
<t> | ||||
Thus, if there are too few connections to make a decision or if | Thus, if there are too few connections to make a decision or if | |||
there is otherwise insufficient information to increase the IW, then | there is otherwise insufficient information to increase the IW, then | |||
the MaxIW defaults to the current recommended value.</t> | the MaxIW defaults to the current recommended value.</t></li> | |||
<t> | <li> <t> | |||
>> An implementation MAY allow the MaxIW to grow beyond the | An implementation <bcp14>MAY</bcp14> allow the MaxIW to grow beyond the | |||
currently recommended Internet default, but not more than 2 segments | currently recommended Internet default but not more than 2 segments | |||
per calendar year.</t> | per calendar year.</t> | |||
<t> | ||||
<t> | Thus, if an endpoint has a persistent history of successfully transmitting | |||
Thus, if an endpoint has a persistent history of successfully | IW segments without loss, then it is allowed to probe the Internet to | |||
transmitting IW segments without loss, then it is allowed to probe | determine if larger IW values have similar success. This probing is | |||
the Internet to determine if larger IW values have similar success. | limited and requires a trusted time source; otherwise, the MaxIW remains | |||
This probing is limited and requires a trusted time source, | constant.</t></li> | |||
otherwise the MaxIW remains constant.</t> | <li> | |||
<t> | ||||
<t> | An implementation <bcp14>MUST</bcp14> adjust the IW based on loss statistics | |||
>> An implementation MUST adjust the IW based on loss statistics at | at | |||
least once every 1000 connections.</t> | least once every 1000 connections.</t> | |||
<t> | ||||
<t> | ||||
An endpoint needs to be sufficiently reactive to IW loss.</t> | An endpoint needs to be sufficiently reactive to IW loss.</t> | |||
</li> | ||||
<t> | <li> <t> | |||
>> An implementation MUST decrease the IW by at least one MSS when | An implementation <bcp14>MUST</bcp14> decrease the IW by at least 1 MSS when | |||
indicated during an evaluation interval.</t> | indicated during an evaluation interval.</t> | |||
<t> | ||||
<t> | ||||
An endpoint that detects loss needs to decrease its IW by at least | An endpoint that detects loss needs to decrease its IW by at least | |||
one MSS, otherwise it is not participating in an automatic reactive | 1 MSS; otherwise, it is not participating in an automatic reactive | |||
algorithm.</t> | algorithm.</t></li> | |||
<li> | ||||
<t> | <t> | |||
>> An implementation MUST increase by no more than 2 MSS per | An implementation <bcp14>MUST</bcp14> increase by no more than 2 MSSs per | |||
evaluation interval.</t> | evaluation interval.</t> | |||
<t> | ||||
<t> | ||||
An endpoint that does not experience IW loss needs to probe the | An endpoint that does not experience IW loss needs to probe the | |||
network incrementally.</t> | network incrementally.</t> | |||
</li> | ||||
<t> | <li> | |||
>> An implementation SHOULD use an IW that is an integer multiple of | <t> | |||
2 MSS.</t> | An implementation <bcp14>SHOULD</bcp14> use an IW that is an integer multiple | |||
of | ||||
<t> | 2 MSSs.</t> | |||
The IW should remain a multiple of 2 MSS segments, to enable | <t> | |||
The IW should remain a multiple of 2 MSS segments to enable | ||||
efficient ACK compression without incurring unnecessary timeouts.</t> | efficient ACK compression without incurring unnecessary timeouts.</t> | |||
</li> | ||||
<t> | <li> <t> | |||
>> An implementation MUST decrease the IW if more than 95% of | An implementation <bcp14>MUST</bcp14> decrease the IW if more than 95% of | |||
connections have IW losses.</t> | connections have IW losses.</t> | |||
<t> | ||||
Again, this is to ensure an implementation is sufficiently reactive.</t></li> | ||||
<t> | <li | |||
Again, this is to ensure an implementation is sufficiently reactive.</t> | > <t> | |||
An implementation <bcp14>MAY</bcp14> group IW values and statistics within | ||||
<t> | subsets of connections. Such grouping <bcp14>MAY</bcp14> use any information | |||
>> An implementation MAY group IW values and statistics within | about | |||
subsets of connections. Such grouping MAY use any information about | ||||
connections to form groups except loss statistics.</t> | connections to form groups except loss statistics.</t> | |||
</li> | ||||
<t> | </ul> | |||
There are some TCP connections which might not be counted at all, | <t> | |||
such as those to/from loopback addresses, or those within the same | There are some TCP connections that might not be counted at all, | |||
such as those to/from loopback addresses or those within the same | ||||
subnet as that of a local interface (for which congestion control is | subnet as that of a local interface (for which congestion control is | |||
sometimes disabled anyway). This may also include connections that | sometimes disabled anyway). This may also include connections that | |||
terminate before the IW is full, i.e., as a separate check at the | terminate before the IW is full, i.e., as a separate check at the | |||
time of the connection closing.</t> | time of the connection closing.</t> | |||
<t> | ||||
<t> | The period over which the IW is updated is intended to be a long timescale, | |||
The period over which the IW is updated is intended to be a long | e.g., a month or so, or 1,000 connections, whichever is longer. An | |||
timescale, e.g., a month or so, or 1,000 connections, whichever is | implementation might check the IW once a month and simply not update the IW | |||
longer. An implementation might check the IW once a month, and | or clear the connection counts in months where the number of connections is | |||
simply not update the IW or clear the connection counts in months | too small.</t> | |||
where the number of connections is too small.</t> | </section> | |||
<section anchor="sect-c.4" numbered="true" toc="default"> | ||||
</section> | <name>Discussion</name> | |||
<t> | ||||
<section title="Discussion" anchor="sect-c.4"><t> | ||||
There are numerous parameters to the above algorithm that are | There are numerous parameters to the above algorithm that are | |||
compliant with the given requirements; this is intended to allow | compliant with the given requirements; this is intended to allow | |||
variation in configuration and implementation while ensuring that | variation in configuration and implementation while ensuring that | |||
all such algorithms are reactive and safe.</t> | all such algorithms are reactive and safe.</t> | |||
<t> | ||||
<t> | ||||
This algorithm continues to assume segments because that is the | This algorithm continues to assume segments because that is the | |||
basis of most TCP implementations. It might be useful to consider | basis of most TCP implementations. It might be useful to consider | |||
revising the specifications to allow byte-based congestion given | revising the specifications to allow byte-based congestion given | |||
sufficient experience.</t> | sufficient experience.</t> | |||
<t> | ||||
<t> | ||||
The algorithm checks for IW losses only during the first IW after a | The algorithm checks for IW losses only during the first IW after a | |||
connection start; it does not check for IW losses elsewhere the IW | connection start; it does not check for IW losses elsewhere the IW | |||
is used, e.g., during slow-start restarts.</t> | is used, e.g., during slow-start restarts.</t> | |||
<t> | <ul> | |||
>> An implementation MAY detect IW losses during slow-start restarts | <li> <t> An implementation <bcp14>MAY</bcp14> detect IW losses during | |||
in addition to losses during the first IW of a connection. In this | slow-start restarts in addition to losses during the first IW of a | |||
case, the implementation MUST count each restart as a "connection" | connection. In this case, the implementation <bcp14>MUST</bcp14> count | |||
for the purposes of connection counts and periodic rechecking of the | each restart as a "connection" for the purposes of connection counts and | |||
IW value.</t> | periodic rechecking of the IW value.</t> | |||
</li> | ||||
<t> | </ul> | |||
<t> | ||||
False positives can occur during some kinds of segment reordering, | False positives can occur during some kinds of segment reordering, | |||
e.g., that might trigger spurious retransmissions even without a | e.g., that might trigger spurious retransmissions even without a | |||
true segment loss. These are not expected to be sufficiently common | true segment loss. These are not expected to be sufficiently common | |||
to dominate the algorithm and its conclusions.</t> | to dominate the algorithm and its conclusions.</t> | |||
<t> | <t> | |||
This mechanism does require additional per-connection state, which | This mechanism does require additional per-connection state, which is | |||
is currently common in some implementations, and is useful for other | currently common in some implementations and is useful for other reasons | |||
reasons (e.g., the ISN is used in TCP-AO <xref target="RFC5925"/>). The mecha | (e.g., the ISN is used in TCP-AO <xref target="RFC5925" | |||
nism | format="default"/>). | |||
also benefits from persistent state kept across reboots, as would be | ||||
other state sharing mechanisms (e.g., TCP Control Block Sharing per | ||||
the main body of this document).</t> | ||||
<t> | The mechanism in this appendix also benefits from persistent state kept across | |||
reboots, which would also be useful to other state sharing mechanisms (e.g., | ||||
TCP Control Block Sharing per the main body of this document). | ||||
</t> | ||||
<t> | ||||
The receive window (rwnd) is not involved in this calculation. The | The receive window (rwnd) is not involved in this calculation. The | |||
size of rwnd is determined by receiver resources and provides space | size of rwnd is determined by receiver resources and provides space | |||
to accommodate segment reordering. It is not involved with | to accommodate segment reordering. | |||
congestion control, which is the focus of this document and its | ||||
management of the IW.</t> | ||||
</section> | Also, rwnd is not involved with congestion control, which is the focus of the wa | |||
y | ||||
this appendix manages the IW. | ||||
<section title="Observations" anchor="sect-c.5"><t> | </t> | |||
The IW may not converge to a single, global value. It also may not | </section> | |||
converge at all, but rather may oscillate by a few MSS as it | <section anchor="sect-c.5" numbered="true" toc="default"> | |||
<name>Observations</name> | ||||
<t> | ||||
The IW may not converge to a single global value. It also may not | ||||
converge at all but rather may oscillate by a few MSSs as it | ||||
repeatedly probes the Internet for larger IWs and fails. Both | repeatedly probes the Internet for larger IWs and fails. Both | |||
properties are consistent with TCP behavior during each individual | properties are consistent with TCP behavior during each individual | |||
connection.</t> | connection.</t> | |||
<t> | ||||
<t> | ||||
This mechanism assumes that losses during the IW are due to IW size. | This mechanism assumes that losses during the IW are due to IW size. | |||
Persistent errors that drop packets for other reasons - e.g., OS | Persistent errors that drop packets for other reasons, e.g., OS | |||
bugs, can cause false positives. Again, this is consistent with | bugs, can cause false positives. Again, this is consistent with | |||
TCP's basic assumption that loss is caused by congestion and | TCP's basic assumption that loss is caused by congestion and | |||
requires backoff. This algorithm treats the IW of new connections as | requires backoff. This algorithm treats the IW of new connections as | |||
a long-timescale backoff system.</t> | a long-timescale backoff system.</t> | |||
</section> | ||||
</section> | </section> | |||
<section numbered="false" anchor="acknowledgments" toc="default"> | ||||
</section> | <name>Acknowledgments</name> | |||
<t> | ||||
<section title="Acknowledgments" numbered="no" anchor="acknowledgments">< | The authors would like to thank <contact fullname="Praveen | |||
t> | Balasubramanian"/> for information regarding TCB sharing in Windows; | |||
The authors would like to thank for Praveen Balasubramanian for | <contact fullname="Christoph Paasch"/> for information regarding TCB | |||
information regarding TCB sharing in Windows, Christoph Paasch for | sharing in Apple OSs; <contact fullname="Yuchung Cheng"/>, <contact | |||
information regarding TCB sharing in Apple OSes, and Yuchung Cheng, | fullname="Lars Eggert"/>, <contact fullname="Ilpo Jarvinen"/>, and <contact | |||
Lars Eggert, Ilpo Jarvinen and Michael Scharf for comments on | fullname="Michael Scharf"/> for comments on earlier draft versions of this | |||
earlier versions of the draft, as well as members of the TCPM WG. | document; as well as members of the TCPM WG. Earlier revisions of this | |||
Earlier revisions of this work received funding from a collaborative | work received funding from a collaborative research project between the | |||
research project between the University of Oslo and Huawei | University of Oslo and Huawei Technologies Co., Ltd. and were partly | |||
Technologies Co., Ltd. and were partly supported by USC/ISI's Postel | supported by USC/ISI's Postel Center.</t> | |||
Center.</t> | <t> | |||
<t> | ||||
This document was prepared using 2-Word-v2.0.template.dot.</t> | This document was prepared using 2-Word-v2.0.template.dot.</t> | |||
</section> | ||||
</back> | ||||
</section> | </rfc> | |||
</back> | ||||
</rfc> | ||||
End of changes. 247 change blocks. | ||||
1365 lines changed or deleted | 1653 lines changed or added | |||
This html diff was produced by rfcdiff 1.48. The latest version is available from http://tools.ietf.org/tools/rfcdiff/ |