rfc8684xml2.original.xml | rfc8684.xml | |||
---|---|---|---|---|
<?xml version="1.0" encoding="US-ASCII"?> | <?xml version='1.0' encoding='utf-8'?> | |||
<!-- Convert to HTML and Text with xml2rfc: http://xml2rfc.ietf.org. --> | ||||
<!DOCTYPE rfc SYSTEM "rfc2629.dtd" [ | <!DOCTYPE rfc SYSTEM "rfc2629-xhtml.ent"> | |||
<!ENTITY RFC5533 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | <rfc xmlns:xi="http://www.w3.org/2001/XInclude" submissionType="IETF" | |||
RFC.5533.xml"> | category="std" consensus="true" docName="draft-ietf-mptcp-rfc6824bis-18" nu | |||
<!ENTITY RFC5062 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | mber="8684" ipr="trust200902" obsoletes="6824" updates="" xml:lang="en" tocInclu | |||
RFC.5062.xml"> | de="true" symRefs="true" sortRefs="true" version="3"> | |||
<!ENTITY RFC5061 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
RFC.5061.xml"> | <!-- xml2rfc v2v3 conversion 2.27.0 --> | |||
<!ENTITY RFC4960 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
RFC.4960.xml"> | ||||
<!ENTITY RFC4987 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
RFC.4987.xml"> | ||||
<!ENTITY RFC6234 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
RFC.6234.xml"> | ||||
<!ENTITY RFC4086 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
RFC.4086.xml"> | ||||
<!ENTITY RFC5681 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
RFC.5681.xml"> | ||||
<!ENTITY RFC2119 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
RFC.2119.xml"> | ||||
<!ENTITY RFC2992 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
RFC.2992.xml"> | ||||
<!ENTITY RFC2979 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
RFC.2979.xml"> | ||||
<!ENTITY RFC2104 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
RFC.2104.xml"> | ||||
<!ENTITY RFC2018 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
RFC.2018.xml"> | ||||
<!ENTITY RFC1918 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
RFC.1918.xml"> | ||||
<!ENTITY RFC0793 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
RFC.0793.xml"> | ||||
<!ENTITY RFC7323 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
RFC.7323.xml"> | ||||
<!ENTITY RFC1122 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
RFC.1122.xml"> | ||||
<!ENTITY RFC3135 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
RFC.3135.xml"> | ||||
<!ENTITY RFC3022 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
RFC.3022.xml"> | ||||
<!ENTITY RFC6181 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
RFC.6181.xml"> | ||||
<!ENTITY RFC6182 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
RFC.6182.xml"> | ||||
<!ENTITY RFC6356 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
RFC.6356.xml"> | ||||
<!ENTITY RFC6555 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
RFC.6555.xml"> | ||||
<!ENTITY RFC8126 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
RFC.8126.xml"> | ||||
<!ENTITY RFC6897 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
RFC.6897.xml"> | ||||
<!ENTITY RFC6528 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
RFC.6528.xml"> | ||||
<!ENTITY RFC5961 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
RFC.5961.xml"> | ||||
<!ENTITY RFC7413 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
RFC.7413.xml"> | ||||
<!ENTITY RFC7430 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
RFC.7430.xml"> | ||||
<!ENTITY RFC8174 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
RFC.8174.xml"> | ||||
<!ENTITY RFC8041 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
RFC.8041.xml"> | ||||
]> | ||||
<?xml-stylesheet type='text/xsl' href='rfc2629.xslt' ?> | ||||
<?rfc strict="no" ?> | ||||
<?rfc toc="yes"?> | ||||
<?rfc tocdepth="4"?> | ||||
<?rfc symrefs="yes"?> | ||||
<?rfc sortrefs="yes" ?> | ||||
<?rfc compact="yes" ?> | ||||
<?rfc subcompact="no" ?> | ||||
<?rfc rfcedstyle="yes"?> | ||||
<rfc category="std" docName="draft-ietf-mptcp-rfc6824bis-18" ipr="trust200902" o bsoletes="6824"> | ||||
<front> | <front> | |||
<title abbrev="Multipath TCP">TCP Extensions for Multipath Operation with Mu ltiple Addresses</title> | <title abbrev="Multipath TCP">TCP Extensions for Multipath Operation with Mu ltiple Addresses</title> | |||
<seriesInfo name="RFC" value="8684"/> | ||||
<author fullname="Alan Ford" initials="A." surname="Ford"> | <author fullname="Alan Ford" initials="A." surname="Ford"> | |||
<organization>Pexip</organization> | <organization>Pexip</organization> | |||
<address> | <address> | |||
<!-- <postal> | ||||
<street>Beech Court</street> | ||||
<city>Hurst</city> | ||||
<region>Berkshire</region> | ||||
<code>RG10 0RQ</code> | ||||
<country>UK</country> | ||||
</postal> --> | ||||
<email>alan.ford@gmail.com</email> | <email>alan.ford@gmail.com</email> | |||
</address> | </address> | |||
</author> | </author> | |||
<author fullname="Costin Raiciu" initials="C." surname="Raiciu"> | <author fullname="Costin Raiciu" initials="C." surname="Raiciu"> | |||
<organization abbrev="U. Politechnica of Bucharest">University Politehnica of Bucharest</organization> | <organization abbrev="U. Politehnica of Bucharest">University Politehnica of Bucharest</organization> | |||
<address> | <address> | |||
<postal> | <postal> | |||
<street>Splaiul Independentei 313</street> | <street>Splaiul Independentei 313</street> | |||
<city>Bucharest</city> | <city>Bucharest</city> | |||
<country>Romania</country> | <country>Romania</country> | |||
</postal> | </postal> | |||
<email>costin.raiciu@cs.pub.ro</email> | <email>costin.raiciu@cs.pub.ro</email> | |||
</address> | </address> | |||
</author> | </author> | |||
<author fullname="Mark Handley" initials="M." surname="Handley"> | <author fullname="Mark Handley" initials="M." surname="Handley"> | |||
<organization abbrev="U. College London">University College London</organi zation> | <organization abbrev="U. College London">University College London</organi zation> | |||
<address> | <address> | |||
<postal> | <postal> | |||
<street>Gower Street</street> | <street>Gower Street</street> | |||
<city>London</city> | <city>London</city> | |||
<code>WC1E 6BT</code> | <code>WC1E 6BT</code> | |||
<country>UK</country> | <country>United Kingdom</country> | |||
</postal> | </postal> | |||
<email>m.handley@cs.ucl.ac.uk</email> | <email>m.handley@cs.ucl.ac.uk</email> | |||
</address> | </address> | |||
</author> | </author> | |||
<author fullname="Olivier Bonaventure" initials="O." surname="Bonaventure"> | <author fullname="Olivier Bonaventure" initials="O." surname="Bonaventure"> | |||
<organization abbrev="U. catholique de Louvain">Université catholiq | <organization abbrev="U. catholique de Louvain" ascii="Universite catholique | |||
ue de Louvain</organization> | de Louvain">Université catholique de Louvain</organization> | |||
<address> | <address> | |||
<postal> | <postal> | |||
<street>Pl. Ste Barbe, 2</street> | <street>Pl. Ste Barbe, 2</street> | |||
<code>1348</code> | <code>1348</code> | |||
<city>Louvain-la-Neuve</city> | <city>Louvain-la-Neuve</city> | |||
<country>Belgium</country> | <country>Belgium</country> | |||
</postal> | </postal> | |||
<email>olivier.bonaventure@uclouvain.be</email> | <email>olivier.bonaventure@uclouvain.be</email> | |||
</address> | </address> | |||
</author> | </author> | |||
skipping to change at line 101 ¶ | skipping to change at line 55 ¶ | |||
<address> | <address> | |||
<postal> | <postal> | |||
<street>Pl. Ste Barbe, 2</street> | <street>Pl. Ste Barbe, 2</street> | |||
<code>1348</code> | <code>1348</code> | |||
<city>Louvain-la-Neuve</city> | <city>Louvain-la-Neuve</city> | |||
<country>Belgium</country> | <country>Belgium</country> | |||
</postal> | </postal> | |||
<email>olivier.bonaventure@uclouvain.be</email> | <email>olivier.bonaventure@uclouvain.be</email> | |||
</address> | </address> | |||
</author> | </author> | |||
<author fullname="Christoph Paasch" initials="C." surname="Paasch"> | <author fullname="Christoph Paasch" initials="C." surname="Paasch"> | |||
<organization abbrev="Apple, Inc.">Apple, Inc.</organization> | <organization abbrev="Apple, Inc.">Apple, Inc.</organization> | |||
<address> | <address> | |||
<postal> | <postal> | |||
<street></street> | <street/> | |||
<city>Cupertino</city> | <city>Cupertino</city> | |||
<country>US</country> | <region>CA</region> | |||
<country>United States of America</country> | ||||
</postal> | </postal> | |||
<email>cpaasch@apple.com</email> | <email>cpaasch@apple.com</email> | |||
</address> | </address> | |||
</author> | </author> | |||
<date year="2020" month="March"/> | ||||
<date year="2019" /> | <keyword>tcp</keyword> | |||
<keyword>extensions</keyword> | ||||
<area>General</area> | <keyword>multipath</keyword> | |||
<workgroup>Internet Engineering Task Force</workgroup> | <keyword>multihomed</keyword> | |||
<keyword>tcp extensions multipath multihomed subflow</keyword> | <keyword>subflow</keyword> | |||
<abstract> | <abstract> | |||
<t>TCP/IP communication is currently restricted to a single path per conne | <t>TCP/IP communication is currently restricted to a single path per conne | |||
ction, yet multiple paths often exist between peers. The simultaneous use of the | ction, yet multiple paths often exist between peers. The simultaneous use of the | |||
se multiple paths for a TCP/IP session would improve resource usage within the n | se multiple paths for a TCP/IP session would improve resource usage within the n | |||
etwork and, thus, improve user experience through higher throughput and improved | etwork and thus improve user experience through higher throughput and improved r | |||
resilience to network failure.</t> | esilience to network failure.</t> | |||
<t>Multipath TCP provides the ability to simultaneously use multiple | ||||
<t>Multipath TCP provides the ability to simultaneously use multiple paths | paths between peers. This document presents a set of extensions to | |||
between peers. This document presents a set of extensions to traditional TCP to | traditional TCP to support multipath operation. The protocol offers the | |||
support multipath operation. The protocol offers the same type of service to ap | same type of service to applications as TCP (i.e., a reliable bytestream), | |||
plications as TCP (i.e., reliable bytestream), and it provides the components ne | and it provides the components necessary to establish and use multiple TCP flow | |||
cessary to establish and use multiple TCP flows across potentially disjoint path | s across potentially disjoint paths.</t> | |||
s.</t> | <t>This document specifies v1 of Multipath TCP, obsoleting v0 as | |||
specified in RFC 6824, through clarifications and modifications primarily | ||||
<t>This document specifies v1 of Multipath TCP, obsoleting v0 as specified | driven by deployment experience.</t> | |||
in RFC6824, through clarifications and modifications primarily driven by deploy | ||||
ment experience.</t> | ||||
</abstract> | </abstract> | |||
</front> | </front> | |||
<middle> | <middle> | |||
<section title="Introduction" anchor="sec_intro"> | <section anchor="sec_intro" numbered="true" toc="default"> | |||
<t>Multipath TCP (MPTCP) is a set of extensions to regular TCP <xref targe | <name>Introduction</name> | |||
t="RFC0793"/> to provide a Multipath TCP <xref target="RFC6182"/> service, which | <t>Multipath TCP (MPTCP) is a set of extensions to regular TCP <xref | |||
enables a transport connection to operate across multiple paths | target="RFC0793" format="default"/> to provide a Multipath TCP service <xr | |||
simultaneously. This document presents the protocol changes required to add mult | ef target="RFC6182" format="default"/>, which enables a transport connection to | |||
ipath capability to TCP; specifically, those for signaling and setting up multip | operate across multiple paths | |||
le paths ("subflows"), managing these subflows, reassembly of data, and terminat | simultaneously. This document presents the protocol changes required to add | |||
ion of sessions. | multipath capability to TCP -- specifically, those for signaling and setting | |||
This is not the only information required to create a Multipath TCP implem | up multiple paths ("subflows"), managing these subflows, reassembly of data, | |||
entation, however. This document is complemented by three others: | and termination of sessions. This is not the only information required to create | |||
<list style="symbols"> | a Multipath TCP implementation, however. This document is complemented by three | |||
<t>Architecture <xref target="RFC6182"/>, which explains the motivatio | others: | |||
ns behind Multipath TCP, contains a discussion of high-level design decisions on | ||||
which this design is based, and an explanation of a functional separation throu | ||||
gh which an extensible MPTCP implementation can be developed.</t> | ||||
<t>Congestion control <xref target="RFC6356"/> presents a safe congest | ||||
ion control algorithm for coupling the behavior of the multiple paths in order t | ||||
o "do no harm" to other network users.</t> | ||||
<t>Application considerations <xref target="RFC6897"/> discusses what | ||||
impact MPTCP will have on applications, what applications will want to do with M | ||||
PTCP, and as a consequence of these factors, what API extensions an MPTCP implem | ||||
entation should present.</t> | ||||
</list> | ||||
This document is an update to, and obsoletes, the v0 specification of Mult | ||||
ipath TCP (RFC6824). This document specifies MPTCP v1, which is not backward com | ||||
patible with MPTCP v0. This document additionally defines version negotiation pr | ||||
ocedures for implementations that support both versions. | ||||
</t> | </t> | |||
<ul spacing="normal"> | ||||
<section title="Design Assumptions" anchor="sec_assum"> | <li><xref target="RFC6182" format="default"/> (MPTCP architecture), whic | |||
<t>In order to limit the potentially huge design space, the mptcp workin | h | |||
g group imposed two key constraints on the Multipath TCP design presented in thi | explains the motivations behind Multipath TCP, contains a discussion | |||
s document: | of high-level design decisions on which this design is based, and provid | |||
<list style="symbols"> | es an explanation of a functional separation through which an extensible MPTCP i | |||
<t>It must be backwards-compatible with current, regular TCP, to inc | mplementation can be developed.</li> | |||
rease its chances of deployment.</t> | <li><xref target="RFC6356" format="default"/> (congestion control), whic | |||
<t>It can be assumed that one or both hosts are multihomed and multi | h presents a safe congestion control algorithm for coupling the behavior of the | |||
addressed.</t> | multiple paths in order to "do no harm" to other network users.</li> | |||
</list> | <li><xref target="RFC6897" | |||
format="default"/> (application considerations), which discusses what im | ||||
pact MPTCP will have on applications, what applications will want to do with MPT | ||||
CP, and as a consequence of these factors, what API extensions an MPTCP implemen | ||||
tation should present.</li> | ||||
</ul> | ||||
<t> | ||||
This document obsoletes the v0 specification of | ||||
Multipath TCP <xref target="RFC6824"/>. This document specifies MPTCP v1, | ||||
which is not backward compatible with MPTCP v0. This document additionally defin | ||||
es version negotiation procedures for implementations that support both versions | ||||
. | ||||
</t> | ||||
<section anchor="sec_assum" numbered="true" toc="default"> | ||||
<name>Design Assumptions</name> | ||||
<t>In order to limit the potentially huge design space, the | ||||
MPTCP Working Group imposed two key constraints on the Multipath TCP des | ||||
ign presented in this document: | ||||
</t> | </t> | |||
<t>To simplify the design, we assume that the presence of multiple addre | <ul spacing="normal"> | |||
sses at a host is sufficient to indicate the existence of multiple paths. These | <li>It must be backward compatible with current, regular TCP, to incre | |||
paths need not be entirely disjoint: they may share one or many routers between | ase its chances of deployment.</li> | |||
them. Even in such a situation, making use of multiple paths is beneficial, impr | <li>It can be assumed that one or both hosts are multihomed and multia | |||
oving resource utilization and resilience to a subset of node failures. The cong | ddressed.</li> | |||
estion control algorithms defined in <xref target="RFC6356"/> ensure this does n | </ul> | |||
ot act detrimentally. Furthermore, there may be some scenarios where different T | <t>To simplify the design, we assume that the presence of multiple | |||
CP ports on a single host can provide disjoint paths (such as through certain Eq | addresses at a host is sufficient to indicate the existence of | |||
ual-Cost Multipath (ECMP) implementations <xref target="RFC2992"/>), and so the | multiple paths. These paths need not be entirely disjoint: they may | |||
MPTCP design also supports the use of ports in path identifiers.</t> | share one or many routers between them. Even in such a situation, | |||
<t>There are three aspects to the backwards-compatibility listed above ( | making use of multiple paths is beneficial, improving resource | |||
discussed in more detail in <xref target="RFC6182"/>): | utilization and resilience to a subset of node failures. The | |||
<list style="hanging"> | congestion control algorithm defined in <xref target="RFC6356" | |||
<t hangText="External Constraints:"> The protocol must function thro | format="default"/> ensures that the use of multiple paths does not act d | |||
ugh the vast majority of existing | etrimentally. | |||
middleboxes such as NATs, firewalls, and proxies, and as such must resemble exis | Furthermore, there may be some scenarios where different TCP ports on a | |||
ting TCP as far as possible on the | single host can provide disjoint paths (such as through certain | |||
wire. Furthermore, the protocol must not assume the segments it sends on the wir | Equal-Cost Multipath (ECMP) implementations <xref target="RFC2992" | |||
e arrive unmodified at the destination: | format="default"/>), and so the MPTCP design also supports the use of | |||
they may be split or coalesced; TCP options may be removed or duplicated. </t> | ports in path identifiers.</t> | |||
<t hangText="Application Constraints:"> The protocol must be usable | <t>There are three aspects to the backward compatibility listed above (d | |||
with no change to existing applications that use the common TCP API (although it | iscussed in more detail in <xref target="RFC6182" format="default"/>): | |||
is reasonable that not all features would be available to such legacy applicati | ||||
ons). Furthermore, the protocol must provide the same service model as regular T | ||||
CP to the application.</t> | ||||
<t hangText="Fallback:"> The protocol should be able to fall back to | ||||
standard TCP with no interference from the user, to be able to communicate with | ||||
legacy hosts.</t> | ||||
</list> | ||||
</t> | </t> | |||
<t>The complementary application considerations document <xref target="R | <dl newline="false" spacing="normal" indent="3"> | |||
FC6897"/> discusses the necessary features of an API to provide backwards-compat | <dt>External Constraints:</dt> | |||
ibility, as well as API extensions to convey the behavior of MPTCP at a level of | <dd> The protocol must function through the vast majority of existing | |||
control and information equivalent to that available with regular, single-path | middleboxes such as NATs, firewalls, and proxies, and as such must resemble exis | |||
TCP.</t> | ting TCP as far as possible on the | |||
<t>Further discussion of the design constraints and associated design de | wire. Furthermore, the protocol must not assume that the segments it sends on th | |||
cisions are given in the MPTCP Architecture document <xref target="RFC6182"/> an | e wire arrive unmodified at the destination: | |||
d in <xref target="howhard"/>.</t> | they may be split or coalesced; TCP options may be removed or duplicated. </dd> | |||
<dt>Application Constraints:</dt> | ||||
<dd> The protocol must be usable with no change to existing applicatio | ||||
ns that use the common TCP API (although it is reasonable that not all features | ||||
would be available to such legacy applications). Furthermore, the protocol must | ||||
provide the same service model as regular TCP to the application.</dd> | ||||
<dt>Fallback:</dt> | ||||
<dd> The protocol should be able to fall back to standard TCP with no | ||||
interference from the user, to be able to communicate with legacy hosts.</dd> | ||||
</dl> | ||||
<t>The complementary application considerations document <xref | ||||
target="RFC6897" format="default"/> discusses the necessary features | ||||
of an API to provide backward compatibility, as well as API extensions t | ||||
o convey the behavior of MPTCP at a level of control and information equivalent | ||||
to that available with regular, single-path TCP.</t> | ||||
<t>Further discussion of the design constraints and associated design de | ||||
cisions is given in the MPTCP architecture document <xref target="RFC6182" forma | ||||
t="default"/> and in <xref target="howhard" format="default"/>.</t> | ||||
</section> | </section> | |||
<section anchor="sec_layers" numbered="true" toc="default"> | ||||
<section title="Multipath TCP in the Networking Stack" anchor="sec_layers" | <name>Multipath TCP in the Networking Stack</name> | |||
> | ||||
<t>MPTCP operates at the transport layer and aims to be transparent to b oth higher and lower | <t>MPTCP operates at the transport layer and aims to be transparent to b oth higher and lower | |||
layers. It is a set of additional features on top of standard TCP; <xref target= "fig_arch" /> illustrates | layers. It is a set of additional features on top of standard TCP; <xref target= "fig_arch" format="default"/> illustrates | |||
this layering. MPTCP is designed to be usable by legacy applications with no cha nges; detailed discussion | this layering. MPTCP is designed to be usable by legacy applications with no cha nges; detailed discussion | |||
of its interactions with applications is given in <xref target="RFC6897"/>.</t> | of its interactions with applications is given in <xref target="RFC6897" format= | |||
"default"/>.</t> | ||||
<figure align="center" anchor="fig_arch" title="Comparison of Standard T | <figure anchor="fig_arch"> | |||
CP and MPTCP Protocol Stacks"> | <name>Comparison of Standard TCP and MPTCP Protocol Stacks</name> | |||
<artwork align="left"><![CDATA[ | <artwork align="left" name="" type="" alt=""><![CDATA[ | |||
+-------------------------------+ | +-------------------------------+ | |||
| Application | | | Application | | |||
+---------------+ +-------------------------------+ | +---------------+ +-------------------------------+ | |||
| Application | | MPTCP | | | Application | | MPTCP | | |||
+---------------+ + - - - - - - - + - - - - - - - + | +---------------+ + - - - - - - - + - - - - - - - + | |||
| TCP | | Subflow (TCP) | Subflow (TCP) | | | TCP | | Subflow (TCP) | Subflow (TCP) | | |||
+---------------+ +-------------------------------+ | +---------------+ +-------------------------------+ | |||
| IP | | IP | IP | | | IP | | IP | IP | | |||
+---------------+ +-------------------------------+ | +---------------+ +-------------------------------+ ]]></artwork> | |||
]]></artwork> | ||||
</figure> | </figure> | |||
</section> | </section> | |||
<section numbered="true" toc="default"> | ||||
<section title="Terminology"> | <name>Terminology</name> | |||
<t>This document makes use of a number of terms that are either MPTCP-sp | <t>This document makes use of a number of terms that are either MPTCP sp | |||
ecific or have defined meaning in the context of MPTCP, as follows: | ecific or have defined meaning in the context of MPTCP, as follows: | |||
<list style="hanging"> | </t> | |||
<t hangText="Path:"> A sequence of links between a sender and a receiv | <dl newline="false" spacing="normal" indent="3"> | |||
er, defined in this context by a 4-tuple of source and destination address/port | <dt>Path:</dt> | |||
pairs.</t> | <dd> A sequence of links between a sender and a receiver, defined in t | |||
<t hangText="Subflow:"> A flow of TCP segments operating over an indiv | his context by a 4-tuple of source and destination address&wj;/port pairs.</dd> | |||
idual path, which forms part of a larger MPTCP connection. A subflow is started | <dt>Subflow:</dt> | |||
and terminated similar to a regular TCP connection.</t> | <dd> A flow of TCP segments operating over an individual path, which f | |||
<t hangText="(MPTCP) Connection:"> A set of one or more subflows, over | orms part of a larger MPTCP connection. A subflow is started and terminated simi | |||
which an application can communicate between two hosts. There is a one-to-one m | larly to a regular TCP connection.</dd> | |||
apping between a connection and an application socket.</t> | <dt>(MPTCP) Connection:</dt> | |||
<t hangText="Data-level:"> The payload data is nominally transferred o | <dd> A set of one or more subflows, over which an application can comm | |||
ver a connection, which in turn is transported over subflows. Thus, the term "d | unicate between two hosts. There is a one‑to‑one mapping between a c | |||
ata-level" is synonymous with "connection level", in contrast to "subflow-level" | onnection and an application socket.</dd> | |||
, which refers to properties of an individual subflow.</t> | <dt>Data-level:</dt> | |||
<t hangText="Token:"> A locally unique identifier given to a multipath | <dd> The payload data is nominally transferred over a connection, whic | |||
connection by a host. May also be referred to as a "Connection ID".</t> | h in turn is transported over subflows. Thus, the term "data-level" is synonymo | |||
<t hangText="Host:"> An end host operating an MPTCP implementation, an | us with "connection-level", in contrast to "subflow-level", which refers to prop | |||
d either initiating or accepting an MPTCP connection.</t> | erties of an individual subflow.</dd> | |||
</list> | <dt>Token:</dt> | |||
In addition to these terms, note that MPTCP's interpretation of, and eff | <dd> A locally unique identifier given to a multipath connection by a | |||
ect on, regular single-path TCP semantics are discussed in <xref target="sec_sem | host. May also be referred to as a "Connection ID".</dd> | |||
antics"/>.</t> | <dt>Host:</dt> | |||
<dd> An end host operating an MPTCP implementation, and either initiat | ||||
ing or accepting an MPTCP connection.</dd> | ||||
</dl> | ||||
<t> | ||||
In addition to these terms, note that MPTCP's interpretation of, and eff | ||||
ect on, regular single-path TCP semantics are discussed in <xref target="sec_sem | ||||
antics" format="default"/>.</t> | ||||
</section> | </section> | |||
<section anchor="sec_operation" numbered="true" toc="default"> | ||||
<section title="MPTCP Concept" anchor="sec_operation"> | <name>MPTCP Concept</name> | |||
<t>This section provides a high-level summary of normal | <t>This section provides a high-level summary of normal | |||
operation of MPTCP, and is illustrated by the scenario shown in | operation of MPTCP; this type of scenario is illustrated in | |||
<xref target="fig_scenario"/>. A detailed description of operation is given in < | <xref target="fig_scenario" format="default"/>. A detailed description of how | |||
xref target="sec_protocol"/>. | MPTCP operates is given in <xref target="sec_protocol" format="default"/>. | |||
<list style="symbols"> | ||||
<t>To a non-MPTCP-aware application, MPTCP will behave the same as n | ||||
ormal TCP. Extended APIs could provide | ||||
additional control to MPTCP-aware applications <xref target="RFC6897"/>. | ||||
An application begins by opening a TCP socket in the normal way. | ||||
MPTCP signaling and operation are handled by the MPTCP implementation. | ||||
</t> | ||||
<t>An MPTCP connection begins similarly to a regular TCP connection. | ||||
This is | ||||
illustrated in <xref target="fig_scenario"/> where an MPTCP connection is establ | ||||
ished between | ||||
addresses A1 and B1 on Hosts A and B, respectively.</t> | ||||
<t>If extra paths are available, additional TCP sessions (termed MPT | ||||
CP "subflows") | ||||
are created on these paths, and are combined with the existing session, which co | ||||
ntinues | ||||
to appear as a single connection to the applications at both ends. The creation | ||||
of the | ||||
additional TCP session is illustrated between Address A2 on Host A and Address B | ||||
1 on | ||||
Host B.</t> | ||||
<t>MPTCP identifies multiple paths by the presence of multiple addre | ||||
sses | ||||
at hosts. Combinations of these multiple addresses equate to the additional path | ||||
s. | ||||
In the example, other potential paths that could be set up are A1<->B2 and | ||||
A2<->B2. | ||||
Although this additional session is shown as being initiated from A2, it could e | ||||
qually have | ||||
been initiated from B1 or B2.</t> | ||||
<t>The discovery and setup of additional subflows | ||||
will be achieved through a path management method; this document describes a mec | ||||
hanism | ||||
by which a host can initiate new subflows by using its own additional addresses, | ||||
or by | ||||
signaling its available addresses to the other host.</t> | ||||
<t>MPTCP adds connection-level sequence numbers to allow the reassem | ||||
bly of | ||||
segments arriving on multiple subflows with differing network delays. </t> | ||||
<t>Subflows are terminated as regular TCP connections, with a four-w | ||||
ay FIN | ||||
handshake. The MPTCP connection is terminated by a connection-level FIN.</t> | ||||
</list> | ||||
</t> | </t> | |||
<?rfc needLines='17'?> | <figure anchor="fig_scenario"> | |||
<figure align="center" anchor="fig_scenario" title="Example MPTCP Usag | <name>Example MPTCP Usage Scenario</name> | |||
e Scenario"> | <artwork align="left" name="" type="" alt=""><![CDATA[ | |||
<artwork align="left"><![CDATA[ | ||||
Host A Host B | Host A Host B | |||
------------------------ ------------------------ | ------------------------ ------------------------ | |||
Address A1 Address A2 Address B1 Address B2 | Address A1 Address A2 Address B1 Address B2 | |||
---------- ---------- ---------- ---------- | ---------- ---------- ---------- ---------- | |||
| | | | | | | | | | |||
| (initial connection setup) | | | | (initial connection setup) | | | |||
|----------------------------------->| | | |----------------------------------->| | | |||
|<-----------------------------------| | | |<-----------------------------------| | | |||
| | | | | | | | | | |||
| (additional subflow setup) | | | (additional subflow setup) | | |||
| |--------------------->| | | | |--------------------->| | | |||
| |<---------------------| | | | |<---------------------| | | |||
| | | | | | | | | | |||
| | | | | | | | | ]]></artwork> | |||
]]></artwork> | </figure> | |||
</figure> | <ul spacing="normal"> | |||
<li>To a non-MPTCP-aware application, MPTCP will behave the same as no | ||||
rmal TCP. Extended APIs could provide | ||||
additional control to MPTCP-aware applications <xref target="RFC6897" format="de | ||||
fault"/>. | ||||
An application begins by opening a TCP socket in the normal way. | ||||
MPTCP signaling and operation are handled by the MPTCP implementation. | ||||
</li> | ||||
<li>An MPTCP connection begins similarly to a regular TCP connection. | ||||
This is | ||||
illustrated in <xref target="fig_scenario" format="default"/>, where an MPTCP co | ||||
nnection is established between | ||||
addresses A1 and B1 on Hosts A and B, respectively.</li> | ||||
<li>If extra paths are available, additional TCP sessions (termed MPTC | ||||
P "subflows") | ||||
are created on these paths and are combined with the existing session, which con | ||||
tinues | ||||
to appear as a single connection to the applications at both ends. The creation | ||||
of the | ||||
additional TCP session is illustrated between Address A2 on Host A and Address B | ||||
1 on | ||||
Host B.</li> | ||||
<li>MPTCP identifies multiple paths by the presence of multiple addres | ||||
ses | ||||
at hosts. Combinations of these multiple addresses equate to the additional path | ||||
s. | ||||
In the example, other potential paths that could be set up are A1<->B2 and | ||||
A2<->B2. | ||||
Although this additional session is shown as being initiated from A2, it could e | ||||
qually have | ||||
been initiated from B1 or B2.</li> | ||||
<li>The discovery and setup of additional subflows | ||||
will be achieved through a path management method; this document describes a mec | ||||
hanism | ||||
by which a host can initiate new subflows by using its own additional addresses | ||||
or by | ||||
signaling its available addresses to the other host.</li> | ||||
<li>MPTCP adds connection-level sequence numbers to allow the reassemb | ||||
ly of | ||||
segments arriving on multiple subflows with differing network delays. </li> | ||||
<li>Subflows are terminated as regular TCP connections, with a four | ||||
209;way FIN | ||||
handshake. The MPTCP connection is terminated by a connection-level FIN.</li> | ||||
</ul> | ||||
</section> | </section> | |||
<section numbered="true" toc="default"> | ||||
<name>Requirements Language</name> | ||||
<section title="Requirements Language"> | <t> | |||
<t>The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL | The key words "<bcp14>MUST</bcp14>", "<bcp14>MUST NOT</bcp14>", | |||
NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "NOT RECOMMENDED", | "<bcp14>REQUIRED</bcp14>", "<bcp14>SHALL</bcp14>", "<bcp14>SHALL NOT</bcp14> | |||
"MAY", and "OPTIONAL" in this document are to be interpreted as | ", | |||
described in BCP 14 <xref target="RFC2119"/> <xref target="RFC8174" | "<bcp14>SHOULD</bcp14>", "<bcp14>SHOULD NOT</bcp14>", | |||
/> | "<bcp14>RECOMMENDED</bcp14>", "<bcp14>NOT RECOMMENDED</bcp14>", | |||
when, and only when, they appear in all capitals, as shown here.</t> | "<bcp14>MAY</bcp14>", and "<bcp14>OPTIONAL</bcp14>" in this document are to | |||
be | ||||
interpreted as described in BCP 14 <xref target="RFC2119"/> <xref | ||||
target="RFC8174"/> when, and only when, they appear in all capitals, as | ||||
shown here. | ||||
</t> | ||||
</section> | </section> | |||
</section> | </section> | |||
<section anchor="sec_overview" numbered="true" toc="default"> | ||||
<section title="Operation Overview" anchor="sec_overview"> | <name>Operation Overview</name> | |||
<t>This section presents a single description of common MPTCP operation, w | <t>This section presents a single description of common MPTCP operation, w | |||
ith reference to the protocol operation. This is a high-level overview of the ke | ith reference to the protocol operation. This is a high-level overview of the ke | |||
y functions; the full specification follows in <xref target="sec_protocol"/>. Ex | y functions; the full specification follows in <xref target="sec_protocol" forma | |||
tensibility and negotiated features are not discussed here. Considerable referen | t="default"/>. Extensibility and negotiated features are not discussed here. Con | |||
ce is made to symbolic names of MPTCP options throughout this section -- these a | siderable reference is made to symbolic names of MPTCP options throughout this s | |||
re subtypes of the IANA-assigned MPTCP option (see <xref target="IANA"/>), and t | ection -- these are subtypes of the IANA‑assigned MPTCP option (see <xref | |||
heir formats are defined in the detailed protocol specification that follows in | target="IANA" format="default"/>), and their formats are defined in the detailed | |||
<xref target="sec_protocol"/>.</t> | protocol specification provided in <xref target="sec_protocol" format="default" | |||
/>.</t> | ||||
<t>A Multipath TCP connection provides a bidirectional bytestream between two ho | <t>A Multipath TCP connection provides a bidirectional bytestream between | |||
sts communicating like normal TCP and, thus, does not require any change to the | two hosts communicating like normal TCP and thus does not require any change to | |||
applications. However, Multipath TCP enables the hosts to use different paths wi | the applications. However, Multipath TCP enables the hosts to use different path | |||
th different IP addresses to exchange packets belonging to the MPTCP connection. | s with different IP addresses to exchange packets belonging to the MPTCP connect | |||
A Multipath TCP connection appears like a normal TCP connection to an applicati | ion. A Multipath TCP connection appears like a normal TCP connection to an appli | |||
on. However, to the network layer, each MPTCP subflow looks like a regular TCP f | cation. However, to the network layer, each MPTCP subflow looks like a regular T | |||
low whose segments carry a new TCP option type. Multipath TCP manages the creati | CP flow whose segments carry a new TCP option type. Multipath TCP manages the cr | |||
on, removal, and utilization of these subflows to send data. The number of subfl | eation, removal, and utilization of these subflows to send data. The number of s | |||
ows that are managed within a Multipath TCP connection is not fixed and it can f | ubflows that are managed within a Multipath TCP connection is not fixed, and it | |||
luctuate during the lifetime of the Multipath TCP connection.</t> | can fluctuate during the lifetime of the Multipath TCP connection.</t> | |||
<t>All MPTCP operations are signaled with a TCP option -- a single numeric | ||||
<t>All MPTCP operations are signaled with a TCP option -- a single numerical typ | al type for MPTCP, with "subtypes" for each MPTCP message. What follows is a sum | |||
e for MPTCP, with "sub-types" for each MPTCP message. What follows is a summary | mary of the purpose and rationale of these messages.</t> | |||
of the purpose and rationale of these messages.</t> | <section numbered="true" toc="default"> | |||
<section title="Initiating an MPTCP Connection"> | <name>Initiating an MPTCP Connection</name> | |||
<t>This is the same signaling as for initiating a normal TCP connection, but the | <t>This is the same signaling as for initiating a normal TCP connection, | |||
SYN, SYN/ACK, and initial ACK (and data) packets also carry the MP_CAPABLE opti | but the SYN, SYN/ACK, and initial ACK (and data) packets also carry the MP_CAPA | |||
on. This option has a variable length and serves multiple purposes. Firstly, it | BLE option. This option has a variable length and serves multiple purposes. Firs | |||
verifies whether the remote host supports Multipath TCP; secondly, this option a | tly, it verifies whether the remote host supports Multipath TCP; secondly, this | |||
llows the hosts to exchange some information to authenticate the establishment o | option allows the hosts to exchange some information to authenticate the establi | |||
f additional subflows. Further details are given in <xref target="sec_init"/>.</ | shment of additional subflows. Further details are given in <xref target="sec_in | |||
t> | it" format="default"/>.</t> | |||
<artwork align="left" name="" type="" alt=""><![CDATA[ | ||||
<figure><artwork align="left"><![CDATA[ | ||||
Host A Host B | Host A Host B | |||
------ ------ | ------ ------ | |||
MP_CAPABLE -> | MP_CAPABLE -> | |||
[flags] | [flags] | |||
<- MP_CAPABLE | <- MP_CAPABLE | |||
[B's key, flags] | [B's key, flags] | |||
ACK + MP_CAPABLE (+ data) -> | ACK + MP_CAPABLE (+ data) -> | |||
[A's key, B's key, flags, (data-level details)] | [A's key, B's key, flags, (data-level details)] ]]></artwork> | |||
]]></artwork></figure> | <t>Retransmission of the ACK + MP_CAPABLE can occur if it is not known i | |||
f it has been received. The following diagrams show all possible exchanges for t | ||||
<t>Retransmission of the ACK + MP_CAPABLE can occur if it is not known if it has | he initial subflow setup to ensure this reliability.</t> | |||
been received. The following diagrams show all possible exchanges for the initi | <artwork align="left" name="" type="" alt=""><![CDATA[ | |||
al subflow setup to ensure this reliability.</t> | ||||
<figure><artwork align="left"><![CDATA[ | ||||
Host A (with data to send immediately) Host B | Host A (with data to send immediately) Host B | |||
------ ------ | ------ ------ | |||
MP_CAPABLE -> | MP_CAPABLE -> | |||
[flags] | [flags] | |||
<- MP_CAPABLE | <- MP_CAPABLE | |||
[B's key, flags] | [B's key, flags] | |||
ACK + MP_CAPABLE + data -> | ACK + MP_CAPABLE + data -> | |||
[A's key, B's key, flags, data-level details] | [A's key, B's key, flags, data-level details] | |||
Host A (with data to send later) Host B | Host A (with data to send later) Host B | |||
skipping to change at line 316 ¶ | skipping to change at line 308 ¶ | |||
Host A Host B (sending first) | Host A Host B (sending first) | |||
------ ------ | ------ ------ | |||
MP_CAPABLE -> | MP_CAPABLE -> | |||
[flags] | [flags] | |||
<- MP_CAPABLE | <- MP_CAPABLE | |||
[B's key, flags] | [B's key, flags] | |||
ACK + MP_CAPABLE -> | ACK + MP_CAPABLE -> | |||
[A's key, B's key, flags] | [A's key, B's key, flags] | |||
<- ACK + DSS + data | <- ACK + DSS + data | |||
[data-level details] | [data-level details] ]]></artwork> | |||
]]></artwork></figure> | </section> | |||
</section> | <section numbered="true" toc="default"> | |||
<name>Associating a New Subflow with an Existing MPTCP Connection</name> | ||||
<section title="Associating a New Subflow with an Existing MPTCP Connection"> | <t>The exchange of keys in the MP_CAPABLE handshake provides material th | |||
<t>The exchange of keys in the MP_CAPABLE handshake provides material that can b | at can be used to authenticate the endpoints when new subflows will be set up. | |||
e used to authenticate the endpoints when new subflows will be set up. | ||||
Additional subflows begin in the same way as initiating a normal TCP connection, but the SYN, SYN/ACK, and ACK packets also carry the MP_JOIN option. </t> | Additional subflows begin in the same way as initiating a normal TCP connection, but the SYN, SYN/ACK, and ACK packets also carry the MP_JOIN option. </t> | |||
<t>Host A initiates a new subflow between one of its addresses and one | ||||
<t>Host A initiates a new subflow between one of its addresses and one of Host B | of Host B's addresses. The token -- generated from the key -- is used | |||
's addresses. The token -- generated from the key -- is used to identify which M | to identify which MPTCP connection it is joining, and the Hash‑bas | |||
PTCP connection it is joining, and the HMAC is used for authentication. The Hash | ed | |||
-based Message Authentication Code (HMAC) uses the keys exchanged in the MP_CAPA | Message Authentication Code (HMAC) is used for authentication. The HMAC | |||
BLE handshake, and the random numbers (nonces) exchanged in these MP_JOIN option | uses the keys exchanged in the MP_CAPABLE handshake and the random numbers (nonc | |||
s. MP_JOIN also contains flags and an Address ID that can be used to refer to th | es) exchanged in these MP_JOIN options. MP_JOIN also contains flags and an Addre | |||
e source address without the sender needing to know if it has been changed by a | ss ID that can be used to refer to the source address without the sender needing | |||
NAT. Further details are in <xref target="sec_join"/>.</t> | to know if it has been changed by a NAT. Further details are given in <xref tar | |||
get="sec_join" format="default"/>.</t> | ||||
<figure><artwork align="left"><![CDATA[ | <artwork align="left" name="" type="" alt=""><![CDATA[ | |||
Host A Host B | Host A Host B | |||
------ ------ | ------ ------ | |||
MP_JOIN -> | MP_JOIN -> | |||
[B's token, A's nonce, | [B's token, A's nonce, | |||
A's Address ID, flags] | A's Address ID, flags] | |||
<- MP_JOIN | <- MP_JOIN | |||
[B's HMAC, B's nonce, | [B's HMAC, B's nonce, | |||
B's Address ID, flags] | B's Address ID, flags] | |||
ACK + MP_JOIN -> | ACK + MP_JOIN -> | |||
[A's HMAC] | [A's HMAC] | |||
<- ACK | <- ACK ]]></artwork> | |||
]]></artwork></figure> | </section> | |||
</section> | <section numbered="true" toc="default"> | |||
<name>Informing the Other Host about Another Potential Address</name> | ||||
<section title="Informing the Other Host about Another Potential Address"> | <t>The set of IP addresses associated to a multihomed host may change du | |||
<t>The set of IP addresses associated to a multihomed host may change during the | ring the lifetime of an MPTCP connection. MPTCP supports the addition and remova | |||
lifetime of an MPTCP connection. MPTCP supports the addition and removal of add | l of addresses on a host both implicitly and explicitly. If Host A has establish | |||
resses on a host both implicitly and explicitly. If Host A has established a sub | ed a subflow starting at address&wj;/port pair IP#-A1 and wants to open a second | |||
flow starting at address/port pair IP#-A1 and wants to open a second subflow sta | subflow starting at address&wj;/port pair IP#-A2, it simply initiates the estab | |||
rting at address/port pair IP#-A2, it simply initiates the establishment of the | lishment of the subflow as explained above. The remote host will then be implici | |||
subflow as explained above. The remote host will then be implicitly informed abo | tly informed about the new address.</t> | |||
ut the new address.</t> | <t>In some circumstances, a host may want to advertise to the remote | |||
host the availability of an address without establishing a new subflow | ||||
<t>In some circumstances, a host may want to advertise to the remote host the av | -- for example, when a NAT prevents setup in one direction. In the exampl | |||
ailability of an address without establishing a new subflow, for example, when a | e below, Host A informs Host B about its alternative IP address&wj;/port pa | |||
NAT prevents setup in one direction. In the example below, Host A informs Host | ir (IP#-A2). Host B may later send an MP_JOIN to this new address. The ADD_ADDR | |||
B about its alternative IP address/port pair (IP#-A2). Host B may later send an | option contains an HMAC to authenticate the address as having been sent from the | |||
MP_JOIN to this new address. The ADD_ADDR option contains a HMAC to authenticat | originator of the connection. The receiver of this option echoes it back to the | |||
e the address as having been sent from the originator of the connection. The rec | client to indicate successful receipt. Further details are given in <xref targe | |||
eiver of this option echoes it back to the client to indicate successful receipt | t="sec_add_address" format="default"/>.</t> | |||
. Further details are in <xref target="sec_add_address"/>.</t> | <artwork align="left" name="" type="" alt=""><![CDATA[ | |||
<figure><artwork align="left"><![CDATA[ | ||||
Host A Host B | Host A Host B | |||
------ ------ | ------ ------ | |||
ADD_ADDR -> | ADD_ADDR -> | |||
[Echo-flag=0, | [Echo-flag=0, | |||
IP#-A2, | IP#-A2, | |||
IP#-A2's Address ID, | IP#-A2's Address ID, | |||
HMAC of IP#-A2] | HMAC of IP#-A2] | |||
<- ADD_ADDR | <- ADD_ADDR | |||
[Echo-flag=1, | [Echo-flag=1, | |||
IP#-A2, | IP#-A2, | |||
IP#-A2's Address ID, | IP#-A2's Address ID, | |||
HMAC of IP#-A2] | HMAC of IP#-A2] ]]></artwork> | |||
]]></artwork></figure> | <t>There is a corresponding signal for address removal, making use of | |||
the Address ID that is signaled in the ADD_ADDR handshake. | ||||
<t>There is a corresponding signal for address removal, making use of the Addres | ||||
s ID that is signaled in the add address handshake. Further details in <xref tar | ||||
get="sec_remove_addr"/>.</t> | ||||
<figure><artwork align="left"><![CDATA[ | Further details are given in <xref target="sec_remove_addr" format="default"/>. | |||
</t> | ||||
<artwork align="left" name="" type="" alt=""><![CDATA[ | ||||
Host A Host B | Host A Host B | |||
------ ------ | ------ ------ | |||
REMOVE_ADDR -> | REMOVE_ADDR -> | |||
[IP#-A2's Address ID] | [IP#-A2's Address ID] ]]></artwork> | |||
]]></artwork></figure> | </section> | |||
</section> | <section numbered="true" toc="default"> | |||
<name>Data Transfer Using MPTCP</name> | ||||
<section title="Data Transfer Using MPTCP"> | <t>To ensure reliable, in-order delivery of data over subflows that may | |||
<t>To ensure reliable, in-order delivery of data over subflows that may appear a | appear and disappear at any time, MPTCP uses a 64-bit Data Sequence Number (DSN) | |||
nd disappear at any time, MPTCP uses a 64-bit data sequence number (DSN) to numb | to number all data sent over the MPTCP connection. Each subflow has its own 32- | |||
er all data sent over the MPTCP connection. Each subflow has its own 32-bit sequ | bit sequence number space, utilizing the regular TCP sequence number header, and | |||
ence number space, utilising the regular TCP sequence number header, and an MPTC | an MPTCP option maps the subflow sequence space to the data sequence space. In | |||
P option maps the subflow sequence space to the data sequence space. In this way | this way, data can be retransmitted on different subflows (mapped to the same DS | |||
, data can be retransmitted on different subflows (mapped to the same DSN) in th | N) in the event of failure.</t> | |||
e event of failure.</t> | <t>The Data Sequence Signal (DSS) carries the Data Sequence Mapping. The | |||
Data Sequence Mapping consists of the subflow sequence number, data sequence nu | ||||
<t>The Data Sequence Signal (DSS) carries the Data Sequence Mapping. The Data Se | mber, and length for which this mapping is valid. This option can also carry a c | |||
quence Mapping consists of the subflow sequence number, data sequence number, an | onnection-level acknowledgment (the "Data ACK") for the received DSN.</t> | |||
d length for which this mapping is valid. This option can also carry a connectio | <t>With MPTCP, all subflows share the same receive buffer and advertise | |||
n-level acknowledgment (the "Data ACK") for the received DSN.</t> | the same receive window. There are two levels of acknowledgment in MPTCP. Regula | |||
r TCP acknowledgments are used on each subflow to acknowledge the reception of t | ||||
<t>With MPTCP, all subflows share the same receive buffer and advertise the same | he segments sent over the subflow independently of their DSN. In addition, there | |||
receive window. There are two levels of acknowledgment in MPTCP. Regular TCP ac | are connection-level acknowledgments for the data sequence space. These acknowl | |||
knowledgments are used on each subflow to acknowledge the reception of the segme | edgments track the advancement of the bytestream and slide the receive window.</ | |||
nts sent over the subflow independently of their DSN. In addition, there are con | t> | |||
nection-level acknowledgments for the data sequence space. These acknowledgments | <t>Further details are given in <xref target="sec_generalop" format="def | |||
track the advancement of the bytestream and slide the receiving window.</t> | ault"/>.</t> | |||
<artwork align="left" name="" type="" alt=""><![CDATA[ | ||||
<t>Further details are in <xref target="sec_generalop"/>.</t> | ||||
<figure><artwork align="left"><![CDATA[ | ||||
Host A Host B | Host A Host B | |||
------ ------ | ------ ------ | |||
DSS -> | DSS -> | |||
[Data Sequence Mapping] | [Data Sequence Mapping] | |||
[Data ACK] | [Data ACK] | |||
[Checksum] | [Checksum] ]]></artwork> | |||
]]></artwork></figure> | </section> | |||
</section> | <section numbered="true" toc="default"> | |||
<name>Requesting a Change in a Path's Priority</name> | ||||
<section title="Requesting a Change in a Path's Priority"> | <t>Hosts can indicate at initial subflow setup whether they wish the sub | |||
<t>Hosts can indicate at initial subflow setup whether they wish the subflow to | flow to be used as a regular or backup path -- a backup path only being used if | |||
be used as a regular or backup path -- a backup path only being used if there ar | there are no regular paths available. During a connection, Host A can request a | |||
e no regular paths available. During a connection, Host A can request a change i | change in the priority of a subflow through the MP_PRIO signal to Host B. Furthe | |||
n the priority of a subflow through the MP_PRIO signal to Host B. Further detail | r details are given in <xref target="sec_policy" format="default"/>.</t> | |||
s are in <xref target="sec_policy"/>.</t> | <artwork align="left" name="" type="" alt=""><![CDATA[ | |||
<figure><artwork align="left"><![CDATA[ | ||||
Host A Host B | Host A Host B | |||
------ ------ | ------ ------ | |||
MP_PRIO -> | MP_PRIO -> ]]></artwork> | |||
]]></artwork></figure> | </section> | |||
</section> | <section numbered="true" toc="default"> | |||
<name>Closing an MPTCP Connection</name> | ||||
<section title="Closing an MPTCP Connection"> | <t>When a host wants to close an existing subflow but not the whole conn | |||
<t>When a host wants to close an existing subflow, but not the whole connection, | ection, it can initiate a regular TCP FIN/ACK exchange.</t> | |||
it can initiate a regular TCP FIN/ACK exchange.</t> | <t>When Host A wants to inform Host B that it has no more data to send, | |||
it signals this "Data FIN" as part of the DSS (see above). It has the same seman | ||||
<t>When Host A wants to inform Host B that it has no more data to send, it signa | tics and behavior as a regular TCP FIN, but at the connection level. Once all th | |||
ls this "Data FIN" as part of the Data Sequence Signal (see above). It has the s | e data on the MPTCP connection has been successfully received, this message is a | |||
ame semantics and behavior as a regular TCP FIN, but at the connection level. On | cknowledged at the connection level with a Data ACK. Further details are given i | |||
ce all the data on the MPTCP connection has been successfully received, then thi | n <xref target="sec_close" format="default"/>.</t> | |||
s message is acknowledged at the connection level with a Data ACK. Further detai | <artwork align="left" name="" type="" alt=""><![CDATA[ | |||
ls are in <xref target="sec_close"/>.</t> | ||||
<figure><artwork align="left"><![CDATA[ | ||||
Host A Host B | Host A Host B | |||
------ ------ | ------ ------ | |||
DSS -> | DSS -> | |||
[Data FIN] | [Data FIN] | |||
<- DSS | <- DSS | |||
[Data ACK] | [Data ACK] ]]></artwork> | |||
]]></artwork></figure> | <t>There is an additional method of connection closure, referred to as | |||
"Fast Close", which is analogous to closing a single-path TCP | ||||
<t>There is an additional method of connection closure, referred to as "Fast Clo | connection with a RST signal. The MP_FASTCLOSE signal is used to | |||
se", which is analogous to closing a single-path TCP connection with a RST signa | indicate to the peer that the connection will be abruptly closed and | |||
l. The MP_FASTCLOSE signal is used to indicate to the peer that the connection w | no data will be accepted anymore. This can be used on an ACK (which | |||
ill be abruptly closed and no data will be accepted anymore. This can be used on | ensures reliability of the signal) or a RST (which does not). | |||
an ACK (ensuring reliability of the signal), or a RST (which is not). Both exam | Both examples are shown in the following diagrams. Further details are given in | |||
ples are shown in the following diagrams. Further details are in <xref target="s | <xref target="sec_fastclose" format="default"/>.</t> | |||
ec_fastclose"/>.</t> | <artwork align="left" name="" type="" alt=""><![CDATA[ | |||
<figure><artwork align="left"><![CDATA[ | ||||
Host A Host B | Host A Host B | |||
------ ------ | ------ ------ | |||
ACK + MP_FASTCLOSE -> | ACK + MP_FASTCLOSE -> | |||
[B's key] | [B's key] | |||
[RST on all other subflows] -> | [RST on all other subflows] -> | |||
<- [RST on all subflows] | <- [RST on all subflows] | |||
Host A Host B | Host A Host B | |||
------ ------ | ------ ------ | |||
RST + MP_FASTCLOSE -> | RST + MP_FASTCLOSE -> | |||
[B's key] [on all subflows] | [B's key] [on all subflows] | |||
<- [RST on all subflows] | <- [RST on all subflows] ]]></artwork> | |||
]]></artwork></figure> | </section> | |||
</section> | <section numbered="true" toc="default"> | |||
<name>Notable Features</name> | ||||
<section title="Notable Features"> | <t>It is worth highlighting that MPTCP's signaling has been designed wit | |||
<t>It is worth highlighting that MPTCP's signaling has been designed with severa | h several key requirements in mind: | |||
l key requirements in mind: | ||||
<list style="symbols"> | </t> | |||
<t>To cope with NATs on the path, addresses are referred to by Address IDs, in c | <ul spacing="normal"> | |||
ase the IP packet's source | <li>To cope with NATs on the path, addresses are referred to by Addres | |||
s IDs, in case the IP packet's source | ||||
address gets changed by a NAT. Setting up a new TCP flow is not possible if the receiver of the SYN is behind a NAT; | address gets changed by a NAT. Setting up a new TCP flow is not possible if the receiver of the SYN is behind a NAT; | |||
to allow subflows to be created when either end is behind a NAT, MPTCP uses the | to allow subflows to be created when either end is behind a NAT, MPTCP uses the | |||
ADD_ADDR message. </t> | ADD_ADDR message. </li> | |||
<li>MPTCP falls back to ordinary TCP if MPTCP operation is not | ||||
<t>MPTCP falls back to ordinary TCP if MPTCP operation is not possible, for exam | possible -- for example, if one host is not MPTCP capable or if a middlebox alt | |||
ple, if one host is not MPTCP capable or if a middlebox alters the payload. This | ers the payload. This is discussed in <xref target="sec_fallback" format="defaul | |||
is discussed in <xref target="sec_fallback"/>.</t> | t"/>.</li> | |||
<li>To address the threats identified in <xref target="RFC6181" | ||||
<t>To address the threats identified in <xref target="RFC6181"/>, the following | format="default"/>, the following steps are taken: keys are sent in | |||
steps are taken: keys are sent in the clear in the MP_CAPABLE messages; MP_JOIN | the clear in the MP_CAPABLE messages; MP_JOIN messages are secured | |||
messages are secured with HMAC-SHA256 (<xref target="RFC2104"/>, <xref target="R | with HMAC-SHA256 (<xref target="RFC2104" format="default"/> using | |||
FC6234"/>) using those keys; and standard TCP validity checks are made on the ot | the algorithm in <xref target="RFC6234" format="default"/>) using thos | |||
her messages (ensuring sequence numbers are in-window <xref target="RFC5961"/>). | e keys; and standard | |||
Residual threats to MPTCP v0 were identified in <xref target="RFC7430"/>, and t | TCP validity checks are made on the other messages (ensuring that | |||
hose affecting the protocol (i.e. modification to ADD_ADDR) have been incorporat | sequence numbers are in‑window <xref target="RFC5961" | |||
ed in this document. Further discussion of security can be found in <xref target | format="default"/>). | |||
="sec_security"/>.</t> | Residual threats to MPTCP v0 were identified in <xref target="RFC7430" | |||
</list></t> | format="default"/>, and those affecting the protocol (i.e., modifications to | |||
</section> | ADD_ADDR) have been incorporated in this document. | |||
Further discussion of security can be found in <xref target="sec_security" form | ||||
at="default"/>.</li> | ||||
</ul> | ||||
</section> | ||||
</section> | </section> | |||
<section anchor="sec_protocol" numbered="true" toc="default"> | ||||
<section title="MPTCP Protocol" anchor="sec_protocol"> | <name>MPTCP Operations: An Overview</name> | |||
<t>This section describes the operation of the MPTCP protocol, and is subd | <t>This section describes the operation of MPTCP. The | |||
ivided into sections for each key part of the protocol operation.</t> | subsections below discuss each key part of the protocol operation.</t> | |||
<t>All MPTCP operations are signaled using optional TCP header fields. A s | <t>All MPTCP operations are signaled using optional TCP header fields. A s | |||
ingle TCP option number ("Kind") has been assigned by IANA for MPTCP (see <xref | ingle TCP option number ("Kind") has been assigned by IANA for MPTCP (see <xref | |||
target="IANA"/>), and then individual messages will be determined by a "subtype" | target="IANA" format="default"/>), and then individual messages will be determin | |||
, the values of which are also stored in an IANA registry (and are also listed i | ed by a "subtype", the values of which are also stored in an IANA registry (and | |||
n <xref target="IANA"/>). As with all TCP options, the Length field is specified | are also listed in <xref target="IANA" format="default"/>). As with all TCP opti | |||
in bytes, and includes the 2 bytes of Kind and Length.</t> | ons, the Length field is specified in bytes and includes the 2 bytes of Kin | |||
<t>Throughout this document, when reference is made to an MPTCP option by | d and Length.</t> | |||
symbolic name, such as "MP_CAPABLE", this refers to a TCP option with the single | <t>Throughout this document, when reference is made to an MPTCP option by | |||
MPTCP option type, and with the subtype value of the symbolic name as defined i | symbolic name, such as "MP_CAPABLE", this refers to a TCP option with the single | |||
n <xref target="IANA"/>. This subtype is a 4-bit field -- the first 4 bits of th | MPTCP option type, and with the subtype value of the symbolic name as defined i | |||
e option payload, as shown in <xref target="fig_option"/>. The MPTCP messages ar | n <xref target="IANA" format="default"/>. This subtype is a 4-bit field -- the f | |||
e defined in the following sections.</t> | irst 4 bits of the option payload, as shown in <xref target="fig_option" format= | |||
"default"/>. The MPTCP messages are defined in the following sections.</t> | ||||
<?rfc needLines='8'?> | <figure anchor="fig_option"> | |||
<figure align="center" anchor="fig_option" title="MPTCP Option Format"> | <name>MPTCP Option Format</name> | |||
<artwork align="left"><![CDATA[ | <artwork align="left" name="" type="" alt=""><![CDATA[ | |||
1 2 3 | 1 2 3 | |||
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | |||
+---------------+---------------+-------+-----------------------+ | +---------------+---------------+-------+-----------------------+ | |||
| Kind | Length |Subtype| | | | Kind | Length |Subtype| | | |||
+---------------+---------------+-------+ | | +---------------+---------------+-------+ | | |||
| Subtype-specific data | | | Subtype-specific data | | |||
| (variable length) | | | (variable length) | | |||
+---------------------------------------------------------------+ | +---------------------------------------------------------------+ ]]></artwork | |||
]]></artwork> | > | |||
</figure> | </figure> | |||
<t>Those MPTCP options associated with subflow initiation are used on | ||||
<t>Those MPTCP options associated with subflow initiation are used on pack | packets with the SYN flag set. Additionally, there is one MPTCP option | |||
ets with the SYN flag set. Additionally, there is one MPTCP option for signaling | for signaling metadata to ensure that segmented data can be recombined for | |||
metadata to ensure segmented data can be recombined for delivery to the applica | delivery to the application.</t> | |||
tion.</t> | <t>The remaining options, however, are signals that do not need to be on | |||
<t>The remaining options, however, are signals that do not need to be on a | a specific packet, such as those for signaling additional | |||
specific packet, such as those for signaling additional addresses. Whilst an im | addresses. While an implementation may desire to send MPTCP options as | |||
plementation may desire to send MPTCP options as soon as possible, it may not be | soon as possible, it may not be possible to combine all desired options | |||
possible to combine all desired options (both those for MPTCP and for regular T | (both those for MPTCP and for regular TCP, such as SACK (selective | |||
CP, such as SACK (selective acknowledgment) <xref target="RFC2018"/>) on a singl | acknowledgment) <xref target="RFC2018" format="default"/>) on a single | |||
e packet. Therefore, an implementation may choose to send duplicate ACKs contain | packet. Therefore, an implementation may choose to send duplicate ACKs | |||
ing the additional signaling information. This changes the semantics of a duplic | containing the additional signaling information. This changes the | |||
ate ACK; these are usually only sent as a signal of a lost segment <xref target= | semantics of a duplicate ACK; these are usually only sent as a signal of | |||
"RFC5681"/> in regular TCP. Therefore, an MPTCP implementation receiving a dupli | a lost segment <xref target="RFC5681" format="default"/> in regular | |||
cate ACK that contains an MPTCP option MUST NOT treat it as a signal of congesti | TCP. Therefore, an MPTCP implementation receiving a duplicate ACK that | |||
on. Additionally, an MPTCP implementation SHOULD NOT send more than two duplicat | contains an MPTCP option <bcp14>MUST NOT</bcp14> treat it as a signal of | |||
e ACKs in a row for the purposes of sending MPTCP options alone, in order to ens | congestion. Additionally, an MPTCP implementation <bcp14>SHOULD | |||
ure no middleboxes misinterpret this as a sign of congestion.</t> | NOT</bcp14> send more than two duplicate ACKs in a row for the purposes | |||
<t>Furthermore, standard TCP validity checks (such as ensuring the sequenc | of sending MPTCP options alone, in order to ensure that no middleboxes mis | |||
e number and acknowledgment number are within window) MUST be undertaken before | interpret this as a sign of congestion.</t> | |||
processing any MPTCP signals, as described in <xref target="RFC5961"/>, and init | <t>Furthermore, standard TCP validity checks (such as ensuring that the | |||
ial subflow sequence numbers SHOULD be generated according to the recommendation | sequence number and acknowledgment number are within the window) <bcp14>MU | |||
s in <xref target="RFC6528"/>.</t> | ST</bcp14> be undertaken before processing any MPTCP signals, as described in <x | |||
ref target="RFC5961" format="default"/>, and initial subflow sequence numbers <b | ||||
<section title="Connection Initiation" anchor="sec_init"> | cp14>SHOULD</bcp14> be generated according to the recommendations in <xref targe | |||
t="RFC6528" format="default"/>.</t> | ||||
<section anchor="sec_init" numbered="true" toc="default"> | ||||
<name>Connection Initiation</name> | ||||
<t>Connection initiation begins with a SYN, SYN/ACK, ACK exchange | <t>Connection initiation begins with a SYN, SYN/ACK, ACK exchange | |||
on a single path. Each packet | on a single path. Each packet | |||
contains the Multipath Capable (MP_CAPABLE) MPTCP option | contains the Multipath Capable (MP_CAPABLE) MPTCP option | |||
(<xref target="tcpm_capable"/>). This option declares its | (<xref target="tcpm_capable" format="default"/>). This option declares i | |||
sender is capable of performing Multipath TCP and wishes to do | ts | |||
sender capable of performing Multipath TCP and wishes to do | ||||
so on this particular connection.</t> | so on this particular connection.</t> | |||
<figure anchor="tcpm_capable"> | ||||
<t>The MP_CAPABLE exchange in this specification (v1) is different to | <name>Multipath Capable (MP_CAPABLE) Option</name> | |||
<artwork align="left" name="" type="" alt=""><![CDATA[ | ||||
1 2 3 | ||||
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | ||||
+---------------+---------------+-------+-------+---------------+ | ||||
| Kind | Length |Subtype|Version|A|B|C|D|E|F|G|H| | ||||
+---------------+---------------+-------+-------+---------------+ | ||||
| Option Sender's Key (64 bits) | | ||||
| (if option Length > 4) | | ||||
| | | ||||
+---------------------------------------------------------------+ | ||||
| Option Receiver's Key (64 bits) | | ||||
| (if option Length > 12) | | ||||
| | | ||||
+-------------------------------+-------------------------------+ | ||||
| Data-Level Length (16 bits) | Checksum (16 bits, optional) | | ||||
+-------------------------------+-------------------------------+ ]]></artwork | ||||
> | ||||
</figure> | ||||
<t>The MP_CAPABLE exchange in this specification (v1) is different than | ||||
that specified in v0. If a host supports multiple versions | that specified in v0. If a host supports multiple versions | |||
of MPTCP, the sender of the MP_CAPABLE option SHOULD signal the | of MPTCP, the sender of the MP_CAPABLE option <bcp14>SHOULD</bcp14> sign al the | |||
highest version number it supports. In return, in its MP_CAPABLE option , | highest version number it supports. In return, in its MP_CAPABLE option , | |||
the receiver will signal the version number it wishes to use, which MUST | the receiver will signal the version number it wishes to use, which <bcp 14>MUST</bcp14> | |||
be equal to or lower than the version number indicated in the initial | be equal to or lower than the version number indicated in the initial | |||
MP_CAPABLE. | MP_CAPABLE. | |||
There is a caveat though with respect to this version negotiation with | There is a caveat, though, with respect to this version negotiation with | |||
old listeners that only support v0. A listener that supports v0 expects that | old listeners that only support v0. A listener that supports v0 expects that | |||
the MP_CAPABLE option in the SYN-segment includes the initiator's key. I | the MP_CAPABLE option in the SYN segment will include the initiator's | |||
f | key. If, however, | |||
the initiator however already upgraded to v1, it won't include the key i | the initiator already upgraded to v1, it won't include the key in the | |||
n the | SYN segment. Thus, the listener will ignore the MP_CAPABLE of this SYN s | |||
SYN-segment. Thus, the listener will ignore the MP_CAPABLE of this SYN-s | egment | |||
egment | and reply with a SYN/ACK that does not include an MP_CAPABLE. The initia | |||
and reply with a SYN/ACK that does not include an MP_CAPABLE. The initia | tor <bcp14>MAY</bcp14> | |||
tor MAY | choose to immediately fall back to TCP or <bcp14>MAY</bcp14> choose to a | |||
choose to immediately fall back to TCP or MAY choose to attempt a connec | ttempt a connection | |||
tion | ||||
using MPTCP v0 (if the initiator supports v0), in order to discover whet her the | using MPTCP v0 (if the initiator supports v0), in order to discover whet her the | |||
listener supports the earlier version of MPTCP. In general a MPTCP v0 co | listener supports the earlier version of MPTCP. In general, an MPTCP v0 | |||
nnection | connection | |||
is likely to be preferred to a TCP one, however in a particular deployme | will likely be preferred over a TCP connection; however, in a particular | |||
nt scenario | deployment scenario, | |||
it may be known that the listener is unlikely to support MPTCPv0 and so | it may be known that the listener is unlikely to support MPTCP v0 and so | |||
the | the | |||
initiator may prefer not to attempt a v0 connection. An initiator MAY ca | initiator may prefer not to attempt a v0 connection. An initiator <bcp14 | |||
che | >MAY</bcp14> cache | |||
information for a peer about what version of MPTCP it supports if any, a | information for a peer about what version of MPTCP it supports, if any, | |||
nd use | and use | |||
this information for future connection attempts.</t> | this information for future connection attempts.</t> | |||
<t>The MP_CAPABLE option is of variable length, with different fields | ||||
<t>The MP_CAPABLE option is variable-length, with different fields | included, depending on which packet the option is used on. The full | |||
included depending on which packet the option is used on. The full | MP_CAPABLE option is shown in <xref target="tcpm_capable" format="defaul | |||
MP_CAPABLE option is shown in <xref target="tcpm_capable"/>.</t> | t"/>.</t> | |||
<t>The MP_CAPABLE option is carried on the SYN, SYN/ACK, and ACK packets | ||||
<?rfc needLines='10'?> | that start the first subflow of an MPTCP connection, as well as the first packe | |||
<figure align="center" anchor="tcpm_capable" title="Multipath Capable (M | t that carries data, if the initiator wishes to send first. The data carried by | |||
P_CAPABLE) Option"> | each option is as follows, where A = initiator and B = listener. | |||
<artwork align="left"><![CDATA[ | </t> | |||
1 2 3 | <ul spacing="normal"> | |||
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | <li>SYN (A->B): only the first 4 octets (Length = 4).</li> | |||
+---------------+---------------+-------+-------+---------------+ | <li>SYN/ACK (B->A): B's key for this connection (Length = 12).</li> | |||
| Kind | Length |Subtype|Version|A|B|C|D|E|F|G|H| | <li>ACK (no data) (A->B): A's key followed by B's key (Length = 20) | |||
+---------------+---------------+-------+-------+---------------+ | .</li> | |||
| Option Sender's Key (64 bits) | | <li>ACK (with first data) (A->B): A's key followed by B's key follo | |||
| (if option Length > 4) | | wed by Data-Level Length, and optional Checksum (Length = 22 or 24).</li> | |||
| | | </ul> | |||
+---------------------------------------------------------------+ | <t> | |||
| Option Receiver's Key (64 bits) | | The contents of the option are determined by the SYN and ACK flags of th | |||
| (if option Length > 12) | | e packet, along with the option's Length field. In <xref target="tcpm_capable" f | |||
| | | ormat="default"/>, "Sender" and "Receiver" refer to the sender or receiver of th | |||
+-------------------------------+-------------------------------+ | e TCP packet (which can be either host).</t> | |||
| Data-Level Length (16 bits) | Checksum (16 bits, optional) | | ||||
+-------------------------------+-------------------------------+ | ||||
]]></artwork> | ||||
</figure> | ||||
<t>The MP_CAPABLE option is carried on the SYN, SYN/ACK, and ACK packets | ||||
that start the first subflow of an MPTCP connection, as well as the first packe | ||||
t that carries data, if the initiator wishes to send first. The data carried by | ||||
each option is as follows, where A = initiator and B = listener. | ||||
<list style="symbols"> | ||||
<t>SYN (A->B): only the first four octets (Length = 4).</t> | ||||
<t>SYN/ACK (B->A): B's Key for this connection (Length = 12).</t> | ||||
<t>ACK (no data) (A->B): A's Key followed by B's Key (Length = 20 | ||||
).</t> | ||||
<t>ACK (with first data) (A->B): A's Key followed by B's Key foll | ||||
owed by Data-Level Length, and optional Checksum (Length = 22 or 24).</t> | ||||
</list> | ||||
The contents of the option is determined by the SYN and ACK flags of the | ||||
packet, along with the option's length field. For the diagram shown in <xref ta | ||||
rget="tcpm_capable"/>, "sender" and "receiver" refer to the sender or receiver o | ||||
f the TCP packet (which can be either host).</t> | ||||
<t>The initial SYN, containing just the MP_CAPABLE header, is used | <t>The initial SYN, containing just the MP_CAPABLE header, is used | |||
to define the version of MPTCP being requested, as well as exchanging | to define the version of MPTCP being requested and also to exchange | |||
flags to negotiate connection features, described later.</t> | flags to negotiate connection features, as described later.</t> | |||
<t>This option is used to declare the 64-bit keys that the end hosts | ||||
<t>This option is used to declare the 64-bit keys that the end hosts hav | have generated for this MPTCP connection. These keys are used to | |||
e generated for this MPTCP connection. These keys are used to authenticate the a | authenticate the addition of future subflows to this connection. This | |||
ddition of future subflows to this connection. This is the only time the key wil | is the only time the key will be sent in the clear on the wire (unless " | |||
l be sent in clear on the wire (unless "fast close", <xref target="sec_fastclose | Fast Close" (<xref target="sec_fastclose" format="default"/>) is used); all futu | |||
"/>, is used); all future subflows will identify the connection using a 32-bit " | re subflows will identify the connection using a 32-bit "token". This token is a | |||
token". This token is a cryptographic hash of this key. The algorithm for this p | cryptographic hash of this key. The algorithm for this process is dependent on | |||
rocess is dependent on the authentication algorithm selected; the method of sele | the authentication algorithm selected; the method of selection is defined later | |||
ction is defined later in this section.</t> | in this section.</t> | |||
<t>Upon reception of the initial SYN segment, a stateful server generate | ||||
<t>Upon reception of the initial SYN-segment, a stateful server generate | s a random key and replies with a SYN/ACK. The key's method of generation is imp | |||
s a random key and replies with a SYN/ACK. The key's method of generation is imp | lementation specific. The key <bcp14>MUST</bcp14> be hard to guess, and it <bcp1 | |||
lementation specific. The key MUST be hard to guess, and it MUST be unique for t | 4>MUST</bcp14> be unique for the sending host across all its current MPTCP conne | |||
he sending host across all its current MPTCP connections. Recommendations for ge | ctions. Recommendations for generating random numbers for use in keys are given | |||
nerating random numbers for use in keys are given in <xref target="RFC4086"/>. C | in <xref target="RFC4086" format="default"/>. Connections will be indexed at eac | |||
onnections will be indexed at each host by the token (a one-way hash of the key) | h host by the token (a one-way hash of the key). Therefore, an implementation wi | |||
. Therefore, an implementation will require a mapping from each token to the cor | ll require a mapping from each token to the corresponding connection, and in tur | |||
responding connection, and in turn to the keys for the connection.</t> | n to the keys for the connection.</t> | |||
<t>There is a risk that two different keys will hash to the same | ||||
<t>There is a risk that two different keys will hash to the same token. | token. The risk of hash collisions is usually small, unless the host | |||
The risk of hash collisions is usually small, unless the host is handling many t | is handling many tens of thousands of connections. Therefore, an | |||
ens of thousands of connections. Therefore, an implementation SHOULD check its l | implementation <bcp14>SHOULD</bcp14> check its list of connection | |||
ist of connection tokens to ensure there is no collision before sending its key, | tokens to ensure that there is no collision before sending its key, | |||
and if there is, then it should generate a new key. This would, however, be cos | and if there is, then it should generate a new key. This would, | |||
tly for a server with thousands of connections. The subflow handshake mechanism | however, be costly for a server with thousands of connections. The | |||
(<xref target="sec_join"/>) will ensure that new subflows only join the correct | subflow handshake mechanism (<xref target="sec_join" | |||
connection, however, through the cryptographic handshake, as well as checking th | format="default"/>) will ensure that new subflows only join the | |||
e connection tokens in both directions, and ensuring sequence numbers are in-win | correct connection, however, through the cryptographic handshake, as | |||
dow. So in the worst case if there was a token collision, the new subflow would | well as checking the connection tokens in both directions, and | |||
not succeed, but the MPTCP connection would continue to provide a regular TCP se | ensuring that sequence numbers are in-window. So, in the worst case, if | |||
rvice.</t> | there was a token collision, the new subflow would not succeed, but the MPTCP co | |||
nnection would continue to provide a regular TCP service.</t> | ||||
<t>Since key generation is implementation-specific, there is no r | <t>Since key generation is implementation specific, there is no | |||
equirement that they be simply random numbers. An implementation is free to exch | requirement that they simply be random numbers. An implementation is | |||
ange cryptographic material out-of-band and generate these keys from this, in or | free to exchange cryptographic material out of band and generate these | |||
der to provide additional mechanisms by which to verify the identity of the comm | keys from this material, in order to provide additional mechanisms by wh | |||
unicating entities. For example, an implementation could choose to link its MPTC | ich to verify the identity of the communicating entities. For example, an implem | |||
P keys to those used in higher-layer TLS or SSH connections.</t> | entation could choose to link its MPTCP keys to those used in higher-layer TLS o | |||
r SSH connections.</t> | ||||
<t>If the server behaves in a | <t>If the server behaves in a | |||
stateless manner, it has to generate its own key in a verifiable | stateless manner, it has to generate its own key in a verifiable | |||
fashion. This verifiable way of generating the key can be done by | fashion. This verifiable way of generating the key can be done by | |||
using a hash of the 4-tuple, sequence number and a local secret | using a hash of the 4-tuple, sequence number, and a local secret | |||
(similar to what is done for the TCP-sequence number <xref target="RFC49 | (similar to what is done for the TCP sequence number <xref target="RFC49 | |||
87"/>). | 87" format="default"/>). | |||
It will thus be able to verify whether it is indeed the originator of | It will thus be able to verify whether it is indeed the originator of | |||
the key echoed back in the later MP_CAPABLE option. | the key echoed back in the subsequent MP_CAPABLE option. | |||
As for a stateful server, the tokens SHOULD be checked for uniqueness, h | As for a stateful server, the tokens <bcp14>SHOULD</bcp14> be checked fo | |||
owever | r uniqueness; however, | |||
if uniqueness is not met, and there is no way to generate an alternative | if uniqueness is not met and there is no way to generate an alternative | |||
verifiable | verifiable | |||
key, then the connection MUST fall back to using regular TCP by not send | key, then the connection <bcp14>MUST</bcp14> fall back to using regular | |||
ing a | TCP by not sending an | |||
MP_CAPABLE in the SYN/ACK.</t> | MP_CAPABLE in the SYN&wj;/ACK.</t> | |||
<t>The ACK carries both A's key and B's key. This is the first time that A's key is seen on the wire, although it is expected that A will have generated a key locally before the initial SYN. The echoing of B's key allows B to operat e statelessly, as described above. Therefore, A's key must be delivered reliably to B, and in order to do this, the transmission of this packet must be made rel iable.</t> | <t>The ACK carries both A's key and B's key. This is the first time that A's key is seen on the wire, although it is expected that A will have generated a key locally before the initial SYN. The echoing of B's key allows B to operat e statelessly, as described above. Therefore, A's key must be delivered reliably to B, and in order to do this, the transmission of this packet must be made rel iable.</t> | |||
<t>If B has data to send first, then the reliable delivery of the | ||||
<t>If B has data to send first, then the reliable delivery of the ACK+MP | ACK + MP_CAPABLE is ensured by the receipt of this data with a | |||
_CAPABLE can be inferred by the receipt of this data with a MPTCP Data Sequence | n | |||
Signal (DSS) option (<xref target="sec_generalop"/>). If, however, A wishes to s | MPTCP Data Sequence Signal (DSS) option (<xref target="sec_generalop" | |||
end data first, it has two options to ensure the reliable delivery of the ACK+MP | format="default"/>) containing a DATA_ACK for the MP_CAPABLE (which is | |||
_CAPABLE. If it immediately has data to send, then the third ACK (with data) wou | the first octet of the data sequence space). If, however, A wishes to sen | |||
ld also contain an MP_CAPABLE option with additional data parameters (the Data-L | d data first, it has | |||
evel Length and optional Checksum as shown in <xref target="tcpm_capable"/>). If | two options to ensure the reliable delivery of the ACK + MP_CAPABLE. If | |||
A does not immediately have data to send, it MUST include the MP_CAPABLE on the | it immediately has data to send, then the first ACK (with data) would | |||
third ACK, but without the additional data parameters. When A does have data to | also contain an MP_CAPABLE option with additional data parameters (the | |||
send, it must repeat the sending of the MP_CAPABLE option from the third ACK, w | Data-Level Length and optional Checksum as shown in <xref | |||
ith additional data parameters. This MP_CAPABLE option is in place of the DSS, a | target="tcpm_capable" format="default"/>). If A does not immediately | |||
nd simply specifies the data-level length of the payload, and the checksum (if t | have data to send, it <bcp14>MUST</bcp14> include the MP_CAPABLE on | |||
he use of checksums is negotiated). This is the minimal data required to establi | the first ACK, but without the additional data parameters. When A does | |||
sh a MPTCP connection - it allows validation of the payload, and given it is the | have data to send, it must repeat the sending of the MP_CAPABLE option | |||
first data, the Initial Data Sequence Number (IDSN) is also known (as it is gen | from the first ACK, with additional data parameters. This MP_CAPABLE | |||
erated from the key, as described below). Conveying the keys on the first data p | option is used in place of the DSS and simply specifies (1) the Dat | |||
acket allows the TCP reliability mechanisms to ensure the packet is successfully | a-Level | |||
delivered. The receiver will acknowledge this data at the connection level with | Length of the payload and (2) the checksum (if the use of checksums | |||
a Data ACK, as if a DSS option has been received.</t> | is | |||
negotiated). This is the minimal data required to establish an MPTCP | ||||
<t>There could be situations where both A and B attempt to transmit init | connection -- it allows validation of the payload, and given that it is | |||
ial data at the same time. For example, if A did not initially have data to send | the | |||
, but then needed to transmit data before it had received anything from B, it wo | first data, the Initial Data Sequence Number (IDSN) is also known (as | |||
uld use a MP_CAPABLE option with data parameters (since it would not know if the | it is generated from the key, as described below). Conveying the keys | |||
MP_CAPABLE on the ACK was received). In such a situation, B may also have trans | on the first data packet allows the TCP reliability mechanisms to | |||
mitted data with a DSS option, but it had not yet been received at A. Therefore, | ensure that the packet is successfully delivered. The receiver will ackn | |||
B has received data with a MP_CAPABLE mapping after it has sent data with a DSS | owledge this data at the connection level with a Data ACK, as if a DSS option ha | |||
option. To ensure these situations can be handled, it follows that the data par | s been received.</t> | |||
ameters in a MP_CAPABLE are semantically equivalent to those in a DSS option and | <t>There could be situations where both A and B attempt to transmit | |||
can be used interchangeably. Similar situations could occur when the MP_CAPABLE | initial data at the same time. For example, if A did not initially | |||
with data is lost and retransmitted. Furthermore, in the case of TCP Segmentati | have data to send but then needed to transmit data before it had | |||
on Offloading, the MP_CAPABLE with data parameters may be duplicated across mult | received anything from B, it would use an MP_CAPABLE option with data | |||
iple packets, and implementations must also be able to cope with duplicate MP_CA | parameters (since it would not know if the MP_CAPABLE on the ACK was | |||
PABLE mappings as well as duplicate DSS mappings.</t> | received). In such a situation, B may also have transmitted data with | |||
a DSS option, but it had not yet been received at A. Therefore, B has | ||||
<t>Additionally, the MP_CAPABLE exchange allows the safe passage of MPTC | received data with an MP_CAPABLE mapping after it has sent data with a | |||
P options on SYN packets to be determined. If any of these options are dropped, | DSS option. To ensure that these situations can be handled, it follows t | |||
MPTCP will gracefully fall back to regular single-path TCP, as documented in <xr | hat the data parameters in an MP_CAPABLE are semantically equivalent to those in | |||
ef target="sec_fallback"/>. If at any point in the handshake either party think | a DSS option and can be used interchangeably. Similar situations could occur wh | |||
s the MPTCP negotiation is compromised, for example by a middlebox corrupting th | en the MP_CAPABLE with data is lost and retransmitted. Furthermore, in the case | |||
e TCP options, or unexpected ACK numbers being present, the host MUST stop using | of TCP segmentation offloading, the MP_CAPABLE with data parameters may be dupli | |||
MPTCP and no longer include MPTCP options in future TCP packets. The other host | cated across multiple packets, and implementations must also be able to cope wit | |||
will then also fall back to regular TCP using the fall back mechanism. Note th | h duplicate MP_CAPABLE mappings as well as duplicate DSS mappings.</t> | |||
at new subflows MUST NOT be established (using the process documented in <xref t | <t>Additionally, the MP_CAPABLE exchange allows the safe passage of | |||
arget="sec_join"/>) until a Data Sequence Signal (DSS) option has been successfu | MPTCP options on SYN packets to be determined. If any of these options | |||
lly received across the path (as documented in <xref target="sec_generalop"/>).< | are dropped, MPTCP will gracefully fall back to regular single-path | |||
/t> | TCP, as documented in <xref target="sec_fallback" format="default"/>. | |||
If at any point in the handshake either party thinks the MPTCP | ||||
<t>Like all MPTCP options, the MP_CAPABLE option starts with the Kind an | negotiation is compromised -- for example, by a middlebox corrupting | |||
d Length to specify the TCP-option kind and its length. Followed by that is the | the TCP options or by unexpected ACK numbers being present -- the host < | |||
MP_CAPABLE option. The first 4 bits of the first octet in the MP_CAPABLE option | bcp14>MUST</bcp14> stop using MPTCP and no longer include MPTCP options in futur | |||
(<xref target="tcpm_capable"/>) define the MPTCP option subtype (see <xref targe | e TCP packets. The other host will then also fall back to regular TCP using the | |||
t="IANA"/>; for MP_CAPABLE, this is 0x0), and the remaining 4 bits of this octet | fallback mechanism. Note that new subflows <bcp14>MUST NOT</bcp14> be establish | |||
specify the MPTCP version in use (for this specification, this is 1).</t> | ed (using the process documented in <xref target="sec_join" format="default"/>) | |||
until a DSS option has been successfully received across the path (as documented | ||||
in <xref target="sec_generalop" format="default"/>).</t> | ||||
<t>Like all MPTCP options, the MP_CAPABLE option starts with the Kind | ||||
and Length to specify the TCP option's kind and length. This | ||||
information is followed by the MP_CAPABLE option. The first 4 bits of | ||||
the first octet in the MP_CAPABLE option (<xref target="tcpm_capable" | ||||
format="default"/>) define the MPTCP Option Subtype (see <xref | ||||
target="IANA" format="default"/>; for MP_CAPABLE, this value is | ||||
0x0), and the remaining 4 bits of this octet specify the MPTCP | ||||
version in use (for this specification, this value is 1).</t> | ||||
<t>The second octet is reserved for flags, allocated as follows: | <t>The second octet is reserved for flags, allocated as follows: | |||
<list style="hanging"> | </t> | |||
<t hangText="A:"> The leftmost bit, labeled "A", SHOULD be set to 1 to | <dl newline="false" spacing="normal" indent="14"> | |||
indicate "Checksum Required", unless the system administrator has decided that | <dt>A:</dt> | |||
checksums are not required (for example, if the environment is controlled and no | <dd> The leftmost bit, labeled "A", <bcp14>SHOULD</bcp14> be set to 1 | |||
middleboxes exist that might adjust the payload).</t> | to indicate "Checksum required", unless the system administrator has decided tha | |||
<t hangText="B:"> The second bit, labeled "B", is an extensibility fla | t checksums are not required (for example, if the environment is controlled and | |||
g, and MUST be set to 0 for current implementations. This will be used for an ex | no middleboxes exist that might adjust the payload).</dd> | |||
tensibility mechanism in a future specification, and the impact of this flag wil | <dt>B:</dt> | |||
l be defined at a later date. It is expected, but not mandated, that this flag w | <dd> The second bit, labeled "B", is an extensibility flag. It | |||
ould be used as part of an alternative security mechanism that does not require | <bcp14>MUST</bcp14> be set to 0 for current implementations. This | |||
a full version upgrade of the protocol, but does require redefining some element | flag will be used for an extensibility mechanism in a future specifica | |||
s of the handshake. If receiving a message with the 'B' flag set to 1, and this | tion, and the impact of this flag will be defined at a later date. It is expecte | |||
is not understood, then the MP_CAPABLE in this SYN MUST be silently ignored, whi | d, but not mandated, that this flag would be used as part of an alternative secu | |||
ch triggers a fallback to regular TCP; the sender is expected to retry with a fo | rity mechanism that does not require a full version upgrade of the protocol but | |||
rmat compatible with this legacy specification. Note that the length of the MP_C | does require redefining some elements of the handshake. If receiving a message w | |||
APABLE option, and the meanings of bits "D" through "H", may be altered by setti | ith the "B" flag set to 1 and this is not understood, then the MP_CAPABLE in thi | |||
ng B=1.</t> | s SYN <bcp14>MUST</bcp14> be silently ignored, which triggers a fallback to regu | |||
<t hangText="C:"> The third bit, labeled "C", is set to "1" to indicat | lar TCP; the sender is expected to retry with a format compatible with this lega | |||
e that the sender of this option will not accept additional MPTCP subflows to th | cy specification. Note that the length of the MP_CAPABLE option, and the meaning | |||
e source address and port, and therefore the receiver MUST NOT try to open any a | s of bits "D" through "H", may be altered by setting B=1.</dd> | |||
dditional subflows towards this address and port. This is an efficiency improvem | <dt>C:</dt> | |||
ent for situations where the sender knows a restriction is in place, for example | <dd> The third bit, labeled "C", is set to 1 to indicate that the | |||
if the sender is behind a strict NAT, or operating behind a legacy Layer 4 load | sender of this option will not accept additional MPTCP subflows to | |||
balancer.</t> | the source address and port, and therefore the receiver <bcp14>MUST | |||
<t hangText="D through H:"> The remaining bits, labeled "D" through "H | NOT</bcp14> try to open any additional subflows toward this address | |||
", are used for crypto algorithm negotiation. In this specification only the ri | and port. This improves efficiency in situations where the | |||
ghtmost bit, labeled "H", is assigned. Bit "H" indicates the use of HMAC-SHA256 | sender knows a restriction is in place -- for example, if the sender i | |||
(as defined in <xref target="sec_join"/>). An implementation that only support | s behind a strict NAT or operating behind a legacy Layer 4 load balancer.</dd> | |||
s this method MUST set bit "H" to 1, and bits "D" through "G" to 0.</t> | <dt>D through H:</dt> | |||
</list> | <dd> The remaining bits, labeled "D" through "H", are used for | |||
crypto algorithm negotiation. In this specification, only the | ||||
A crypto algorithm MUST be specified. If flag bits D through H are all | rightmost bit, labeled "H", is assigned. Bit "H" indicates the use | |||
0, the MP_CAPABLE option MUST be treated as invalid and ignored (that is, it mus | of HMAC-SHA256 (as defined in <xref target="sec_join" | |||
t be treated as a regular TCP handshake).</t> | format="default"/>). An implementation that only supports this | |||
method <bcp14>MUST</bcp14> set bit "H" to 1 and bits "D" | ||||
<t>The selection of the authentication algorithm also impacts the algori | through "G" to 0.</dd> | |||
thm used to generate the token and the Initial Data Sequence Number (IDSN). In t | ||||
his specification, with only the SHA-256 algorithm (bit "H") specified and selec | ||||
ted, the token MUST be a truncated (most significant 32 bits) SHA-256 hash (<xre | ||||
f target="RFC6234"/>) of the key. A different, 64-bit truncation (the least sign | ||||
ificant 64 bits) of the SHA-256 hash of the key MUST be used as the IDSN. Note t | ||||
hat the key MUST be hashed in network byte order. Also note that the "least sign | ||||
ificant" bits MUST be the rightmost bits of the SHA-256 digest, as per <xref tar | ||||
get="RFC6234"/>. Future specifications of the use of the crypto bits may choose | ||||
to specify different algorithms for token and IDSN generation.</t> | ||||
<t>Both the crypto and checksum bits negotiate capabilities in similar w | ||||
ays. For the Checksum Required bit (labeled "A"), if either host requires the us | ||||
e of checksums, checksums MUST be used. In other words, the only way for checksu | ||||
ms not to be used is if both hosts in their SYNs set A=0. This decision is confi | ||||
rmed by the setting of the "A" bit in the third packet (the ACK) of the handshak | ||||
e. For example, if the initiator sets A=0 in the SYN, but the responder sets A=1 | ||||
in the SYN/ACK, checksums MUST be used in both directions, and the initiator wi | ||||
ll set A=1 in the ACK. The decision whether to use checksums will be stored by a | ||||
n implementation in a per-connection binary state variable. If A=1 is received b | ||||
y a host that does not want to use checksums, it MUST fall back to regular TCP b | ||||
y ignoring the MP_CAPABLE option as if it was invalid.</t> | ||||
<t>For crypto negotiation, the responder has the choice. The initiator c | ||||
reates a proposal setting a bit for each algorithm it supports to 1 (in this ver | ||||
sion of the specification, there is only one proposal, so bit "H" will be always | ||||
set to 1). The responder responds with only 1 bit set -- this is the chosen alg | ||||
orithm. The rationale for this behavior is that the responder will typically be | ||||
a server with potentially many thousands of connections, so it may wish to choos | ||||
e an algorithm with minimal computational complexity, depending on the load. If | ||||
a responder does not support (or does not want to support) any of the initiator' | ||||
s proposals, it MUST respond without an MP_CAPABLE option, thus forcing a fallba | ||||
ck to regular TCP.</t> | ||||
<t>The MP_CAPABLE option is only used in the first subflow of a connecti | </dl> | |||
on, in order to identify the connection; all following subflows will use the "Jo | <t>A crypto algorithm <bcp14>MUST</bcp14> be specified. If flag bits "D | |||
in" option (see <xref target="sec_join"/>) to join the existing connection.</t> | " through "H" are all 0, the MP_CAPABLE option <bcp14>MUST</bcp14> be treated as | |||
invalid and ignored (that is, it must be treated as a regular TCP handshake).</ | ||||
t> | ||||
<t>The selection of the authentication algorithm also impacts the algori | ||||
thm used to generate the token and the IDSN. In this specification, with only th | ||||
e SHA-256 algorithm (bit "H") specified and selected, the token <bcp14>MUST</bcp | ||||
14> be a truncated (most significant 32 bits) SHA-256 hash <xref target="RF | ||||
C6234" format="default"/> of the key. A different, 64-bit truncation (the least | ||||
significant 64 bits) of the SHA-256 hash of the key <bcp14>MUST</bcp14> be used | ||||
as the IDSN. Note that the key <bcp14>MUST</bcp14> be hashed in network byte ord | ||||
er. Also note that the "least significant" bits <bcp14>MUST</bcp14> be the right | ||||
most bits of the SHA-256 digest, as per <xref target="RFC6234" format="default"/ | ||||
>. Future specifications of the use of the crypto bits may choose to specify dif | ||||
ferent algorithms for token and IDSN generation.</t> | ||||
<t>Both the crypto and checksum bits negotiate capabilities in similar | ||||
ways. For the "Checksum required" bit (labeled "A"), if either host | ||||
requires the use of checksums, checksums <bcp14>MUST</bcp14> be | ||||
used. In other words, the only way for checksums not to be used is if | ||||
both hosts in their SYNs set A=0. This decision is confirmed by the | ||||
setting of the "A" bit in the third packet (the ACK) of the | ||||
handshake. For example, if the initiator sets A=0 in the SYN but the | ||||
responder sets A=1 in the SYN/ACK, checksums <bcp14>MUST</bcp14> be | ||||
used in both directions, and the initiator will set A=1 in the | ||||
ACK. The decision regarding whether to use checksums will be stored by a | ||||
n implementation in a per-connection binary state variable. If A=1 is received b | ||||
y a host that does not want to use checksums, it <bcp14>MUST</bcp14> fall back t | ||||
o regular TCP by ignoring the MP_CAPABLE option as if it was invalid.</t> | ||||
<t>For crypto negotiation, the responder has the choice. The initiator | ||||
creates a proposal setting a bit for each algorithm it supports to 1 | ||||
(in this version of the specification, there is only one proposal, so | ||||
bit "H" will always be set to 1). The responder responds with only 1&nbs | ||||
p;bit set -- this is the chosen algorithm. The rationale for this behavior is th | ||||
at the responder will typically be a server with potentially many thousands of c | ||||
onnections, so it may wish to choose an algorithm with minimal computational com | ||||
plexity, depending on the load. If a responder does not support (or does not wan | ||||
t to support) any of the initiator's proposals, it <bcp14>MUST</bcp14> respond w | ||||
ithout an MP_CAPABLE option, thus forcing a fallback to regular TCP.</t> | ||||
<t>The MP_CAPABLE option is only used in the first subflow of a | ||||
connection, in order to identify the connection; all subsequent | ||||
subflows will use the MP_JOIN option (see <xref target="sec_join" | ||||
format="default"/>) to join the existing connection.</t> | ||||
<t>If a SYN contains an MP_CAPABLE option but the | <t>If a SYN contains an MP_CAPABLE option but the | |||
SYN/ACK does not, it is assumed that sender of the SYN/ACK is not | SYN/ACK does not, it is assumed that the sender of the SYN/ACK is not | |||
multipath capable; thus, the MPTCP session MUST operate as | multipath capable; thus, the MPTCP session <bcp14>MUST</bcp14> operate a | |||
a regular, single-path TCP. If a SYN does not contain a | s | |||
MP_CAPABLE option, the SYN/ACK MUST NOT contain one | a regular, single-path TCP session. If a SYN does not contain an | |||
MP_CAPABLE option, the SYN/ACK <bcp14>MUST NOT</bcp14> contain one | ||||
in response. If the third packet (the ACK) does not contain | in response. If the third packet (the ACK) does not contain | |||
the MP_CAPABLE option, then the session MUST fall back to | the MP_CAPABLE option, then the session <bcp14>MUST</bcp14> fall back to | |||
operating as a regular, single-path TCP. This is to maintain | operating as a regular, single-path TCP session. This is done to maintai | |||
n | ||||
compatibility with middleboxes on the path that drop some | compatibility with middleboxes on the path that drop some | |||
or all TCP options. Note that an implementation MAY choose | or all TCP options. Note that an implementation <bcp14>MAY</bcp14> choos e | |||
to attempt sending MPTCP options more than one time before | to attempt sending MPTCP options more than one time before | |||
making this decision to operate as regular TCP (see | making this decision to operate as regular TCP (see | |||
<xref target="heuristics"/>).</t> | <xref target="heuristics" format="default"/>).</t> | |||
<t>If the SYN packets are unacknowledged, it is up to local | <t>If the SYN packets are unacknowledged, it is up to local | |||
policy to decide how to respond. It is expected that a sender | policy to decide how to respond. It is expected that a sender | |||
will eventually fall back to single-path TCP (i.e., without the | will eventually fall back to single-path TCP (i.e., without the | |||
MP_CAPABLE option) in order to work around middleboxes that | MP_CAPABLE option) in order to work around middleboxes that | |||
may drop packets with unknown options; however, the number of | may drop packets with unknown options; however, the number of | |||
multipath-capable attempts that are made first will be up to | multipath-capable attempts that are made first will be up to | |||
local policy. | local policy. | |||
It is possible that MPTCP and non-MPTCP SYNs could get reordered | It is possible that MPTCP and non-MPTCP SYNs could get reordered | |||
in the network. Therefore, the final state is inferred from the | in the network. Therefore, the final state is inferred from the | |||
presence or absence of the MP_CAPABLE option in the third packet | presence or absence of the MP_CAPABLE option in the third packet | |||
of the TCP handshake. If this option is not present, the | of the TCP handshake. If this option is not present, the | |||
connection SHOULD fall back to regular TCP, as documented in | connection <bcp14>SHOULD</bcp14> fall back to regular TCP, as documented | |||
<xref target="sec_fallback"/>.</t> | in | |||
<xref target="sec_fallback" format="default"/>.</t> | ||||
<t>The initial data sequence number on an MPTCP connection | <t>The IDSN on an MPTCP connection | |||
is generated from the key. The algorithm for IDSN generation is | is generated from the key. The algorithm for IDSN generation is | |||
also determined from the negotiated authentication algorithm. | also determined from the negotiated authentication algorithm. | |||
In this specification, with only the SHA-256 algorithm specified and | In this specification, with only the SHA-256 algorithm specified and | |||
selected, the IDSN of a host MUST be the least significant 64 bits of th e | selected, the IDSN of a host <bcp14>MUST</bcp14> be the least significan t 64 bits of the | |||
SHA-256 hash of its key, i.e., IDSN-A = Hash(Key-A) and IDSN-B = Hash(Ke y-B). | SHA-256 hash of its key, i.e., IDSN-A = Hash(Key-A) and IDSN-B = Hash(Ke y-B). | |||
This deterministic generation of the IDSN allows a receiver to ensure | This deterministic generation of the IDSN allows a receiver to ensure | |||
that there are no gaps in sequence space at the start of the connection. | that there are no gaps in sequence space at the start of the connection. | |||
The SYN with MP_CAPABLE occupies the first octet of data sequence space, | The SYN with MP_CAPABLE occupies the first octet of data sequence space, | |||
although this does not need to be acknowledged at the connection level | although this does not need to be acknowledged at the connection level | |||
until the first data is sent (see <xref target="sec_generalop"/>).</t> | until the first data is sent (see <xref target="sec_generalop" format="d efault"/>).</t> | |||
</section> | </section> | |||
<section anchor="sec_join" numbered="true" toc="default"> | ||||
<section title="Starting a New Subflow" anchor="sec_join"> | <name>Starting a New Subflow</name> | |||
<t>Once an MPTCP connection has begun with the MP_CAPABLE | <t>Once an MPTCP connection has begun with the MP_CAPABLE | |||
exchange, further subflows can be added to the connection. | exchange, further subflows can be added to the connection. | |||
Hosts have knowledge of their own address(es), and can | Hosts have knowledge of their own address(es) and can | |||
become aware of the other host's addresses through | become aware of the other host's addresses through | |||
signaling exchanges as described in | signaling exchanges as described in | |||
<xref target="sec_pm"/>. Using this knowledge, a host | <xref target="sec_pm" format="default"/>. Using this knowledge, a host | |||
can initiate a new subflow over a currently unused pair of | can initiate a new subflow over a currently unused pair of | |||
addresses. It is permitted for either host in a connection | addresses. It is permissible for either host in a connection | |||
to initiate the creation of a new subflow, but it is expected | to initiate the creation of a new subflow, but it is expected | |||
that this will normally be the original connection initiator | that this will normally be the original connection initiator | |||
(see <xref target="heuristics"/> for heuristics).</t> | (see <xref target="heuristics" format="default"/> for heuristics).</t> | |||
<t>A new subflow is started as a normal TCP SYN/ACK | <t>A new subflow is started as a normal TCP SYN/ACK | |||
exchange. The Join Connection (MP_JOIN) MPTCP option | exchange. The Join Connection (MP_JOIN) MPTCP option | |||
is used to identify the connection to be joined by the new subflow. | is used to identify the connection to be joined by the new subflow. | |||
It uses keying material that was exchanged in the initial MP_CAPABLE | It uses keying material that was exchanged in the initial MP_CAPABLE | |||
handshake (<xref target="sec_init"/>), and that handshake also | handshake (<xref target="sec_init" format="default"/>), and that handsha ke also | |||
negotiates the crypto algorithm in use for the MP_JOIN handshake.</t> | negotiates the crypto algorithm in use for the MP_JOIN handshake.</t> | |||
<t>This section specifies the behavior of MP_JOIN using the HMAC-SHA256 | <t>This section specifies the behavior of MP_JOIN using the HMAC-SHA256 | |||
algorithm. An MP_JOIN option is present in the SYN, SYN/ACK, | algorithm. An MP_JOIN option is present in the SYN, SYN/ACK, | |||
and ACK of the three-way handshake, although in each case with a | and ACK of the three-way handshake, although in each case with a | |||
different format.</t> | different format.</t> | |||
<t>In the first MP_JOIN on the SYN packet, illustrated in | <t>In the first MP_JOIN on the SYN packet, illustrated in | |||
<xref target="tcpm_join"/>, the initiator sends a token, random | <xref target="tcpm_join" format="default"/>, the initiator sends a token | |||
number, and address ID.</t> | , random | |||
number, and Address ID.</t> | ||||
<figure anchor="tcpm_join"> | ||||
<name>Join Connection (MP_JOIN) Option (for Initial SYN)</name> | ||||
<artwork align="left" name="" type="" alt=""><![CDATA[ | ||||
1 2 3 | ||||
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | ||||
+---------------+---------------+-------+-----+-+---------------+ | ||||
| Kind | Length = 12 |Subtype|(rsv)|B| Address ID | | ||||
+---------------+---------------+-------+-----+-+---------------+ | ||||
| Receiver's Token (32 bits) | | ||||
+---------------------------------------------------------------+ | ||||
| Sender's Random Number (32 bits) | | ||||
+---------------------------------------------------------------+ ]]></artwork | ||||
> | ||||
</figure> | ||||
<t>The token is used to identify the MPTCP connection and is a | <t>The token is used to identify the MPTCP connection and is a | |||
cryptographic hash of the receiver's key, as exchanged | cryptographic hash of the receiver's key, as exchanged | |||
in the initial MP_CAPABLE handshake (<xref target="sec_init"/>). | in the initial MP_CAPABLE handshake (<xref target="sec_init" format="def ault"/>). | |||
In this specification, the tokens presented in this | In this specification, the tokens presented in this | |||
option are generated by the SHA-256 <xref target="RFC6234"/> | option are generated by the SHA-256 algorithm <xref target="RFC6234" for | |||
algorithm, truncated to the most significant 32 bits. The token | mat="default"/>, truncated to the most significant 32 bits. The token | |||
included in the MP_JOIN option is the token that the receiver | included in the MP_JOIN option is the token that the receiver | |||
of the packet uses to identify this connection; i.e., Host A | of the packet uses to identify this connection; i.e., Host A | |||
will send Token-B (which is generated from Key-B). Note that the | will send Token-B (which is generated from Key-B). Note that the | |||
hash generation algorithm can be overridden by the choice of | hash generation algorithm can be overridden by the choice of | |||
cryptographic handshake algorithm, as defined in <xref target="sec_init" | cryptographic handshake algorithm, as defined in <xref target="sec_init" | |||
/>.</t> | format="default"/>.</t> | |||
<t>The MP_JOIN SYN sends not only the token (which is static for a | <t>The MP_JOIN SYN sends not only the token (which is static for a | |||
connection) but also random numbers (nonces) that are used to prevent | connection) but also random numbers (nonces) that are used to prevent | |||
replay attacks on the authentication method. Recommendations for the | replay attacks on the authentication method. Recommendations for the | |||
generation of random numbers for this purpose are given in <xref target= | generation of random numbers for this purpose are given in <xref target= | |||
"RFC4086"/>.</t> | "RFC4086" format="default"/>.</t> | |||
<t>The MP_JOIN option includes an "Address ID". This is an identifier | <t>The MP_JOIN option includes an "Address ID". This is an identifier | |||
generated by the sender of the option, used to identify the source addre ss | generated by the sender of the option, used to identify the source addre ss | |||
of this packet, even if the IP header has been changed in transit by a m iddlebox. | of this packet, even if the IP header has been changed in transit by a m iddlebox. | |||
The numeric value of this field is generated by the sender and must map uniquely | The numeric value of this field is generated by the sender and must map uniquely | |||
to a source IP address for the sending host. | to a source IP address for the sending host. | |||
The Address ID allows address removal (<xref target="sec_remove_addr"/>) | The Address ID allows address removal (<xref target="sec_remove_addr" fo rmat="default"/>) | |||
without needing to know what the source address at the | without needing to know what the source address at the | |||
receiver is, thus allowing address removal through NATs. | receiver is, thus allowing address removal through NATs. | |||
The Address ID also allows correlation between new subflow setup attempt s | The Address ID also allows correlation between new subflow setup attempt s | |||
and address signaling (<xref target="sec_add_address"/>), | and address signaling (<xref target="sec_add_address" format="default"/> ), | |||
to prevent setting up duplicate subflows on the same path, if an MP_JOIN | to prevent setting up duplicate subflows on the same path, if an MP_JOIN | |||
and ADD_ADDR are sent at the same time.</t> | and ADD_ADDR are sent at the same time.</t> | |||
<t>The Address IDs of the subflow used in the initial SYN | <t>The Address IDs of the subflow used in the initial SYN | |||
exchange of the first subflow in the connection are implicit, | exchange of the first subflow in the connection are implicit | |||
and have the value zero. A host MUST store the mappings between | and have the value zero. A host <bcp14>MUST</bcp14> store the mappings b | |||
etween | ||||
Address IDs and addresses both for itself and the remote host. | Address IDs and addresses both for itself and the remote host. | |||
An implementation will also need to know which local and remote | An implementation will also need to know which local and remote | |||
Address IDs are associated with which established subflows, for | Address IDs are associated with which established subflows, for | |||
when addresses are removed from a local or remote host.</t> | when addresses are removed from a local or remote host.</t> | |||
<t>The MP_JOIN option on packets with the SYN flag set also includes | ||||
<t>The MP_JOIN option on packets with the SYN flag set also includes 4 b | 4 bits of flags, 3 of which are currently reserved and | |||
its of flags, 3 of which are currently reserved and MUST be set to zero by the s | <bcp14>MUST</bcp14> be set to 0 by the sender. The final bit, labeled | |||
ender. The final bit, labeled "B", indicates whether the sender of this option w | "B", indicates whether the sender of this option (1) wishes this | |||
ishes this subflow to be used as a backup path (B=1) in the event of failure of | subflow to be used as a backup path (B=1) in the event of failure of | |||
other paths, or whether it wants it to be used as part of the connection immedia | other paths or (2) wants the subflow to be used as part of the | |||
tely. By setting B=1, the sender of the option is requesting the other host to o | connection immediately. By setting B=1, the sender of the option is | |||
nly send data on this subflow if there are no available subflows where B=0. Subf | requesting that the other host only send data on this subflow if there | |||
low policy is discussed in more detail in <xref target="sec_policy"/>.</t> | are no available subflows where B=0. Subflow policy is discussed in more | |||
detail in <xref target="sec_policy" format="default"/>.</t> | ||||
<?rfc needLines='10'?> | ||||
<figure align="center" anchor="tcpm_join" title="Join Connection (MP_JOI | ||||
N) Option (for Initial SYN)"> | ||||
<artwork align="left"><![CDATA[ | ||||
1 2 3 | ||||
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | ||||
+---------------+---------------+-------+-----+-+---------------+ | ||||
| Kind | Length = 12 |Subtype|(rsv)|B| Address ID | | ||||
+---------------+---------------+-------+-----+-+---------------+ | ||||
| Receiver's Token (32 bits) | | ||||
+---------------------------------------------------------------+ | ||||
| Sender's Random Number (32 bits) | | ||||
+---------------------------------------------------------------+ | ||||
]]></artwork> | ||||
</figure> | ||||
<t>When receiving a SYN with an MP_JOIN option that contains | <t>When receiving a SYN with an MP_JOIN option that contains | |||
a valid token for an existing MPTCP connection, the recipient | a valid token for an existing MPTCP connection, the recipient | |||
SHOULD respond with a SYN/ACK also containing an MP_JOIN | <bcp14>SHOULD</bcp14> respond with a SYN/ACK also containing an MP_JOIN | |||
option containing a random number and a truncated (leftmost 64 | option containing a random number and a truncated (leftmost 64 bits | |||
bits) Hash-based Message Authentication Code (HMAC). This | ) HMAC. This | |||
version of the option is shown in <xref target="tcpm_join2"/>. | version of the option is shown in <xref target="tcpm_join2" format="defa | |||
If the token is unknown, or the host wants to refuse subflow | ult"/>. If the token is unknown or the host wants to refuse subflow | |||
establishment (for example, due to a limit on the number of | establishment (for example, due to a limit on the number of | |||
subflows it will permit), the receiver will send back a reset | subflows it will permit), the receiver will send back a reset | |||
(RST) signal, analogous to an unknown port in TCP, containing a | (RST) signal, analogous to an unknown port in TCP, containing an | |||
MP_TCPRST option (<xref target="sec_reset"/>) with a "MPTCP | MP_TCPRST option (<xref target="sec_reset" format="default"/>) with an " | |||
MPTCP | ||||
specific error" reason code. Although calculating an HMAC | specific error" reason code. Although calculating an HMAC | |||
requires cryptographic operations, it is believed that the | requires cryptographic operations, it is believed that the | |||
32-bit token in the MP_JOIN SYN gives sufficient protection against blin d state | 32-bit token in the MP_JOIN SYN gives sufficient protection against blin d state | |||
exhaustion attacks; therefore, there is no need to provide | exhaustion attacks; therefore, there is no need to provide | |||
mechanisms to allow a responder to operate statelessly at the | mechanisms to allow a responder to operate statelessly at the | |||
MP_JOIN stage.</t> | MP_JOIN stage.</t> | |||
<figure anchor="tcpm_join2"> | ||||
<name>Join Connection (MP_JOIN) Option (for Responding SYN/ACK)</name> | ||||
<artwork align="left" name="" type="" alt=""><![CDATA[ | ||||
1 2 3 | ||||
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | ||||
+---------------+---------------+-------+-----+-+---------------+ | ||||
| Kind | Length = 16 |Subtype|(rsv)|B| Address ID | | ||||
+---------------+---------------+-------+-----+-+---------------+ | ||||
| | | ||||
| Sender's Truncated HMAC (64 bits) | | ||||
| | | ||||
+---------------------------------------------------------------+ | ||||
| Sender's Random Number (32 bits) | | ||||
+---------------------------------------------------------------+ ]]></artwork | ||||
> | ||||
</figure> | ||||
<t>An HMAC is sent by both hosts -- by the initiator (Host A) | <t>An HMAC is sent by both hosts -- by the initiator (Host A) | |||
in the third packet (the ACK) and by the responder (Host B) in | in the third packet (the ACK) and by the responder (Host B) in | |||
the second packet (the SYN/ACK). Doing the HMAC exchange at this | the second packet (the SYN/ACK). Doing the HMAC exchange at this | |||
stage allows both hosts to have first exchanged random data (in the | stage allows both hosts to have first exchanged random data (in the | |||
first two SYN packets) that is used as the "message". This | first two SYN packets) that is used as the "message". This | |||
specification defines that HMAC as defined in <xref target="RFC2104"/> | specification defines that HMAC as defined in <xref target="RFC2104" for | |||
is used, along with the SHA-256 hash algorithm <xref target="RFC6234"/>, | mat="default"/> | |||
is used, along with the SHA-256 hash algorithm <xref target="RFC6234" fo | ||||
rmat="default"/>, | ||||
and that the output is truncated to the leftmost 160 bits (20 octets). | and that the output is truncated to the leftmost 160 bits (20 octets). | |||
Due to option space limitations, the HMAC included in | Due to option space limitations, the HMAC included in | |||
the SYN/ACK is truncated to the leftmost 64 bits, but this is | the SYN/ACK is truncated to the leftmost 64 bits, but this is | |||
acceptable since random numbers are used; thus, an attacker | acceptable, since random numbers are used; thus, an attacker | |||
only has one chance to correctly guess the HMAC that matches the random | only has one chance to correctly guess the HMAC that matches the random | |||
number previously sent by the peer (if the HMAC is | number previously sent by the peer (if the HMAC is | |||
incorrect, the TCP connection is closed, so a new MP_JOIN negotiation | incorrect, the TCP connection is closed, so a new MP_JOIN negotiation | |||
with a new random number is required).</t> | with a new random number is required).</t> | |||
<t>The initiator's authentication information is sent in its | <t>The initiator's authentication information is sent in its | |||
first ACK (the third packet of the handshake), as shown in | first ACK (the third packet of the handshake), as shown in | |||
<xref target="tcpm_join3"/>. This data needs to be sent reliably, | <xref target="tcpm_join3" format="default"/>. This data needs to be sent reliably, | |||
since it is the only time this HMAC is sent; | since it is the only time this HMAC is sent; | |||
therefore, receipt of this packet MUST trigger a regular TCP ACK | therefore, receipt of this packet <bcp14>MUST</bcp14> trigger a regular | |||
in response, and the packet MUST be retransmitted if this | TCP ACK | |||
in response, and the packet <bcp14>MUST</bcp14> be retransmitted if this | ||||
ACK is not received. In other words, sending the ACK/MP_JOIN | ACK is not received. In other words, sending the ACK/MP_JOIN | |||
packet places the subflow in the PRE_ESTABLISHED state, and it | packet places the subflow in the PRE_ESTABLISHED state, and it | |||
moves to the ESTABLISHED state only on receipt of an ACK from | moves to the ESTABLISHED state only on receipt of an ACK from | |||
the receiver. It is not permitted to send data while in the | the receiver. It is not permissible to send data while in the | |||
PRE_ESTABLISHED state. The reserved bits in this option MUST be set | PRE_ESTABLISHED state. The reserved bits in this option <bcp14>MUST</bcp | |||
to zero by the sender.</t> | 14> be set | |||
to 0 by the sender.</t> | ||||
<t>The key for the HMAC algorithm, in the case of the message transmitte | <figure anchor="tcpm_join3"> | |||
d by Host A, will be Key-A followed by Key-B, and in the case of Host B, Key-B f | <name>Join Connection (MP_JOIN) Option (for Initiator's Firs | |||
ollowed by Key-A. These are the keys that were exchanged in the original MP_CAPA | t ACK)</name> | |||
BLE handshake. The "message" for the HMAC algorithm in each case is the concaten | <artwork align="left" name="" type="" alt=""><![CDATA[ | |||
ations of random number for each host (denoted by R): for Host A, R-A followed b | 1 2 3 | |||
y R-B; and for Host B, R-B followed by R-A.</t> | 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | |||
+---------------+---------------+-------+-----------------------+ | ||||
<?rfc needLines='10'?> | | Kind | Length = 24 |Subtype| (reserved) | | |||
<figure align="center" anchor="tcpm_join2" title="Join Connection (MP_JO | +---------------+---------------+-------+-----------------------+ | |||
IN) Option (for Responding SYN/ACK)"> | | | | |||
<artwork align="left"><![CDATA[ | | | | |||
1 2 3 | | Sender's Truncated HMAC (160 bits) | | |||
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | | | | |||
+---------------+---------------+-------+-----+-+---------------+ | | | | |||
| Kind | Length = 16 |Subtype|(rsv)|B| Address ID | | +---------------------------------------------------------------+ ]]></artwork | |||
+---------------+---------------+-------+-----+-+---------------+ | > | |||
| | | ||||
| Sender's Truncated HMAC (64 bits) | | ||||
| | | ||||
+---------------------------------------------------------------+ | ||||
| Sender's Random Number (32 bits) | | ||||
+---------------------------------------------------------------+ | ||||
]]></artwork> | ||||
</figure> | ||||
<?rfc needLines='12'?> | ||||
<figure align="center" anchor="tcpm_join3" title="Join Connection (MP_JO | ||||
IN) Option (for Third ACK)"> | ||||
<artwork align="left"><![CDATA[ | ||||
1 2 3 | ||||
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | ||||
+---------------+---------------+-------+-----------------------+ | ||||
| Kind | Length = 24 |Subtype| (reserved) | | ||||
+---------------+---------------+-------+-----------------------+ | ||||
| | | ||||
| | | ||||
| Sender's Truncated HMAC (160 bits) | | ||||
| | | ||||
| | | ||||
+---------------------------------------------------------------+ | ||||
]]></artwork> | ||||
</figure> | </figure> | |||
<t>The key for the HMAC algorithm, in the case of the message | ||||
transmitted by Host A, will be Key-A followed by Key-B; and in the | ||||
case of Host B, Key-B followed by Key-A. These are the keys that were | ||||
exchanged in the original MP_CAPABLE handshake. The "message" for the | ||||
HMAC algorithm in each case is the concatenations of random numbers for | ||||
each host (denoted by R): for Host A, R-A followed by R-B; and for | ||||
Host B, R-B followed by R-A.</t> | ||||
<t>These various MPTCP options fit together to enable authenticated subf | ||||
low setup as illustrated in <xref target="fig_tokens" format="default"/>.</t> | ||||
<figure anchor="fig_tokens"> | ||||
<name>Example Use of MPTCP Authentication</name> | ||||
<artwork align="left" name="" type="" alt=""><![CDATA[ | ||||
Host A Host B | ||||
------------------------ ---------- | ||||
Address A1 Address A2 Address B1 | ||||
---------- ---------- ---------- | ||||
| | | | ||||
| | SYN + MP_CAPABLE | | ||||
|--------------------------------------------->| | ||||
|<---------------------------------------------| | ||||
| SYN/ACK + MP_CAPABLE(Key-B) | | ||||
| | | | ||||
| ACK + MP_CAPABLE(Key-A, Key-B) | | ||||
|--------------------------------------------->| | ||||
| | | | ||||
| | SYN + MP_JOIN(Token-B, R-A) | | ||||
| |------------------------------->| | ||||
| |<-------------------------------| | ||||
| | SYN/ACK + MP_JOIN(HMAC-B, R-B) | | ||||
| | | | ||||
| | ACK + MP_JOIN(HMAC-A) | | ||||
| |------------------------------->| | ||||
| |<-------------------------------| | ||||
| | ACK | | ||||
<t>These various MPTCP options fit together to enable authenticated subf | HMAC-A = HMAC(Key=(Key-A + Key-B), Msg=(R-A + R-B)) | |||
low setup as illustrated in <xref target="fig_tokens"/>.</t> | HMAC-B = HMAC(Key=(Key-B + Key-A), Msg=(R-B + R-A)) ]]></artwork> | |||
<?rfc needLines='24'?> | ||||
<figure align="center" anchor="fig_tokens" title="Example Use of MPTCP A | ||||
uthentication"> | ||||
<artwork align="left"><![CDATA[ | ||||
Host A Host B | ||||
------------------------ ---------- | ||||
Address A1 Address A2 Address B1 | ||||
---------- ---------- ---------- | ||||
| | | | ||||
| | SYN + MP_CAPABLE | | ||||
|--------------------------------------------->| | ||||
|<---------------------------------------------| | ||||
| SYN/ACK + MP_CAPABLE(Key-B) | | ||||
| | | | ||||
| ACK + MP_CAPABLE(Key-A, Key-B) | | ||||
|--------------------------------------------->| | ||||
| | | | ||||
| | SYN + MP_JOIN(Token-B, R-A) | | ||||
| |------------------------------->| | ||||
| |<-------------------------------| | ||||
| | SYN/ACK + MP_JOIN(HMAC-B, R-B) | | ||||
| | | | ||||
| | ACK + MP_JOIN(HMAC-A) | | ||||
| |------------------------------->| | ||||
| |<-------------------------------| | ||||
| | ACK | | ||||
HMAC-A = HMAC(Key=(Key-A+Key-B), Msg=(R-A+R-B)) | ||||
HMAC-B = HMAC(Key=(Key-B+Key-A), Msg=(R-B+R-A)) | ||||
]]></artwork> | ||||
</figure> | </figure> | |||
<t>If the token received at Host B is unknown or local policy | <t>If the token received at Host B is unknown or local policy | |||
prohibits the acceptance of the new subflow, the recipient MUST | prohibits the acceptance of the new subflow, the recipient <bcp14>MUST</ | |||
respond with a TCP RST for the subflow. If appropriate, a MP_TCPRST | bcp14> | |||
option with a "Administratively prohibited" reason code | respond with a TCP RST for the subflow. If appropriate, an MP_TCPRST | |||
(<xref target="sec_reset"/>) should be included.</t> | option with an "Administratively prohibited" reason code | |||
(<xref target="sec_reset" format="default"/>) should be included.</t> | ||||
<t>If the token is accepted at Host B, but the HMAC returned to | <t>If the token is accepted at Host B but the HMAC returned to | |||
Host A does not match the one expected, Host A MUST close the | Host A does not match the one expected, Host A <bcp14>MUST</bcp14> close | |||
subflow with a TCP RST. In this, and all following cases of sending | the | |||
a RST in this section, the sender SHOULD send a MP_TCPRST option | subflow with a TCP RST. In this and all subsequent cases of sending | |||
(<xref target="sec_reset"/>) on this RST packet with the reason | a RST as described in this section, the sender <bcp14>SHOULD</bcp14> sen | |||
code for a "MPTCP specific error".</t> | d an MP_TCPRST option | |||
(<xref target="sec_reset" format="default"/>) on this RST packet with th | ||||
<t>If Host B does not receive the expected HMAC, or the MP_JOIN | e reason | |||
option is missing from the ACK, it MUST close the subflow with a | code for an "MPTCP-specific error".</t> | |||
<t>If Host B does not receive the expected HMAC or the MP_JOIN | ||||
option is missing from the ACK, it <bcp14>MUST</bcp14> close the subflow | ||||
with a | ||||
TCP RST.</t> | TCP RST.</t> | |||
<t>If the HMACs are verified as correct, then both hosts have | <t>If the HMACs are verified as correct, then both hosts have | |||
verified each other as being the same peers as existed at | verified each other as being the same peers as those that existed at | |||
the start of the connection, and they have agreed of which | the start of the connection, and they have agreed of which | |||
connection this subflow will become a part.</t> | connection this subflow will become a part.</t> | |||
<t>If the SYN/ACK as received at Host A does not have an MP_JOIN | <t>If the SYN/ACK as received at Host A does not have an MP_JOIN | |||
option, Host A MUST close the subflow with a TCP RST.</t> | option, Host A <bcp14>MUST</bcp14> close the subflow with a TCP RST.</t> | |||
<t>This covers all cases of the loss of an MP_JOIN. In more detail, | <t>This covers all cases of the loss of an MP_JOIN. In more detail, | |||
if MP_JOIN is stripped from the SYN on the path from A to | if an MP_JOIN is stripped from the SYN on the path from A to | |||
B, and Host B does not have a listener on the relevant | B and Host B does not have a listener on the relevant | |||
port, it will respond with a RST in the normal way. If in | port, it will respond with a RST in the normal way. If in | |||
response to a SYN with an MP_JOIN option, a SYN/ACK is | response to a SYN with an MP_JOIN option a SYN/ACK is | |||
received without the MP_JOIN option (either since it was | received without the MP_JOIN option (because it was either | |||
stripped on the return path, or it was stripped on the | stripped on the return path, or stripped on the | |||
outgoing path but Host B responded as if | outgoing path leading to Host B responding as if | |||
it were a new regular TCP session), then the subflow is | it was a new regular TCP session), then the subflow is | |||
unusable and Host A MUST close it with a RST.</t> | unusable and Host A <bcp14>MUST</bcp14> close it with a RST.</t> | |||
<t>Note that additional subflows can be created | <t>Note that additional subflows can be created | |||
between any pair of ports (but see <xref target="heuristics"/> for | between any pair of ports (but see <xref target="heuristics" format="def ault"/> for | |||
heuristics); no explicit application-level accept calls or | heuristics); no explicit application-level accept calls or | |||
bind calls are required to open additional subflows. To | bind calls are required to open additional subflows. To | |||
associate a new subflow with an existing connection, the token | associate a new subflow with an existing connection, the token | |||
supplied in the subflow's SYN exchange is used for | supplied in the subflow's SYN exchange is used for | |||
demultiplexing. This then binds the 5-tuple of the TCP | demultiplexing. This then binds the 5-tuple of the TCP | |||
subflow to the local token of the connection. A consequence is | subflow to the local token of the connection. One consequence is | |||
that it is possible to allow any port pairs to be used for a | that it is possible to allow any port pairs to be used for a | |||
connection. </t> | connection. </t> | |||
<t>Demultiplexing subflow SYNs <bcp14>MUST</bcp14> be done using the tok | ||||
<t>Demultiplexing subflow SYNs MUST be done using the token; | en; | |||
this is unlike traditional TCP, where the destination port is | this is unlike traditional TCP, where the destination port is | |||
used for demultiplexing SYN packets. Once a subflow is set up, | used for demultiplexing SYN packets. Once a subflow is set up, | |||
demultiplexing packets is done using the 5-tuple, as in | demultiplexing packets is done using the 5-tuple, as in | |||
traditional TCP. The 5-tuples will be mapped to the local | traditional TCP. The 5-tuples will be mapped to the local | |||
connection identifier (token). Note that Host A will know its | connection identifier (token). Note that Host A will know its | |||
local token for the subflow even though it is not sent on the | local token for the subflow even though it is not sent on the | |||
wire -- only the responder's token is sent.</t> | wire -- only the responder's token is sent.</t> | |||
</section> | </section> | |||
<section anchor="sec_generalop" numbered="true" toc="default"> | ||||
<section title="General MPTCP Operation" anchor="sec_generalop"> | <name>MPTCP Operation and Data Transfer</name> | |||
<t>This section discusses operation of MPTCP for data transfer. At a hig | <t>This section discusses the operation of MPTCP for data transfer. At a | |||
h level, an MPTCP implementation will take one input data stream from an applica | high level, an MPTCP implementation will take one input data stream from an app | |||
tion, and split it into one or more subflows, with sufficient control informatio | lication and split it into one or more subflows, with sufficient control informa | |||
n to allow it to be reassembled and delivered reliably and in order to the recip | tion to allow it to be reassembled and delivered reliably and in order to the re | |||
ient application. The following subsections define this behavior in detail.</t> | cipient application. The following subsections define this behavior in detail.</ | |||
t> | ||||
<t>The data sequence mapping and the Data ACK are signaled in the Data S | <t>The Data Sequence Mapping and the Data ACK are signaled in the DSS op | |||
equence Signal (DSS) option (<xref target="tcpm_dsn"/>). Either or both can be s | tion (<xref target="tcpm_dsn" format="default"/>). Either or both can be signale | |||
ignaled in one DSS, depending on the flags set. The data sequence mapping define | d in one DSS, depending on the flags set. The Data Sequence Mapping defines how | |||
s how the sequence space on the subflow maps to the connection level, and the Da | the sequence space on the subflow maps to the connection level, and the Data ACK | |||
ta ACK acknowledges receipt of data at the connection level. These functions are | acknowledges receipt of data at the connection level. These functions are descr | |||
described in more detail in the following two subsections.</t> | ibed in more detail in the following two subsections.</t> | |||
<figure anchor="tcpm_dsn"> | ||||
<?rfc needLines='18'?> | <name>Data Sequence Signal (DSS) Option</name> | |||
<figure align="center" anchor="tcpm_dsn" title="Data Sequence Signal (DS | <artwork align="left" name="" type="" alt=""><![CDATA[ | |||
S) Option"> | ||||
<artwork align="left"><![CDATA[ | ||||
1 2 3 | 1 2 3 | |||
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | |||
+---------------+---------------+-------+----------------------+ | +---------------+---------------+-------+----------------------+ | |||
| Kind | Length |Subtype| (reserved) |F|m|M|a|A| | | Kind | Length |Subtype| (reserved) |F|m|M|a|A| | |||
+---------------+---------------+-------+----------------------+ | +---------------+---------------+-------+----------------------+ | |||
| Data ACK (4 or 8 octets, depending on flags) | | | Data ACK (4 or 8 octets, depending on flags) | | |||
+--------------------------------------------------------------+ | +--------------------------------------------------------------+ | |||
| Data sequence number (4 or 8 octets, depending on flags) | | | Data Sequence Number (4 or 8 octets, depending on flags) | | |||
+--------------------------------------------------------------+ | +--------------------------------------------------------------+ | |||
| Subflow Sequence Number (4 octets) | | | Subflow Sequence Number (4 octets) | | |||
+-------------------------------+------------------------------+ | +-------------------------------+------------------------------+ | |||
| Data-Level Length (2 octets) | Checksum (2 octets) | | | Data-Level Length (2 octets) | Checksum (2 octets) | | |||
+-------------------------------+------------------------------+ | +-------------------------------+------------------------------+ ]]></artwork> | |||
]]></artwork> | ||||
</figure> | </figure> | |||
<t>The flags, when set, define the contents of this option, as follows: | <t>The flags, when set, define the contents of this option, as follows: | |||
<list style="symbols"> | </t> | |||
<t>A = Data ACK present</t> | <ul spacing="normal"> | |||
<t>a = Data ACK is 8 octets (if not set, Data ACK is 4 octets)</t> | <li>A = Data ACK present</li> | |||
<t>M = Data Sequence Number (DSN), Subflow Sequence Number (SSN), Da | <li>a = Data ACK is 8 octets (if not set, Data ACK is 4 octets)</li> | |||
ta-Level Length, and Checksum (if negotiated) present</t> | <li>M = Data Sequence Number (DSN), Subflow Sequence Number (SSN), Dat | |||
<t>m = Data sequence number is 8 octets (if not set, DSN is 4 octets | a-Level Length, and Checksum (if negotiated) present</li> | |||
)</t> | <li>m = Data Sequence Number is 8 octets (if not set, DSN is 4 octets) | |||
</list> | </li> | |||
</ul> | ||||
The flags 'a' and 'm' only have meaning if the corresponding 'A' or 'M' | <t> | |||
flags are set; otherwise, they will be ignored. The maximum length of this optio | ||||
n, with all flags set, is 28 octets.</t> | ||||
<t>The 'F' flag indicates "Data FIN". If present, this means that this m | ||||
apping covers the final data from the sender. This is the connection-level equiv | ||||
alent to the FIN flag in single-path TCP. A connection is not closed unless ther | ||||
e has been a Data FIN exchange, a MP_FASTCLOSE (<xref target="sec_fastclose"/>) | ||||
message, or an implementation-specific, connection-level send timeout. The purpo | ||||
se of the Data FIN and the interactions between this flag, the subflow-level FIN | ||||
flag, and the data sequence mapping are described in <xref target="sec_close"/> | ||||
. | ||||
The remaining reserved bits MUST be set to zero by an implementation of | ||||
this specification.</t> | ||||
<t>Note that the checksum is only present in this option if the use of M | ||||
PTCP checksumming has been negotiated at the MP_CAPABLE handshake (see <xref tar | ||||
get="sec_init"/>). The presence of the checksum can be inferred from the length | ||||
of the option. If a checksum is present, but its use had not been negotiated in | ||||
the MP_CAPABLE handshake, the receiver MUST close the subflow with a RST as it n | ||||
ot behaving as negotiated. If a checksum is not present when its use has been ne | ||||
gotiated, the receiver MUST close the subflow with a RST as it is considered bro | ||||
ken. In both cases, this RST SHOULD be accompanied with a MP_TCPRST option (<xre | ||||
f target="sec_reset"/>) with the reason code for a "MPTCP specific error".</t> | ||||
<section title="Data Sequence Mapping" anchor="sec_dsn"> | ||||
<t>The data stream as a whole can be reassembled through the use of th | ||||
e data sequence mapping components of the DSS option (<xref target="tcpm_dsn"/>) | ||||
, which define the | ||||
mapping from the subflow sequence number to the data sequence number. This is us | ||||
ed by the receiver to ensure in-order delivery to the application layer. Meanwhi | ||||
le, the subflow-level sequence numbers (i.e., the regular sequence numbers in th | ||||
e TCP header) have subflow-only relevance. It is expected (but not mandated) tha | ||||
t SACK <xref target='RFC2018'/> is used at the subflow level to improve efficien | ||||
cy.</t> | ||||
<t>The data sequence mapping specifies a mapping from subflow sequence s | ||||
pace to data sequence space. This is expressed in terms of starting sequence num | ||||
bers for the subflow and the data level, and a length of bytes for which this ma | ||||
pping is valid. | ||||
This explicit mapping for a range of data was chosen rather than per-packet sign | ||||
aling to assist with compatibility with situations where TCP/IP segmentation or | ||||
coalescing is undertaken separately from the stack that is generating the data f | ||||
low (e.g., through the use of TCP segmentation offloading on network interface c | ||||
ards, or by middleboxes such as performance enhancing proxies). It also allows a | ||||
single mapping to cover many packets, which may be useful in bulk transfer situ | ||||
ations.</t> | ||||
<t>A mapping is fixed, in that the subflow sequence number is bound to t | ||||
he data sequence number after the mapping has been processed. A sender MUST NOT | ||||
change this mapping | ||||
after it has been declared; however, the same data sequence number can be mapped | ||||
to by different subflows for retransmission purposes (see <xref target="sec_ret | ||||
ransmit"/>). This would also permit the same data to be sent simultaneously on m | ||||
ultiple subflows for resilience or efficiency purposes, especially in the case o | ||||
f lossy links. Although the detailed specification of such operation is outside | ||||
the scope of this document, an implementation SHOULD treat the first data that i | ||||
s received at a subflow for the data sequence space as that which should be deli | ||||
vered to the application, and any later data for that sequence space SHOULD be i | ||||
gnored.</t> | ||||
<t>The data sequence number is specified as an absolute value, whereas t | ||||
he subflow sequence numbering is relative (the SYN at the start of the subflow h | ||||
as relative subflow sequence number 0). This is to allow middleboxes to change t | ||||
he initial sequence number of a subflow, such as firewalls that undertake Initia | ||||
l Sequence Number (ISN) randomization.</t> | ||||
<t>The data sequence mapping also contains a checksum of the data that t | ||||
his mapping covers, if use of checksums has been negotiated at the MP_CAPABLE ex | ||||
change. Checksums are used to detect if the payload has been adjusted in any way | ||||
by a non-MPTCP-aware middlebox. If this checksum fails, it will trigger a failu | ||||
re of the subflow, or a fallback to regular TCP, as documented in <xref target=" | ||||
sec_fallback"/>, since MPTCP can no longer reliably know the subflow sequence sp | ||||
ace at the receiver to build data sequence mappings. Without checksumming enable | ||||
d, corrupt data may be delivered to the application if a middlebox alters segmen | ||||
t boundaries, alters content, or does not deliver all segments covered by a data | ||||
sequence mapping. It is therefore RECOMMENDED to use checksumming unless it is | ||||
known the network path contains no such devices.</t> | ||||
<t>The checksum algorithm used is the standard TCP checksum <xref target | ||||
="RFC0793"/>, operating over the data covered by this mapping, along with a pseu | ||||
do-header as shown in <xref target="fig_pseudo"/>.</t> | ||||
<?rfc needLines='18'?> | The flags "a" and "m" only have meaning if the corresponding "A" or "M" | |||
<figure align="center" anchor="fig_pseudo" title="Pseudo-Header for DSS | flags are set; otherwise, they will be ignored. The maximum length of this optio | |||
Checksum"> | n, with all flags set, is 28 octets.</t> | |||
<artwork align="left"><![CDATA[ | <t>The "F" flag indicates "Data FIN". If present, this means that this | |||
mapping covers the final data from the sender. This is the | ||||
connection-level equivalent of the FIN flag in single-path TCP. A connec | ||||
tion is not closed unless there has been a Data FIN exchange, an MP_FASTCLOSE (< | ||||
xref target="sec_fastclose" format="default"/>) message, or an implementation-sp | ||||
ecific connection-level send timeout. The purpose of the Data FIN and the intera | ||||
ctions between this flag, the subflow-level FIN flag, and the Data Sequence Mapp | ||||
ing are described in <xref target="sec_close" format="default"/>. | ||||
The remaining reserved bits <bcp14>MUST</bcp14> be set to 0 by an implem | ||||
entation of this specification.</t> | ||||
<t>Note that the checksum is only present in this option if the use of | ||||
MPTCP checksumming has been negotiated at the MP_CAPABLE handshake | ||||
(see <xref target="sec_init" format="default"/>). The presence of the | ||||
checksum can be inferred from the length of the option. If a checksum | ||||
is present but its use had not been negotiated in the MP_CAPABLE | ||||
handshake, the receiver <bcp14>MUST</bcp14> close the subflow with a | ||||
RST, as it is not behaving as negotiated. If a checksum is not present w | ||||
hen its use has been negotiated, the receiver <bcp14>MUST</bcp14> close the subf | ||||
low with a RST, as it is considered broken. In both cases, this RST <bcp14>SHOUL | ||||
D</bcp14> be accompanied by an MP_TCPRST option (<xref target="sec_reset" format | ||||
="default"/>) with the reason code for an "MPTCP-specific error".</t> | ||||
<section anchor="sec_dsn" numbered="true" toc="default"> | ||||
<name>Data Sequence Mapping</name> | ||||
<t>The data stream as a whole can be reassembled through the use of th | ||||
e Data Sequence Mapping components of the DSS option (<xref target="tcpm_dsn" fo | ||||
rmat="default"/>), which define the | ||||
mapping from the subflow sequence number to the data sequence number. This is | ||||
used by the receiver to ensure in-order delivery to the application | ||||
layer. Meanwhile, the subflow-level sequence numbers (i.e., the | ||||
regular sequence numbers in the TCP header) are only relevant to the s | ||||
ubflow. It is expected (but not mandated) that SACK <xref | ||||
target="RFC2018" format="default"/> will be used at the subflow level | ||||
to improve efficiency.</t> | ||||
<t>The Data Sequence Mapping specifies a mapping from the subflow | ||||
sequence space to the data sequence space. This is expressed in terms | ||||
of starting sequence numbers for the subflow and the data level, and a length of | ||||
bytes for which this mapping is valid. | ||||
This explicit mapping for a range of data, rather than per‑packet signalin | ||||
g, was chosen to assist with compatibility with | ||||
situations where TCP/IP segmentation or coalescing is undertaken | ||||
separately from the stack that is generating the data flow (e.g., | ||||
through the use of TCP segmentation offloading on network interface | ||||
cards, or by middleboxes such as Performance Enhancing Proxies | ||||
(PEPs) <xref target="RFC3135" format="default"/>). It | ||||
also allows a single mapping to cover many packets; this may be useful | ||||
in bulk‑transfer situations.</t> | ||||
<t>A mapping is fixed, in that the subflow sequence number is bound to | ||||
the data sequence number after the mapping has been processed. A sender <bcp14> | ||||
MUST NOT</bcp14> change this mapping | ||||
after it has been declared; however, the same data sequence number can be | ||||
mapped to by different subflows for retransmission purposes (see | ||||
<xref target="sec_retransmit" format="default"/>). This would also | ||||
permit the same data to be sent simultaneously on multiple subflows | ||||
for resilience or efficiency purposes, especially in the case of | ||||
lossy links. Although the detailed specification of such operation | ||||
is outside the scope of this document, an implementation | ||||
<bcp14>SHOULD</bcp14> treat the first data that is received at a | ||||
subflow for the data sequence space as the data that should be deliver | ||||
ed to the application, and any subsequent data for that sequence space <bcp14>SH | ||||
OULD</bcp14> be ignored.</t> | ||||
<t>The data sequence number is specified as an absolute value, | ||||
whereas the subflow sequence numbering is relative (the SYN at the | ||||
start of the subflow has a relative subflow sequence number of | ||||
0). This is done to allow middleboxes to change the Initial Sequence | ||||
Number (ISN) of a subflow, such as firewalls that undertake ISN random | ||||
ization.</t> | ||||
<t>The Data Sequence Mapping also contains a checksum of the data | ||||
that this mapping covers, if the use of checksums has been negotiated | ||||
at | ||||
the MP_CAPABLE exchange. Checksums are used to detect if the payload | ||||
has been adjusted in any way by a non-MPTCP-aware middlebox. If this | ||||
checksum fails, it will trigger a failure of the subflow, or a | ||||
fallback to regular TCP, as documented in <xref | ||||
target="sec_fallback" format="default"/>, since MPTCP can no longer | ||||
reliably know the subflow sequence space at the receiver to build | ||||
Data Sequence Mappings. Without checksumming enabled, corrupt data | ||||
may be delivered to the application if a middlebox alters segment | ||||
boundaries, alters content, or does not deliver all segments covered | ||||
by a Data Sequence Mapping. It is therefore | ||||
<bcp14>RECOMMENDED</bcp14> that checksumming be used, unless it is kno | ||||
wn | ||||
that the network path contains no such devices.</t> | ||||
<t>The checksum algorithm used is the standard TCP checksum <xref targ | ||||
et="RFC0793" format="default"/>, operating over the data covered by this mapping | ||||
, along with a pseudo‑header as shown in <xref target="fig_pseudo" format= | ||||
"default"/>.</t> | ||||
<figure anchor="fig_pseudo"> | ||||
<name>Pseudo-Header for DSS Checksum</name> | ||||
<artwork align="left" name="" type="" alt=""><![CDATA[ | ||||
1 2 3 | 1 2 3 | |||
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | |||
+--------------------------------------------------------------+ | +--------------------------------------------------------------+ | |||
| | | | | | |||
| Data Sequence Number (8 octets) | | | Data Sequence Number (8 octets) | | |||
| | | | | | |||
+--------------------------------------------------------------+ | +--------------------------------------------------------------+ | |||
| Subflow Sequence Number (4 octets) | | | Subflow Sequence Number (4 octets) | | |||
+-------------------------------+------------------------------+ | +-------------------------------+------------------------------+ | |||
| Data-Level Length (2 octets) | Zeros (2 octets) | | | Data-Level Length (2 octets) | Zeros (2 octets) | | |||
+-------------------------------+------------------------------+ | +-------------------------------+------------------------------+ ]]></artwork> | |||
]]></artwork> | </figure> | |||
</figure> | <t>Note that the data sequence number used in the pseudo-header is alw | |||
ays the 64-bit value, irrespective of what length is used in the DSS option itse | ||||
<t>Note that the data sequence number used in the pseudo-header is alway | lf. The standard TCP checksum algorithm has been chosen, since it will be calcul | |||
s the 64-bit value, irrespective of what length is used in the DSS option itself | ated anyway for the TCP subflow, and if calculated first over the data before ad | |||
. The standard TCP checksum algorithm has been chosen since it will be calculate | ding the pseudo-headers, it only needs to be calculated once. Furthermore, since | |||
d anyway for the TCP subflow, and if calculated first over the data before addin | the TCP checksum is additive, the checksum for a DSN_MAP can be constructed by | |||
g the pseudo-headers, it only needs to be calculated once. Furthermore, since th | simply adding together the checksums for the data of each constituent TCP segmen | |||
e TCP checksum is additive, the checksum for a DSN_MAP can be constructed by sim | t and adding the checksum for the DSS pseudo‑header.</t> | |||
ply adding together the checksums for the data of each constituent TCP segment, | <t>Note that checksumming relies on the TCP subflow containing contigu | |||
and adding the checksum for the DSS pseudo-header.</t> | ous data; therefore, a TCP subflow <bcp14>MUST NOT</bcp14> use the Urgent Pointe | |||
r to interrupt an existing mapping. Further note, however, that if Urgent data i | ||||
<t>Note that checksumming relies on the TCP subflow containing contiguou | s received on a subflow, it <bcp14>SHOULD</bcp14> be mapped to the data sequence | |||
s data; therefore, a TCP subflow MUST NOT use the Urgent Pointer to interrupt an | space and delivered to the application, analogous to Urgent data in regular TCP | |||
existing mapping. Further note, however, that if Urgent data is received on a s | .</t> | |||
ubflow, it SHOULD be mapped to the data sequence space and delivered to the appl | <t>To avoid possible deadlock scenarios, subflow-level | |||
ication analogous to Urgent data in regular TCP.</t> | processing should be undertaken separately from processing at the | |||
<t>To avoid possible deadlock scenarios, subflow-level | ||||
processing should be undertaken separately from that at | ||||
connection level. Therefore, even if a mapping does not exist | connection level. Therefore, even if a mapping does not exist | |||
from the subflow space to the data-level space, the data | from the subflow space to the data‑level space, the data | |||
SHOULD still be ACKed at the subflow (if it is in-window). | <bcp14>SHOULD</bcp14> still be ACKed at the subflow (if it is in-window) | |||
. | ||||
This data cannot, however, be acknowledged at the data level | This data cannot, however, be acknowledged at the data level | |||
(<xref target="sec_dataack"/>) because its data sequence | (<xref target="sec_dataack" format="default"/>) because its data sequenc | |||
numbers are unknown. Implementations MAY hold onto such | e | |||
unmapped data for a short while in the expectation that a | numbers are unknown. Implementations <bcp14>MAY</bcp14> hold onto such | |||
unmapped data for a short while, in the expectation that a | ||||
mapping will arrive shortly. Such unmapped data cannot be | mapping will arrive shortly. Such unmapped data cannot be | |||
counted as being within the connection level receive window because this is | counted as being within the connection-level receive window because this is | |||
relative to the data sequence numbers, so if the receiver runs | relative to the data sequence numbers, so if the receiver runs | |||
out of memory to hold this data, it will have to be discarded. | out of memory to hold this data, it will have to be discarded. | |||
If a mapping for that subflow-level sequence space does not | If a mapping for that subflow-level sequence space does not | |||
arrive within a receive window of data, that subflow SHOULD be | arrive within a receive window of data, that subflow <bcp14>SHOULD</bcp1 4> be | |||
treated as broken, closed with a RST, and any unmapped data | treated as broken, closed with a RST, and any unmapped data | |||
silently discarded.</t> | silently discarded.</t> | |||
<t>Data sequence numbers are always 64-bit quantities and | ||||
<t>Data sequence numbers are always 64-bit quantities, and | <bcp14>MUST</bcp14> be maintained as such in implementations. If a | |||
MUST be maintained as such in implementations. If a | ||||
connection is progressing at a slow rate, so protection | connection is progressing at a slow rate, so protection | |||
against wrapped sequence numbers is not required, | against wrapped sequence numbers is not required, | |||
then an implementation MAY include just the lower 32 | then an implementation <bcp14>MAY</bcp14> include just the lower 32 | |||
bits of the data sequence number in the data sequence mapping and/or | bits of the data sequence number in the Data Sequence Mapping and&wj;/or | |||
Data ACK as an optimization, and an implementation can make this choice | Data ACK as an optimization, and an implementation can make this choice | |||
independently for each packet. An implementation MUST be able to receive | independently for each packet. An implementation <bcp14>MUST</bcp14> be | |||
and process both 64-bit or 32-bit sequence number values, but it is not | able to receive | |||
required that an implementation is able to send both.</t> | and process both 64-bit and 32-bit sequence number values, but it is not | |||
required that an implementation be able to send both.</t> | ||||
<t>An implementation MUST send the full 64-bit data sequence number | <t>An implementation <bcp14>MUST</bcp14> send the full 64-bit data seq | |||
uence number | ||||
if it is transmitting at a sufficiently high rate that the 32-bit value | if it is transmitting at a sufficiently high rate that the 32-bit value | |||
could wrap within the Maximum Segment Lifetime | could wrap within the Maximum Segment Lifetime | |||
(MSL) <xref target="RFC7323"/>. The lengths of the DSNs used in these | (MSL) <xref target="RFC7323" format="default"/>. The lengths of the DSNs used in these | |||
values (which may be different) are declared with flags in the | values (which may be different) are declared with flags in the | |||
DSS option. Implementations MUST accept a 32-bit DSN and implicitly | DSS option. Implementations <bcp14>MUST</bcp14> accept a 32-bit DSN and implicitly | |||
promote it to a 64-bit quantity by incrementing the upper 32 | promote it to a 64-bit quantity by incrementing the upper 32 | |||
bits of sequence number each time the lower 32 | bits of the sequence number each time the lower 32 | |||
bits wrap. A sanity check MUST be implemented to ensure that | bits wrap. A sanity check <bcp14>MUST</bcp14> be implemented to ensure t | |||
hat | ||||
a wrap occurs at an expected time (e.g., the sequence number jumps | a wrap occurs at an expected time (e.g., the sequence number jumps | |||
from a very high number to a very low number) and is not triggered | from a very high number to a very low number) and is not triggered | |||
by out-of-order packets.</t> | by out‑of-order packets.</t> | |||
<t>As with the standard TCP sequence number, the data sequence | ||||
<t>As with the standard TCP sequence number, the data sequence | ||||
number should not start at zero, but at a random value to make | number should not start at zero, but at a random value to make | |||
blind session hijacking harder. This specification requires | blind session hijacking harder. This specification requires | |||
setting the initial data sequence number (IDSN) of each host to the | setting the IDSN of each host to the | |||
least significant 64 bits of the SHA-256 hash of the host's key, as | least significant 64 bits of the SHA-256 hash of the host's key, as | |||
described in <xref target="sec_init"/>. This is required also in | described in <xref target="sec_init" format="default"/>. This is also re | |||
order for the receiver to know what the expected IDSN is, and thus | quired in | |||
order for the receiver to know what the expected IDSN is and thus | ||||
determine if any initial connection-level packets are missing; this | determine if any initial connection-level packets are missing; this | |||
is particularly relevant if two subflows start transmitting simultaneous ly.</t> | is particularly relevant if two subflows start transmitting simultaneous ly.</t> | |||
<t>The mapping provided by a Data Sequence Mapping MUST apply to | ||||
<t>A data sequence mapping does not need to be included in | some or all of the subflow sequence space in the TCP segment that | |||
carries the option. It does not need to be included in | ||||
every MPTCP packet, as long as the subflow sequence space in | every MPTCP packet, as long as the subflow sequence space in | |||
that packet is covered by a mapping known at the receiver. This | that packet is covered by a mapping known at the receiver. This | |||
can be used to reduce overhead in cases where the mapping is | can be used to reduce overhead in cases where the mapping is | |||
known in advance; one such case is when there is a single | known in advance. One such case is when there is a single | |||
subflow between the hosts, another is when segments of | subflow between the hosts, and another is when segments of | |||
data are scheduled in larger than packet-sized chunks.</t> | data are scheduled in larger-than-packet-sized chunks.</t> | |||
<t>An "infinite" mapping can be used to fall back to regular TCP by | ||||
<t>An "infinite" mapping can be used to fall back to regular TCP by | ||||
mapping the subflow-level data to the connection-level data | mapping the subflow-level data to the connection-level data | |||
for the remainder of the connection (see | for the remainder of the connection (see | |||
<xref target="sec_fallback"/>). This is achieved by setting | <xref target="sec_fallback" format="default"/>). This is achieved by set ting | |||
the Data-Level Length field of the DSS option to the reserved value of 0 . The | the Data-Level Length field of the DSS option to the reserved value of 0 . The | |||
checksum, in such a case, will also be set to zero.</t> | checksum, in such a case, will also be set to 0.</t> | |||
</section> | </section> | |||
<section anchor="sec_dataack" numbered="true" toc="default"> | ||||
<section title="Data Acknowledgments" anchor="sec_dataack"> | <name>Data Acknowledgments</name> | |||
<t>To provide full end-to-end resilience, MPTCP provides a | <t>To provide full end-to-end resilience, MPTCP provides a | |||
connection-level acknowledgment, to act as a cumulative ACK for | connection-level acknowledgment, to act as a cumulative ACK for | |||
the connection as a whole. This is the "Data ACK" field of | the connection as a whole. This is done via the "Data ACK" field of | |||
the DSS option (<xref target="tcpm_dsn"/>). The Data ACK | the DSS option (<xref target="tcpm_dsn" format="default"/>). The Data AC | |||
K | ||||
is analogous to the behavior | is analogous to the behavior | |||
of the standard TCP cumulative ACK -- indicating | of the standard TCP cumulative ACK -- indicating | |||
how much data has been successfully received (with no | how much data has been successfully received (with no | |||
holes). This is in comparison to the subflow-level ACK, which | holes). This can be compared to the subflow-level ACK, which | |||
acts analogous to TCP SACK, given that there may still be | acts in a fashion analogous to TCP SACK, given that there may still be | |||
holes in the data stream at the connection level. | holes in the data stream at the connection level. | |||
The Data ACK specifies the next data sequence number | The Data ACK specifies the next data sequence number | |||
it expects to receive.</t> | it expects to receive.</t> | |||
<t>The Data ACK, as for the DSN, can be sent as the full 64-bit | ||||
<t>The Data ACK, as for the DSN, can be sent as the full 64-bit | value or as the lower 32 bits. If data is received with a 64-bit DSN, | |||
value, or as the lower 32 bits. If data is received with a 64-bit DSN, | it <bcp14>MUST</bcp14> be acknowledged with a 64-bit Data ACK. If the D | |||
it MUST be acknowledged with a 64-bit Data ACK. If the DSN received | SN received | |||
is 32 bits, an implementation can choose whether to send a 32-bit or | is 32 bits, an implementation can choose whether to send a 32-bit o | |||
64-bit Data ACK, and an implementation MUST accept either in this situat | r | |||
ion.</t> | 64-bit Data ACK, and an implementation <bcp14>MUST</bcp14> accept either | |||
in this situation.</t> | ||||
<t>The Data ACK proves that the data, and all required MPTCP | <t>The Data ACK proves that the data, and all required MPTCP | |||
signaling, has been received and accepted by the remote end. | signaling, have been received and accepted by the remote end. | |||
One key use of the Data ACK signal is that it is used to indicate | One key use of the Data ACK signal is that it is used to indicate | |||
the left edge of the advertised receive window. As explained in | the left edge of the advertised receive window. As explained in | |||
<xref target="sec_rwin"/>, the receive window is shared by all | <xref target="sec_rwin" format="default"/>, the receive window is shared by all | |||
subflows and is relative to the Data ACK. Because of this, an | subflows and is relative to the Data ACK. Because of this, an | |||
implementation MUST NOT use the RCV.WND field of a TCP segment | implementation <bcp14>MUST NOT</bcp14> use the RCV.WND field of a TCP se gment | |||
at the connection level if it does not also carry a DSS option with | at the connection level if it does not also carry a DSS option with | |||
a Data ACK field. Furthermore, | a Data ACK field. Furthermore, | |||
separating the connection-level acknowledgments from the | separating the connection-level acknowledgments from the | |||
subflow level allows processing to be done separately, and | subflow level allows processing to be done separately, and | |||
a receiver has the freedom to drop segments after acknowledgment | a receiver has the freedom to drop segments after acknowledgment | |||
at the subflow level, for example, due to memory constraints | at the subflow level -- for example, due to memory constraints | |||
when many segments arrive out of order.</t> | when many segments arrive out of order.</t> | |||
<t>An MPTCP sender <bcp14>MUST NOT</bcp14> free data from the send buf | ||||
<t>An MPTCP sender MUST NOT free data from the send buffer until | fer until | |||
it has been acknowledged by both a Data ACK received on any subflow | it has been acknowledged by both a Data ACK received on any subflow | |||
and at the subflow level by all subflows on which the data was sent. | and at the subflow level by all subflows on which the data was sent. | |||
The former condition ensures liveness of the | The former condition ensures liveness of the | |||
connection and the latter condition ensures liveness and | connection, and the latter condition ensures liveness and | |||
self-consistence of a subflow when data needs to be | self-consistence of a subflow when data needs to be | |||
retransmitted. | retransmitted. | |||
Note, however, that if some data needs to be retransmitted multiple | Note, however, that if some data needs to be retransmitted multiple | |||
times over a subflow, there is a risk of blocking the sending | times over a subflow, there is a risk of blocking the send | |||
window. In this case, the MPTCP sender can decide to terminate the | window. In this case, the MPTCP sender can decide to terminate the | |||
subflow that is behaving badly by sending a RST, using an appropriate | subflow that is behaving badly by sending a RST, using an appropriate | |||
MP_TCPRST (<xref target="sec_reset"/>) error code.</t> | MP_TCPRST (<xref target="sec_reset" format="default"/>) error code.</t> | |||
<t>The Data ACK <bcp14>MAY</bcp14> be included in all segments; howeve | ||||
<t>The Data ACK MAY be included in all segments; however, optimizations | r, optimizations | |||
SHOULD be considered in more advanced implementations, where the | <bcp14>SHOULD</bcp14> be considered in more advanced implementations, wh | |||
ere the | ||||
Data ACK is present in segments | Data ACK is present in segments | |||
only when the Data ACK value advances, and this behavior MUST | only when the Data ACK value advances, and this behavior <bcp14>MUST</bc | |||
be treated as valid. This behavior ensures the sender buffer | p14> | |||
be treated as valid. This behavior ensures that the send buffer | ||||
is freed, while reducing overhead when the data transfer is | is freed, while reducing overhead when the data transfer is | |||
unidirectional.</t> | unidirectional.</t> | |||
</section> | </section> | |||
<section anchor="sec_close" numbered="true" toc="default"> | ||||
<section title="Closing a Connection" anchor="sec_close"> | <name>Closing a Connection</name> | |||
<t>In regular TCP, a FIN announces the receiver that the sender has no m | <t>In regular TCP, a FIN announces to the receiver that the sender has | |||
ore data to send. | no more data to send. | |||
In order to allow subflows to operate independently and to keep the appearance o f TCP over the wire, | In order to allow subflows to operate independently and to keep the appearance o f TCP over the wire, | |||
a FIN in MPTCP only affects the subflow on which it is sent. This | a FIN in MPTCP only affects the subflow on which it is sent. This | |||
allows nodes to exercise considerable freedom over which paths are in use at any one time. | allows nodes to exercise considerable freedom over which paths are in use at any one time. | |||
The semantics of a FIN remain as for regular TCP; i.e., it is not until both sid es have ACKed | The semantics of a FIN remain as for regular TCP; i.e., it is not until both sid es have ACKed | |||
each other's FINs that the subflow is fully closed.</t> | each other's FINs that the subflow is fully closed.</t> | |||
<t>When an application calls close() on a socket, this indicates that it has no more | <t>When an application calls close() on a socket, this indicates that it has no more | |||
data to send; for regular TCP, this would result in a FIN on the connection. For MPTCP, an | data to send; for regular TCP, this would result in a FIN on the connection. For MPTCP, an | |||
equivalent mechanism is needed, and this is referred to as the DATA_FIN.</t> | equivalent mechanism is needed; this is referred to as the DATA_FIN.</t> | |||
<t>A DATA_FIN is an indication that the sender has no more data to sen | ||||
<t>A DATA_FIN is an indication that the sender has no more data to send, | d, and | |||
and | as such it can be used to verify that all data has been successfully rec | |||
as such can be used to verify that all data has been successfully receiv | eived. A DATA_FIN, | |||
ed. A DATA_FIN, | ||||
as with the FIN on a regular TCP connection, is a unidirectional signal. </t> | as with the FIN on a regular TCP connection, is a unidirectional signal. </t> | |||
<t>The DATA_FIN is signaled by setting the "F" flag in the DSS | ||||
<t>The DATA_FIN is signaled by setting the 'F' flag in the Data Sequence | option (<xref target="tcpm_dsn" format="default"/>) | |||
Signal option (<xref target="tcpm_dsn"/>) to 1. A DATA_FIN occupies 1 octet (th | to 1. A DATA_FIN occupies 1 octet (the final octet) of the | |||
e final octet) of the connection-level sequence space. Note that the DATA_FIN is | connection-level sequence space. Note that the | |||
included in the Data-Level Length, but not at the subflow level: for example, a | DATA_FIN is included in the Data-Level Length but not at the subflow | |||
segment with DSN 80, and Data-Level Length 11, with DATA_FIN set, would map 10 | level: for example, a segment with a DSN value of 80 and a | |||
octets from the subflow into data sequence space 80-89, the DATA_FIN is DSN 90; | Data-Level Length of 11, with DATA_FIN set, would map 10 octets from | |||
therefore, this segment including DATA_FIN would be acknowledged with a DATA_ACK | the subflow into data sequence space 80-89, and the DATA_FIN would | |||
of 91.</t> | be DSN 90; therefore, this segment, including DATA_FIN, would be | |||
acknowledged with a DATA_ACK of 91.</t> | ||||
<t>Note that when the DATA_FIN is not attached to a TCP segment containi | <t>Note that when the DATA_FIN is not attached to a TCP segment contai | |||
ng data, the Data Sequence Signal MUST have a subflow sequence number of 0, a Da | ning data, the DSS <bcp14>MUST</bcp14> have a subflow sequence number of 0, a Da | |||
ta-Level Length of 1, and the data sequence number that corresponds with the DAT | ta-Level Length of 1, and the data sequence number that corresponds with the DAT | |||
A_FIN itself. The checksum in this case will only cover the pseudo-header.</t> | A_FIN itself. The checksum in this case will only cover the pseudo-header.</t> | |||
<t>A DATA_FIN has the same semantics and behavior as a regular TCP FIN | ||||
<t>A DATA_FIN has the semantics and behavior as a regular TCP FIN, but a | , but at the connection level. Notably, it is only DATA_ACKed once all data has | |||
t the connection level. Notably, it is only DATA_ACKed once all data has been su | been successfully received at the connection level. Note, therefore, that a DATA | |||
ccessfully received at the connection level. Note, therefore, that a DATA_FIN is | _FIN is decoupled from a subflow FIN. It is only permissible to combine these si | |||
decoupled from a subflow FIN. It is only permissible to combine these signals o | gnals on one subflow if there is no data outstanding on other subflows. Otherwis | |||
n one subflow if there is no data outstanding on other subflows. Otherwise, it m | e, it may be necessary to retransmit data on different subflows. Essentially, a | |||
ay be necessary to retransmit data on different subflows. Essentially, a host MU | host <bcp14>MUST NOT</bcp14> close all functioning subflows unless it is safe to | |||
ST NOT close all functioning subflows unless it is safe to do so, i.e., until al | do so, i.e., until all outstanding data has been DATA_ACKed or until the segmen | |||
l outstanding data has been DATA_ACKed, or until the segment with the DATA_FIN f | t with the DATA_FIN flag set is the only outstanding segment.</t> | |||
lag set is the only outstanding segment.</t> | <t>Once a DATA_FIN has been acknowledged, all remaining subflows | |||
<bcp14>MUST</bcp14> be closed with standard FIN exchanges. Both | ||||
<t>Once a DATA_FIN has been acknowledged, all remaining subflows MUST be | hosts <bcp14>SHOULD</bcp14> send FINs on all subflows, as a courtesy, | |||
closed with standard FIN exchanges. Both hosts SHOULD send FINs on all subflows | to allow middleboxes to clean up state even if an individual subflow | |||
, as a courtesy to allow middleboxes to clean up state even if an individual sub | has failed. Reducing the timeouts (MSL) on subflows at end hosts after | |||
flow has failed. It is also encouraged to reduce the timeouts (Maximum Segment L | receiving a | |||
ifetime) on subflows at end hosts after receiving a DATA_FIN. In particular, any | DATA_FIN is also encouraged. In particular, any subflows where there i | |||
subflows where there is still outstanding data queued (which has been retransmi | s still | |||
tted on other subflows in order to get the DATA_FIN acknowledged) MAY be closed | outstanding data queued (which has been retransmitted on other | |||
with a RST with MP_TCPRST (<xref target="sec_reset"/>) error code for "too much | subflows in order to get the DATA_FIN acknowledged) | |||
outstanding data".</t> | <bcp14>MAY</bcp14> be closed with a RST with an MP_TCPRST (<xref targe | |||
t="sec_reset" format="default"/>) error code for "too much outstanding data".</t | ||||
<t>A connection is considered closed once both hosts' DATA_FINs have bee | > | |||
n acknowledged by DATA_ACKs.</t> | <t>A connection is considered closed once both hosts' DATA_FINs have b | |||
een acknowledged by DATA_ACKs.</t> | ||||
<t>As specified above, a standard TCP FIN on an individual subflow only | <t>As specified above, a standard TCP FIN on an individual subflow | |||
shuts down the subflow on which it was sent. If all subflows have been closed wi | only shuts down the subflow on which it was sent. If all subflows | |||
th a FIN exchange, but no DATA_FIN has been received and acknowledged, the MPTCP | have been closed with a FIN exchange but no DATA_FIN has been | |||
connection is treated as closed only after a timeout. This implies that an impl | received and acknowledged, the MPTCP connection is treated as closed | |||
ementation will have TIME_WAIT states at both the subflow and connection levels | only after a timeout. This implies that an implementation will have | |||
(see <xref target="app_fsm"/>). This permits "break-before-make" scenarios where | TIME_WAIT states at both the subflow level and the connection level (s | |||
connectivity is lost on all subflows before a new one can be re-established.</t | ee <xref target="app_fsm" format="default"/>). This permits "break-before-make" | |||
> | scenarios where connectivity is lost on all subflows before a new one can be re& | |||
</section> | #8209;established.</t> | |||
</section> | ||||
<section title="Receiver Considerations" anchor="sec_rwin"> | <section anchor="sec_rwin" numbered="true" toc="default"> | |||
<name>Receiver Considerations</name> | ||||
<t>Regular TCP advertises a receive window in each packet, telling the sender how much data the receiver | <t>Regular TCP advertises a receive window in each packet, telling the sender how much data the receiver | |||
is willing to accept past the cumulative ack. The receive window is used to impl ement flow control, throttling | is willing to accept past the cumulative ACK. The receive window is used to impl ement flow control, throttling | |||
down fast senders when receivers cannot keep up. </t> | down fast senders when receivers cannot keep up. </t> | |||
<t>MPTCP also uses a unique receive window, shared between the subflow s. The idea is to allow any | <t>MPTCP also uses a unique receive window, shared between the subflow s. The idea is to allow any | |||
subflow to send data as long as the receiver is willing to accept it. The altern | subflow to send data as long as the receiver is willing to accept it. The | |||
ative, maintaining per subflow | alternative -- maintaining per-subflow | |||
receive windows, could end up stalling some subflows while others would not use | receive windows -- could end up stalling some subflows while others would not us | |||
up their window.</t> | e up their window.</t> | |||
<t>The receive window is relative to the DATA_ACK. As in TCP, a receiv | ||||
<t>The receive window is relative to the DATA_ACK. As in TCP, a receiv | er <bcp14>MUST NOT</bcp14> shrink the right edge of the receive window (i.e., DA | |||
er MUST NOT shrink the right edge of the receive window (i.e., DATA_ACK + receiv | TA_ACK + receive window). The receiver will | |||
e window). The receiver will | ||||
use the data sequence number to tell if a packet should be accepted at the conne ction level.</t> | use the data sequence number to tell if a packet should be accepted at the conne ction level.</t> | |||
<t>When deciding to accept packets at the subflow level, regular TCP c | ||||
<t>When deciding to accept packets at subflow level, regular TCP check | hecks | |||
s | ||||
the sequence number in the packet against the allowed receive window. | the sequence number in the packet against the allowed receive window. | |||
With multipath, such a check is done using only the connection-level window. A s | With MPTCP, such a check is done using only the connection-level window. A sanit | |||
anity | y | |||
check SHOULD be performed at subflow level to ensure that the subflow and mapped | check <bcp14>SHOULD</bcp14> be performed at the subflow level to ensure that the | |||
sequence | subflow and mapped sequence | |||
numbers meet the following test: SSN - SUBFLOW_ACK <= DSN - DATA_ACK, where S SN is the subflow sequence number of the received packet and SUBFLOW_ACK is the RCV.NXT (next expected sequence number) of the subflow (with the equivalent conn ection-level definitions for DSN and DATA_ACK).</t> | numbers meet the following test: SSN - SUBFLOW_ACK <= DSN - DATA_ACK, where S SN is the subflow sequence number of the received packet and SUBFLOW_ACK is the RCV.NXT (next expected sequence number) of the subflow (with the equivalent conn ection-level definitions for DSN and DATA_ACK).</t> | |||
<t>In regular TCP, once a segment is deemed in-window, it is put in ei | ||||
<t>In regular TCP, once a segment is deemed in-window, it is put either | ther | |||
in the in-order receive queue or in the out-of-order queue. | the in-order receive queue or the out-of-order queue. | |||
In Multipath TCP, the same happens but at the connection level: a segment | In Multipath TCP, the same thing happens, but at the connection level: a segment | |||
is placed in the connection level in-order or out-of-order queue if | is placed in the connection-level in-order or out-of-order queue if | |||
it is in-window at both connection and subflow levels. | it is in-window at both the connection level and the subflow level. | |||
The stack still has to remember, for each subflow, which segments were | The stack still has to remember, for each subflow, which segments were | |||
received successfully so that it can ACK them at subflow level appropriately. | received successfully so that it can ACK them at the subflow level appropriately | |||
Typically, this will be implemented by keeping per subflow out-of-order | . | |||
queues (containing only message headers, not the payloads) and remembering | Typically, this will be implemented by keeping per-subflow out-of-order | |||
queues (containing only message headers -- not the payloads) and remembering | ||||
the value of the cumulative ACK. | the value of the cumulative ACK. | |||
</t> | </t> | |||
<t>It is important for implementers to understand how large | <t>It is important for implementers to understand how large | |||
a receiver buffer is appropriate. The lower bound for full | a receive buffer is appropriate. The lower bound for full | |||
network utilization is the maximum bandwidth-delay product | network utilization is the maximum bandwidth-delay product | |||
of any one of the paths. However, this might be insufficient | of any one of the paths. However, this might be insufficient | |||
when a packet is lost on a slower subflow and needs to be | when a packet is lost on a slower subflow and needs to be | |||
retransmitted (see <xref target="sec_retransmit"/>). A tight | retransmitted (see <xref target="sec_retransmit" format="default"/>). A tight | |||
upper bound would be the maximum round-trip time (RTT) of any path mul tiplied | upper bound would be the maximum round-trip time (RTT) of any path mul tiplied | |||
by the total bandwidth available across all paths. This | by the total bandwidth available across all paths. This | |||
permits all subflows to continue at full speed while a | permits all subflows to continue at full speed while a | |||
packet is fast-retransmitted on the maximum RTT path. Even | packet is fast-retransmitted on the maximum RTT path. Even | |||
this might be insufficient to maintain full performance in | this might be insufficient to maintain full performance in | |||
the event of a retransmit timeout on the maximum RTT path. | the event of a retransmit timeout on the maximum RTT path. | |||
It is for future study to determine the relationship between | Determining the relationship between | |||
retransmission strategies and receive buffer sizing.</t> | retransmission strategies and receive buffer sizing is left for future | |||
study.</t> | ||||
</section> | </section> | |||
<section anchor="sec_sender" numbered="true" toc="default"> | ||||
<section title="Sender Considerations" anchor="sec_sender"> | <name>Sender Considerations</name> | |||
<t>The sender remembers receiver window advertisements from the receiv | <t>The sender remembers receive window advertisements from the | |||
er. It should only update its local receive window values when the largest seque | receiver. It should only update its local receive window values when | |||
nce number allowed (i.e., DATA_ACK + receive window) increases, on the receipt o | the largest sequence number allowed (i.e., DATA_ACK + receive | |||
f a DATA_ACK. This is important to allow using paths with different RTTs, and th | window) increases on the receipt of a DATA_ACK. This is important | |||
us different feedback loops. </t> | for allowing the use of paths with different RTTs and thus different f | |||
eedback loops. </t> | ||||
<t>MPTCP uses a single receive window across all subflows, and if the | <t>MPTCP uses a single receive window across all subflows, and if | |||
receive window was guaranteed to be unchanged end-to-end, a host could always re | the receive window was guaranteed to be unchanged end to end, a host c | |||
ad the most recent receive window value. However, some classes of middleboxes ma | ould always read the most recent receive window value. However, some classes of | |||
y alter the TCP-level receive window. Typically, these will | middleboxes may alter the TCP-level receive window. Typically, these will | |||
shrink the offered window, although for short periods of time it may be possible for the window to be larger (however, | shrink the offered window, although for short periods of time it may be possible for the window to be larger (however, | |||
note that this would not continue for long periods since ultimately the middlebo x must keep up with | note that this would not continue for long periods, since ultimately the middleb ox must keep up with | |||
delivering data to the receiver). Therefore, if receive window sizes differ on m ultiple subflows, | delivering data to the receiver). Therefore, if receive window sizes differ on m ultiple subflows, | |||
when sending data MPTCP SHOULD take the largest of the most recent window sizes as the one to use in calculations. | when sending data MPTCP <bcp14>SHOULD</bcp14> take the largest of the most recen t window sizes as the one to use in calculations. | |||
This rule is implicit in the requirement not to reduce the right edge of the win dow.</t> | This rule is implicit in the requirement not to reduce the right edge of the win dow.</t> | |||
<t>The sender <bcp14>MUST</bcp14> also remember the receive windows ad | ||||
<t>The sender MUST also remember the receive windows advertised by eac | vertised by each subflow. | |||
h subflow. | ||||
The allowed window for subflow i is (ack_i, ack_i + rcv_wnd_i), where ack_i is t he | The allowed window for subflow i is (ack_i, ack_i + rcv_wnd_i), where ack_i is t he | |||
subflow-level cumulative ACK of subflow i. This ensures data will not be sent to a middlebox | subflow-level cumulative ACK of subflow i. This ensures that data will not be se nt to a middlebox | |||
unless there is enough buffering for the data. </t> | unless there is enough buffering for the data. </t> | |||
<t>Putting the two rules together, we get the following: a sender is a llowed to send | <t>Putting the two rules together, we get the following: a sender is a llowed to send | |||
data segments with data-level sequence numbers between (DATA_ACK, DATA_ACK + rec eive_window). | data segments with data-level sequence numbers between (DATA_ACK, DATA_ACK + rec eive_window). | |||
Each of these segments will be mapped onto subflows, as long as subflow sequence numbers | Each of these segments will be mapped onto subflows, as long as subflow sequence numbers | |||
are in the allowed windows for those subflows. Note that subflow sequence number s do not | are in the allowed windows for those subflows. Note that subflow sequence number s do not | |||
generally affect flow control if the same receive window is advertised across al l subflows. | generally affect flow control if the same receive window is advertised across al l subflows. | |||
They will perform flow control for those subflows with a smaller advertised rece ive window. | They will perform flow control for those subflows with a smaller advertised rece ive window. | |||
</t> | </t> | |||
<t>The send buffer <bcp14>MUST</bcp14>, at a minimum, be as big as the | ||||
<t>The send buffer MUST, at a minimum, be as big as the receive buffer | receive buffer, to enable the sender to reach maximum throughput.</t> | |||
, to enable the sender to reach maximum throughput.</t> | ||||
</section> | </section> | |||
<section anchor="sec_retransmit" numbered="true" toc="default"> | ||||
<section title="Reliability and Retransmissions" anchor="sec_retransmit" | <name>Reliability and Retransmissions</name> | |||
> | <t>The Data Sequence Mapping allows senders to resend data with the | |||
same data sequence number on a different subflow. When doing this, a | ||||
<t>The data sequence mapping allows senders to resend data with the sa | host <bcp14>MUST</bcp14> still retransmit the original data on the | |||
me data sequence number on a different subflow. When doing this, a host MUST sti | original subflow, in order to preserve the subflow's integrity | |||
ll retransmit the original data on the original subflow, in order to preserve th | (middleboxes could replay old data and&wj;/or could reject holes in | |||
e subflow integrity (middleboxes could replay old data, and/or could reject hole | subflows), and a receiver will ignore these retransmissions. While | |||
s in subflows), and a receiver will ignore these retransmissions. While this is | this is clearly suboptimal, for compatibility reasons this is | |||
clearly suboptimal, for compatibility reasons this is sensible behavior. Optimiz | sensible behavior. Optimizations could be negotiated in future | |||
ations could be negotiated in future versions of this protocol. Note also that t | versions of this protocol. Note also that this property would also per | |||
his property would also permit a sender to always send the same data, with the s | mit a sender to always send the same data, with the same data sequence number, o | |||
ame data sequence number, on multiple subflows, if desired for reliability reaso | n multiple subflows, if desired for reliability reasons.</t> | |||
ns.</t> | ||||
<t>This protocol specification does not mandate any mechanisms for han dling retransmissions, and much will be dependent upon local policy | <t>This protocol specification does not mandate any mechanisms for han dling retransmissions, and much will be dependent upon local policy | |||
(as discussed in <xref target="sec_policy"/>). One can imagine aggressive connec | (as discussed in <xref target="sec_policy" format="default"/>). One can imagine | |||
tion-level retransmissions policies where every packet lost at subflow level is | aggressive connection-level retransmission policies where every packet lost at t | |||
retransmitted on | he subflow level is retransmitted on | |||
a different subflow (hence, wasting bandwidth but possibly reducing application- | a different subflow (hence wasting bandwidth but possibly reducing application-t | |||
to-application delays), or conservative retransmission policies where connection | o-application delays) or conservative retransmission policies where connection-l | |||
-level retransmits | evel retransmissions | |||
are only used after a few subflow-level retransmission timeouts occur.</t> | are only used after a few subflow-level retransmission timeouts occur.</t> | |||
<t>It is envisaged that a standard connection-level retransmission mec hanism | <t>It is envisaged that a standard connection-level retransmission mec hanism | |||
would be implemented around a connection-level data queue: all segments that hav en't | would be implemented around a connection-level data queue: all segments that hav en't | |||
been DATA_ACKed are stored. A timer is set when | been DATA_ACKed are stored. A timer is set when | |||
the head of the connection-level is ACKed at subflow level but its corresponding | the head of the connection level is ACKed at the subflow level but is not DATA_A | |||
data | CKed at the data level. This timer will guard against retransmission failures | |||
is not ACKed at data level. This timer will guard against failures in retransmis | ||||
sion | ||||
by middleboxes that proactively ACK data.</t> | by middleboxes that proactively ACK data.</t> | |||
<t>The sender <bcp14>MUST</bcp14> keep data in its send buffer as | ||||
<t>The sender MUST keep data in its send buffer as long as the data ha | long as the data has not been acknowledged both (1) at the | |||
s not been acknowledged at both connection level and on all subflows on which it | connection level and (2) on all subflows on which it | |||
has been sent. In this way, the sender can always retransmit the data if needed, on the same subflow or on a different one. A special case is when a subflow fai ls: the sender | has been sent. In this way, the sender can always retransmit the data if needed, on the same subflow or on a different one. A special case is when a subflow fai ls: the sender | |||
will typically resend the data on other working subflows after a timeout, and wi | will typically resend the data on other working subflows after a timeout and wil | |||
ll keep trying to retransmit the data | l keep trying to retransmit the data | |||
on the failed subflow too. The sender will declare the subflow failed after a pr | on the failed subflow too. The sender will declare the subflow failed after a pr | |||
edefined upper bound on retransmissions is reached (which MAY be lower than the | edefined upper bound on retransmissions is reached (which <bcp14>MAY</bcp14> be | |||
usual TCP limits of the Maximum Segment Life), or on the receipt of an ICMP erro | lower than the usual TCP limits of the MSL) or on the receipt of an ICMP error, | |||
r, and only then delete the outstanding data segments. </t> | and only then delete the outstanding data segments. </t> | |||
<t>If multiple retransmissions that indicate that a | ||||
<t>If multiple retransmissions are triggered that indicate that a subf | subflow is performing badly are triggered, this <bcp14>MAY</bcp14> lea | |||
low performs badly, this MAY lead to a host resetting the subflow with a RST. Ho | d to a host resetting the subflow with a RST. However, additional research is re | |||
wever, additional research is required to understand the heuristics of how and w | quired to understand the heuristics of how and when to reset underperforming sub | |||
hen to reset underperforming subflows. For example, a highly asymmetric path may | flows. For example, a highly asymmetric path may be misdiagnosed as underperform | |||
be misdiagnosed as underperforming. A RST for this purpose SHOULD be accompanie | ing. A RST for this purpose <bcp14>SHOULD</bcp14> be accompanied by an "Unaccept | |||
d with an "Unacceptable performance" MP_TCPRST option (<xref target="sec_reset"/ | able performance" MP_TCPRST option (<xref target="sec_reset" format="default"/>) | |||
>).</t> | .</t> | |||
</section> | </section> | |||
<section anchor="sec_cc" numbered="true" toc="default"> | ||||
<section title="Congestion Control Considerations" anchor="sec_cc"> | <name>Congestion Control Considerations</name> | |||
<t>Different subflows in an MPTCP connection have different congestion windows. | <t>Different subflows in an MPTCP connection have different congestion windows. | |||
To achieve fairness at bottlenecks and resource pooling, it is necessary to coup le the | To achieve fairness at bottlenecks and resource pooling, it is necessary to coup le the | |||
congestion windows in use on each subflow, in order to push most traffic to unco ngested links. | congestion windows in use on each subflow, in order to push most traffic to unco ngested links. | |||
One algorithm for achieving this is presented in <xref target="RFC6356"/>; | One algorithm for achieving this is presented in <xref target="RFC6356" format=" default"/>; | |||
the algorithm does not achieve perfect resource pooling but is "safe" in that it is readily | the algorithm does not achieve perfect resource pooling but is "safe" in that it is readily | |||
deployable in the current Internet. By this, we mean that it does not take up mo re capacity | deployable in the current Internet. By this we mean that it does not take up mor e capacity | |||
on any one path than if it was a single path flow using only that route, so this ensures | on any one path than if it was a single path flow using only that route, so this ensures | |||
fair coexistence with single-path TCP at shared bottlenecks.</t> | fair coexistence with single-path TCP at shared bottlenecks.</t> | |||
<t>It is foreseeable that different congestion controllers will be | ||||
<t>It is foreseeable that different congestion controllers will be imp | implemented for MPTCP, each aiming to achieve different properties | |||
lemented for MPTCP, each aiming to achieve different properties in the resource | in the resource pooling / fairness / stability design space, as well a | |||
pooling/fairness/stability design space, as well as those for achieving differen | s those for achieving different properties in quality of service, reliability, a | |||
t properties in quality of service, reliability, and resilience.</t> | nd resilience.</t> | |||
<t>Regardless of the algorithm used, | <t>Regardless of the algorithm used, | |||
the design of the MPTCP protocol aims to provide the congestion control implemen | the design of MPTCP aims to provide the congestion control | |||
tations sufficient information | implementations with sufficient information | |||
to take the right decisions; this information includes, for each subflow, which | to make the right decisions; this information includes, for each subflow, which | |||
packets were lost and when. </t> | packets were lost and when. </t> | |||
</section> | </section> | |||
<section anchor="sec_policy" numbered="true" toc="default"> | ||||
<section title="Subflow Policy" anchor="sec_policy"> | <name>Subflow Policy</name> | |||
<t>Within a local MPTCP implementation, a host may use any local polic y it wishes to decide how to share the traffic to be sent over the available pat hs.</t> | <t>Within a local MPTCP implementation, a host may use any local polic y it wishes to decide how to share the traffic to be sent over the available pat hs.</t> | |||
<t>In the typical use case, where the goal is to maximize throughput, | <t>In the typical use case, where the goal is to maximize throughput, | |||
all available paths will be used simultaneously for data transfer, using coupled | all available paths will be used simultaneously for data transfer, using coupled | |||
congestion control as described in <xref target="RFC6356"/>. It is expected, ho | congestion control as described in <xref target="RFC6356" format="default"/>. I | |||
wever, that other use cases will appear.</t> | t is expected, however, that other use cases will appear.</t> | |||
<t>For instance, a possibility is an 'all-or-nothing' approach, i.e., | <t>For instance, one possibility is an "all-or-nothing" approach, i.e. | |||
have a second path ready for use in the event of | , have a second path ready for use in the event of | |||
failure of the first path, but alternatives could include entirely saturating on e path before using an additional | failure of the first path, but alternatives could include entirely saturating on e path before using an additional | |||
path (the 'overflow' case). Such choices would be most likely based on the monet ary cost of links, but may also be | path (the "overflow" case). Such choices would be most likely based on the monet ary cost of links but may also be | |||
based on properties such as the delay or jitter of links, where stability (of de lay or bandwidth) is more important than throughput. Application | based on properties such as the delay or jitter of links, where stability (of de lay or bandwidth) is more important than throughput. Application | |||
requirements such as these are discussed in detail in <xref target="RFC6897"/>.< /t> | requirements such as these are discussed in detail in <xref target="RFC6897" for mat="default"/>.</t> | |||
<t>The ability to make effective choices at the sender requires full k nowledge of the path "cost", which | <t>The ability to make effective choices at the sender requires full k nowledge of the path "cost", which | |||
is unlikely to be the case. It would be desirable for a receiver to be able to s ignal their own preferences for paths, | is unlikely to be the case. It would be desirable for a receiver to be able to s ignal their own preferences for paths, | |||
since they will often be the multihomed party, and may have to pay for metered i | since they will often be the multihomed party and may have to pay for metered in | |||
ncoming bandwidth.</t> | coming bandwidth.</t> | |||
<t>To enable this, the MP_JOIN option (see <xref target="sec_join"/>) | <t>To enable this behavior, the MP_JOIN option (see <xref | |||
contains the 'B' bit, which allows a host to indicate to its peer that this path | target="sec_join" format="default"/>) contains the "B" bit, | |||
should be treated as a backup path to use only in the event of failure of other | which allows a host to indicate to its peer that this path should be | |||
working subflows (i.e., a subflow where the receiver has indicated B=1 SHOULD N | treated as a backup path to use only in the event of failure of | |||
OT be used to send data unless there are no usable subflows where B=0).</t> | other working subflows (i.e., a subflow where the receiver has | |||
<t>In the event that the available set of paths changes, a host may wi | indicated that B=1 <bcp14>SHOULD NOT</bcp14> be used to send data unle | |||
sh to signal a change in priority of subflows to the peer (e.g., a subflow that | ss there are no usable subflows where B=0).</t> | |||
was previously set as backup should now take priority over all remaining subflow | <t>In the event that the available set of paths changes, a host may | |||
s). Therefore, the MP_PRIO option, shown in <xref target="tcpm_prio"/>, can be u | wish to signal a change in priority of subflows to the peer (e.g., a | |||
sed to change the 'B' flag of the subflow on which it is sent.</t> | subflow that was previously set as a backup should now take priority | |||
<t>Another use of the MP_PRIO option is to set the 'B' flag on a subfl | over all remaining subflows). Therefore, the MP_PRIO option, shown | |||
ow to cleanly retire its use before closing it and removing it with REMOVE_ADDR | in <xref target="tcpm_prio" format="default"/>, can be used to | |||
<xref target="sec_remove_addr"/>, for example to support make-before-break sessi | change the "B" flag of the subflow on which it is sent.</t> | |||
on continuity, where new subflows are added before the previously used ones are | <figure anchor="tcpm_prio"> | |||
closed.</t> | <name>Change Subflow Priority (MP_PRIO) Option</name> | |||
<?rfc needLines='8'?> | <artwork align="left" name="" type="" alt=""><![CDATA[ | |||
<figure align="center" anchor="tcpm_prio" title="Change Subflow Priori | 1 2 3 | |||
ty (MP_PRIO) Option"> | 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | |||
<artwork align="left"><![CDATA[ | +---------------+---------------+-------+-----+-+ | |||
1 2 3 | | Kind | Length |Subtype|(rsv)|B| | |||
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | +---------------+---------------+-------+-----+-+ ]]></artwork> | |||
+---------------+---------------+-------+-----+-+ | ||||
| Kind | Length |Subtype|(rsv)|B| | ||||
+---------------+---------------+-------+-----+-+ | ||||
]]></artwork> | ||||
</figure> | </figure> | |||
<t>Another use of the MP_PRIO option is to set the "B" flag on a | ||||
<t>It should be noted that the backup flag is a request from a data receiver to | subflow to cleanly "retire" its use before closing it and removing it | |||
a data sender only, and the data sender SHOULD adhere to these requests. A host | with REMOVE_ADDR (<xref target="sec_remove_addr" format="default"/>) - | |||
cannot assume that the data sender will do so, however, since local policies -- | - for example, to support make-before-break session continuity, where new subflo | |||
or technical difficulties -- may override MP_PRIO requests. Note also that this | ws are added before the previously used subflows are closed.</t> | |||
signal applies to a single direction, and so the sender of this option could cho | <t>It should be noted that the backup flag is a request from a data re | |||
ose to continue using the subflow to send data even if it has signaled B=1 to th | ceiver to a data sender only, and the data sender <bcp14>SHOULD</bcp14> adhere t | |||
e other host.</t> | o these requests. A host cannot assume that the data sender will do so, however, | |||
since local policies -- or technical difficulties -- may override MP_PRIO reque | ||||
sts. Note also that this signal applies to a single direction, and so the sender | ||||
of this option could choose to continue using the subflow to send data even if | ||||
it has signaled B=1 to the other host.</t> | ||||
</section> | </section> | |||
</section> | </section> | |||
<section anchor="sec_pm" numbered="true" toc="default"> | ||||
<section title="Address Knowledge Exchange (Path Management)" anchor="sec_ | <name>Address Knowledge Exchange (Path Management)</name> | |||
pm"> | <t>We use the term "path management" to refer to the exchange of informa | |||
<t>We use the term "path management" to refer to the exchange of informa | tion about additional paths between hosts, which in this design is managed by mu | |||
tion about additional paths between hosts, which in this design is managed by mu | ltiple addresses at hosts. For more details regarding the architectural thinking | |||
ltiple addresses at hosts. For more detail of the architectural thinking behind | behind this design, see the MPTCP architecture document <xref target="RFC6182" | |||
this design, see the MPTCP Architecture document <xref target="RFC6182"/>.</t> | format="default"/>.</t> | |||
<t>This design makes use of two methods of sharing such | <t>This design makes use of two methods of sharing such | |||
information, and both can be used on a connection. | information, and both can be used on a connection. | |||
The first is the direct | The first is the direct | |||
setup of new subflows, already described in | setup of new subflows (described in | |||
<xref target="sec_join"/>, where the initiator has an | <xref target="sec_join" format="default"/>), where the initiator has an | |||
additional address. The second method, described in the | additional address. The second method (described in the | |||
following subsections, signals addresses explicitly to the | following subsections) signals addresses explicitly to the | |||
other host to allow it to initiate new subflows. The | other host to allow it to initiate new subflows. The | |||
two mechanisms are complementary: the first is implicit and | two mechanisms are complementary: the first is implicit and | |||
simple, while the explicit is more complex but is more | simple, while the second (explicit) is more complex but is more | |||
robust. Together, the mechanisms allow addresses to change in | robust. Together, these mechanisms allow addresses to change in | |||
flight (and thus support operation through NATs, since the | flight (and thus support operation through NATs, since the | |||
source address need not be known), and also allow the | source address need not be known); they also allow the | |||
signaling of previously unknown addresses, and of addresses | signaling of previously unknown addresses and of addresses | |||
belonging to other address families (e.g., both IPv4 and IPv6).</t> | belonging to other address families (e.g., both IPv4 and IPv6).</t> | |||
<t>Here is an example of typical operation of the protocol: | <t>Here is an example of typical operation of the protocol: | |||
<list style="symbols"> | </t> | |||
<t>An MPTCP connection is initially set up between address/port A1 o | <ul spacing="normal"> | |||
f Host A | <li>An MPTCP connection is initially set up between address&wj;/port A | |||
and address/port B1 of Host B. If Host A is multihomed and | 1 of Host A | |||
and address&wj;/port B1 of Host B. If Host A is multihomed and | ||||
multiaddressed, it can start an additional subflow from | multiaddressed, it can start an additional subflow from | |||
its address A2 to B1, by sending a SYN with a Join | its address A2 to B1, by sending a SYN with an MP_JOIN | |||
option from A2 to B1, using B's previously declared | option from A2 to B1, using B's previously declared | |||
token for this connection. Alternatively, if B is | token for this connection. Alternatively, if B is | |||
multihomed, it can try to set up a new subflow from B2 to | multihomed, it can try to set up a new subflow from B2 to | |||
A1, using A's previously declared token. In either | A1, using A's previously declared token. In either | |||
case, the SYN will be sent to the port already in use | case, the SYN will be sent to the port already in use | |||
for the original subflow on the receiving host.</t> | for the original subflow on the receiving host.</li> | |||
<li>Simultaneously (or after a timeout), an ADD_ADDR option | ||||
<t>Simultaneously (or after a timeout), an ADD_ADDR option | (<xref target="sec_add_address" format="default"/>) is sent on an existing subfl | |||
(<xref target="sec_add_address"/>) is sent on an existing subflow, informing | ow, informing | |||
the receiver of the sender's alternative address(es). The recipient can use | the receiver of the sender's alternative address(es). The recipient can use | |||
this information to open a new subflow to the sender's additional address. | this information to open a new subflow to the sender's additional address(es). | |||
In our example, A will send ADD_ADDR option informing B of address/port A2. | In our example, A will send the ADD_ADDR option informing B of address&wj;/port | |||
The mix of using the SYN-based option and the ADD_ADDR option, including | A2. | |||
timeouts, is implementation specific and can be tailored to agree with local pol | The mix of using the SYN‑based option and the ADD_ADDR option, including | |||
icy.</t> | timeouts, is implementation specific and can be tailored to agree with local pol | |||
icy.</li> | ||||
<t>If subflow A2-B1 is successfully set up, Host B can use the Addre | <li>If subflow A2-B1 is successfully set up, Host B can use the Addres | |||
ss ID in | s ID in | |||
the Join option to correlate this with the ADD_ADDR option that will also arrive | the MP_JOIN option to correlate this source address with the ADD_ADDR option tha | |||
on | t will also arrive on | |||
an existing subflow; now B knows not to open A2-B1, ignoring the ADD_ADDR. | an existing subflow; now B knows not to open A2-B1, ignoring the ADD_ADDR. | |||
Otherwise, if B has not received the A2-B1 MP_JOIN SYN but received the ADD_ADDR , | Otherwise, if B has not received the A2-B1 MP_JOIN SYN but received the ADD_ADDR , | |||
it can try to initiate a new subflow from one or more of its addresses to addres s | it can try to initiate a new subflow from one or more of its addresses to addres s | |||
A2. This permits new sessions to be opened if one host is behind a NAT.</t> | A2. This permits new sessions to be opened if one host is behind a NAT.</li> | |||
</list> | </ul> | |||
<t> | ||||
Other ways of using the two signaling mechanisms are possible; for instan ce, | Other ways of using the two signaling mechanisms are possible; for instan ce, | |||
signaling addresses in other address families can only be done explicitly using | signaling addresses in other address families can only be done explicitly | |||
the Add Address option. | using the Add Address (ADD_ADDR) option. | |||
</t> | </t> | |||
<section anchor="sec_add_address" numbered="true" toc="default"> | ||||
<section title="Address Advertisement" anchor="sec_add_address"> | <name>Address Advertisement</name> | |||
<t>The Add Address (ADD_ADDR) MPTCP option announces additional addresse | <t>The ADD_ADDR MPTCP option announces additional addresses (and, opti | |||
s (and optionally, ports) on which a | onally, ports) on which a | |||
host can be reached (<xref target="tcpm_address"/>). | host can be reached (<xref target="tcpm_address" format="default"/>). | |||
This option can be used at any time during a connection, depending on when the | This option can be used at any time during a connection, depending on when the | |||
sender wishes to enable multiple paths and/or when paths become available. As wi | sender wishes to enable multiple paths and&wj;/or when paths become available. A | |||
th all MPTCP | s with all MPTCP | |||
signals, the receiver MUST undertake standard TCP validity checks, e.g. <xref ta | signals, the receiver <bcp14>MUST</bcp14> undertake standard TCP validity | |||
rget="RFC5961"/>, before acting upon it.</t> | checks, e.g., per <xref target="RFC5961" format="default"/>, before | |||
acting upon it.</t> | ||||
<t>Every address has an Address ID that can be used for uniquely identif | <figure anchor="tcpm_address"> | |||
ying the address within a connection for address removal. The Address ID is also | <name>Add Address (ADD_ADDR) Option</name> | |||
used to identify MP_JOIN options (see <xref target="sec_join"/>) relating to | <artwork align="left" name="" type="" alt=""><![CDATA[ | |||
the same address, even when address translators are in use. The Address ID MUST | 1 2 3 | |||
uniquely | 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | |||
identify the address for the sender of the option (within the scope of the conne | +---------------+---------------+-------+-------+---------------+ | |||
ction), but the mechanism for | | Kind | Length |Subtype|(rsv)|E| Address ID | | |||
allocating such IDs is implementation specific.</t> | +---------------+---------------+-------+-------+---------------+ | |||
| Address (IPv4: 4 octets / IPv6: 16 octets) | | ||||
+-------------------------------+-------------------------------+ | ||||
| Port (2 octets, optional) | | | ||||
+-------------------------------+ | | ||||
| Truncated HMAC (8 octets, if E=0) | | ||||
| +-------------------------------+ | ||||
| | | ||||
+-------------------------------+ ]]></artwork> | ||||
</figure> | ||||
<t>All address IDs learned via either MP_JOIN or ADD_ADDR | <t>Every address has an Address ID that can be used for uniquely ident | |||
SHOULD be stored by the receiver in a data structure that gathers all th | ifying the address within a connection for address removal. The Address ID is al | |||
e Address ID | so | |||
to address mappings for a connection (identified by a token pair). In th | used to identify MP_JOIN options (see <xref target="sec_join" format="default"/> | |||
is way, there is | ) relating to | |||
a stored mapping between Address ID, observed source address, and token | the same address, even when address translators are in use. The Address ID <bcp1 | |||
pair for | 4>MUST</bcp14> uniquely | |||
identify the address for the sender of the option (within the scope of the conne | ||||
ction); the mechanism for | ||||
allocating such IDs is implementation specific.</t> | ||||
<t>All Address IDs learned via either MP_JOIN or ADD_ADDR | ||||
<bcp14>SHOULD</bcp14> be stored by the receiver in a data structure | ||||
that gathers all the Address-ID-to-address mappings for a connection | ||||
(identified by a token pair). In this way, there is | ||||
a stored mapping between the Address ID, observed source address, and to | ||||
ken pair for | ||||
future processing of control information for a connection. Note that an implementation | future processing of control information for a connection. Note that an implementation | |||
MAY discard incoming address advertisements at will, for example, for av | <bcp14>MAY</bcp14> discard incoming address advertisements at will -- fo | |||
oiding updating | r example, to avoid updating | |||
mapping state, or because advertised addresses are of no use to it (for | mapping state or because advertised addresses are of no use to it (for | |||
example, IPv6 addresses when it has IPv4 only). Therefore, a host MUST t | example, IPv6 addresses when it has IPv4 only). Therefore, a host <bcp14 | |||
reat address | >MUST</bcp14> treat address | |||
advertisements as soft state, and it MAY choose to refresh advertisement | advertisements as soft state, and it <bcp14>MAY</bcp14> choose to refres | |||
s periodically. | h advertisements periodically. | |||
Note also that an implementation MAY choose to cache these address adver | Note also that an implementation <bcp14>MAY</bcp14> choose to cache thes | |||
tisements even | e address advertisements even | |||
if they are not currently relevant but may be relevant in the future, su ch as IPv4 | if they are not currently relevant but may be relevant in the future, su ch as IPv4 | |||
addresses when IPv6 connectivity is available but IPv4 is awaiting DHCP. </t> | addresses when IPv6 connectivity is available but IPv4 is awaiting DHCP. </t> | |||
<t>This option is shown in <xref target="tcpm_address" format="default | ||||
<t>This option is shown in <xref target="tcpm_address"/>. The illustrati | "/>. The illustration is sized for | |||
on is sized for | IPv4 addresses. For IPv6, the length of the address will be 16 octe | |||
IPv4 addresses. For IPv6, the length of the address will be 16 octets (i | ts (instead of 4).</t> | |||
nstead of 4).</t> | <t>The 2 octets that specify the TCP port number to use are optional, | |||
and their presence | ||||
<t>The 2 octets that specify the TCP port number to use are optional and | ||||
their presence | ||||
can be inferred from the length of the option. Although it is expected t hat the majority of | can be inferred from the length of the option. Although it is expected t hat the majority of | |||
use cases will use the same port pairs as used for the initial subflow ( e.g., port | use cases will use the same port pairs as those used for the initial sub flow (e.g., port | |||
80 remains port 80 on all subflows, as does the ephemeral port at the cl ient), there | 80 remains port 80 on all subflows, as does the ephemeral port at the cl ient), there | |||
may be cases (such as port-based load balancing) where the explicit spec ification of | may be cases (such as port-based load balancing) where the explicit spec ification of | |||
a different port is required. If no port is specified, MPTCP SHOULD atte | a different port is required. If no port is specified, MPTCP <bcp14>SHOU | |||
mpt to | LD</bcp14> attempt to | |||
connect to the specified address on the same port as is already in use b | connect to the specified address on the same port as the port that is al | |||
y the subflow | ready in use by the subflow | |||
on which the ADD_ADDR signal was sent; this is discussed in more detail | on which the ADD_ADDR signal was sent; this is discussed in more detail | |||
in <xref target="heuristics"/>.</t> | in <xref target="heuristics" format="default"/>.</t> | |||
<t>The Truncated HMAC parameter present in this option is the rightmos | ||||
<t>The Truncated HMAC present in this Option is the rightmost 64 bits of | t 64 bits of an HMAC, negotiated and | |||
an HMAC, negotiated and | calculated in the same way as for MP_JOIN as described in <xref target=" | |||
calculated in the same way as for MP_JOIN as described in <xref target=" | sec_join" format="default"/>. For this | |||
sec_join"/>. For this | ||||
specification of MPTCP, as there is only one hash algorithm option speci fied, this will be HMAC as | specification of MPTCP, as there is only one hash algorithm option speci fied, this will be HMAC as | |||
defined in <xref target="RFC2104"/>, using the SHA-256 hash algorithm <x ref target="RFC6234"/>. | defined in <xref target="RFC2104" format="default"/>, using the SHA-256 hash algorithm <xref target="RFC6234" format="default"/>. | |||
In the same way as for MP_JOIN, the key for the HMAC | In the same way as for MP_JOIN, the key for the HMAC | |||
algorithm, in the case of the message transmitted by Host A, will be Key -A followed by Key-B, and in | algorithm, in the case of the message transmitted by Host A, will be Key -A followed by Key-B, and in | |||
the case of Host B, Key-B followed by Key-A. These are the keys that we re exchanged in the original | the case of Host B, Key-B followed by Key-A. These are the keys that we re exchanged in the original | |||
MP_CAPABLE handshake. The message for the HMAC is the Address ID, IP Add ress, and Port which precede | MP_CAPABLE handshake. The message for the HMAC is the Address ID, IP add ress, and port that precede | |||
the HMAC in the ADD_ADDR option. If the port is not present in the ADD_A DDR option, the HMAC message | the HMAC in the ADD_ADDR option. If the port is not present in the ADD_A DDR option, the HMAC message | |||
will nevertheless include two octets of value zero. The rationale for th e HMAC is to | will nevertheless include 2 octets of value zero. The rationale for the HMAC is to | |||
prevent unauthorized entities from injecting ADD_ADDR signals in an atte mpt to hijack a connection. | prevent unauthorized entities from injecting ADD_ADDR signals in an atte mpt to hijack a connection. | |||
Note that additionally the presence of this HMAC prevents the address be | Note that, additionally, the presence of this HMAC prevents the | |||
ing changed in flight unless | address from being changed in flight unless | |||
the key is known by an intermediary. If a host receives an ADD_ADDR opti on for which it cannot | the key is known by an intermediary. If a host receives an ADD_ADDR opti on for which it cannot | |||
validate the HMAC, it SHOULD silently ignore the option.</t> | validate the HMAC, it <bcp14>SHOULD</bcp14> silently ignore the option.< | |||
/t> | ||||
<t>A set of four flags are present after the subtype and before the Addr | <t>A set of four flags is present after the subtype and before the Add | |||
ess ID. Only the rightmost | ress ID. Only the rightmost | |||
bit - labelled 'E' - is assigned in this specification. The other bits a | bit -- labeled "E" -- is assigned in this specification. The other | |||
re currently unassigned and MUST | bits are currently unassigned; they <bcp14>MUST</bcp14> | |||
be set to zero by a sender and MUST be ignored by the receiver.</t> | be set to 0 by a sender and <bcp14>MUST</bcp14> be ignored by the receiv | |||
er.</t> | ||||
<t>The 'E' flag exists to provide reliability for this option. Because t | <t>The "E" flag exists to provide reliability for this option. Because | |||
his option will often be sent | this option will often be sent | |||
on pure ACKs, there is no guarantee of reliability. Therefore, a receive r receiving a fresh ADD_ADDR | on pure ACKs, there is no guarantee of reliability. Therefore, a receive r receiving a fresh ADD_ADDR | |||
option (where E=0), will send the same option back to the sender, but no | option (where E=0) will send the same option back to the sender, but not | |||
t including the HMAC, and | including the HMAC and | |||
with E=1, to indicate receipt. The lack of this echo can be used by the | with E=1, to indicate receipt. According to local policy, the lack of | |||
initial ADD_ADDR sender to | this type of "echo" can indicate to the initial ADD_ADDR sender that the | |||
retransmit the ADD_ADDR according to local policy.</t> | ADD_ADDR needs to be retransmitted.</t> | |||
<?rfc needLines='11'?> | ||||
<figure align="center" anchor="tcpm_address" title="Add Address (ADD_ADD | ||||
R) Option"> | ||||
<artwork align="left"><![CDATA[ | ||||
1 2 3 | ||||
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | ||||
+---------------+---------------+-------+-------+---------------+ | ||||
| Kind | Length |Subtype|(rsv)|E| Address ID | | ||||
+---------------+---------------+-------+-------+---------------+ | ||||
| Address (IPv4 - 4 octets / IPv6 - 16 octets) | | ||||
+-------------------------------+-------------------------------+ | ||||
| Port (2 octets, optional) | | | ||||
+-------------------------------+ | | ||||
| Truncated HMAC (8 octets, if E=0) | | ||||
| +-------------------------------+ | ||||
| | | ||||
+-------------------------------+ | ||||
]]></artwork> | ||||
</figure> | ||||
<t>Due to the proliferation of NATs, it is reasonably likely that one ho | ||||
st may attempt to advertise private addresses <xref target="RFC1918"/>. It is no | ||||
t desirable to prohibit this, since there may be cases where both hosts have add | ||||
itional interfaces on the same private network, and a host MAY advertise such ad | ||||
dresses. The MP_JOIN handshake to create a new subflow (<xref target="sec_join"/ | ||||
>) provides mechanisms to minimize security risks. The MP_JOIN message contains | ||||
a 32-bit token that uniquely identifies the connection to the receiving host. If | ||||
the token is unknown, the host will return with a RST. In the unlikely event th | ||||
at the token is valid at the receiving host, subflow setup will continue, but th | ||||
e HMAC exchange must occur for authentication. This will fail, and will provide | ||||
sufficient protection against two unconnected hosts accidentally setting up a ne | ||||
w subflow upon the signal of a private address. Further security considerations | ||||
around the issue of ADD_ADDR messages that accidentally misdirect, or maliciousl | ||||
y direct, new MP_JOIN attempts are discussed in <xref target="sec_security"/>.</ | ||||
t> | ||||
<t>A host that receives an ADD_ADDR but finds a connection set up to tha | ||||
t IP address and port number is unsuccessful SHOULD NOT perform further connecti | ||||
on attempts to this address/port combination for this connection. A sender that | ||||
wants to trigger a new incoming connection attempt on a previously advertised ad | ||||
dress/port combination can therefore refresh ADD_ADDR information by sending the | ||||
option again.</t> | ||||
<t>A host can therefore send an ADD_ADDR message with an already assigne | ||||
d Address ID, but the Address MUST be the same as previously assigned to this Ad | ||||
dress ID. A new ADD_ADDR may have the same, or different, port number. If the po | ||||
rt number is different, the receiving host SHOULD try to set up a new subflow to | ||||
this new address/port combination.</t> | ||||
<t>A host wishing to replace an existing Address ID MUST first remove th | ||||
e existing one (<xref target="sec_remove_addr"/>).</t> | ||||
<t>During normal MPTCP operation, it is unlikely that there will be suff | <t>Due to the proliferation of NATs, it is reasonably likely that | |||
icient TCP option space for ADD_ADDR to be included along with those for data se | one host may attempt to advertise private addresses <xref | |||
quence numbering (<xref target="sec_dsn"/>). Therefore, it is expected that an M | target="RFC1918" format="default"/>. It is not desirable to prohibit | |||
PTCP implementation will send the ADD_ADDR option on separate ACKs. As discussed | this behavior, since there may be cases where both hosts have additional | |||
earlier, however, an MPTCP implementation MUST NOT treat duplicate ACKs with an | interfaces on the same private network, and a host | |||
y MPTCP option, with the exception of the DSS option, as indications of congesti | <bcp14>MAY</bcp14> advertise such addresses. The MP_JOIN handshake | |||
on <xref target="RFC5681"/>, and an MPTCP implementation SHOULD NOT send more th | to create a new subflow (<xref target="sec_join" format="default"/>) | |||
an two duplicate ACKs in a row for signaling purposes.</t> | provides mechanisms to minimize security risks. The MP_JOIN message | |||
contains a 32-bit token that uniquely identifies the connection to | ||||
the receiving host. If the token is unknown, the host will respond | ||||
with a RST. In the unlikely event that the token is valid at the | ||||
receiving host, subflow setup will continue, but the HMAC exchange | ||||
must occur for authentication. The HMAC exchange | ||||
will fail and will provide | ||||
sufficient protection against two unconnected hosts accidentally | ||||
setting up a new subflow upon the signal of a private address. | ||||
Further security considerations around the issue of ADD_ADDR messages that acci | ||||
dentally misdirect, or maliciously direct, new MP_JOIN attempts are discussed in | ||||
<xref target="sec_security" format="default"/>.</t> | ||||
<t>A host that receives an ADD_ADDR but finds that a connection set up | ||||
to that IP address and port number is unsuccessful <bcp14>SHOULD NOT</bcp14> pe | ||||
rform further connection attempts to this address&wj;/port combination for this | ||||
connection. A sender that wants to trigger a new incoming connection attempt on | ||||
a previously advertised address&wj;/port combination can therefore refresh ADD_A | ||||
DDR information by sending the option again.</t> | ||||
<t>A host can therefore send an ADD_ADDR message with an | ||||
already-assigned Address ID, but the address <bcp14>MUST</bcp14> be | ||||
the same as the address previously assigned to this Address ID. A | ||||
new ADD_ADDR may have the same port number or a different port number. | ||||
If the port number is different, the receiving host <bcp14>SHOULD</bcp14> try t | ||||
o set up a new subflow to this new address&wj;/port combination.</t> | ||||
<t>A host wishing to replace an existing Address ID <bcp14>MUST</bcp14 | ||||
> first remove the existing one (<xref target="sec_remove_addr" format="default" | ||||
/>).</t> | ||||
<t>During normal MPTCP operation, it is unlikely that there will be su | ||||
fficient TCP option space for ADD_ADDR to be included along with those for data | ||||
sequence numbering (<xref target="sec_dsn" format="default"/>). Therefore, it is | ||||
expected that an MPTCP implementation will send the ADD_ADDR option on separate | ||||
ACKs. As discussed earlier, however, an MPTCP implementation <bcp14>MUST NOT</b | ||||
cp14> treat duplicate ACKs with any MPTCP option, with the exception of the DSS | ||||
option, as indications of congestion <xref target="RFC5681" format="default"/>, | ||||
and an MPTCP implementation <bcp14>SHOULD NOT</bcp14> send more than two duplica | ||||
te ACKs in a row for signaling purposes.</t> | ||||
</section> | ||||
<section anchor="sec_remove_addr" numbered="true" toc="default"> | ||||
<name>Remove Address</name> | ||||
<t>If, during the lifetime of an MPTCP connection, a previously | ||||
announced address becomes invalid (e.g., if the interface | ||||
disappears or an IPv6 address is no longer preferred), the affected | ||||
host <bcp14>SHOULD</bcp14> announce this situation so that the peer ca | ||||
n remove | ||||
subflows related to this address. Even if an address is not in use | ||||
by an MPTCP connection, if it has been previously announced, an | ||||
implementation <bcp14>SHOULD</bcp14> announce its removal. A host | ||||
<bcp14>MAY</bcp14> also choose to announce that a valid IP address | ||||
should not be used any longer -- for example, for make‑before-br | ||||
eak session continuity.</t> | ||||
<t>This is achieved through the Remove Address (REMOVE_ADDR) option | ||||
(<xref target="tcpm_remove" format="default"/>), which will remove a | ||||
previously added address (or list of addresses) from a connection | ||||
and terminate any subflows currently using that address.</t> | ||||
</section> | <figure anchor="tcpm_remove"> | |||
<section title="Remove Address" anchor="sec_remove_addr"> | <name>Remove Address (REMOVE_ADDR) Option</name> | |||
<t>If, during the lifetime of an MPTCP connection, a previously announce | <artwork align="left" name="" type="" alt=""><![CDATA[ | |||
d address becomes invalid (e.g., if the interface disappears, or an IPv6 address | 1 2 3 | |||
is no longer preferred), the affected host SHOULD announce this so that the pee | 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | |||
r can remove subflows related to this address. Even if an address is not in use | +---------------+---------------+-------+-------+---------------+ | |||
by a MPTCP connection, if it has been previously announced, an implementation SH | | Kind |Length = 3 + n |Subtype|(resvd)| Address ID | ... | |||
OULD announce its removal. A host MAY also choose to announce that a valid IP ad | +---------------+---------------+-------+-------+---------------+ | |||
dress should not be used any longer, for example for make-before-break session c | (followed by n-1 Address IDs, if required) ]]></artwo | |||
ontinuity.</t> | rk> | |||
<t>This is achieved through the Remove Address (REMOVE_ADDR) option (<xr | </figure> | |||
ef target="tcpm_remove"/>), which will remove a previously added address (or lis | ||||
t of addresses) from a connection and terminate any subflows currently using tha | ||||
t address.</t> | ||||
<t>For security purposes, if a host receives a REMOVE_ADDR option, it mu | ||||
st ensure the affected path(s) are no longer in use before it instigates closure | ||||
. The receipt of REMOVE_ADDR SHOULD first trigger the sending of a TCP keepalive | ||||
<xref target="RFC1122"/> on the path, and if a response is received the path SH | ||||
OULD NOT be removed. If the path is found to still be alive, the receiving host | ||||
SHOULD no longer use the specified address for future connections, but it is the | ||||
responsibility of the host which sent the REMOVE_ADDR to shut down the subflow. | ||||
The requesting host MAY also use MP_PRIO (<xref target="sec_policy"/>) to reque | ||||
st a path is no longer used, before removal. Typical TCP validity tests on the s | ||||
ubflow (e.g., ensuring sequence and ACK numbers are correct) MUST also be undert | ||||
aken. An implementation can use indications of these test failures as part of in | ||||
trusion detection or error logging.</t> | ||||
<t>The sending and receipt (if no keepalive response was received) of th | ||||
is message SHOULD trigger the sending of RSTs by both hosts on the affected subf | ||||
low(s) (if possible), as a courtesy to cleaning up middlebox state, before clean | ||||
ing up any local state.</t> | ||||
<t>Address removal is undertaken by ID, so as to permit the use of NATs | ||||
and other middleboxes that rewrite source addresses. If there is no address at t | ||||
he requested ID, the receiver will silently ignore the request.</t> | ||||
<t>A subflow that is still functioning MUST be closed with a FIN exchang | ||||
e as in regular TCP, rather than using this option. For more information, see <x | ||||
ref target="sec_close"/>.</t> | ||||
<?rfc needLines='8'?> | ||||
<figure align="center" anchor="tcpm_remove" title="Remove Address (REMOV | ||||
E_ADDR) Option"> | ||||
<artwork align="left"><![CDATA[ | ||||
1 2 3 | ||||
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | ||||
+---------------+---------------+-------+-------+---------------+ | ||||
| Kind | Length = 3+n |Subtype|(resvd)| Address ID | ... | ||||
+---------------+---------------+-------+-------+---------------+ | ||||
(followed by n-1 Address IDs, if required) | ||||
]]></artwork> | ||||
</figure> | ||||
</section> | ||||
<t>For security purposes, if a host receives a REMOVE_ADDR option, | ||||
it must ensure that the affected path or paths are no longer in use | ||||
before it instigates closure. The receipt of REMOVE_ADDR | ||||
<bcp14>SHOULD</bcp14> first trigger the sending of a TCP keepalive | ||||
<xref target="RFC1122" format="default"/> on the path, and if a | ||||
response is received, the path <bcp14>SHOULD NOT</bcp14> be | ||||
removed. If the path is found to still be alive, the receiving host | ||||
<bcp14>SHOULD</bcp14> no longer use the specified address for future | ||||
connections, but it is the responsibility of the host that sent the | ||||
REMOVE_ADDR to shut down the subflow. Before the address is removed, | ||||
the requesting host | ||||
<bcp14>MAY</bcp14> also use MP_PRIO (<xref target="sec_policy" | ||||
format="default"/>) to request that a path no longer be used. Typical | ||||
TCP validity tests on the subflow (e.g., ensuring | ||||
that sequence and ACK numbers are correct) <bcp14>MUST</bcp14> also be | ||||
undertaken. An implementation can use indications of these test failures as par | ||||
t of intrusion detection or error logging.</t> | ||||
<t>The sending and receipt (if no keepalive response was received) | ||||
of this message <bcp14>SHOULD</bcp14> trigger the sending of RSTs by | ||||
both hosts on the affected subflow(s) (if possible), as a courtesy, | ||||
to allow the cleanup of middlebox state before cleaning up any local s | ||||
tate.</t> | ||||
<t>Address removal is undertaken according to the Address ID, so as to | ||||
permit the use of NATs and other middleboxes that rewrite source | ||||
addresses. If an Address ID is not known, the receiver will | ||||
silently ignore the request.</t> | ||||
<t>A subflow that is still functioning <bcp14>MUST</bcp14> be closed w | ||||
ith a FIN exchange as in regular TCP, rather than using this option. For more in | ||||
formation, see <xref target="sec_close" format="default"/>.</t> | ||||
</section> | ||||
</section> | </section> | |||
<section anchor="sec_fastclose" numbered="true" toc="default"> | ||||
<section title="Fast Close" anchor="sec_fastclose"> | <name>Fast Close</name> | |||
<t>Regular TCP has the means of sending a reset (RST) signal to abruptly | <t>Regular TCP has the means of sending a RST signal to abruptly | |||
close a connection. With MPTCP, a regular RST only has the scope of the | close a connection. With MPTCP, a regular RST only has the scope of | |||
subflow | the subflow; it | |||
and will only close the concerned subflow but not affect the remaining | will only close the applicable subflow and will not affect the remaining | |||
subflows. MPTCP's connection will stay alive at the data level, in order | subflows. MPTCP's connection will stay alive at the data level, in order | |||
to permit break-before-make handover between subflows. It is therefore | to permit break-before-make handover between subflows. It is therefore | |||
necessary to provide an MPTCP-level "reset" to allow the abrupt closure | necessary to provide an MPTCP-level "reset" to allow the abrupt closure | |||
of the whole MPTCP connection, and this is the MP_FASTCLOSE option.</t> | of the whole MPTCP connection; this is done via the MP_FASTCLOSE option. | |||
</t> | ||||
<t>MP_FASTCLOSE is used to indicate to the peer that the connection will be | <t>MP_FASTCLOSE is used to indicate to the peer that the connection will be | |||
abruptly closed and no data will be accepted anymore. The reasons for | abruptly closed and no data will be accepted anymore. The reasons for | |||
triggering an MP_FASTCLOSE are implementation specific. Regular TCP does | triggering an MP_FASTCLOSE are implementation specific. Regular TCP does | |||
not allow sending a RST while the connection is in a synchronized | not allow the sending of a RST while the connection is in a synchronized | |||
state <xref target="RFC0793"/>. Nevertheless, implementations allow | state <xref target="RFC0793" format="default"/>. Nevertheless, implement | |||
the sending of a RST in this state, if, for example, the operating | ations allow | |||
the sending of a RST in this state if, for example, the operating | ||||
system is running out of resources. In these cases, MPTCP should send | system is running out of resources. In these cases, MPTCP should send | |||
the MP_FASTCLOSE. This option is illustrated in <xref target="tcpm_fastc | the MP_FASTCLOSE. This option is illustrated in <xref target="tcpm_fastc | |||
lose"/>.</t> | lose" format="default"/>.</t> | |||
<figure anchor="tcpm_fastclose"> | ||||
<?rfc needLines='12'?> | <name>Fast Close (MP_FASTCLOSE) Option</name> | |||
<figure align="center" anchor="tcpm_fastclose" title="Fast Close (MP_FAS | <artwork align="left" name="" type="" alt=""><![CDATA[ | |||
TCLOSE) Option"> | 1 2 3 | |||
<artwork align="left"><![CDATA[ | 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | |||
1 2 3 | +---------------+---------------+-------+-----------------------+ | |||
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | | Kind | Length |Subtype| (reserved) | | |||
+---------------+---------------+-------+-----------------------+ | +---------------+---------------+-------+-----------------------+ | |||
| Kind | Length |Subtype| (reserved) | | | Option Receiver's Key | | |||
+---------------+---------------+-------+-----------------------+ | | (64 bits) | | |||
| Option Receiver's Key | | | | | |||
| (64 bits) | | +---------------------------------------------------------------+ ]]></artwork | |||
| | | > | |||
+---------------------------------------------------------------+ | ||||
]]></artwork> | ||||
</figure> | </figure> | |||
<t>If Host A wants to force the closure of an MPTCP connection, it can | ||||
<t>If Host A wants to force the closure of an MPTCP connection, it has t | do so via two | |||
wo | options: | |||
different options: | </t> | |||
<list style="symbols"> | <ul spacing="normal"> | |||
<t>Option A (ACK) : Host A sends an ACK containing the MP_FASTCLOSE | <li>Option A (ACK): Host A sends an ACK containing the MP_FASTCLOSE | |||
option on one subflow, containing the key of Host B as declared in | option on one subflow, containing the key of Host B as declared in | |||
the initial connection handshake. On all the other subflows, Host A | the initial connection handshake. On all the other subflows, Host&n | |||
sends a regular TCP RST to close these subflows, and tears them down. | bsp;A | |||
Host A now enters FASTCLOSE_WAIT state.</t> | sends a regular TCP RST to close these subflows and tears them down. | |||
Host A now enters FASTCLOSE_WAIT state.</li> | ||||
<t>Option R (RST) : Host A sends a RST containing the MP_FASTCLOSE | <li>Option R (RST): Host A sends a RST containing the MP_FASTCLOSE | |||
option on all subflows, containing the key of Host B as declared in | option on all subflows, containing the key of Host B as declared in | |||
the initial connection handshake. Host A can tear the subflows and | the initial connection handshake. Host A can tear down the subflows | |||
the connection down immediately.</t> | and | |||
</list> | the connection immediately.</li> | |||
</t> | </ul> | |||
<t>If Host A decides to force the closure by using Option A and sending | ||||
<t>If host A decides to force the closure by using Option A and sending | an ACK with the MP_FASTCLOSE option, the connection shall proceed as fol | |||
an ACK with the MP_FASTCLOSE option, the connection shall proceed as foll | lows: | |||
ows: | </t> | |||
<list style="symbols"> | <ul spacing="normal"> | |||
<t>Upon receipt of an ACK with MP_FASTCLOSE by Host B, containing th | <li>Upon receipt of an ACK with MP_FASTCLOSE by Host B, containing the | |||
e valid key, Host B answers | valid key, Host B answers | |||
on the same subflow with a TCP RST and tears down all subflows also | on the same subflow with a TCP RST and tears down all subflows | |||
through sending TCP RST signals. Host B can | also through sending TCP RST signals. Host B can | |||
now close the whole MPTCP connection (it transitions directly to CLO | now close the whole MPTCP connection (it transitions directly to CLO | |||
SED state).</t> | SED state).</li> | |||
<li>As soon as Host A has received the TCP RST on the remaining subflo | ||||
<t>As soon as Host A has received the TCP RST on the remaining subfl | w, it | |||
ow, it | ||||
can close this subflow and tear down the whole connection (transitio n from | can close this subflow and tear down the whole connection (transitio n from | |||
FASTCLOSE_WAIT to CLOSED states). If Host A receives an MP_FASTCLOSE instead | FASTCLOSE_WAIT state to CLOSED state). If Host A receives an MP_FAST CLOSE instead | |||
of a TCP RST, both hosts attempted fast closure simultaneously. Host A should | of a TCP RST, both hosts attempted fast closure simultaneously. Host A should | |||
reply with a TCP RST and tear down the connection.</t> | reply with a TCP RST and tear down the connection.</li> | |||
<li>If Host A does not receive a TCP RST in reply to its MP_FASTCLOSE | ||||
<t>If Host A does not receive a TCP RST in reply to its MP_FASTCLOSE | after one | |||
after one | retransmission timeout (RTO) (the RTO of the subflow where the MP_FA | |||
retransmission timeout (RTO) (the RTO of the subflow where the MP_FA | STCLOSE has been sent), it <bcp14>SHOULD</bcp14> | |||
STCLOSE has been sent), it SHOULD | retransmit the MP_FASTCLOSE. To keep this connection from being | |||
retransmit the MP_FASTCLOSE. The number of retransmissions SHOULD be | retained for a long time, the number of retransmissions <bcp14>SHOUL | |||
limited to avoid this connection from being retained for a long time | D</bcp14> be | |||
, but | limited; | |||
this limit is implementation specific. A RECOMMENDED number is 3. If | this limit is implementation specific. A <bcp14>RECOMMENDED</bcp14> | |||
no TCP RST | number is 3. If no TCP RST | |||
is received in response, Host A SHOULD send a TCP RST with the MP_FA | is received in response, Host A <bcp14>SHOULD</bcp14> send a TCP RST | |||
STCLOSE option | with the MP_FASTCLOSE option | |||
itself when it releases state in order to clear any remaining state a | itself when it releases state in order to clear any remaining state | |||
t middleboxes.</t> | at middleboxes.</li> | |||
</list> | </ul> | |||
</t> | <t>If, however, Host A decides to force the closure by using Option R an | |||
d | ||||
<t>If however host A decides to force the closure by using Option R and | sending a RST with the MP_FASTCLOSE option, Host B will act as follows: | |||
sending a RST with the MP_FASTCLOSE option, Host B will act as follows: | upon receipt of a RST with MP_FASTCLOSE, containing the valid key, | |||
Upon receipt of a RST with MP_FASTCLOSE, containing the valid key, | Host B tears down all subflows by sending a TCP RST. Host B can now | |||
Host B tears down all subflows by sending a TCP RST. Host B can now close | close the whole MPTCP | |||
the whole MPTCP | connection (it transitions directly to CLOSED state).</t> | |||
connection (it transitions directly to CLOSED state).</t> | ||||
</section> | </section> | |||
<section anchor="sec_reset" numbered="true" toc="default"> | ||||
<section title="Subflow Reset" anchor="sec_reset"> | <name>Subflow Reset</name> | |||
<t>An implementation of MPTCP may also need to send a regular TCP RST to | <t>An implementation of MPTCP may also need to send a regular TCP RST to | |||
force | force | |||
the closure of a subflow. A host sends a TCP RST in order to close a subf | the closure of a subflow. A host sends a TCP RST in order to close a sub | |||
low | flow | |||
or reject an attempt to open a subflow (MP_JOIN). In order to inform the | or reject an attempt to open a subflow (MP_JOIN). In order to let the | |||
receiving host why a subflow is being closed or rejected, the TCP RST pac | receiving host know why a subflow is being closed or rejected, the TCP R | |||
ket | ST packet | |||
MAY include the MP_TCPRST Option. The host MAY use this information to | <bcp14>MAY</bcp14> include the MP_TCPRST option (<xref target="tcpm_rese | |||
decide, for example, whether it tries to re-establish the subflow | t"/>). The host <bcp14>MAY</bcp14> use this information to | |||
immediately, later, or never.</t> | decide, for example, whether it tries to re-establish the subflow | |||
immediately, later, or never.</t> | ||||
<?rfc needLines='8'?> | <figure anchor="tcpm_reset"> | |||
<figure align="center" anchor="tcpm_reset" title="TCP RST Reason (MP_TCP | <name>TCP RST Reason (MP_TCPRST) Option</name> | |||
RST) Option"> | <artwork align="left" name="" type="" alt=""><![CDATA[ | |||
<artwork align="left"><![CDATA[ | 1 2 3 | |||
1 2 3 | 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | |||
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | +---------------+---------------+-------+-----------------------+ | |||
+---------------+---------------+-------+-----------------------+ | | Kind | Length |Subtype|U|V|W|T| Reason | | |||
| Kind | Length |Subtype|U|V|W|T| Reason | | +---------------+---------------+-------+-----------------------+ ]]></artwork | |||
+---------------+---------------+-------+-----------------------+ | > | |||
]]></artwork> | ||||
</figure> | </figure> | |||
<t>The MP_TCPRST option contains a reason code that allows the | <t>The MP_TCPRST option contains a reason code that allows the | |||
sender of the option to provide more information about the reason for | sender of the option to provide more information about the reason for | |||
the termination of the subflow. Using 12 bits of option space, the | the termination of the subflow. Using 12 bits of option space, the | |||
first four bits are reserved for flags (only one of which is currently | first 4 bits are reserved for flags (only one of which is currently | |||
defined), and the remaining octet is used to express a reason code for | defined), and the remaining octet is used to express a reason code for | |||
this subflow termination, from which a receiver MAY infer information | this subflow termination, from which a receiver <bcp14>MAY</bcp14> infer information | |||
about the usability of this path.</t> | about the usability of this path.</t> | |||
<t>The "T" flag is used by the sender to indicate whether the error | <t>The "T" flag is used by the sender to indicate whether the error | |||
condition that is reported is Transient (T bit set to 1) or Permanent | condition that is reported is Transient ("T" bit set to 1) or Permanent | |||
(T bit set to 0). If the error condition is considered to be | ("T" bit set to 0). If the error condition is considered to be | |||
Transient by the sender of the RST segment, the recipient of this | Transient by the sender of the RST segment, the recipient of this | |||
segment MAY try to reestablish a subflow for this connection over the | segment <bcp14>MAY</bcp14> try to re-establish a subflow for this connec | |||
failed path. The time at which a receiver may try to re-establish this | tion over the | |||
is implementation-specific, but SHOULD take into account the properties | failed path. The time at which a receiver may try to | |||
of the failure defined by the following reason code. If the error condi | re‑establish this subflow | |||
tion | is implementation specific but <bcp14>SHOULD</bcp14> take into account t | |||
is considered to be permanent, the receiver of the RST segment SHOULD NO | he properties | |||
T try | of the failure as defined by the provided reason code. If the error con | |||
to reestablish a subflow for this connection over this path. The "U", " | dition | |||
V" | is considered to be Permanent, the receiver of the RST segment <bcp14>SH | |||
OULD NOT</bcp14> try | ||||
to re‑establish a subflow for this connection over this path. The | ||||
"U", "V", | ||||
and "W" flags are not defined by this specification and are reserved for | and "W" flags are not defined by this specification and are reserved for | |||
future use. An implementation of this specification MUST set these flags | future use. An implementation of this specification <bcp14>MUST</bcp14> | |||
to 0, and a receiver MUST ignore them.</t> | set these flags | |||
to 0, and a receiver <bcp14>MUST</bcp14> ignore them.</t> | ||||
<t>The "Reason" code is an 8-bit field that indicates the reason for | <t>"Reason" is an 8-bit field that indicates the reason code for | |||
the termination of the subflow. The following codes are defined in | the termination of the subflow. The following codes are defined in | |||
this document: | this document: | |||
<list style="symbols"> | </t> | |||
<t>Unspecified error (code 0x0). This is the default error implying | <ul spacing="normal"> | |||
the | <li>Unspecified error (code 0x00). This is the default error; | |||
it implies that the | ||||
subflow is no longer available. The presence of this option shows | subflow is no longer available. The presence of this option shows | |||
that the RST was generated by a MPTCP-aware device.</t> | that the RST was generated by an MPTCP-aware device.</li> | |||
<li>MPTCP-specific error (code 0x01). An error has been detected in t | ||||
<t>MPTCP specific error (code 0x01). An error has been detected in | he | |||
the | ||||
processing of MPTCP options. This is the usual reason code to retur n | processing of MPTCP options. This is the usual reason code to retur n | |||
in the cases where a RST is being sent to close a subflow for reason | in the cases where a RST is being sent to close a subflow because | |||
s | of an invalid response.</li> | |||
of an invalid response.</t> | <li>Lack of resources (code 0x02). This code indicates that the | |||
<t>Lack of resources (code 0x02). This code indicates that the | ||||
sending host does not have enough resources to support the | sending host does not have enough resources to support the | |||
terminated subflow.</t> | terminated subflow.</li> | |||
<li>Administratively prohibited (code 0x03). This code indicates that | ||||
<t>Administratively prohibited (code 0x03). This code indicates tha | ||||
t | ||||
the requested subflow is prohibited by the policies of the sending | the requested subflow is prohibited by the policies of the sending | |||
host.</t> | host.</li> | |||
<li>Too much outstanding data (code 0x04). This code indicates that | ||||
<t>Too much outstanding data (code 0x04). This code indicates that | there is an excessive amount of data that needs to be transmitted | |||
there is an excessive amount of data that need to be transmitted | ||||
over the terminated subflow while having already been acknowledged | over the terminated subflow while having already been acknowledged | |||
over one or more other subflows. This may occur if a path has been | over one or more other subflows. This may occur if a path has been | |||
unavailable for a short period and it is more efficient to reset and | unavailable for a short period and it is more efficient to reset and | |||
start again than it is to retransmit the queued data.</t> | start again than it is to retransmit the queued data.</li> | |||
<li>Unacceptable performance (code 0x05). This code indicates that | ||||
<t>Unacceptable performance (code 0x05). This code indicates that | ||||
the performance of this subflow was too low compared to the other | the performance of this subflow was too low compared to the other | |||
subflows of this Multipath TCP connection.</t> | subflows of this Multipath TCP connection.</li> | |||
<li>Middlebox interference (code 0x06). Middlebox interference has | ||||
<t>Middlebox interference (code 0x06). Middlebox interference has | been detected over this subflow, making MPTCP signaling invalid. Fo | |||
been detected over this subflow making MPTCP signaling invalid. For | r | |||
example, this may be sent if the checksum does not validate.</t> | example, this may be sent if the checksum does not validate.</li> | |||
</list> | </ul> | |||
</t> | ||||
</section> | </section> | |||
<section anchor="sec_fallback" numbered="true" toc="default"> | ||||
<section title="Fallback" anchor="sec_fallback"> | <name>Fallback</name> | |||
<t>Sometimes, middleboxes will exist on a path that could prevent the op | <t>Sometimes, middleboxes will exist on a path that could prevent the | |||
eration of MPTCP. MPTCP has been designed in order to cope with many middlebox m | operation of MPTCP. MPTCP has been designed to cope with many | |||
odifications (see <xref target="sec_middleboxes"/>), but there are still some ca | middlebox modifications (see <xref target="sec_middleboxes" | |||
ses where a subflow could fail to operate within the MPTCP requirements. These c | format="default"/>), but there are still some cases where a subflow | |||
ases are notably the following: the loss of MPTCP options on a path, and the mod | could fail to operate within the MPTCP requirements. Notably, these case | |||
ification of payload data. If such an event occurs, it is necessary to "fall bac | s are the following: the loss of MPTCP options on a path and the modification of | |||
k" to the previous, safe operation. This may be either falling back to regular T | payload data. If such an event occurs, it is necessary to "fall back" to the pr | |||
CP or removing a problematic subflow.</t> | evious, safe operation. This may be either falling back to regular TCP or removi | |||
ng a problematic subflow.</t> | ||||
<t>At the start of an MPTCP connection (i.e., the first subflow), it is | <t>At the start of an MPTCP connection (i.e., the first subflow), it is | |||
important to ensure that the path is fully MPTCP capable and the necessary MPTCP | important to ensure that the path is fully MPTCP capable and the necessary MPTCP | |||
options can reach each host. The handshake as described in <xref target="sec_in | options can reach each host. The handshake as described in <xref target="sec_in | |||
it"/> SHOULD fall back to regular TCP if either of the SYN messages do not have | it" format="default"/> <bcp14>SHOULD</bcp14> fall back to regular TCP if either | |||
the MPTCP options: this is the same, and desired, behavior in the case where a h | of the SYN messages does not have the MPTCP options: this is the same, and desir | |||
ost is not MPTCP capable, or the path does not support the MPTCP options. When a | ed, behavior in the case where a host is not MPTCP capable or the path does not | |||
ttempting to join an existing MPTCP connection (<xref target="sec_join"/>), if a | support the MPTCP options. When attempting to join an existing MPTCP connection | |||
path is not MPTCP capable and the MPTCP options do not get through on the SYNs, | (<xref target="sec_join" format="default"/>), if a path is not MPTCP capable and | |||
the subflow will be closed according to the MP_JOIN logic.</t> | the MPTCP options do not get through on the SYNs, the subflow will be closed ac | |||
cording to the MP_JOIN logic.</t> | ||||
<t>There is, however, another corner case that should be addressed. That | <t>There is, however, another corner case that should be addressed: | |||
is one of MPTCP options getting through on the SYN, but not on regular packets. | the case where MPTCP options get through on the SYN but not on regular | |||
This can be resolved if the subflow is the first subflow, and thus all data in | packets. If the subflow is the first subflow and thus all data in | |||
flight is contiguous, using the following rules.</t> | flight is contiguous, this situation can be resolved by using the follow | |||
ing rules:</t> | ||||
<t>A sender MUST include a DSS option with data sequence mapping in ever | <ul spacing="normal"> | |||
y segment until one of the sent segments has been acknowledged with a DSS option | <li>A sender <bcp14>MUST</bcp14> include a DSS option with Data Sequence Mapping | |||
containing a Data ACK. Upon reception of the acknowledgment, the sender has the | in every segment until one of the sent segments has been acknowledged with a DS | |||
confirmation that the DSS option passes in both directions and may choose to se | S option containing a Data ACK. Upon reception of the acknowledgment, the sender | |||
nd fewer DSS options than once per segment.</t> | has the confirmation that the DSS option passes in both directions and may choo | |||
se to send fewer DSS options than once per segment.</li> | ||||
<t>If, however, an ACK is received for data (not just for the SYN) witho | <li>If, however, an ACK is received for data (not just for the SYN) | |||
ut a DSS option containing a Data ACK, the sender determines the path is not MPT | without a DSS option containing a Data ACK, the sender determines that t | |||
CP capable. In the case of this occurring on an additional subflow (i.e., one st | he path is not MPTCP capable. In the case of this occurring on an additional sub | |||
arted with MP_JOIN), the host MUST close the subflow with a RST, which SHOULD co | flow (i.e., one started with MP_JOIN), the host <bcp14>MUST</bcp14> close the su | |||
ntain a MP_TCPRST option (<xref target="sec_reset"/>) with a "Middlebox interfer | bflow with a RST, which <bcp14>SHOULD</bcp14> contain an MP_TCPRST option (<xref | |||
ence" reason code.</t> | target="sec_reset" format="default"/>) with a "Middlebox interference" reason c | |||
ode.</li> | ||||
<t>In the case of such an ACK being received on the first subflow (i.e., | <li>In the case of such an ACK being received on the first subflow | |||
that started with MP_CAPABLE), before any additional subflows are added, the im | (i.e., that started with MP_CAPABLE), before any additional subflows | |||
plementation MUST drop out of an MPTCP mode, back to regular TCP. The sender wil | are added, the implementation <bcp14>MUST</bcp14> drop out of MPTCP | |||
l send one final data sequence mapping, with the Data-Level Length value of 0 in | mode and fall back to regular TCP. The sender will send one final Data S | |||
dicating an infinite mapping (to inform the other end in case the path drops opt | equence Mapping, with the Data-Level Length value of 0 indicating an infinite ma | |||
ions in one direction only), and then revert to sending data on the single subfl | pping (to inform the other end in case the path drops options in one direction o | |||
ow without any MPTCP options.</t> | nly), and then revert to sending data on the single subflow without any MPTCP op | |||
tions.</li> | ||||
<t>If a subflow breaks during operation, e.g. if it is re-routed and MPT | <li>If a subflow breaks during operation, e.g., if it is rerouted and | |||
CP options are no longer permitted, then once this is detected (by the subflow-l | MPTCP options are no longer permitted, then once this is detected (by | |||
evel receive buffer filling up, since there is no mapping available in order to | the subflow-level receive buffer filling up, since there is no mapping | |||
DATA_ACK this data), the subflow SHOULD be treated as broken and closed with a R | available in order to DATA_ACK this data), the subflow | |||
ST, since no data can be delivered to the application layer, and no fallback sig | <bcp14>SHOULD</bcp14> be treated as broken and closed with a RST, | |||
nal can be reliably sent. This RST SHOULD include the MP_TCPRST option (<xref ta | since no data can be delivered to the application layer and no | |||
rget="sec_reset"/>) with a "Middlebox interference" reason code.</t> | fallback signal can be reliably sent. This RST <bcp14>SHOULD</bcp14> | |||
include the MP_TCPRST option (<xref target="sec_reset" | ||||
<t>These rules should cover all cases where such a failure could happen: | format="default"/>) with a "Middlebox interference" reason code.</li> | |||
whether it's on the forward or reverse path and whether the server or the clien | </ul> | |||
t first sends data.</t> | <t>These rules should cover all cases where such a failure could | |||
happen -- whether it's on the forward or reverse path and whether the se | ||||
<t>So far this section has discussed the loss of MPTCP options, either i | rver or the client first sends data.</t> | |||
nitially, or during the course of the connection. As described in <xref target=" | <t>So far, this section has discussed the loss of MPTCP options, | |||
sec_generalop"/>, each portion of data for which there is a mapping is protected | either initially or during the course of the connection. As described | |||
by a checksum, if checksums have been negotiated. This mechanism is used to det | in <xref target="sec_generalop" format="default"/>, each portion of | |||
ect if middleboxes have made any adjustments to the payload (added, removed, or | data for which there is a mapping is protected by a checksum, if | |||
changed data). A checksum will fail if the data has been changed in any way. Thi | checksums have been negotiated. This mechanism is used to detect if | |||
s will also detect if the length of data on the subflow is increased or decrease | middleboxes have made any adjustments to the payload (added, removed, | |||
d, and this means the data sequence mapping is no longer valid. The sender no lo | or changed data). A checksum will fail if the data has been changed in | |||
nger knows what subflow-level sequence number the receiver is genuinely operatin | any way. The use of a checksum will also detect whether the length of da | |||
g at (the middlebox will be faking ACKs in return), and it cannot signal any fur | ta on the subflow is | |||
ther mappings. Furthermore, in addition to the possibility of payload modificati | increased or decreased, and this means the Data Sequence Mapping is no | |||
ons that are valid at the application layer, there is the possibility that such | longer valid. The sender no longer knows what subflow-level sequence | |||
modifications could be triggered across MPTCP segment boundaries, corrupting the | number the receiver is genuinely operating at (the middlebox will be | |||
data. Therefore, all data from the start of the segment that failed the checksu | faking ACKs in return), and it cannot signal any further | |||
m onwards is not trustworthy.</t> | mappings. Furthermore, in addition to the possibility of payload | |||
modifications that are valid at the application layer, it is possible th | ||||
<t>Note that if checksum usage has not been negotiated, this fallback me | at such modifications could be triggered across MPTCP segment boundaries, corrup | |||
chanism cannot be used unless there is some higher or lower layer signal to info | ting the data. Therefore, all data from the start of the segment that failed the | |||
rm the MPTCP implementation that the payload has been tampered with.</t> | checksum onward is not trustworthy.</t> | |||
<t>Note that if checksum usage has not been negotiated, this fallback me | ||||
<t>When multiple subflows are in use, the data in flight on a subflow wi | chanism cannot be used unless there is some higher-layer or lower‑layer si | |||
ll likely involve data that is not contiguously part of the connection-level str | gnal to inform the MPTCP implementation that the payload has been tampered with. | |||
eam, since segments will be spread across the multiple subflows. Due to the prob | </t> | |||
lems identified above, it is not possible to determine what adjustment has done | <t>When multiple subflows are in use, the data in flight on a subflow | |||
to the data (notably, any changes to the subflow sequence numbering). Therefore, | will likely involve data that is not contiguously part of the | |||
it is not possible to recover the subflow, and the affected subflow must be imm | connection-level stream, since segments will be spread across the | |||
ediately closed with a RST, featuring an MP_FAIL option (<xref target="tcpm_fall | multiple subflows. Due to the problems identified above, it is not | |||
back"/>), which defines the data sequence number at the start of the segment (de | possible to determine what adjustments have been done to the data (notab | |||
fined by the data sequence mapping) that had the checksum failure. Note that the | ly, | |||
MP_FAIL option requires the use of the full 64-bit sequence number, even if 32- | any changes to the subflow sequence numbering). Therefore, it is not | |||
bit sequence numbers are normally in use in the DSS signals on the path.</t> | possible to recover the subflow, and the affected subflow must be | |||
immediately closed with a RST that includes an MP_FAIL option (<xref tar | ||||
<?rfc needLines='8'?> | get="tcpm_fallback" format="default"/>), which defines the data sequence number | |||
<figure align="center" anchor="tcpm_fallback" title="Fallback (MP_FAIL) | at the start of the segment (defined by the Data Sequence Mapping) that had the | |||
Option"> | checksum failure. Note that the MP_FAIL option requires the use of the full 64-b | |||
<artwork align="left"><![CDATA[ | it sequence number, even if 32-bit sequence numbers are normally in use in the D | |||
1 2 3 | SS signals on the path.</t> | |||
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | <figure anchor="tcpm_fallback"> | |||
+---------------+---------------+-------+----------------------+ | <name>Fallback (MP_FAIL) Option</name> | |||
| Kind | Length=12 |Subtype| (reserved) | | <artwork align="left" name="" type="" alt=""><![CDATA[ | |||
+---------------+---------------+-------+----------------------+ | 1 2 3 | |||
| | | 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | |||
| Data Sequence Number (8 octets) | | +---------------+---------------+-------+----------------------+ | |||
| | | | Kind | Length=12 |Subtype| (reserved) | | |||
+--------------------------------------------------------------+ | +---------------+---------------+-------+----------------------+ | |||
| | | ||||
]]></artwork> | | Data Sequence Number (8 octets) | | |||
| | | ||||
+--------------------------------------------------------------+ ]]></artwork> | ||||
</figure> | </figure> | |||
<t>The receiver of this option <bcp14>MUST</bcp14> discard all data foll | ||||
owing the data sequence number specified. | ||||
Failed data <bcp14>MUST NOT</bcp14> be DATA_ACKed and so will be retrans | ||||
mitted on other subflows (<xref target="sec_retransmit" format="default"/>). </t | ||||
> | ||||
<t>A special case is when there is a single subflow and it fails with a | ||||
checksum error. If it is known that all unacknowledged data in | ||||
flight is contiguous (which will usually be the case with a single | ||||
subflow), an infinite mapping can be applied to the subflow without | ||||
the need to close it first, essentially turning off all further | ||||
MPTCP signaling. | ||||
<t>The receiver of this option MUST discard all data following the data | In this case, if a receiver identifies a checksum failure | |||
sequence number specified. | ||||
Failed data MUST NOT be DATA_ACKed and so will be retransmitted on other | ||||
subflows (<xref target="sec_retransmit"/>). </t> | ||||
<t>A special case is when there is a single subflow and it fails with a | ||||
checksum error. | ||||
If it is known that all unacknowledged data in flight is | ||||
contiguous (which will usually be the case with a single subflow), an infinite m | ||||
apping can be applied to the subflow without the need to close it first, and | ||||
essentially turn off all further MPTCP signaling. In this case, if a receiver id | ||||
entifies a checksum failure | ||||
when there is only one path, it will send back an MP_FAIL option on the subflow- level ACK, referring to the data-level sequence number of the start of the | when there is only one path, it will send back an MP_FAIL option on the subflow- level ACK, referring to the data-level sequence number of the start of the | |||
segment on which the checksum error was detected. The sender will receive | segment on which the checksum error was detected. The sender will receive | |||
this, and if all unacknowledged data in flight is contiguous, will signal an inf | this information and, if all unacknowledged data in flight is contiguous, will s | |||
inite mapping. | ignal an infinite mapping. | |||
This infinite mapping will be a DSS option (<xref target="sec_generalop"/>) | This infinite mapping will be a DSS option (<xref target="sec_generalop" format= | |||
on the first new packet, containing a data sequence mapping that acts retroactiv | "default"/>) | |||
ely, referring to the start of the subflow sequence | on the first new packet, containing a Data Sequence Mapping that acts retroactiv | |||
number of the most recent segment that was known to be delivered intact (i.e. wa | ely, referring to the start of the subflow sequence | |||
s successfully DATA_ACKed). From that point onwards, data can be altered | number of the most recent segment that was known to be delivered intact (i.e., w | |||
as successfully DATA_ACKed). From that point onward, data can be altered | ||||
by a middlebox without affecting MPTCP, as the data stream is equivalent to a re gular, legacy TCP session. | by a middlebox without affecting MPTCP, as the data stream is equivalent to a re gular, legacy TCP session. | |||
Whilst in theory paths may only be damaged in one direction, and the MP_FAIL sig | While in theory paths may only be damaged in one direction -- and the MP_FAIL | |||
nal affects only one direction of traffic, | signal affects only one direction of traffic -- | |||
for implementation simplicity, the receiver of an MP_FAIL MUST also respond with | for simplicity of implementation, the receiver of an MP_FAIL <bcp14>MUST</bcp14> | |||
an MP_FAIL in the reverse direction and entirely revert to a regular TCP sessio | also respond with an MP_FAIL in the reverse direction and entirely revert to a | |||
n.</t> | regular TCP session.</t> | |||
<t>In the rare case that the data is not contiguous (which could happen when there is only one subflow but it is retransmitting data from a subflow | <t>In the rare case that the data is not contiguous (which could happen when there is only one subflow but it is retransmitting data from a subflow | |||
that has recently been uncleanly closed), the receiver MUST close the subflow wi | that has recently been uncleanly closed), the receiver <bcp14>MUST</bcp14> close | |||
th a RST with MP_FAIL. The receiver MUST discard all data that follows the | the subflow with a RST with MP_FAIL. The receiver <bcp14>MUST</bcp14> discard a | |||
data sequence number specified. The sender MAY attempt to create a new subflow b | ll data that follows the | |||
elonging to the same connection, and, if it chooses to do so, SHOULD place | data sequence number specified. The sender <bcp14>MAY</bcp14> attempt to | |||
the single subflow immediately in single-path mode by setting an infinite data s | create a new subflow belonging to the same connection and, if it chooses to do | |||
equence mapping. This mapping will begin from the data-level sequence number | so, <bcp14>SHOULD</bcp14> immediately place | |||
the single subflow in single-path mode by setting an infinite Data Sequence Mapp | ||||
ing. This mapping will begin from the data-level sequence number | ||||
that was declared in the MP_FAIL.</t> | that was declared in the MP_FAIL.</t> | |||
<t>After a sender signals an infinite mapping, it <bcp14>MUST</bcp14> on | ||||
<t>After a sender signals an infinite mapping, it MUST only use subflow | ly use subflow ACKs to clear its send buffer. | |||
ACKs to clear its send buffer. | ||||
This is because Data ACKs may become misaligned with the subflow ACKs when middl eboxes insert or delete data. | This is because Data ACKs may become misaligned with the subflow ACKs when middl eboxes insert or delete data. | |||
The receive SHOULD stop generating Data ACKs after it receives an infinite mappi | The receiver <bcp14>SHOULD</bcp14> stop generating Data ACKs after it receives | |||
ng. </t> | an infinite mapping.</t> | |||
<t>When a connection has fallen back with an infinite mapping, only one | ||||
<t>When a connection has fallen back with an infinite mapping, only one | subflow can send data; otherwise, the receiver would not know how to reorder the | |||
subflow can send data; otherwise, the receiver would not know how to reorder the | data. In practice, this means that all MPTCP subflows will have to be terminate | |||
data. In practice, this means that all MPTCP subflows will have to be terminate | d except one. Once MPTCP falls back to regular TCP, it <bcp14>MUST NOT</bcp14> r | |||
d except one. Once MPTCP falls back to regular TCP, it MUST NOT revert to MPTCP | evert to MPTCP later in the connection.</t> | |||
later in the connection.</t> | ||||
<t>It should be emphasized that MPTCP is not attempting to prevent the u se of middleboxes that want to adjust the payload. An MPTCP-aware middlebox coul d provide such functionality by also rewriting checksums.</t> | <t>It should be emphasized that MPTCP is not attempting to prevent the u se of middleboxes that want to adjust the payload. An MPTCP-aware middlebox coul d provide such functionality by also rewriting checksums.</t> | |||
</section> | </section> | |||
<section anchor="sec_errors" numbered="true" toc="default"> | ||||
<section title="Error Handling" anchor="sec_errors"> | <name>Error Handling</name> | |||
<t>In addition to the fallback mechanism as described above, the standar | <t>In addition to the fallback mechanism described above, the standard c | |||
d classes of TCP errors may need to be handled in an MPTCP-specific way. Note th | lasses of TCP errors may need to be handled in an MPTCP‑specific way. Note | |||
at changing semantics -- such as the relevance of a RST -- are covered in <xref | that changing semantics -- such as the relevance of a RST -- are covered in <xr | |||
target="sec_semantics"/>. Where possible, we do not want to deviate from regular | ef target="sec_semantics" format="default"/>. Where possible, we do not want to | |||
TCP behavior.</t> | deviate from regular TCP behavior.</t> | |||
<t>The following list covers possible errors and the appropriate MPTCP b ehavior: | <t>The following list covers possible errors and the appropriate MPTCP b ehavior: | |||
<list style="symbols"> | ||||
<t>Unknown token in MP_JOIN (or HMAC failure in MP_JOIN ACK, or miss | ||||
ing MP_JOIN in SYN/ACK response): send RST (analogous to TCP's behavior on an un | ||||
known port)</t> | ||||
<t>DSN out of window (during normal operation): drop the data, do no | ||||
t send Data ACKs</t> | ||||
<t>Remove request for unknown address ID: silently ignore</t> | ||||
</list> | ||||
</t> | </t> | |||
<ul spacing="normal"> | ||||
<li>Unknown token in MP_JOIN (or HMAC failure in MP_JOIN ACK, or missi | ||||
ng MP_JOIN in SYN/ACK response): send RST (analogous to TCP's behavior on an unk | ||||
nown port)</li> | ||||
<li>DSN out of window (during normal operation): drop the data; do not | ||||
send Data ACKs</li> | ||||
<li>Remove request for unknown Address ID: silently ignore</li> | ||||
</ul> | ||||
</section> | </section> | |||
<section anchor="heuristics" numbered="true" toc="default"> | ||||
<section title="Heuristics" anchor="heuristics"> | <name>Heuristics</name> | |||
<t>There are a number of heuristics that are needed for | <t>There are a number of heuristics that are needed for | |||
performance or deployment but that are not required for | performance or deployment but that are not required for | |||
protocol correctness. In this section, we detail such | protocol correctness. In this section, we detail such | |||
heuristics. Note that discussion of buffering and certain | heuristics. Note that discussions of buffering and certain | |||
sender and receiver window behaviors are presented in Sections | sender and receiver window behaviors are presented in Sections | |||
<xref target="sec_rwin" format="counter"/> and <xref target="sec_sender" | <xref target="sec_rwin" format="counter"/> and <xref | |||
format="counter"/>, | target="sec_sender" format="counter"/>, | |||
as well as retransmission in <xref target="sec_retransmit"/>.</t> | and retransmission is discussed in <xref target="sec_retransmit" format= | |||
"default"/>.</t> | ||||
<section title="Port Usage"> | <section numbered="true" toc="default"> | |||
<t>Under typical operation, an MPTCP implementation SHOULD use | <name>Port Usage</name> | |||
the same ports as already in use. In other words, the | <t>Under typical operation, an MPTCP implementation <bcp14>SHOULD</bcp | |||
destination port of a SYN containing an MP_JOIN option SHOULD | 14> use | |||
the same ports as the ports that are already in use. In other words, t | ||||
he | ||||
destination port of a SYN containing an MP_JOIN option <bcp14>SHOULD</ | ||||
bcp14> | ||||
be the same as the remote port of the first subflow in the | be the same as the remote port of the first subflow in the | |||
connection. The local port for such SYNs SHOULD also be the | connection. The local port for such SYNs <bcp14>SHOULD</bcp14> also b | |||
same as for the first subflow (and as such, an | e the | |||
implementation SHOULD reserve ephemeral ports across all | same as the port for the first subflow (and as such, an | |||
implementation <bcp14>SHOULD</bcp14> reserve ephemeral ports across al | ||||
l | ||||
local IP addresses), although there may be cases where this | local IP addresses), although there may be cases where this | |||
is infeasible. This strategy is intended to maximize the | is infeasible. This strategy is intended to maximize the | |||
probability of the SYN being permitted by a firewall or NAT | probability of the SYN being permitted by a firewall or NAT | |||
at the recipient and to avoid confusing any network | at the recipient and to avoid confusing any network-monitoring softwar | |||
monitoring software.</t> | e.</t> | |||
<t>There may also be cases, however, where a host wishes to | <t>There may also be cases, however, where a host wishes to | |||
signal that a specific port should be used, and this facility | signal that a specific port should be used; this facility | |||
is provided in the ADD_ADDR option as documented in | is provided in the ADD_ADDR option as documented in | |||
<xref target="sec_add_address"/>. It is therefore feasible | <xref target="sec_add_address" format="default"/>. It is therefore fe asible | |||
to allow multiple subflows between the same two addresses | to allow multiple subflows between the same two addresses | |||
but using different port pairs, and | but using different port pairs, and | |||
such a facility could be used to allow load balancing within | such a facility could be used to allow load balancing within | |||
the network based on 5-tuples (e.g., some ECMP implementations <xref t arget="RFC2992"/>).</t> | the network based on 5-tuples (e.g., some ECMP implementations <xref t arget="RFC2992" format="default"/>).</t> | |||
</section> | </section> | |||
<section numbered="true" toc="default"> | ||||
<section title="Delayed Subflow Start and Subflow Symmetry"> | <name>Delayed Subflow Start and Subflow Symmetry</name> | |||
<t>Many TCP connections are short-lived and consist only of a few | <t>Many TCP connections are short-lived and consist only of a few | |||
segments, and so the overheads | segments, and so the overhead | |||
of using MPTCP outweigh any benefits. A heuristic is required, | of using MPTCP outweighs any benefits. A heuristic is required, | |||
therefore, to decide when to start using additional subflows in | therefore, to decide when to start using additional subflows in | |||
an MPTCP connection. Experimental deployments have shown that | an MPTCP connection. Experimental deployments have shown that | |||
MPTCP can be applied in a range of scenarios so an implementation | MPTCP can be applied in a range of scenarios, so an implementation | |||
is likely to need to take into account factors including the type of | will likely need to take into account such factors as the type of | |||
traffic being sent and duration of session, and this information | traffic being sent and the duration of the session; this information | |||
MAY be signalled by the application layer.</t> | <bcp14>MAY</bcp14> be signaled by the application layer.</t> | |||
<t>However, for standard TCP traffic, a suggested general-purpose | <t>However, for standard TCP traffic, a suggested general-purpose | |||
heuristic that an implementation MAY choose to employ is as follows.</ | heuristic that an implementation <bcp14>MAY</bcp14> choose to employ i | |||
t> | s as follows.</t> | |||
<t>If a host has data buffered for its peer (which implies that the | <t>If a host has data buffered for its peer (which implies that the | |||
application has received a request for data), the host opens one | application has received a request for data), the host opens one | |||
subflow for each initial window's worth of data that is buffered.</t> | subflow for each initial window's worth of data that is buffered.</t> | |||
<t>Consideration should also be given to limiting the rate of adding | <t>Consideration should also be given to limiting the rate of adding | |||
new subflows, as well as limiting the total number of subflows open | new subflows, as well as limiting the total number of subflows open | |||
for a particular connection. A host may choose to vary these values | for a particular connection. A host may choose to vary these values | |||
based on its load or knowledge of traffic and path characteristics.</t > | based on its load or knowledge of traffic and path characteristics.</t > | |||
<t>Note that this heuristic alone is probably insufficient. Traffic | <t>Note that this heuristic alone is probably insufficient. Traffic | |||
for many common applications, such as downloads, is highly asymmetric and | for many common applications, such as downloads, is highly asymmetric, and | |||
the host that is multihomed may well be the client that will never fil l | the host that is multihomed may well be the client that will never fil l | |||
its buffers, and thus never use MPTCP according to this heuristic. Adv anced APIs that allow an | its buffers and thus never use MPTCP according to this heuristic. Adva nced APIs that allow an | |||
application to signal its traffic requirements would aid in these deci sions.</t> | application to signal its traffic requirements would aid in these deci sions.</t> | |||
<t>An additional time-based heuristic could be applied, opening additi onal | <t>An additional time-based heuristic could be applied, opening additi onal | |||
subflows after a given period of time has passed. This would alleviate the | subflows after a given period of time has passed. This would alleviate the | |||
above issue, and also provide resilience for low-bandwidth but long-li ved | above issue and also provide resilience for low‑bandwidth but lo ng-lived | |||
applications.</t> | applications.</t> | |||
<t>Another issue is that both communicating hosts may simultaneously t ry to | <t>Another issue is that both communicating hosts may simultaneously t ry to | |||
set up a subflow between the same pair of addresses. This leads to an | set up a subflow between the same pair of addresses. This leads to an | |||
inefficient use of resources.</t> | inefficient use of resources.</t> | |||
<t>If the same ports are used on all subflows, as recommended above, | <t>If the same ports are used on all subflows, as recommended above, | |||
then standard TCP simultaneous open logic should take care of this sit uation | then standard TCP simultaneous-open logic should take care of this sit uation | |||
and only one subflow will be established between the address pairs. Ho wever, | and only one subflow will be established between the address pairs. Ho wever, | |||
this relies on the same ports being used at both end hosts. If a host does | this relies on the same ports being used at both end hosts. If a host does | |||
not support TCP simultaneous open, it is RECOMMENDED that some element | not support TCP simultaneous open, it is <bcp14>RECOMMENDED</bcp14> th | |||
of randomization is applied to the time to wait before opening new sub | at some element | |||
flows, | of randomization be applied to the time to wait before opening new sub | |||
flows, | ||||
so that only one subflow is created between a given address pair. If, however, | so that only one subflow is created between a given address pair. If, however, | |||
hosts signal additional ports to use (for example, for leveraging ECMP on-path), | hosts signal additional ports to use (for example, for leveraging ECMP on-path), | |||
this heuristic is not appropriate.</t> | this heuristic is not appropriate.</t> | |||
<t>This section has shown some of the factors that an implementer | ||||
<t>This section has shown some of the considerations that an implement | should consider when developing MPTCP heuristics, but it is not intend | |||
er | ed to be | |||
should give when developing MPTCP heuristics, but is not intended to b | ||||
e | ||||
prescriptive.</t> | prescriptive.</t> | |||
</section> | </section> | |||
<section numbered="true" toc="default"> | ||||
<section title="Failure Handling"> | <name>Failure Handling</name> | |||
<t>Requirements for MPTCP's handling of unexpected signals have been | <t>Requirements for MPTCP's handling of unexpected signals are | |||
given in <xref target="sec_errors"/>. There are other failure cases, | given in <xref target="sec_errors" format="default"/>. There are other | |||
however, where a hosts can choose appropriate behavior.</t> | failure cases, | |||
however, where hosts can choose appropriate behavior.</t> | ||||
<t>For example, <xref target="sec_init"/> suggests that a host SHOULD | <t>For example, <xref target="sec_init" format="default"/> suggests th | |||
at a host <bcp14>SHOULD</bcp14> | ||||
fall back to trying regular TCP SYNs after one or more failures of MPT CP | fall back to trying regular TCP SYNs after one or more failures of MPT CP | |||
SYNs for a connection. A host may keep a system-wide cache of such | SYNs for a connection. A host may keep a system-wide cache of such | |||
information, so that it can back off from using MPTCP, firstly for tha t | information, so that it can back off from using MPTCP, firstly for tha t | |||
particular destination host, and eventually on a whole interface, if | particular destination host and, eventually, on a whole interface, if | |||
MPTCP connections continue failing. The duration of such a cache would | MPTCP connections continue to fail. The duration of such a cache would | |||
be implementation-specific.</t> | be implementation specific.</t> | |||
<t>Another failure could occur when the MP_JOIN handshake fails. | <t>Another failure could occur when the MP_JOIN handshake fails. | |||
<xref target="sec_errors"/> specifies that an incorrect handshake MUST | <xref target="sec_errors" format="default"/> specifies that an incorre ct handshake <bcp14>MUST</bcp14> | |||
lead to the subflow being closed with a RST. A host operating an activ e | lead to the subflow being closed with a RST. A host operating an activ e | |||
intrusion detection system may choose to start blocking MP_JOIN packet s | intrusion-detection system may choose to start blocking MP_JOIN packet s | |||
from the source host if multiple failed MP_JOIN attempts are seen. Fro m | from the source host if multiple failed MP_JOIN attempts are seen. Fro m | |||
the connection initiator's point of view, if an MP_JOIN fails, it SHOU | the connection initiator's point of view, if an MP_JOIN fails, it | |||
LD | <bcp14>SHOULD NOT</bcp14> | |||
NOT attempt to connect to the same IP address and port during the life | attempt to connect to the same IP address and port during the lifetime | |||
time | ||||
of the connection, unless the other host refreshes the information wit h | of the connection, unless the other host refreshes the information wit h | |||
another ADD_ADDR option. Note that the ADD_ADDR option is informationa l | another ADD_ADDR option. Note that the ADD_ADDR option is informationa l | |||
only, and does not guarantee the other host will attempt a connection. | only and does not guarantee that the other host will attempt a connect | |||
</t> | ion.</t> | |||
<t>In addition, an implementation may learn, over a number of connecti ons, | <t>In addition, an implementation may learn, over a number of connecti ons, | |||
that certain interfaces or destination addresses consistently fail and | that certain interfaces or destination addresses consistently fail and | |||
may default to not trying to use MPTCP for these. Behavior could also | may default to not trying to use MPTCP for such interfaces or | |||
be learned for particularly badly performing subflows or subflows that | addresses. The behavior of subflows that perform particularly badly | |||
regularly fail during use, in order to temporarily choose not to use | or subflows that regularly fail during use could also | |||
be learned, so that an implementation can temporarily choose not to us | ||||
e | ||||
these paths.</t> | these paths.</t> | |||
</section> | </section> | |||
</section> | </section> | |||
</section> | </section> | |||
<section anchor="sec_semantics" numbered="true" toc="default"> | ||||
<section title="Semantic Issues" anchor="sec_semantics"> | <name>Semantic Issues</name> | |||
<t>In order to support multipath operation, the semantics of some TCP comp | <t>In order to support multipath operation, the semantics of some TCP | |||
onents have changed. To aid clarity, this section collects these semantic change | components have changed. To help clarify, this section lists these | |||
s as a reference. | semantic changes as a point of reference. | |||
<list style="hanging"> | </t> | |||
<t hangText="Sequence number:"> The (in-header) TCP sequence | <dl newline="false" spacing="normal" indent="3"> | |||
<dt>Sequence number:</dt> | ||||
<dd> The (in-header) TCP sequence | ||||
number is specific to the subflow. To allow the receiver to | number is specific to the subflow. To allow the receiver to | |||
reorder application data, an additional data-level | reorder application data, an additional data-level | |||
sequence space is used. In this data-level sequence space, the initi | sequence space is used. In this data‑level sequence space, the | |||
al SYN and | initial SYN and | |||
the final DATA_FIN occupy 1 octet of sequence space. This is to ensu | the final DATA_FIN occupy 1 octet of sequence space. This is done to | |||
re these | ensure that these | |||
signals are acknowledged at the connection level. There is an explic it | signals are acknowledged at the connection level. There is an explic it | |||
mapping of data sequence space to subflow sequence space, | mapping of data sequence space to subflow sequence space, | |||
which is signaled through TCP options in data | which is signaled through TCP options in data | |||
packets.</t> | packets.</dd> | |||
<dt>ACK:</dt> | ||||
<t hangText="ACK:"> The ACK field in the TCP header | <dd> The ACK field in the TCP header | |||
acknowledges only the subflow sequence number, not the | acknowledges only the subflow sequence number -- not the | |||
data-level sequence space. Implementations SHOULD NOT | data-level sequence space. Implementations <bcp14>SHOULD NOT</bcp14> | |||
attempt to infer a data-level acknowledgment from the | attempt to infer a data-level acknowledgment from the | |||
subflow ACKs. | subflow ACKs. | |||
This separates subflow- and connection-level processing | This separates subflow-level and connection-level processing | |||
at an end host.</t> | at an end host.</dd> | |||
<dt>Duplicate ACK:</dt> | ||||
<t hangText="Duplicate ACK:"> A duplicate ACK that includes any MPTCP | <dd> A duplicate ACK that includes any MPTCP signaling | |||
signaling | (with the exception of the DSS option) <bcp14>MUST NOT</bcp14> be tr | |||
(with the exception of the DSS option) MUST NOT be treated as a sign | eated as a signal of congestion. | |||
al of congestion. | ||||
To limit the chances of non-MPTCP-aware entities mistakenly interpre ting duplicate | To limit the chances of non-MPTCP-aware entities mistakenly interpre ting duplicate | |||
ACKs as a signal of congestion, MPTCP SHOULD NOT send more than two | ACKs as a signal of congestion, MPTCP <bcp14>SHOULD NOT</bcp14> send | |||
duplicate ACKs | more than two duplicate ACKs | |||
containing (non-DSS) MPTCP signals in a row.</t> | containing (non-DSS) MPTCP signals in a row.</dd> | |||
<dt>Receive Window:</dt> | ||||
<t hangText="Receive Window:">The receive window in the TCP | <dd>The receive window in the TCP | |||
header indicates the amount of free buffer space for the | header indicates the amount of free buffer space for the | |||
whole data-level connection (as opposed to for this | whole data-level connection (as opposed to the amount of space for t | |||
subflow) that is available at the receiver. This is the | his | |||
same semantics as regular TCP, but to maintain these | subflow) that is available at the receiver. The | |||
semantics are the same as for regular TCP, but to maintain these | ||||
semantics the receive window must be interpreted at the | semantics the receive window must be interpreted at the | |||
sender as relative to the sequence number given in the | sender as relative to the sequence number given in the | |||
DATA_ACK rather than the subflow ACK in the TCP header. | DATA_ACK rather than the subflow ACK in the TCP header. | |||
In this way, the original flow control role is preserved. | In this way, the original role of flow control is preserved. | |||
Note that some middleboxes may change the receive window, | Note that some middleboxes may change the receive window, | |||
and so a host SHOULD use the maximum value of those recently | and so a host <bcp14>SHOULD</bcp14> use the maximum value of those r ecently | |||
seen on the constituent subflows for the connection-level | seen on the constituent subflows for the connection-level | |||
receive window, and also needs to maintain a subflow-level | receive window and also needs to maintain a subflow-level | |||
window for subflow-level processing.</t> | window for subflow-level processing.</dd> | |||
<dt>FIN:</dt> | ||||
<t hangText="FIN:"> The FIN flag in the TCP header applies | <dd> The FIN flag in the TCP header applies | |||
only to the subflow it is sent on, not to the whole | only to the subflow it is sent on -- not to the whole | |||
connection. For connection-level FIN semantics, the | connection. For connection-level FIN semantics, the | |||
DATA_FIN option is used.</t> | DATA_FIN option is used.</dd> | |||
<dt>RST:</dt> | ||||
<t hangText="RST:"> The RST flag in the TCP header applies | <dd> The RST flag in the TCP header applies | |||
only to the subflow it is sent on, not to the whole | only to the subflow it is sent on -- not to the whole | |||
connection. The MP_FASTCLOSE option provides the fast close | connection. The MP_FASTCLOSE option provides the Fast Close | |||
functionality of a RST at the MPTCP connection level.</t> | functionality of a RST at the MPTCP connection level.</dd> | |||
<dt>Address List:</dt> | ||||
<t hangText="Address List:"> Address list management (i.e., | <dd> Address list management (i.e., | |||
knowledge of the local and remote hosts' lists of | knowledge of the local and remote hosts' lists of | |||
available IP addresses) is handled | available IP addresses) is handled | |||
on a per-connection basis (as opposed to per subflow, per | on a per-connection basis (as opposed to per subflow, per | |||
host, or per pair of communicating hosts). This permits | host, or per pair of communicating hosts). This permits | |||
the application of per-connection local policy. Adding an | the application of per-connection local policy. Adding an | |||
address to one connection (either explicitly through an Add | address to one connection (either explicitly through an | |||
Address message, or implicitly through a Join) has no implication | ADD_ADDR message or implicitly through an MP_JOIN) has no implicatio | |||
for other connections between the same pair of hosts.</t> | ns | |||
for other connections between the same pair of hosts.</dd> | ||||
<t hangText="5-tuple:"> The 5-tuple (protocol, local | <dt>5-tuple:</dt> | |||
<dd> The 5-tuple (protocol, local | ||||
address, local port, remote address, remote port) | address, local port, remote address, remote port) | |||
presented by kernel APIs to the application layer in a | presented by kernel APIs to the application layer in a | |||
non-multipath-aware application is that of the first | non-multipath-aware application is that of the first | |||
subflow, even if the subflow has since been closed and | subflow, even if the subflow has since been closed and | |||
removed from the connection. This decision, and other | removed from the connection. This decision, and other | |||
related API issues, are discussed in more detail in | related API issues, are discussed in more detail in | |||
<xref target="RFC6897"/>.</t> | <xref target="RFC6897" format="default"/>.</dd> | |||
</list> | </dl> | |||
</t> | ||||
</section> | </section> | |||
<section anchor="sec_security" numbered="true" toc="default"> | ||||
<section title="Security Considerations" anchor="sec_security"> | <name>Security Considerations</name> | |||
<t>As identified in <xref target="RFC6181"/>, the addition of multipath ca | <t>As identified in <xref target="RFC6181" format="default"/>, the | |||
pability to TCP will bring with it a number of new classes of threat. In order t | addition of multipath capability to TCP will bring with it a number of | |||
o prevent these, <xref target="RFC6182"/> presents a set of requirements for a s | new classes of threats. In order to prevent these threats, <xref target="R | |||
ecurity solution for MPTCP. The fundamental goal is for the security of MPTCP to | FC6182" | |||
be "no worse" than regular TCP today, and the key security requirements are: | format="default"/> presents a set of requirements for a security | |||
<list style="symbols"> | solution for MPTCP. The fundamental goal is for the security of MPTCP to | |||
<t>Provide a mechanism to confirm that the parties in a subflow handsh | be "no worse" than regular TCP today. The key security requirements | |||
ake are the same as in the original connection setup.</t> | are as follows: | |||
<t>Provide verification that the peer can receive traffic at a new add | </t> | |||
ress before using it as part of a connection.</t> | <ul spacing="normal"> | |||
<t>Provide replay protection, i.e., ensure that a request to add/remov | <li>Provide a mechanism to confirm that the parties in a subflow | |||
e a subflow is 'fresh'.</t> | handshake are the same as the parties in the original connection setup.< | |||
</list> | /li> | |||
<li>Provide verification that the peer can receive traffic at a new addr | ||||
In order to achieve these goals, MPTCP includes a hash-based handshake a | ess before using it as part of a connection.</li> | |||
lgorithm documented in Sections <xref target="sec_init" format="counter"/> and < | <li>Provide replay protection, i.e., ensure that a request to add&wj;/re | |||
xref target="sec_join" format="counter"/>.</t> | move a subflow is "fresh".</li> | |||
</ul> | ||||
<t>The security of the MPTCP connection hangs on the use of keys that are | <t> | |||
shared once at the start of the first subflow, and are never sent again over the | In order to achieve these goals, MPTCP includes a hash-based handshake | |||
network (unless used in the fast close mechanism, <xref target="sec_fastclose"/ | algorithm, as documented in Sections <xref target="sec_init" format="count | |||
>). To ease demultiplexing while not giving away any cryptographic material, fu | er"/> and <xref target="sec_join" format="counter"/>.</t> | |||
ture subflows use a truncated cryptographic hash of this key as the connection i | <t>The security of the MPTCP connection hangs on the use of keys that | |||
dentification "token". The keys are concatenated and used as keys for creating | are shared once at the start of the first subflow and are never sent | |||
Hash-based Message Authentication Codes (HMACs) used on subflow setup, in order | again over the network (unless used in the Fast Close mechanism (<xref | |||
to verify that the parties in the handshake are the same as in the original conn | target="sec_fastclose" format="default"/>)). To ease demultiplexing | |||
ection setup. It also provides verification that the peer can receive traffic a | while not giving away any cryptographic material, future subflows use a | |||
t this new address. Replay attacks would still be possible when only keys are u | truncated cryptographic hash of this key as the connection | |||
sed; therefore, the handshakes use single-use random numbers (nonces) at both en | identification "token". The keys are concatenated and used as keys for | |||
ds -- this ensures the HMAC will never be the same on two handshakes. Guidance o | creating Hash-based Message Authentication Codes (HMACs) used on subflow | |||
n generating random numbers suitable for use as keys is given in <xref target="R | setup, in order to verify that the parties in the handshake are the same | |||
FC4086"/> and discussed in <xref target="sec_init"/>. The nonces are valid for t | as the parties in the original connection setup. It also provides verific | |||
he lifetime of the TCP connection attempt. HMAC is also used to secure the ADD_A | ation that | |||
DDR option, due to the threats identified in <xref target="RFC7430"/>.</t> | the peer can receive traffic at this new address. Replay attacks would | |||
<t>The use of crypto capability bits in the initial connection handshake t | still be possible when only keys are used; therefore, the handshakes use | |||
o negotiate use of a particular algorithm allows the deployment of additional cr | single-use random numbers (nonces) at both ends -- this ensures that the H | |||
ypto mechanisms in the future. This negotiation would nevertheless be susceptib | MAC will never be the same on two handshakes. Guidance on generating random numb | |||
le to a bid-down attack by an on-path active attacker who could modify the crypt | ers suitable for use as keys is given in <xref target="RFC4086" format="default" | |||
o capability bits in the response from the receiver to use a less secure crypto | /> and discussed in <xref target="sec_init" format="default"/>. The nonces are v | |||
mechanism. The security mechanism presented in this document should therefore pr | alid for the lifetime of the TCP connection attempt. HMAC is also used to secure | |||
otect against all forms of flooding and hijacking attacks discussed in <xref tar | the ADD_ADDR option, due to the threats identified in <xref target="RFC7430" fo | |||
get="RFC6181"/>.</t> | rmat="default"/>.</t> | |||
<t>The use of crypto capability bits in the initial connection handshake | ||||
<t>The version negotiation specified in <xref target="sec_init"/>, if diff | to negotiate the use of a particular algorithm allows the deployment of ad | |||
ering MPTCP versions shared a common negotiation format, would allow an on-path | ditional crypto mechanisms in the future. This negotiation would nevertheless b | |||
attacker to apply a theoretical bid-down attack. Since the v1 and v0 protocols h | e susceptible to a bid-down attack by an on-path active attacker who could modif | |||
ave a different handshake, such an attack would require the client to re-establi | y the crypto capability bits in the response from the receiver to use a less sec | |||
sh the connection using v0, and this being supported by the server. Note that an | ure crypto mechanism. The security mechanism presented in this document should t | |||
on-path attacker would have access to the raw data, negating any other TCP-leve | herefore protect against all forms of flooding and hijacking attacks discussed i | |||
l security mechanisms. | n <xref target="RFC6181" format="default"/>.</t> | |||
Also a change from RFC6824 has removed the subflow identifier from the MP_ | <t>The version negotiation specified in <xref target="sec_init" | |||
PRIO option (<xref target="sec_policy"/>), to remove the theoretical attack wher | format="default"/>, if differing MPTCP versions shared a common | |||
e a subflow could be placed in "backup" mode by an attacker.</t> | negotiation format, would allow an on-path attacker to apply a | |||
theoretical bid-down attack. Since the v1 and v0 protocols have a | ||||
<t>During normal operation, regular TCP protection mechanisms (such as ens | different handshake, such an attack would require that the client | |||
uring sequence numbers are in-window) will provide the same level of protection | re-establish the connection using v0 and that the server support v0. | |||
against attacks on individual TCP subflows as exists for regular TCP today. Impl | Note that an on-path attacker would have access to the raw data, negating any o | |||
ementations will introduce additional buffers compared to regular TCP, to reasse | ther TCP-level security mechanisms. As also noted in <xref target="app_changelog | |||
mble data at the connection level. The application of window sizing will minimiz | "/>, this document specifies the removal of the AddrID field <xref target="RFC68 | |||
e the risk of denial-of-service attacks consuming resources.</t> | 24"/> in the MP_PRIO option (<xref target="sec_policy" format="default"/>). | |||
This change eliminates the possibility of a theoretical attack where | ||||
<t>As discussed in <xref target="sec_add_address"/>, a host may advertise | a subflow could be placed in "backup" mode by an attacker.</t> | |||
its private addresses, but these might point to different hosts in the receiver' | <t>During normal operation, regular TCP protection mechanisms (such as | |||
s network. The MP_JOIN handshake (<xref target="sec_join"/>) will ensure that th | ensuring that sequence numbers are in-window) will provide the same | |||
is does not succeed in setting up a subflow to the incorrect host. However, it c | level of protection against attacks on individual TCP subflows as the | |||
ould still create unwanted TCP handshake traffic. This feature of MPTCP could be | level of protection that exists for regular TCP today. Implementations wil | |||
a target for denial-of-service exploits, with malicious participants in MPTCP c | l introduce additional buffers compared to regular TCP, to reassemble data at th | |||
onnections encouraging the recipient to target other hosts in the network. There | e connection level. The application of window sizing will minimize the risk of d | |||
fore, implementations should consider heuristics (<xref target="heuristics"/>) a | enial-of-service attacks consuming resources.</t> | |||
t both the sender and receiver to reduce the impact of this.</t> | <t>As discussed in <xref target="sec_add_address" format="default"/>, a ho | |||
st may advertise its private addresses, but these might point to different hosts | ||||
in the receiver's network. The MP_JOIN handshake (<xref target="sec_join" forma | ||||
t="default"/>) will ensure that this does not succeed in setting up a subflow to | ||||
the incorrect host. However, it could still create unwanted TCP handshake traff | ||||
ic. This feature of MPTCP could be a target for denial-of-service exploits, with | ||||
malicious participants in MPTCP connections encouraging the recipient to target | ||||
other hosts in the network. Therefore, implementations should consider heuristi | ||||
cs (<xref target="heuristics" format="default"/>) at both the sender and receive | ||||
r to reduce the impact of this.</t> | ||||
<t>To further protect against malicious ADD_ADDR messages sent by an off-p ath attacker, the ADD_ADDR includes an HMAC using the keys negotiated during the handshake. This effectively prevents an attacker from diverting an MPTCP connec tion through an off-path ADD_ADDR injection into the stream.</t> | <t>To further protect against malicious ADD_ADDR messages sent by an off-p ath attacker, the ADD_ADDR includes an HMAC using the keys negotiated during the handshake. This effectively prevents an attacker from diverting an MPTCP connec tion through an off-path ADD_ADDR injection into the stream.</t> | |||
<t>A small security risk could theoretically exist with key reuse, but in | ||||
<t>A small security risk could theoretically exist with key reuse, but in | order to accomplish a replay attack, both the sender and receiver keys, and the | |||
order to accomplish a replay attack, both the sender and receiver keys, and the | sender and receiver random numbers, in the MP_JOIN handshake (<xref target="sec_ | |||
sender and receiver random numbers, in the MP_JOIN handshake (<xref target="sec_ | join" format="default"/>) would have to match.</t> | |||
join"/>) would have to match.</t> | <t>While this specification defines a "medium" security solution, | |||
meeting the criteria specified at the start of this section and in the | ||||
<t>Whilst this specification defines a "medium" security solution, meeting | threat analysis document <xref target="RFC6181" format="default"/>, since | |||
the criteria specified at the start of this section and the threat analysis (<x | attacks | |||
ref target="RFC6181"/>), since attacks only ever get worse, it is likely that a | only ever get worse, it is likely that a future version of MPTCP would | |||
future version of MPTCP would need to be able to support stronger security. Ther | need to be able to support stronger security. | |||
e are several ways the security of MPTCP could potentially be improved; some of | There are several ways the security of MPTCP could potentially be improved; som | |||
these would be compatible with MPTCP as defined in this document, whilst others | e of these would be compatible with MPTCP as defined in this document, while oth | |||
may not be. For now, the best approach is to get experience with the current app | ers may not be. For now, the best approach is to gain experience with the curren | |||
roach, establish what might work, and check that the threat analysis is still ac | t approach, establish what might work, and check that the threat analysis is sti | |||
curate.</t> | ll accurate.</t> | |||
<t>Possible ways of improving MPTCP security could include:</t> | ||||
<t>Possible ways of improving MPTCP security could include:<list style="symbols" | <ul spacing="normal"> | |||
> | <li>defining a new MPTCP cryptographic algorithm, as negotiated in | |||
<t>defining a new MPCTP cryptographic algorithm, as negotiated in MP_CAPABLE. A | MP_CAPABLE. If an implementation was being deployed in a controlled | |||
sub-case could be to include an additional deployment assumption, such as statef | environment where additional assumptions could be made, such as the | |||
ul servers, in order to allow a more powerful algorithm to be used.</t> | ability for the servers to store state during the TCP handshake, then | |||
<t>defining how to secure data transfer with MPTCP, whilst not changing the sign | it may be possible to use a stronger cryptographic algorithm than | |||
aling part of the protocol.</t> | would otherwise be possible.</li> | |||
<t>defining security that requires more option space, perhaps in conjunction wit | <li>defining how to secure data transfer with MPTCP, while not changing | |||
h a "long options" proposal for extending the TCP options space (such as those s | the signaling part of the protocol.</li> | |||
urveyed in <xref target="TCPLO"/>), or perhaps building on the current approach | <li>defining security that requires more option space, perhaps in | |||
with a second stage of MPTCP-option-based security.</t> | conjunction with a "long options" proposal for extending the TCP | |||
<t>revisiting the working group's decision to exclusively use TCP options for MP | option space (such as those surveyed in <xref | |||
TCP signaling, and instead look at also making use of the TCP payloads.</t> | target="I-D.ananth-tcpm-tcpoptext" format="default"/>), or perhaps | |||
</list></t> | building on the current approach with a second stage of | |||
security based on MPTCP options.</li> | ||||
<t>MPTCP has been designed with several methods available to indicate a new secu | <li>revisiting the working group's decision to exclusively use TCP | |||
rity mechanism, including: | options for MPTCP signaling and instead looking at the | |||
<list style="symbols"> | possibility of using TCP payloads as well.</li> | |||
<t>available flags in MP_CAPABLE (<xref target="tcpm_capable"/>);</t> | </ul> | |||
<t>available subtypes in the MPTCP option (<xref target="fig_option"/>);</t> | <t>MPTCP has been designed with several methods available to indicate a ne | |||
<t>the version field in MP_CAPABLE (<xref target="tcpm_capable"/>);</t> | w security mechanism, including: | |||
</list></t> | </t> | |||
<ul spacing="normal"> | ||||
<li>available flags in MP_CAPABLE (<xref target="tcpm_capable" format="d | ||||
efault"/>).</li> | ||||
<li>available subtypes in the MPTCP option (<xref target="fig_option" fo | ||||
rmat="default"/>).</li> | ||||
<li>the Version field in MP_CAPABLE (<xref target="tcpm_capable" format= | ||||
"default"/>).</li> | ||||
</ul> | ||||
</section> | </section> | |||
<section anchor="sec_middleboxes" numbered="true" toc="default"> | ||||
<section title="Interactions with Middleboxes" anchor="sec_middleboxes"> | <name>Interactions with Middleboxes</name> | |||
<t>Multipath TCP was designed to be deployable in the present world. Its d | ||||
<t>Multipath TCP was designed to be deployable in the present world. Its | esign takes into account "reasonable" | |||
design takes into account "reasonable" | ||||
existing middlebox behavior. In this section, we outline a few representative mi ddlebox-related failure scenarios and | existing middlebox behavior. In this section, we outline a few representative mi ddlebox-related failure scenarios and | |||
show how Multipath TCP handles them. Next, we list the design decisions multipat | show how Multipath TCP handles them. Next, we list the design decisions | |||
h has made to accommodate the different | Multipath TCP has made to accommodate the different | |||
middleboxes.</t> | middleboxes.</t> | |||
<t>A primary concern is our use of a new TCP option. Middleboxes should fo | ||||
<t>A primary concern is our use of a new TCP option. Middleboxes should | rward packets | |||
forward packets | with unknown options unchanged, yet there are some that don't. We expect these | |||
with unknown options unchanged, yet there are some that don't. These we expect w | middleboxes to strip options and pass the data, | |||
ill either strip options and pass the data, | ||||
drop packets with new options, copy the same option into multiple segments (e.g. , when doing segmentation), or drop | drop packets with new options, copy the same option into multiple segments (e.g. , when doing segmentation), or drop | |||
options during segment coalescing.</t> | options during segment coalescing.</t> | |||
<t>MPTCP uses a single new TCP option called "Kind", and all message types | ||||
are defined by "subtype" values (see <xref target="IANA" format="default"/>). T | ||||
his should reduce the chances of only some types of MPTCP options being passed; | ||||
instead, the key differing characteristics are different paths and the presence | ||||
of the SYN flag.</t> | ||||
<t>MPTCP SYN packets on the first subflow of a connection contain the MP_C | ||||
APABLE option (<xref target="sec_init" format="default"/>). If this is dropped, | ||||
MPTCP <bcp14>SHOULD</bcp14> fall back to regular TCP. If packets with the MP_JOI | ||||
N option (<xref target="sec_join" format="default"/>) are dropped, the paths wil | ||||
l simply not be used.</t> | ||||
<t>If a middlebox strips options but otherwise passes the packets | ||||
unchanged, MPTCP will behave safely. If an MP_CAPABLE option is dropped | ||||
on either the outgoing path or the return path, the initiating host can | ||||
fall back to regular TCP, as illustrated in <xref target="fig_syn" | ||||
format="default"/> and discussed in <xref target="sec_init" | ||||
format="default"/>.</t> | ||||
<figure anchor="fig_syn"> | ||||
<name>Connection Setup with Middleboxes That Strip Options from Packets< | ||||
/name> | ||||
<artwork align="left" name="" type="" alt=""><![CDATA[ | ||||
Host A Host B | ||||
| Middlebox M | | ||||
| | | | ||||
| SYN (MP_CAPABLE) | SYN | | ||||
|-------------------|---------------->| | ||||
| SYN/ACK | | ||||
|<------------------------------------| | ||||
a) MP_CAPABLE option stripped on outgoing path | ||||
<t>MPTCP uses a single new TCP option "Kind", and all message types are | Host A Host B | |||
defined by "subtype" values (see <xref target="IANA"/>). This should reduce the | | SYN (MP_CAPABLE) | | |||
chances of only some types of MPTCP options being passed, and instead the key di | |-------------------------------------->| | |||
ffering characteristics are different paths, and the presence of the SYN flag.</ | | Middlebox M | | |||
t> | | | | | |||
| SYN/ACK |SYN/ACK (MP_CAPABLE)| | ||||
<t>MPTCP SYN packets on the first subflow of a connection contain the MP | |<-----------------|--------------------| | |||
_CAPABLE option (<xref target="sec_init"/>). If this is dropped, MPTCP SHOULD fa | b) MP_CAPABLE option stripped on return path ]]></artwork> | |||
ll back to regular TCP. If packets with the MP_JOIN option (<xref target="sec_jo | </figure> | |||
in"/>) are dropped, the paths will simply not be used.</t> | <t>Subflow SYNs contain the MP_JOIN option. If this option is stripped on | |||
the outgoing path, | ||||
<t>If a middlebox strips options but otherwise passes the packets unchan | the SYN will appear to be a regular SYN to Host B. Depending on whether th | |||
ged, MPTCP will behave safely. If an MP_CAPABLE option is dropped on either the | ere is a listening socket on | |||
outgoing or the return path, the initiating host can fall back to regular TCP, a | the target port, Host B will reply with either a SYN/ACK or a RST (subflow conne | |||
s illustrated in <xref target="fig_syn"/> and discussed in <xref target="sec_ini | ction fails). When Host A | |||
t"/>.</t> | receives the SYN/ACK, it sends a RST because the SYN/ACK does not contain the MP | |||
_JOIN option and its token. | ||||
<t>Subflow SYNs contain the MP_JOIN option. If this option is stripped | Either way, the subflow setup fails but otherwise does not affect the MPTCP conn | |||
on the outgoing path, | ection as a whole.</t> | |||
the SYN will appear to be a regular SYN to Host B. Depending on whether th | <t>We now examine data flow with MPTCP, assuming that the flow is | |||
ere is a listening socket on | correctly set up, which implies that the options in the SYN | |||
the target port, Host B will reply either with SYN/ACK or RST (subflow connectio | ||||
n fails). When Host A | ||||
receives the SYN/ACK it sends a RST because the SYN/ACK does not contain the MP_ | ||||
JOIN option and its token. | ||||
Either way, the subflow setup fails, but otherwise does not affect the MPTCP con | ||||
nection as a whole.</t> | ||||
<figure align="center" anchor="fig_syn" title="Connection Setup with Mid | ||||
dleboxes that Strip Options from Packets"> | ||||
<artwork align="left"><![CDATA[ | ||||
Host A Host B | ||||
| Middlebox M | | ||||
| | | | ||||
| SYN(MP_CAPABLE) | SYN | | ||||
|-------------------|---------------->| | ||||
| SYN/ACK | | ||||
|<------------------------------------| | ||||
a) MP_CAPABLE option stripped on outgoing path | ||||
Host A Host B | ||||
| SYN(MP_CAPABLE) | | ||||
|------------------------------------>| | ||||
| Middlebox M | | ||||
| | | | ||||
| SYN/ACK |SYN/ACK(MP_CAPABLE)| | ||||
|<----------------|-------------------| | ||||
b) MP_CAPABLE option stripped on return path | ||||
]]></artwork> | ||||
</figure> | ||||
<t>We now examine data flow with MPTCP, assuming the flow is correctly s | ||||
et up, which implies the options in the SYN | ||||
packets were allowed through by the relevant middleboxes. If options are allowed through and there is no resegmentation or | packets were allowed through by the relevant middleboxes. If options are allowed through and there is no resegmentation or | |||
coalescing to TCP segments, Multipath TCP flows can proceed without problems.</t > | coalescing to TCP segments, Multipath TCP flows can proceed without problems.</t > | |||
<t>The case when options get stripped on data packets is discussed | ||||
<t>The case when options get stripped on data packets has been discussed | in <xref target="sec_fallback" format="default"/>. | |||
in the Fallback section. | If only some MPTCP options are stripped, behavior is not deterministic. | |||
If only some MPTCP options are stripped, behavior is not deterministic. | If some Data Sequence Mappings are lost, the connection can continue so long as | |||
If some data sequence mappings are lost, the connection can continue so long as | mappings exist for the subflow-level data (e.g., if multiple maps have been sent | |||
mappings exist for the subflow-level data (e.g., if multiple maps have been sent | that reinforce each other). If some subflow-level space is left unmapped, howev | |||
that reinforce each other). If some subflow-level space is left unmapped, howev | er, the subflow is treated as broken and is closed, using the process described | |||
er, the subflow is treated as broken and is closed, through the process describe | in <xref target="sec_fallback" format="default"/>. MPTCP should survive with a l | |||
d in <xref target="sec_fallback"/>. MPTCP should survive with a loss of some Dat | oss of some Data ACKs, but performance will degrade as the fraction of stripped | |||
a ACKs, but performance will degrade as the fraction of stripped options increas | options increases. | |||
es. | ||||
We do not expect such cases to appear in practice, though: most | We do not expect such cases to appear in practice, though: most | |||
middleboxes will either strip all options or let them all through.</t> | middleboxes will either strip all options or let them all through.</t> | |||
<t>We end this section with a list of middlebox classes, their behavior, a | ||||
<t>We end this section with a list of middlebox classes, their behavior, | nd the elements in the MPTCP design | |||
and the elements in the MPTCP design | ||||
that allow operation through such middleboxes. Issues surrounding dropping packe ts with options | that allow operation through such middleboxes. Issues surrounding dropping packe ts with options | |||
or stripping options were discussed above, and are not included here: | or stripping options were discussed above and are not included here: | |||
<list style="symbols"> | </t> | |||
<t>NATs <xref target="RFC3022"/> (Network Address (and Port) Translato | <ul spacing="normal"> | |||
rs) change the source address (and often source port) of packets. This means tha | <li>NATs (Network Address (and port) Translators) <xref | |||
t a host will not know its | target="RFC3022" format="default"/> change the source address (and | |||
often the source port) of packets. This means that a host will not know | ||||
its | ||||
public-facing address for signaling in MPTCP. Therefore, MPTCP permits impli cit address addition via the MP_JOIN option, | public-facing address for signaling in MPTCP. Therefore, MPTCP permits impli cit address addition via the MP_JOIN option, | |||
and the handshake mechanism ensures that connection attempts to private addr | and the handshake mechanism ensures that connection attempts to private addr | |||
esses <xref target="RFC1918"/>, since they are authenticated, will only set up s | esses <xref target="RFC1918" format="default"/>, since they are authenticated, w | |||
ubflows to the correct hosts. | ill only set up subflows to the correct hosts. | |||
Explicit address removal is undertaken by an Address ID to allow no knowledg | Explicit address removal is undertaken by an Address ID to allow no knowledg | |||
e of the source address.</t> | e of the source address.</li> | |||
<li>Performance Enhancing Proxies (PEPs) <xref target="RFC3135" format=" | ||||
<t>Performance Enhancing Proxies (PEPs) <xref target="RFC3135"/> might | default"/> might proactively ACK data to increase performance. MPTCP, however, r | |||
proactively ACK data to increase performance. MPTCP, however, relies on accurat | elies on accurate congestion control signals from the end host, and non‑MP | |||
e congestion control signals from the end host, and non-MPTCP-aware PEPs will no | TCP-aware PEPs will not be able to provide such signals. MPTCP will, therefore, | |||
t be able to provide such signals. MPTCP will, therefore, fall back to single-pa | fall back to single-path TCP or close the problematic subflow (see <xref target= | |||
th TCP, or close the problematic subflow (see <xref target="sec_fallback"/>).</t | "sec_fallback" format="default"/>).</li> | |||
> | <li>Traffic normalizers <xref target="norm" format="default"/> may not | |||
allow holes in sequence numbers, and they may cache packets and retransm | ||||
<t>Traffic Normalizers <xref target="norm"/> may not allow holes in se | it the same data. | |||
quence numbers, and may cache packets and retransmit the same data. | MPTCP looks like standard TCP on the wire and will not retransmit different data | |||
MPTCP looks like standard TCP on the wire, and will not retransmit different dat | on the same subflow sequence number. In the event of a retransmission, the same | |||
a on the same subflow sequence number. In the event of a retransmission, the sam | data will be retransmitted on the original TCP subflow even if it is additional | |||
e data will be retransmitted on the original TCP subflow even if it is additiona | ly retransmitted at the connection level on a different subflow.</li> | |||
lly retransmitted at the connection level on a different subflow.</t> | <li>Firewalls <xref target="RFC2979" format="default"/> might perform | |||
Initial Sequence Number (ISN) randomization on TCP connections. MPTCP us | ||||
<t>Firewalls <xref target="RFC2979"/> might perform initial sequence n | es relative | |||
umber randomization on TCP connections. MPTCP uses relative | sequence numbers in Data Sequence Mappings to cope with this. Like NATs, firewal | |||
sequence numbers in data sequence mapping to cope with this. Like NATs, firewall | ls will not permit many incoming connections, so | |||
s will not permit many incoming connections, so | ||||
MPTCP supports address signaling (ADD_ADDR) so that a multiaddressed host can in vite its peer behind the firewall/NAT to connect | MPTCP supports address signaling (ADD_ADDR) so that a multiaddressed host can in vite its peer behind the firewall/NAT to connect | |||
out to its additional interface.</t> | out to its additional interface.</li> | |||
<li>Intrusion Detection Systems / Intrusion Prevention Systems (IDSs&wj; | ||||
<t>Intrusion Detection/Prevention Systems (IDS/IPS) observe packet str | /IPSs) observe packet streams for patterns and content that could threaten a net | |||
eams for patterns and content that could threaten a network. MPTCP may require t | work. MPTCP may require the | |||
he | instrumentation of additional paths, and an MPTCP-aware IDS or IPS would need to | |||
instrumentation of additional paths, and an MPTCP-aware IDS/IPS would need to re | read MPTCP tokens to correlate data from multiple subflows to maintain comparab | |||
ad MPTCP tokens to correlate data from mutliple subflows to maintain comparable | le visibility into all of the traffic between devices. Without such changes, an | |||
visibility into all of the traffic between devices. Without such changes, an IDS | IDS would get an incomplete view of the traffic, increasing the risk of missing | |||
would get an incomplete view of the traffic, increasing the risk of missing tra | traffic of interest (false negatives) and increasing the chances of erroneously | |||
ffic of interest (false negatives), and increasing the chances of erroneously id | identifying a subflow as a risk due to only seeing partial data (false positives | |||
entifying a subflow as a risk due to only seeing partial data (false positives). | ).</li> | |||
</t> | <li>Application-level middleboxes such as content-aware firewalls may | |||
alter the payload within a subflow -- for example, rewriting URIs in | ||||
<t>Application-level middleboxes such as content-aware firewalls may a | HTTP traffic. MPTCP will detect such changes using the checksum | |||
lter the payload within a subflow, such as rewriting URIs in HTTP traffic. MPTCP | and close the affected subflow(s), if there are other subflows that can be used. | |||
will detect these using the checksum | If all subflows are affected, MPTCP | |||
and close the affected subflow(s), if there are other subflows that can be used. | will fall back to TCP, allowing such middleboxes to change the payload. MPTCP-aw | |||
If all subflows are affected, multipath | are middleboxes should be able to adjust the payload and MPTCP metadata in order | |||
will fall back to TCP, allowing such middleboxes to change the payload. MPTCP-aw | not to break the connection.</li> | |||
are middleboxes should be able to adjust the payload and MPTCP metadata in order | </ul> | |||
not to break the connection.</t> | <t> | |||
</list> | ||||
In addition, all classes of middleboxes may affect TCP traffic in the fo llowing ways: | In addition, all classes of middleboxes may affect TCP traffic in the fo llowing ways: | |||
<list style="symbols"> | ||||
<t>TCP options may be removed, or packets with unknown options dropped | ||||
, by many classes of middleboxes. It is intended | ||||
that the initial SYN exchange, with a TCP option, will be sufficient to identify | ||||
the path capabilities. If such a packet does | ||||
not get through, MPTCP will end up falling back to regular TCP.</t> | ||||
<t>Segmentation/Coalescing (e.g., TCP segmentation offloading) might c | ||||
opy options between packets and might | ||||
strip some options. MPTCP's data sequence mapping includes the relative subflow | ||||
sequence number instead of using the sequence | ||||
number in the segment. In this way, the mapping is independent of the packets th | ||||
at carry it.</t> | ||||
<t>The receive window may be shrunk by some middleboxes at the subflow | ||||
level. MPTCP will use the maximum window at data level, but will also obey | ||||
subflow-specific windows.</t> | ||||
</list> | ||||
</t> | </t> | |||
<ul spacing="normal"> | ||||
</section> | <li>TCP options may be removed, or packets with unknown options dropped, | |||
by many classes of middleboxes. It is intended | ||||
<section anchor="Acknowledgments" title="Acknowledgments"> | that the initial SYN exchange, with a TCP option, will be sufficient to identify | |||
<!-- <t>The authors were originally supported by Trilogy (http://www.trilo | the path's capabilities. If such a packet does | |||
gy-project.org), a research project (ICT-216372) partially funded by the Europea | not get through, MPTCP will end up falling back to regular TCP.</li> | |||
n Community under its Seventh Framework Program.</t> | <li>Segmentation/coalescing (e.g., TCP segmentation offloading) might co | |||
<t>Alan Ford was originally supported by Roke Manor Research and later Cis | py options between packets and might | |||
co Systems.</t> --> | strip some options. MPTCP's Data Sequence Mapping includes the relative subflow | |||
<t>The authors gratefully acknowledge significant input into this document | sequence number instead of using the sequence | |||
from Sébastien Barré and Andrew McDonald.</t> | number in the segment. In this way, the mapping is independent of the packets th | |||
<t>The authors also wish to acknowledge reviews and contributions from Ilj | at carry it.</li> | |||
itsch van Beijnum, Lars Eggert, Marcelo Bagnulo, Robert Hancock, Pasi Sarolahti, | <li>The receive window may be shrunk by some middleboxes at the | |||
Toby Moncaster, Philip Eardley, Sergio Lembo, Lawrence Conroy, Yoshifumi Nishid | subflow level. MPTCP will use the maximum window at the data level but w | |||
a, Bob Briscoe, Stein Gjessing, Andrew McGregor, Georg Hampel, Anumita Biswas, W | ill also obey | |||
es Eddy, Alexey Melnikov, Francis Dupont, Adrian Farrel, Barry Leiba, Robert Spa | subflow-specific windows.</li> | |||
rks, Sean Turner, Stephen Farrell, Martin Stiemerling, Gregory Detal, Fabien Duc | </ul> | |||
hene, Xavier de Foy, Rahul Jadhav, Klemens Schragel, Mirja Kuehlewind, Sheng Jia | ||||
ng, Alissa Cooper, Ines Robles, Roman Danyliw, Adam Roach, Barry Leiba, Alexey M | ||||
elnikov, Eric Vyncke, and Ben Kaduk.</t> | ||||
</section> | ||||
<section anchor="IANA" title="IANA Considerations"> | ||||
<t>This document obsoletes RFC6824 and as such IANA is requested to update | ||||
the TCP option space registry to point to this document for Multipath TCP, as f | ||||
ollows:</t> | ||||
<texttable anchor="table_tcpo" title="TCP Option Kind Numbers"> | ||||
<ttcol align="center">Kind</ttcol> | ||||
<ttcol align="center">Length</ttcol> | ||||
<ttcol align="center">Meaning</ttcol> | ||||
<ttcol align="center">Reference</ttcol> | ||||
<c>30</c> | ||||
<c>N</c> | ||||
<c>Multipath TCP (MPTCP)</c> | ||||
<c>This document</c> | ||||
</texttable> | ||||
<section anchor="IANA_subtypes" title="MPTCP Option Subtypes"> | ||||
<t>The 4-bit MPTCP subtype sub-registry ("MPTCP Option Subtypes" under the | ||||
"Transmission Control Protocol (TCP) Parameters" registry) was defined in RFC68 | ||||
24. Since RFC6824 was an Experimental not Standards Track RFC, and since no furt | ||||
her entries have occurred beyond those pointing to RFC6824, IANA is requested to | ||||
replace the existing registry with <xref target="table_iana"/> and with the fol | ||||
lowing explanatory note.</t> | ||||
<t>Note: This registry specifies the MPTCP Option Subtypes for MPTCP v1, w | ||||
hich obsoletes the Experimental MPTCP v0. For the MPTCP v0 subtypes, please refe | ||||
r to RFC6824.</t> | ||||
<texttable anchor="table_iana" title="MPTCP Option Subtypes"> | ||||
<ttcol align="center">Value</ttcol> | ||||
<ttcol align="center">Symbol</ttcol> | ||||
<ttcol align="center">Name</ttcol> | ||||
<ttcol align="center">Reference</ttcol> | ||||
<c>0x0</c> | ||||
<c>MP_CAPABLE</c> | ||||
<c>Multipath Capable</c> | ||||
<c>This document, <xref target="sec_init"/></c> | ||||
<c>0x1</c> | ||||
<c>MP_JOIN</c> | ||||
<c>Join Connection</c> | ||||
<c>This document, <xref target="sec_join"/></c> | ||||
<c>0x2</c> | ||||
<c>DSS</c> | ||||
<c>Data Sequence Signal (Data ACK and data sequence mapping)</c> | ||||
<c>This document, <xref target="sec_generalop"/></c> | ||||
<c>0x3</c> | ||||
<c>ADD_ADDR</c> | ||||
<c>Add Address</c> | ||||
<c>This document, <xref target="sec_add_address"/></c> | ||||
<c>0x4</c> | ||||
<c>REMOVE_ADDR</c> | ||||
<c>Remove Address</c> | ||||
<c>This document, <xref target="sec_remove_addr"/></c> | ||||
<c>0x5</c> | ||||
<c>MP_PRIO</c> | ||||
<c>Change Subflow Priority</c> | ||||
<c>This document, <xref target="sec_policy"/></c> | ||||
<c>0x6</c> | ||||
<c>MP_FAIL</c> | ||||
<c>Fallback</c> | ||||
<c>This document, <xref target="sec_fallback"/></c> | ||||
<c>0x7</c> | ||||
<c>MP_FASTCLOSE</c> | ||||
<c>Fast Close</c> | ||||
<c>This document, <xref target="sec_fastclose"/></c> | ||||
<c>0x8</c> | ||||
<c>MP_TCPRST</c> | ||||
<c>Subflow Reset</c> | ||||
<c>This document, <xref target="sec_reset"/></c> | ||||
<c>0xf</c> | ||||
<c>MP_EXPERIMENTAL</c> | ||||
<c>Reserved for private experiments</c> | ||||
<c></c> | ||||
</texttable> | ||||
<t>Values 0x9 through 0xe are currently unassigned. Option 0xf is reserved | ||||
for use by private experiments. Its use may be formalized in a future specifica | ||||
tion. Future assignments in this registry are to be defined by Standards Action | ||||
as defined by <xref target="RFC8126"/>. Assignments consist of the MPTCP subtyp | ||||
e's symbolic name and its associated value, and a reference to its specification | ||||
.</t> | ||||
</section> | </section> | |||
<section anchor="IANA_handshake" title="MPTCP Handshake Algorithms"> | <section anchor="IANA" numbered="true" toc="default"> | |||
<name>IANA Considerations</name> | ||||
<t>The "MPTCP Handshake Algorithms" sub-registry under the "Transmission C | ||||
ontrol Protocol (TCP) Parameters" registry was defined in RFC6824. Since RFC6824 | ||||
was an Experimental not Standards Track RFC, and since no further entries have | ||||
occurred beyond those pointing to RFC6824, IANA is requested to replace the exis | ||||
ting registry with <xref target="table_crypto"/> and with the following explanat | ||||
ory note.</t> | ||||
<t>Note: This registry specifies the MPTCP Handshake Algorithms for MPTCP | ||||
v1, which obsoletes the Experimental MPTCP v0. For the MPTCP v0 subtypes, please | ||||
refer to RFC6824.</t> | ||||
<texttable anchor="table_crypto" title="MPTCP Handshake Algorithms"> | ||||
<ttcol align="center">Flag Bit</ttcol> | ||||
<ttcol align="center">Meaning</ttcol> | ||||
<ttcol align="center">Reference</ttcol> | ||||
<c>A</c> | ||||
<c>Checksum required</c> | ||||
<c>This document, <xref target="sec_init"/></c> | ||||
<c>B</c> | ||||
<c>Extensibility</c> | ||||
<c>This document, <xref target="sec_init"/></c> | ||||
<c>C</c> | ||||
<c>Do not attempt to establish new subflows to the source address.</c> | ||||
<c>This document, <xref target="sec_init"/></c> | ||||
<c>D-G</c> | <t>This document obsoletes <xref target="RFC6824"/>. As such, IANA has upd | |||
<c>Unassigned</c> | ated | |||
<c></c> | several registries to point to this document. In addition, this document | |||
creates one new registry. These topics are described in the following sub | ||||
sections.</t> | ||||
<c>H</c> | <section anchor="IANA-TCP-Option-Kind" numbered="true" toc="default"> | |||
<c>HMAC-SHA256</c> | <name>TCP Option Kind Numbers</name> | |||
<c>This document, <xref target="sec_join"/></c> | <t>IANA has | |||
</texttable> | updated the "TCP Option Kind Numbers" registry to point to this document | |||
for Multipath TCP, as shown in <xref target="table_tcpo"/>:</t> | ||||
<table anchor="table_tcpo" align="center"> | ||||
<name>TCP Option Kind Numbers</name> | ||||
<thead> | ||||
<tr> | ||||
<th align="center">Kind</th> | ||||
<th align="center">Length</th> | ||||
<th align="center">Meaning</th> | ||||
<th align="center">Reference</th> | ||||
</tr> | ||||
</thead> | ||||
<tbody> | ||||
<tr> | ||||
<td align="center">30</td> | ||||
<td align="center">N</td> | ||||
<td align="center">Multipath TCP (MPTCP)</td> | ||||
<td align="center">RFC 8684</td> | ||||
</tr> | ||||
</tbody> | ||||
</table> | ||||
</section> | ||||
<section anchor="IANA_subtypes" numbered="true" toc="default"> | ||||
<name>MPTCP Option Subtypes</name> | ||||
<t>The 4-bit MPTCP subtype in the "MPTCP Option Subtypes" | ||||
subregistry under the "Transmission Control Protocol (TCP) Parameters" | ||||
registry was defined in <xref target="RFC6824"/>. Since <xref target="RF | ||||
C6824"/> is an | ||||
Experimental RFC and not a Standards Track RFC, and since no further | ||||
entries have occurred beyond those pointing to <xref target="RFC6824"/>, | ||||
IANA has | ||||
replaced the existing registry with the contents of | ||||
<xref target="table_iana" format="default"/> and with the following | ||||
explanatory note.</t> | ||||
<t>Note that the meanings of bits D through H can be dependent upon bit B, | <t>Note: This registry specifies the MPTCP Option Subtypes for MPTCP v1, | |||
depending on how Extensibility is defined in future specifications; see | which obsoletes the Experimental MPTCP v0. For the MPTCP v0 subtypes, please re | |||
<xref target="sec_init"/> for more information.</t> | fer to <xref target="RFC6824"/>.</t> | |||
<table anchor="table_iana" align="center"> | ||||
<name>MPTCP Option Subtypes</name> | ||||
<thead> | ||||
<tr> | ||||
<th align="center">Value</th> | ||||
<th align="center">Symbol</th> | ||||
<th align="center">Name</th> | ||||
<th align="center">Reference</th> | ||||
</tr> | ||||
</thead> | ||||
<tbody> | ||||
<tr> | ||||
<td align="center">0x0</td> | ||||
<td align="center">MP_CAPABLE</td> | ||||
<td align="center">Multipath Capable</td> | ||||
<td align="center">RFC 8684, <xref target="sec_init" format="defau | ||||
lt"/></td> | ||||
</tr> | ||||
<tr> | ||||
<td align="center">0x1</td> | ||||
<td align="center">MP_JOIN</td> | ||||
<td align="center">Join Connection</td> | ||||
<td align="center">RFC 8684, <xref target="sec_join" format="defau | ||||
lt"/></td> | ||||
</tr> | ||||
<tr> | ||||
<td align="center">0x2</td> | ||||
<td align="center">DSS</td> | ||||
<td align="center">Data Sequence Signal (Data ACK and Data Sequenc | ||||
e Mapping)</td> | ||||
<td align="center">RFC 8684, <xref target="sec_generalop" format=" | ||||
default"/></td> | ||||
</tr> | ||||
<tr> | ||||
<td align="center">0x3</td> | ||||
<td align="center">ADD_ADDR</td> | ||||
<td align="center">Add Address</td> | ||||
<td align="center">RFC 8684, <xref target="sec_add_address" format | ||||
="default"/></td> | ||||
</tr> | ||||
<tr> | ||||
<td align="center">0x4</td> | ||||
<td align="center">REMOVE_ADDR</td> | ||||
<td align="center">Remove Address</td> | ||||
<td align="center">RFC 8684, <xref target="sec_remove_addr" format | ||||
="default"/></td> | ||||
</tr> | ||||
<tr> | ||||
<td align="center">0x5</td> | ||||
<td align="center">MP_PRIO</td> | ||||
<td align="center">Change Subflow Priority</td> | ||||
<td align="center">RFC 8684, <xref target="sec_policy" format="def | ||||
ault"/></td> | ||||
</tr> | ||||
<tr> | ||||
<td align="center">0x6</td> | ||||
<td align="center">MP_FAIL</td> | ||||
<td align="center">Fallback</td> | ||||
<td align="center">RFC 8684, <xref target="sec_fallback" format="d | ||||
efault"/></td> | ||||
</tr> | ||||
<tr> | ||||
<td align="center">0x7</td> | ||||
<td align="center">MP_FASTCLOSE</td> | ||||
<td align="center">Fast Close</td> | ||||
<td align="center">RFC 8684, <xref target="sec_fastclose" format=" | ||||
default"/></td> | ||||
</tr> | ||||
<tr> | ||||
<td align="center">0x8</td> | ||||
<td align="center">MP_TCPRST</td> | ||||
<td align="center">Subflow Reset</td> | ||||
<td align="center">RFC 8684, <xref target="sec_reset" format="defa | ||||
ult"/></td> | ||||
</tr> | ||||
<tr> | ||||
<td align="center">0xf</td> | ||||
<td align="center">MP_EXPERIMENTAL</td> | ||||
<td align="center">Reserved for Private Use</td> | ||||
<td align="center"/> | ||||
</tr> | ||||
</tbody> | ||||
</table> | ||||
<t>Values 0x9 through 0xe are currently unassigned. Option 0xf is reserv | ||||
ed for use by private experiments. Its use may be formalized in a future specifi | ||||
cation. Future assignments in this registry are to be defined by Standards Actio | ||||
n as defined by <xref target="RFC8126" format="default"/>. Assignments consist | ||||
of the MPTCP subtype's symbolic name, its associated value, and a reference to i | ||||
ts specification.</t> | ||||
</section> | ||||
<section anchor="IANA_handshake" numbered="true" toc="default"> | ||||
<name>MPTCP Handshake Algorithms</name> | ||||
<t>The "MPTCP Handshake Algorithms" subregistry under the | ||||
"Transmission Control Protocol (TCP) Parameters" registry was defined | ||||
in <xref target="RFC6824"/>. Since <xref target="RFC6824"/> is an Experi | ||||
mental RFC and not | ||||
a Standards Track RFC, and since no further entries have occurred | ||||
beyond those pointing to <xref target="RFC6824"/>, IANA has replaced | ||||
the existing registry with the contents of | ||||
<xref target="table_crypto" format="default"/> and with the following explanato | ||||
ry note.</t> | ||||
<t>Note: This registry specifies the MPTCP Handshake Algorithms for MPTC | ||||
P v1, which obsoletes the Experimental MPTCP v0. For the MPTCP v0 subtypes, plea | ||||
se refer to <xref target="RFC6824"/>.</t> | ||||
<table anchor="table_crypto" align="center"> | ||||
<name>MPTCP Handshake Algorithms</name> | ||||
<thead> | ||||
<tr> | ||||
<th align="center">Flag Bit</th> | ||||
<th align="center">Meaning</th> | ||||
<th align="center">Reference</th> | ||||
</tr> | ||||
</thead> | ||||
<tbody> | ||||
<tr> | ||||
<td align="center">A</td> | ||||
<td align="center">Checksum required</td> | ||||
<td align="center">RFC 8684, <xref target="sec_init" format="defau | ||||
lt"/></td> | ||||
</tr> | ||||
<tr> | ||||
<td align="center">B</td> | ||||
<td align="center">Extensibility</td> | ||||
<td align="center">RFC 8684, <xref target="sec_init" format="defau | ||||
lt"/></td> | ||||
</tr> | ||||
<tr> | ||||
<td align="center">C</td> | ||||
<td align="center">Do not attempt to establish new subflows to the | ||||
source address.</td> | ||||
<td align="center">RFC 8684, <xref target="sec_init" format="defau | ||||
lt"/></td> | ||||
</tr> | ||||
<tr> | ||||
<td align="center">D-G</td> | ||||
<td align="center">Unassigned</td> | ||||
<td align="center"/> | ||||
</tr> | ||||
<tr> | ||||
<td align="center">H</td> | ||||
<td align="center">HMAC-SHA256</td> | ||||
<td align="center">RFC 8684, <xref target="sec_join" format="defau | ||||
lt"/></td> | ||||
</tr> | ||||
</tbody> | ||||
</table> | ||||
<t>Future assignments in this registry are also | <t>Note that the meanings of bits "D" through "H" can be dependent upon | |||
to be defined by Standards Action as defined by <xref target="RFC8126"/>. | bit "B", | |||
depending on how the Extensibility parameter is defined in future specific | ||||
ations; see | ||||
<xref target="sec_init" format="default"/> for more information.</t> | ||||
<t>Future assignments in this registry are also | ||||
to be defined by Standards Action as defined by <xref target="RFC8126" for | ||||
mat="default"/>. | ||||
Assignments consist of the value of the flags, a symbolic name for the alg orithm, | Assignments consist of the value of the flags, a symbolic name for the alg orithm, | |||
and a reference to its specification.</t> | and a reference to its specification.</t> | |||
</section> | ||||
<section anchor="IANA_rst" title="MP_TCPRST Reason Codes"> | ||||
<t>IANA is requested to create a further sub-registry, "MPTCP MP_TCPRST Re | ||||
ason Codes" under the "Transmission Control Protocol (TCP) Parameters" registry, | ||||
based on the reason code in MP_TCPRST (<xref target="sec_reset"/>) message. Ini | ||||
tial values for this registry are given in <xref target="table_rstcodes"/>; futu | ||||
re assignments are to be defined by Specification Required as defined by <xref t | ||||
arget="RFC8126"/>. Assignments consist of the value of the code, a short descrip | ||||
tion of its meaning, and a reference to its specification. The maximum value is | ||||
0xff.</t> | ||||
<t>As guidance to the Designated Expert <xref target="RFC8126"/>, assignme | ||||
nts should not normally be refused unless codepoint space is becoming scarce, pr | ||||
oviding that there is a clear distinction from other, already-existing codes, an | ||||
d also providing there is sufficient guidance for implementors both sending and | ||||
receiving these codes.</t> | ||||
<texttable anchor="table_rstcodes" title="MPTCP MP_TCPRST Reason Codes"> | ||||
<ttcol align="center">Code</ttcol> | ||||
<ttcol align="center">Meaning</ttcol> | ||||
<ttcol align="center">Reference</ttcol> | ||||
<c>0x00</c> | ||||
<c>Unspecified TCP error</c> | ||||
<c>This document, <xref target="sec_reset"/></c> | ||||
<c>0x01</c> | ||||
<c>MPTCP specific error</c> | ||||
<c>This document, <xref target="sec_reset"/></c> | ||||
<c>0x02</c> | ||||
<c>Lack of resources</c> | ||||
<c>This document, <xref target="sec_reset"/></c> | ||||
<c>0x03</c> | ||||
<c>Administratively prohibited</c> | ||||
<c>This document, <xref target="sec_reset"/></c> | ||||
<c>0x04</c> | ||||
<c>Too much outstanding data</c> | ||||
<c>This document, <xref target="sec_reset"/></c> | ||||
<c>0x05</c> | ||||
<c>Unacceptable performance</c> | ||||
<c>This document, <xref target="sec_reset"/></c> | ||||
<c>0x06</c> | ||||
<c>Middlebox interference</c> | ||||
<c>This document, <xref target="sec_reset"/></c> | ||||
</texttable> | ||||
</section> | </section> | |||
<section anchor="IANA_rst" numbered="true" toc="default"> | ||||
<name>MP_TCPRST Reason Codes</name> | ||||
<t>IANA has created a further subregistry, "MPTCP MP_TCPRST | ||||
Reason Codes" under the "Transmission Control Protocol (TCP) | ||||
Parameters" registry, based on the reason code in the MP_TCPRST (<xref t | ||||
arget="sec_reset" format="default"/>) message. Initial values for this registry | ||||
are given in <xref target="table_rstcodes" format="default"/>; future assignment | ||||
s are to be defined by Specification Required as defined by <xref target="RFC812 | ||||
6" format="default"/>. Assignments consist of the value of the code, a short des | ||||
cription of its meaning, and a reference to its specification. The maximum value | ||||
is 0xff.</t> | ||||
<table anchor="table_rstcodes" align="center"> | ||||
<name>MPTCP MP_TCPRST Reason Codes</name> | ||||
<thead> | ||||
<tr> | ||||
<th align="center">Code</th> | ||||
<th align="center">Meaning</th> | ||||
<th align="center">Reference</th> | ||||
</tr> | ||||
</thead> | ||||
<tbody> | ||||
<tr> | ||||
<td align="center">0x00</td> | ||||
<td align="center">Unspecified error</td> | ||||
<td align="center">RFC 8684, <xref target="sec_reset" format="defa | ||||
ult"/></td> | ||||
</tr> | ||||
<tr> | ||||
<td align="center">0x01</td> | ||||
<td align="center">MPTCP-specific error</td> | ||||
<td align="center">RFC 8684, <xref target="sec_reset" format="defa | ||||
ult"/></td> | ||||
</tr> | ||||
<tr> | ||||
<td align="center">0x02</td> | ||||
<td align="center">Lack of resources</td> | ||||
<td align="center">RFC 8684, <xref target="sec_reset" format="defa | ||||
ult"/></td> | ||||
</tr> | ||||
<tr> | ||||
<td align="center">0x03</td> | ||||
<td align="center">Administratively prohibited</td> | ||||
<td align="center">RFC 8684, <xref target="sec_reset" format="defa | ||||
ult"/></td> | ||||
</tr> | ||||
<tr> | ||||
<td align="center">0x04</td> | ||||
<td align="center">Too much outstanding data</td> | ||||
<td align="center">RFC 8684, <xref target="sec_reset" format="defa | ||||
ult"/></td> | ||||
</tr> | ||||
<tr> | ||||
<td align="center">0x05</td> | ||||
<td align="center">Unacceptable performance</td> | ||||
<td align="center">RFC 8684, <xref target="sec_reset" format="defa | ||||
ult"/></td> | ||||
</tr> | ||||
<tr> | ||||
<td align="center">0x06</td> | ||||
<td align="center">Middlebox interference</td> | ||||
<td align="center">RFC 8684, <xref target="sec_reset" format="defa | ||||
ult"/></td> | ||||
</tr> | ||||
</tbody> | ||||
</table> | ||||
<t>As guidance to the designated expert <xref target="RFC8126" | ||||
format="default"/>, assignments should not normally be refused unless | ||||
codepoint space is becoming scarce, provided that there is a clear | ||||
distinction from other, already-existing codes and also provided that th | ||||
ere is sufficient guidance for implementers both sending and receiving these cod | ||||
es.</t> | ||||
</section> | ||||
</section> | </section> | |||
</middle> | </middle> | |||
<!-- *****BACK MATTER ***** --> | ||||
<back> | <back> | |||
<references title="Normative References"> | <displayreference target="I-D.ananth-tcpm-tcpoptext" to="TCPLO"/> | |||
&RFC0793; | ||||
&RFC2104; | ||||
&RFC2119; | ||||
&RFC5961; | ||||
&RFC6234; | ||||
&RFC8174; | ||||
</references> | ||||
<references title="Informative References"> | ||||
&RFC1122; | ||||
&RFC7323; | ||||
&RFC1918; | ||||
&RFC2018; | ||||
&RFC5681; | ||||
&RFC2979; | ||||
&RFC2992; | ||||
&RFC3022; | ||||
&RFC3135; | ||||
&RFC4086; | ||||
&RFC4987; | ||||
&RFC8126; | ||||
&RFC6181; | ||||
&RFC6356; | ||||
&RFC6897; | ||||
&RFC6182; | ||||
&RFC6528; | ||||
&RFC7413; | ||||
&RFC7430; | ||||
&RFC8041; | ||||
<!-- &TCPLO; draft-ananth-tcpm-tcpoptext-00; Expired--> | ||||
<reference anchor='TCPLO'> | ||||
<front> | ||||
<title>TCP option space extension</title> | ||||
<author initials='A' surname='Ramaiah' fullname='Anantha Ramaiah'> | ||||
<organization /> | ||||
</author> | ||||
<date month='March' day='26' year='2012' /> | ||||
<abstract><t>The document goals are as follows: Firstly, this document summarize | ||||
s the motivations for extending TCP option space. Secondly, It tries to summari | ||||
ze the various known issues that needs to be taken into account while extending | ||||
the TCP option space. Thirdly, it briefly provides a short summary of the vario | ||||
us TCP option space proposals that has been proposed so far. Some additional pr | ||||
oposals which includes variations to the existing proposals are also presented. | ||||
The goal of this document is to rejuvenate the discussions on this topic and eve | ||||
ntually to converge on a scheme for extending TCP option space.</t></abstract> | ||||
</front> | <references> | |||
<name>References</name> | ||||
<references> | ||||
<name>Normative References</name> | ||||
<seriesInfo name='Work in' value='Progress' /> | <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.0793. | |||
xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.2104. | ||||
xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.2119. | ||||
xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.5961. | ||||
xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.6234. | ||||
xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.8174. | ||||
xml"/> | ||||
</references> | ||||
</reference> | <references> | |||
<name>Informative References</name> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.1122. | ||||
xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.7323. | ||||
xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.1918. | ||||
xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.2018. | ||||
xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.5681. | ||||
xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.2979. | ||||
xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.2992. | ||||
xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.3022. | ||||
xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.3135. | ||||
xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.4086. | ||||
xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.4987. | ||||
xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.8126. | ||||
xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.6181. | ||||
xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.6356. | ||||
xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.6897. | ||||
xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.6182. | ||||
xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.6528. | ||||
xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.6824. | ||||
xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.7413. | ||||
xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.7430. | ||||
xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.8041. | ||||
xml"/> | ||||
<reference anchor='norm' target="http://www.usenix.org/events/sec01/full_papers/ | <!-- draft-ananth-tcpm-tcpoptext (Expired) --> | |||
handley/handley.pdf"><front><title abbrev="Network Intrusion Detection: Evasion, | <xi:include href="https://www.rfc-editor.org/refs/bibxml3/reference.I-D.ananth-t | |||
Traffic Normalization, and End-to-End Protocol Semantics ">Network Intrusion De | cpm-tcpoptext.xml"/> | |||
tection: Evasion, Traffic Normalization, and End-to-End Protocol Semantics</titl | ||||
e><author initials='M.' surname='Handley' fullname='Mark Handley'><organization> | ||||
ACIRI</organization></author><author initials='V.' surname='Paxson' fullname='Ve | ||||
rn Paxson'><organization>ACIRI</organization></author><author initials='C.' surn | ||||
ame='Kreibich' fullname='Christian Kreibich'><organization>Technische Universita | ||||
t Munchen</organization></author><date year="2001"/></front><seriesInfo name="Us | ||||
enix Security" value="2001"/></reference> | ||||
<reference anchor='howhard' target="https://www.usenix.org/conference/nsdi12/how | <reference anchor="norm" target="https://www.usenix.org/legacy/events/se | |||
-hard-can-it-be-designing-and-implementing-deployable-multipath-tcp"> | c01/full_papers/handley/handley.pdf"> | |||
<front><title abbrev="How Hard Can It Be? Designing and Implementing a Deployabl | <front> | |||
e Multipath TCP">How Hard Can It Be? Designing and Implementing a Deployable Mul | <title abbrev="Network Intrusion Detection: Evasion, Traffic Normali | |||
tipath TCP</title> | zation, and End-to-End Protocol Semantics ">Network Intrusion Detection: Evasion | |||
<author initials='C.' surname='Raiciu' fullname='Costin Raiciu'><organization>Un | , Traffic | |||
iversitatea Politehnica Bucuresti</organization></author> | Normalization, and End-to-End Protocol Semantics</title> | |||
<author initials='C.' surname='Paasch' fullname='Christoph Paasch'><organization | <seriesInfo name="Usenix Security Symposium" value="2001"/> | |||
>Universite Catholique de Louvain</organization></author> | <author initials="M." surname="Handley" fullname="Mark Handley"> | |||
<author initials='S.' surname='Barre' fullname='Sebastien Barre'><organization>U | <organization>ACIRI</organization> | |||
niversite Catholique de Louvain</organization></author> | </author> | |||
<author initials='A.' surname='Ford' fullname='Alan Ford'><organization/></autho | <author initials="V." surname="Paxson" fullname="Vern Paxson"> | |||
r> | <organization>ACIRI</organization> | |||
<author initials='M.' surname='Honda' fullname='Michio Honda'><organization>Keio | </author> | |||
University</organization></author> | <author initials="C." surname="Kreibich" fullname="Christian Kreibic | |||
<author initials='F.' surname='Duchene' fullname='Fabien Duchene'><organization> | h"> | |||
Universite Catholique de Louvain</organization></author> | <organization>Technische | |||
<author initials='O.' surname='Bonaventure' fullname='Olivier Bonaventure'><orga | Universitat Munchen</organization> | |||
nization>Universite Catholique de Louvain</organization></author> | </author> | |||
<author initials='M.' surname='Handley' fullname='Mark Handley'><organization>Un | <date month="August" year="2001"/> | |||
iversity College London</organization></author> | </front> | |||
<date year="2012" /> | </reference> | |||
</front> | ||||
<seriesInfo name="Usenix Symposium on Networked Systems Design and Implementatio | ||||
n" value="2012"/> | ||||
</reference> | ||||
<reference anchor='deployments' target="https://www.ietfjournal.org/multipath-tc | <reference anchor="howhard" target="https://www.usenix.org/conference/ns | |||
p-deployments/"><front><title abbrev="MPTCP Deployments">Multipath TCP Deploymen | di12/technical-sessions/presentation/raiciu"> | |||
ts</title><author initials='O.' surname='Bonaventure' fullname='Olivier Bonavent | <front> | |||
ure'><organization>Universite Catholique de Louvain</organization></author><auth | <title abbrev="How Hard Can It Be? Designing and Implementing a Depl | |||
or initials='S.' surname='Seo' fullname='SungHoon Seo'></author><date day="1" mo | oyable Multipath TCP">How Hard Can It Be? Designing and Implementing a Deployabl | |||
nth="November" year="2016"/></front><seriesInfo name="IETF Journal" value="2016" | e Multipath TCP</title> | |||
/></reference> | <seriesInfo name="Usenix Symposium on Networked Systems Design and I | |||
mplementation" value="2012"/> | ||||
<author initials="C." surname="Raiciu" fullname="Costin Raiciu"> | ||||
<organization>Universitatea Politehnica Bucuresti</organization> | ||||
</author> | ||||
<author initials="C." surname="Paasch" fullname="Christoph Paasch"> | ||||
<organization>Universite Catholique de Louvain</organization> | ||||
</author> | ||||
<author initials="S." surname="Barre" fullname="Sebastien Barre"> | ||||
<organization>Universite Catholique de Louvain</organization> | ||||
</author> | ||||
<author initials="A." surname="Ford" fullname="Alan Ford"> | ||||
<organization/> | ||||
</author> | ||||
<author initials="M." surname="Honda" fullname="Michio Honda"> | ||||
<organization>Keio University</organization> | ||||
</author> | ||||
<author initials="F." surname="Duchene" fullname="Fabien Duchene"> | ||||
<organization>Universite Catholique de Louvain</organization> | ||||
</author> | ||||
<author initials="O." surname="Bonaventure" fullname="Olivier Bonave | ||||
nture"> | ||||
<organization>Universite Catholique de Louvain</organization> | ||||
</author> | ||||
<author initials="M." surname="Handley" fullname="Mark Handley"> | ||||
<organization>University College London</organization> | ||||
</author> | ||||
<date month="April" year="2012"/> | ||||
</front> | ||||
</reference> | ||||
</references> | <reference anchor="deployments" target="https://www.ietfjournal.org/mult | |||
ipath-tcp-deployments/"> | ||||
<front> | ||||
<title abbrev="MPTCP Deployments">Multipath TCP Deployments</title> | ||||
<seriesInfo name="IETF Journal" value="2016"/> | ||||
<author initials="O." surname="Bonaventure" fullname="Olivier Bonave | ||||
nture"> | ||||
<organization>Universite Catholique de Louvain</organization> | ||||
</author> | ||||
<author initials="S." surname="Seo" fullname="SungHoon Seo"/> | ||||
<date month="November" year="2016"/> | ||||
</front> | ||||
</reference> | ||||
</references> | ||||
</references> | ||||
<section title="Notes on Use of TCP Options" anchor="app_options"> | <section anchor="app_options" numbered="true" toc="default"> | |||
<name>Notes on Use of TCP Options</name> | ||||
<t>The TCP option space is limited due to the length of the Data Offset fi eld in the TCP header (4 bits), which defines the TCP header length in 32-bit wo rds. With the standard TCP header being 20 bytes, this leaves a maximum of 40 by tes for options, and many of these may already be used by options such as timest amp and SACK.</t> | <t>The TCP option space is limited due to the length of the Data Offset fi eld in the TCP header (4 bits), which defines the TCP header length in 32-bit wo rds. With the standard TCP header being 20 bytes, this leaves a maximum of 40 by tes for options, and many of these may already be used by options such as timest amp and SACK.</t> | |||
<t>We performed a brief study on the commonly used TCP options in SYN, | ||||
data, and pure ACK packets and found that there is enough room | ||||
to fit all the options discussed in this document.</t> | ||||
<t>SYN packets typically include the following options: Maximum Segment Si | ||||
ze (MSS) (4 bytes), | ||||
window scale (3 bytes), SACK permitted (2 bytes), and timestamp | ||||
(10 bytes). The sum of these options is 19 bytes. Some operating | ||||
systems appear to pad each option up to a word boundary, thus using 24 | ||||
bytes (a brief survey suggests that Windows XP and Mac OS X do this, where | ||||
as Linux does not). | ||||
<t>We have performed a brief study on the commonly used TCP options in SYN | Optimistically, therefore, we have 21 bytes available, or 16 if options ha | |||
, data, and pure ACK packets, and found that there is enough room to fit all the | ve to be | |||
options we propose using in this document.</t> | word-aligned. In either case, however, the SYN versions of | |||
MP_CAPABLE (12 bytes) and MP_JOIN (12 or 16 bytes) will fit in t | ||||
<t>SYN packets typically include Maximum Segment Size (MSS) (4 bytes), win | his remaining space.</t> | |||
dow scale (3 bytes), SACK permitted (2 bytes), and timestamp (10 bytes) options. | <t>Note that due to the use of a 64-bit data-level sequence space, it is | |||
Together these sum to 19 bytes. Some operating systems appear to pad each optio | feasible that MPTCP will not require the timestamp option for | |||
n up to a word boundary, thus using 24 bytes (a brief survey suggests Windows XP | protection against wrapped sequence numbers (per the Protection | |||
and Mac OS X do this, whereas Linux does not). | Against Wrapped Sequences (PAWS) mechanism, as described in <xref target=" | |||
RFC7323" | ||||
Optimistically, therefore, we have 21 bytes spare, or 16 if it has to be w | format="default"/>), since the data-level sequence space has far less | |||
ord-aligned. In either case, however, the SYN versions of Multipath Capable (12 | chance of wrapping. Confirmation of the validity of this optimization is | |||
bytes) and Join (12 or 16 bytes) options will fit in this remaining space.</t> | left for further study.</t> | |||
<t>TCP data packets typically carry timestamp options in every packet, | ||||
<t>Note that due to the use of a 64-bit data-level sequence space, it is f | taking 10 bytes (or 12, with padding). That leaves 30 bytes (or 28, if | |||
easible that MPTCP will not require the timestamp option for protection against | word-aligned). The DSS option varies in length, depending on (1) whet | |||
wrapped sequence numbers (PAWS <xref target="RFC7323"/>), since the data-level s | her the | |||
equence space has far less chance of wrapping. Confirmation of the validity of t | Data Sequence Mapping, DATA_ACK, or both are included, (2) whether th | |||
his optimisation is for further study.</t> | e | |||
sequence numbers in use are 4 or 8 octets, and (3) whether the | ||||
<t>TCP data packets typically carry timestamp options in every packet, tak | checksum is present. The maximum size of the DSS option is 28 bytes, so ev | |||
ing 10 bytes (or 12 with padding). That leaves 30 bytes (or 28, if word-aligned) | en that will fit in the available space. But unless a connection is both bidirec | |||
. The Data Sequence Signal (DSS) option varies in length depending on whether th | tional and high-bandwidth, it is unlikely that all that option space will be req | |||
e data sequence mapping and DATA_ACK are included, and whether the sequence numb | uired on each DSS option.</t> | |||
ers in use are 4 or 8 octets. The maximum size of the DSS option is 28 bytes, so | <t>Within the DSS option, it is not necessary to include the Data Sequence | |||
even that will fit in the available space. But unless a connection is both bidi | Mapping and DATA_ACK in each packet, and in many cases it may be possible to al | |||
rectional and high-bandwidth, it is unlikely that all that option space will be | ternate their presence (so long as the mapping covers the data being sent in the | |||
required on each DSS option.</t> | subsequent packet). It would also be possible to alternate between 4-byte and 8 | |||
-byte sequence numbers in each option.</t> | ||||
<t>Within the DSS option, it is not necessary to include the data sequence | <t>On subflow and connection setup, an MPTCP option is also set on the thi | |||
mapping and DATA_ACK in each packet, and in many cases it may be possible to al | rd packet (an ACK). These are 20 bytes (for MP_CAPABLE) and 24 bytes (for M | |||
ternate their presence (so long as the mapping covers the data being sent in the | P_JOIN), both of which will fit in the available option space.</t> | |||
following packet). It would also be possible to alternate between 4- and 8-byte | ||||
sequence numbers in each option.</t> | ||||
<t>On subflow and connection setup, an MPTCP option is also set on the thi | ||||
rd packet (an ACK). These are 20 bytes (for Multipath Capable) and 24 bytes (for | ||||
Join), both of which will fit in the available option space.</t> | ||||
<t>Pure ACKs in TCP typically contain only timestamps (10 bytes). Here, Mu ltipath TCP typically | <t>Pure ACKs in TCP typically contain only timestamps (10 bytes). Here, Mu ltipath TCP typically | |||
needs to encode only the DATA_ACK (maximum of 12 bytes). Occasionally, ACKs will contain SACK information. Depending | needs to encode only the DATA_ACK (maximum of 12 bytes). Occasionally, ACKs will contain SACK information. Depending | |||
on the number of lost packets, SACK may utilize the entire option space. If a DA TA_ACK had to be | on the number of lost packets, SACK may utilize the entire option space. If a DA TA_ACK had to be | |||
included, then it is probably necessary to reduce the number of SACK blocks to a ccommodate the | included, then it is probably necessary to reduce the number of SACK blocks to a ccommodate the | |||
DATA_ACK. However, the presence of the DATA_ACK is unlikely to be necessary in a case where SACK is | DATA_ACK. However, the presence of the DATA_ACK is unlikely to be necessary in a case where SACK is | |||
in use, since until at least some of the SACK blocks have been retransmitted, th e cumulative | in use, since until at least some of the SACK blocks have been retransmitted, th e cumulative | |||
data-level ACK will not be moving forward (or if it does, due to retransmissions on another path, | data-level ACK will not be moving forward (or if it does, due to retransmissions on another path, | |||
then that path can also be used to transmit the new DATA_ACK).</t> | then that path can also be used to transmit the new DATA_ACK).</t> | |||
<t>The ADD_ADDR option can be between 16 and 30 bytes, depending on | ||||
<t>The ADD_ADDR option can be between 16 and 30 bytes, depending on whethe | (1) whether IPv4 or IPv6 is used and (2) whether or not the port | |||
r IPv4 or IPv6 is used, and whether or not the port number is present. It is unl | number is | |||
ikely that such signaling would fit in a data packet (although if there is space | present. It is unlikely that such signaling would fit in a data packet | |||
, it is fine to include it). It is recommended to use duplicate ACKs with no oth | (although if there is space, it is fine to include it). It is | |||
er payload or options in order to transmit these rare signals. Note this is the | recommended that duplicate ACKs not be used with any other payload or opti | |||
reason for mandating that duplicate ACKs with MPTCP options are not taken as a s | ons, in | |||
ignal of congestion.</t> | order to transmit these rare signals. Note that this is the reason for | |||
mandating that duplicate ACKs with MPTCP options not be taken as a signal | ||||
of congestion.</t> | ||||
</section> | </section> | |||
<section anchor="app_tfo" numbered="true" toc="default"> | ||||
<section title="TCP Fast Open and MPTCP" anchor="app_tfo"> | <name>TCP Fast Open and MPTCP</name> | |||
<t>TCP Fast Open (TFO) is an experimental TCP extension, described in | <t>TCP Fast Open (TFO) is an experimental TCP extension, described in | |||
<xref target="RFC7413"/>, which has been introduced to allow sending data | <xref target="RFC7413" format="default"/>, which has been introduced to | |||
allow the sending of data | ||||
one RTT earlier than with regular TCP. This is | one RTT earlier than with regular TCP. This is | |||
considered a valuable gain as very short connections are very common, | considered a valuable gain, as very short connections are very common, | |||
especially for HTTP request/response schemes. It achieves this by sending | especially for HTTP request/response schemes. It achieves this by sending | |||
the SYN-segment together with the application's data and allowing the list | the SYN segment together with the application's data and allowing the list | |||
ener to reply | ener to reply | |||
immediately with data after the SYN/ACK. <xref target="RFC7413"/> secures | immediately with data after the SYN/ACK. <xref target="RFC7413" format="de | |||
this mechanism, by using a new TCP option that includes a cookie which | fault"/> secures | |||
this mechanism by using a new TCP option that includes a cookie that | ||||
is negotiated in a preceding connection.</t> | is negotiated in a preceding connection.</t> | |||
<t>When using TFO in conjunction with MPTCP, there are two key | ||||
points to take into account, as detailed below.</t> | ||||
<section anchor="tfocookie" numbered="true" toc="default"> | ||||
<name>TFO Cookie Request with MPTCP</name> | ||||
<t>When a TFO initiator first connects to a listener, it cannot immediat | ||||
ely | ||||
include data in the SYN for security reasons <xref target="RFC7413" fo | ||||
rmat="default"/>. | ||||
Instead, it requests a cookie that will be used in subsequent | ||||
connections. This is done with the TCP cookie request/response options | ||||
, | ||||
of 2 bytes and 6-18 bytes, respectively (depending on the chosen cooki | ||||
e length).</t> | ||||
<t>TFO and MPTCP can be combined, provided that the total length of all | ||||
the | ||||
options does not exceed the maximum 40 bytes possible in TCP: | ||||
<t>When using TCP Fast Open in conjunction with MPTCP, there are two key | </t> | |||
points to take into account, detailed hereafter.</t> | <ul spacing="normal"> | |||
<li>In the SYN: MPTCP uses a 4-byte MP_CAPABLE option. The sum | ||||
<section title="TFO cookie request with MPTCP" anchor="tfocookie"> | of the MPTCP and TFO options is 6 bytes. With typical TCP options usin | |||
<t>When a TFO initiator first connects to a listener, it cannot immedia | g up | |||
tely | to 19 bytes in the SYN (24 bytes if options are padded at a word bound | |||
include data in the SYN for security reasons <xref target="RFC7413"/>. | ary), | |||
Instead, it requests a cookie that will be used in subsequent | there is enough space to combine the MP_CAPABLE with the TFO cookie re | |||
connections. This is done with the TCP cookie request/response options, | quest.</li> | |||
of respectively 2 bytes and 6-18 bytes (depending on the chosen cookie | <li>In the SYN + ACK: MPTCP uses a 12-byte MP_CAPABLE option, but | |||
length).</t> | now the TFO option can be as long as 18 bytes. Since the maximum optio | |||
n length | ||||
<t>TFO and MPTCP can be combined provided that the total length of all | may be exceeded, it is up to the listener to avoid this problem by usi | |||
the | ng a | |||
options does not exceed the maximum 40 bytes possible in TCP: | shorter cookie. | |||
As an example, if we consider that 19 bytes are used for classical | ||||
<list style="symbols"> | TCP options, the maximum possible cookie length would be | |||
<t>In the SYN: MPTCP uses a 4-bytes long MP_CAPABLE option. The MPTCP | 7 bytes. Note that, for the SYN packet, the same limitation applies to | |||
and TFO options sum up to 6 bytes. With typical TCP-options using up | subsequent | |||
to 19 bytes in the SYN (24 bytes if options are padded at a word bounda | connections (because the initiator then echoes | |||
ry), | the cookie back to the listener). Finally, if the security impact of r | |||
there is enough space to combine the MP_CAPABLE with the TFO Cookie Req | educing | |||
uest.</t> | the cookie size is not deemed acceptable, the listener can reduce the | |||
amount of space used by other TCP options by omitting the TCP timestam | ||||
<t>In the SYN+ACK: MPTCP uses a 12-bytes long MP_CAPABLE option, but | ps (as | |||
now TFO can be as long as 18 bytes. Since the maximum option length | outlined in <xref target="app_options" format="default"/>).</li> | |||
may be exceeded, it is up to the listener to solve this by using a | </ul> | |||
shorter cookie. | </section> | |||
As an example, if we consider that 19 bytes are used for classical | <section anchor="tfodata" numbered="true" toc="default"> | |||
TCP options, the maximum possible cookie length would be | <name>Data Sequence Mapping under TFO</name> | |||
of 7 bytes. Note that the same limitation applies to subsequent | <t>In the TCP establishment phase, MPTCP uses a key exchange that is | |||
connections, for the SYN packet (because the initiator then echoes back | used to generate the Initial Data Sequence Numbers (IDSNs). In particu | |||
the cookie to the listener). Finally, if the security impact of reducin | lar, | |||
g | the SYN with MP_CAPABLE occupies the first octet of data sequence | |||
the cookie size is not deemed acceptable, the listener can reduce the | space. With TFO, one way to handle the data sent together with the SYN | |||
amount of other TCP-options by omitting the TCP timestamps (as | would be to consider an implicit DSS mapping that covers that SYN segm | |||
outlined in <xref target="app_options"/>).</t> | ent | |||
</list></t> | (since there is not enough space in the SYN to include a DSS option). | |||
</section> | The problem with that approach is that if a middlebox modifies the TFO | |||
data, this will not be noticed by MPTCP because of the absence of a | ||||
<section title="Data sequence mapping under TFO" anchor="tfodata"> | DSS checksum. For example, a TCP‑aware (but not MPTCP-aware) mid | |||
<t>MPTCP uses, in the TCP establishment phase, a key exchange that is | dlebox could | |||
used to generate the Initial Data Sequence Numbers (IDSNs). In particul | insert bytes at the beginning of the stream and adapt the TCP checksum | |||
ar, | and sequence numbers accordingly. With an implicit mapping, this infor | |||
the SYN with MP_CAPABLE occupies the first octet of the data sequence | mation would | |||
space. With TFO, one way to handle the data sent together with the SYN | give to the initiator and listener a different view of the DSS | |||
would be to consider an implicit DSS mapping that covers that SYN segme | mapping; there would be no | |||
nt | way to detect this inconsistency, because the DSS checksum is not pres | |||
(since there is not enough space in the SYN to include a DSS option). | ent.</t> | |||
The problem with that approach is that if a middlebox modifies the TFO | <t>To solve this issue, the TFO data must not be considered part of the | |||
data, this will not be noticed by MPTCP because of the absence of a | data sequence number space: the SYN with MP_CAPABLE still occupies | |||
DSS-checksum. For example, a TCP (but not MPTCP)-aware middlebox could | the first octet of data sequence space, but then the first non-TFO | |||
insert bytes at the beginning of the stream and adapt the TCP checksum | data byte occupies the second octet. This guarantees that, if the | |||
and sequence numbers accordingly. With an implicit mapping, this would | use of the DSS checksum is negotiated, all data in the data sequence | |||
give to initiator and listener a different view on the DSS-mapping, wit | number space is checksummed. We also note that this does not entail | |||
h no | a loss of functionality, because TFO data is always only sent on the | |||
way to detect this inconsistency as the DSS checksum is not present.</t | initial subflow, before any attempt to create additional subflows.</t> | |||
> | </section> | |||
<section anchor="tfoexamples" numbered="true" toc="default"> | ||||
<t>To solve this, the TFO data must not be considered part of the | <name>Connection Establishment Examples</name> | |||
Data Sequence Number space: the SYN with MP_CAPABLE still occupies | <t>A few examples of possible "TFO + MPTCP" | |||
the first octet of data sequence space, but then the first non-TFO | establishment scenarios are shown below.</t> | |||
data byte occupies the second octet. This guarantees that, if the | <t>Before an initiator can send data together with the SYN, it must requ | |||
use of DSS-checksum is negotiated, all data in the data sequence | est | |||
number space is checksummed. We also note that this does not entail | a cookie from the listener, as shown in <xref target="fig_tfocookie" | |||
a loss of functionality, because TFO-data is always only sent on the | format="default"/>. (Note: The sequence number | |||
initial subflow before any attempt to create additional subflows.</t> | and length are annotated in <xref target="fig_tfocookie" format="default"/> as | |||
</section> | Seq(Length) (e.g., "S. 0(0)") and used as such in the subsequent figures | |||
(e.g., "S 0(20)" in <xref target="fig_tfodata"/>).) This is done b | ||||
<section title="Connection establishment examples" anchor="tfoexamples"> | y simply combining the TFO and MPTCP options.</t> | |||
<t>The following shows a few examples of possible TFO+MPTCP | <figure anchor="fig_tfocookie"> | |||
establishment scenarios.</t> | <name>Cookie Request</name> | |||
<artwork align="left" name="" type="" alt=""><![CDATA[ | ||||
<t>Before an initiator can send data together with the SYN, it must re | initiator listener | |||
quest | | | | |||
a cookie to the listener, as shown in <xref target="fig_tfocookie"/>. | | S Seq=0(Length=0) <MP_CAPABLE>, <TFO cookie request> | | |||
This is done by simply combining the TFO and MPTCP options.</t> | | --------------------------------------------------------> | | |||
| | | ||||
<figure align="center" anchor="fig_tfocookie" title="Cookie request - | | S. 0(0) ack 1 <MP_CAPABLE>, <TFO cookie> | | |||
sequence number and length are annotated as Seq(Length) and used hereafter in th | | <-------------------------------------------------------- | | |||
e figures."> | | | | |||
<artwork align="left"><![CDATA[ | | . 0(0) ack 1 <MP_CAPABLE> | | |||
initiator listener | | --------------------------------------------------------> | | |||
| | | | | ]]></artwork> | |||
| S Seq=0(Length=0) <MP_CAPABLE>, <TFO cookie request> | | </figure> | |||
| -----------------------------------------------------------> | | <t>Once this is done, the received cookie can be used for TFO, as shown | |||
| | | in <xref target="fig_tfodata" format="default"/>. In this example, the | |||
| S. 0(0) ack 1 <MP_CAPABLE>, <TFO cookie> | | initiator first | |||
| <----------------------------------------------------------- | | sends 20 bytes in the SYN. The listener immediately replies with 100 b | |||
| | | ytes | |||
| . 0(0) ack 1 <MP_CAPABLE> | | following the SYN-ACK, to which the initiator replies with 20 more byt | |||
| -----------------------------------------------------------> | | es. | |||
| | | Note that the last segment in the figure | |||
]]></artwork> | ||||
</figure> | ||||
<t>Once this is done, the received cookie can be used for TFO, as show | ||||
n | ||||
in <xref target="fig_tfodata"/>. In this example, the initiator first | ||||
sends 20 bytes in the SYN. The listener immediately replies with 100 by | ||||
tes | ||||
following the SYN-ACK upon which the initiator replies with 20 more byt | ||||
es. | ||||
Note that the last segment in the figure | ||||
has a TCP sequence number of 21, while the DSS subflow sequence | has a TCP sequence number of 21, while the DSS subflow sequence | |||
number is 1 (because the TFO data is not part of the data sequence | number is 1 (because the TFO data is not part of the data sequence | |||
number space, as explained in Section <xref target="tfodata"/>.</t> | number space, as explained in <xref target="tfodata" format="default"/ | |||
>.</t> | ||||
<figure align="center" anchor="fig_tfodata" title="The listener support | ||||
s TFO"> | ||||
<artwork align="left"><![CDATA[ | ||||
initiator listener | ||||
| | | ||||
| S 0(20) <MP_CAPABLE>, <TFO cookie> | | ||||
| -----------------------------------------------------------> | | ||||
| | | ||||
| S. 0(0) ack 21 <MP_CAPABLE> | | ||||
| <----------------------------------------------------------- | | ||||
| | | ||||
| . 1(100) ack 21 <DSS ack=1 seq=1 ssn=1 dlen=100> | | ||||
| <----------------------------------------------------------- | | ||||
| | | ||||
| . 21(0) ack 1 <MP_CAPABLE> | | ||||
| -----------------------------------------------------------> | | ||||
| | | ||||
| . 21(20) ack 101 <DSS ack=101 seq=1 ssn=1 dlen=20> | | ||||
| -----------------------------------------------------------> | | ||||
| | | ||||
]]></artwork> | ||||
</figure> | ||||
<t>In <xref target="fig_tfofallback"/>, the listener does not support | <figure anchor="fig_tfodata"> | |||
TFO. The initiator detects | <name>The Listener Supports TFO</name> | |||
that no state is created in the listener (as no data is acked), and no | <artwork align="left" name="" type="" alt=""><![CDATA[ | |||
w | initiator listener | |||
sends the MP_CAPABLE in the third ack, in order for the listener to | | | | |||
build its MPTCP context at then end of the establishment. Now, the | | S 0(20) <MP_CAPABLE>, <TFO cookie> | | |||
tfo data, retransmitted, becomes part of the data sequence mapping | | --------------------------------------------------------> | | |||
because it is effectively sent (in fact re-sent) after the | | | | |||
| S. 0(0) ack 21 <MP_CAPABLE> | | ||||
| <-------------------------------------------------------- | | ||||
| | | ||||
| . 1(100) ack 21 <DSS ack=1 seq=1 ssn=1 dlen=100> | | ||||
| <-------------------------------------------------------- | | ||||
| | | ||||
| . 21(0) ack 1 <MP_CAPABLE> | | ||||
| --------------------------------------------------------> | | ||||
| | | ||||
| . 21(20) ack 101 <DSS ack=101 seq=1 ssn=1 dlen=20> | | ||||
| --------------------------------------------------------> | | ||||
| | ]]></artwork> | ||||
</figure> | ||||
<t>In <xref target="fig_tfofallback" format="default"/>, the listener do | ||||
es not support TFO. The initiator detects | ||||
that no state is created in the listener (as no data is ACKed) and now | ||||
sends the MP_CAPABLE in the third packet, in order for the listener to | ||||
build its MPTCP context at the end of the establishment. Now, the | ||||
TFO data, when retransmitted, becomes part of the Data Sequence Mappin | ||||
g | ||||
because it is effectively sent (in fact re‑sent) after the | ||||
establishment.</t> | establishment.</t> | |||
<figure anchor="fig_tfofallback"> | ||||
<figure align="center" anchor="fig_tfofallback" title="The listener doe | <name>The Listener Does Not Support TFO</name> | |||
s not support TFO"> | <artwork align="left" name="" type="" alt=""><![CDATA[ | |||
<artwork align="left"><![CDATA[ | initiator listener | |||
initiator listener | | | | |||
| | | | S 0(20) <MP_CAPABLE>, <TFO cookie> | | |||
| S 0(20) <MP_CAPABLE>, <TFO cookie> | | | --------------------------------------------------------> | | |||
| -----------------------------------------------------------> | | | | | |||
| | | | S. 0(0) ack 1 <MP_CAPABLE> | | |||
| S. 0(0) ack 1 <MP_CAPABLE> | | | <-------------------------------------------------------- | | |||
| <----------------------------------------------------------- | | | | | |||
| | | | . 1(0) ack 1 <MP_CAPABLE> | | |||
| . 1(0) ack 1 <MP_CAPABLE> | | | --------------------------------------------------------> | | |||
| -----------------------------------------------------------> | | | | | |||
| | | | . 1(20) ack 1 <DSS ack=1 seq=1 ssn=1 dlen=20> | | |||
| . 1(20) ack 1 <DSS ack=1 seq=1 ssn=1 dlen=20> | | | --------------------------------------------------------> | | |||
| -----------------------------------------------------------> | | | | | |||
| | | | . 0(0) ack 21 <DSS ack=21 seq=1 ssn=1 dlen=0> | | |||
| . 0(0) ack 21 <DSS ack=21 seq=1 ssn=1 dlen=0> | | | <-------------------------------------------------------- | | |||
| <----------------------------------------------------------- | | | | ]]></artwork> | |||
| | | </figure> | |||
]]></artwork> | <t>It is also possible that the listener acknowledges only part of the T | |||
</figure> | FO | |||
data, as illustrated in <xref target="fig_tfopartial" format="default" | ||||
<t>It is also possible that the listener acknowledges only part of the | />. The | |||
TFO | initiator will simply retransmit the missing data together with a | |||
data, as illustrated in <xref target="fig_tfopartial"/>. The | DSS mapping.</t> | |||
initiator will simply retransmit the missing data together with a DSS-m | <figure anchor="fig_tfopartial"> | |||
apping.</t> | <name>Partial Data Acknowledgment</name> | |||
<artwork align="left" name="" type="" alt=""><![CDATA[ | ||||
<figure align="center" anchor="fig_tfopartial" title="Partial data ackn | initiator listener | |||
owledgement"> | | | | |||
<artwork align="left"><![CDATA[ | | S 0(1000) <MP_CAPABLE>, <TFO cookie> | | |||
initiator listener | | --------------------------------------------------------> | | |||
| | | | | | |||
| S 0(1000) <MP_CAPABLE>, <TFO cookie> | | | S. 0(0) ack 501 <MP_CAPABLE> | | |||
| -----------------------------------------------------------> | | | <-------------------------------------------------------- | | |||
| | | | | | |||
| S. 0(0) ack 501 <MP_CAPABLE> | | | . 501(0) ack 1 <MP_CAPABLE> | | |||
| <----------------------------------------------------------- | | | --------------------------------------------------------> | | |||
| | | | | | |||
| . 501(0) ack 1 <MP_CAPABLE> | | | . 501(500) ack 1 <DSS ack=1 seq=1 ssn=1 dlen=500> | | |||
| -----------------------------------------------------------> | | | --------------------------------------------------------> | | |||
| | | | | ]]></artwork> | |||
| . 501(500) ack 1 <DSS ack=1 seq=1 ssn=1 dlen=500> | | </figure> | |||
| -----------------------------------------------------------> | | </section> | |||
| | | ||||
]]></artwork> | ||||
</figure> | ||||
</section> | ||||
</section> | </section> | |||
<section anchor="app_tcb" numbered="true" toc="default"> | ||||
<section title="Control Blocks" anchor="app_tcb"> | <name>Control Blocks</name> | |||
<t>Conceptually, an MPTCP connection can be represented as an MPTCP protocol con | <t>Conceptually, an MPTCP connection can be represented as an MPTCP protoc | |||
trol | ol control | |||
block (PCB) that contains several variables that track the progress and the | block (PCB) that contains several variables that track the progress and the | |||
state of the MPTCP connection and a set of linked TCP control blocks | state of the MPTCP connection and a set of linked TCP control blocks | |||
that correspond to the subflows that have been established.</t> | that correspond to the subflows that have been established.</t> | |||
<t>RFC 793 <xref target="RFC0793" format="default"/> specifies several sta | ||||
<t>RFC 793 <xref target="RFC0793"/> specifies several state variables. Whenever | te variables. Whenever possible, we reuse | |||
possible, we reuse | the same terminology as RFC 793 to describe the state variables that are | |||
the same terminology as RFC 793 to describe the state variables that are | ||||
maintained by MPTCP.</t> | maintained by MPTCP.</t> | |||
<section numbered="true" toc="default"> | ||||
<section title="MPTCP Control Block"> | <name>MPTCP Control Block</name> | |||
<t>The MPTCP control block contains the following variable per connection.</t> | <t>The MPTCP control block contains the following variables per connecti | |||
on.</t> | ||||
<section title="Authentication and Metadata"> | <section numbered="true" toc="default"> | |||
<t><list style="hanging"> | <name>Authentication and Metadata</name> | |||
<t hangText="Local.Token (32 bits):"> This is the token chosen by the local host | <dl newline="false" spacing="normal" indent="3"> | |||
on | <dt>Local.Token (32 bits):</dt> | |||
<dd> This is the token chosen by the local host on | ||||
this MPTCP connection. The token must be unique among all established | this MPTCP connection. The token must be unique among all established | |||
MPTCP connections, and is generated from the local key.</t> | MPTCP connections and is generated from the local key.</dd> | |||
<t hangText="Local.Key (64 bits):"> This is the key sent by the local host on th | <dt>Local.Key (64 bits):</dt> | |||
is | <dd> This is the key sent by the local host on this | |||
MPTCP connection.</t> | MPTCP connection.</dd> | |||
<t hangText="Remote.Token (32 bits):"> This is the token chosen by the remote ho | <dt>Remote.Token (32 bits):</dt> | |||
st on | <dd> This is the token chosen by the remote host on | |||
this MPTCP connection, generated from the remote key.</t> | this MPTCP connection, generated from the remote key.</dd> | |||
<t hangText="Remote.Key (64 bits):"> This is the key chosen by the remote host o | <dt>Remote.Key (64 bits):</dt> | |||
n | <dd> This is the key chosen by the remote host on | |||
this MPTCP connection</t> | this MPTCP connection.</dd> | |||
<t hangText="MPTCP.Checksum (flag):"> This flag is set to true if at least one o | <dt>MPTCP.Checksum (flag):</dt> | |||
f the | <dd> This flag is set to true if at least one of the | |||
hosts has set the A bit in the MP_CAPABLE options exchanged during connection es | hosts has set the "A" bit in the MP_CAPABLE options exchanged during | |||
tablishment, | connection establishment; otherwise, | |||
and is set to false otherwise. If this flag is set, the checksum must be comput | it is set to false. If this flag is set, the checksum must be computed in | |||
ed in | all DSS options.</dd> | |||
all DSS options.</t> | </dl> | |||
</list></t> | </section> | |||
</section> | <section numbered="true" toc="default"> | |||
<name>Sending Side</name> | ||||
<section title="Sending Side"> | <dl newline="false" spacing="normal" indent="3"> | |||
<t><list style="hanging"> | <dt>SND.UNA (64 bits):</dt> | |||
<t hangText="SND.UNA (64 bits):"> This is the data sequence number of the next b | <dd> This is the data sequence number of the next byte to be | |||
yte to be | ||||
acknowledged, at the MPTCP connection level. This variable is updated | acknowledged, at the MPTCP connection level. This variable is updated | |||
upon reception of a DSS option containing a DATA_ACK.</t> | upon reception of a DSS option containing a DATA_ACK.</dd> | |||
<t hangText="SND.NXT (64 bits):"> This is the data sequence number of the next b | <dt>SND.NXT (64 bits):</dt> | |||
yte to be | <dd> This is the data sequence number of the next byte to be | |||
sent. SND.NXT is used to determine the value of the DSN in the DSS option.</t> | sent. SND.NXT is used to determine the value of the DSN in the DSS option.</dd> | |||
<t hangText="SND.WND (32 bits with RFC 7323, 16 bits otherwise):"> This is the s | <dt>SND.WND (32 bits):</dt> | |||
ending window. MPTCP | <dd> This is the send window. 32 bits if the features in RFC | |||
maintains the sending window at the MPTCP connection level and the same | 7323 are used; 16 bits otherwise. MPTCP maintains the send window at | |||
window is shared by all subflows. All subflows use the MPTCP connection | the MPTCP connection level, and the same | |||
level SND.WND to compute the SEQ.WND value that is sent in each | window is shared by all subflows. All subflows use the MPTCP connection-level | |||
transmitted segment.</t> | SND.WND to compute the SEQ.WND value that is sent in each | |||
</list></t> | transmitted segment.</dd> | |||
</section> | </dl> | |||
</section> | ||||
<section title="Receiving Side"> | <section numbered="true" toc="default"> | |||
<t><list style="hanging"> | <name>Receiving Side</name> | |||
<t hangText="RCV.NXT (64 bits):"> This is the data sequence number of the next b | <dl newline="false" spacing="normal" indent="3"> | |||
yte that | <dt>RCV.NXT (64 bits):</dt> | |||
<dd> This is the data sequence number of the next byte that | ||||
is expected on the MPTCP connection. This state variable is modified | is expected on the MPTCP connection. This state variable is modified | |||
upon reception of in-order data. The value of RCV.NXT is used to specify | upon reception of in-order data. The value of RCV.NXT is used to specify | |||
the DATA_ACK that is sent in the DSS option on all subflows.</t> | the DATA_ACK that is sent in the DSS option on all subflows.</dd> | |||
<t hangText="RCV.WND (32 bits with RFC 7323, 16 bits otherwise):"> This is the c | <dt>RCV.WND (32 bits):</dt> | |||
onnection-level | <dd> This is the connection-level receive window, which is the | |||
receive window, which is the maximum of the RCV.WND on all the subflows.</t> | maximum of the RCV.WND on all the subflows. 32 bits if the fea | |||
</list></t> | tures in RFC 7323 are used; 16 bits otherwise.</dd> | |||
</section> | </dl> | |||
</section> | </section> | |||
</section> | ||||
<section title="TCP Control Blocks"> | <section numbered="true" toc="default"> | |||
<t>The MPTCP control block also contains a list of the TCP control blocks | <name>TCP Control Blocks</name> | |||
<t>The MPTCP control block also contains a list of the TCP control block | ||||
s | ||||
that are associated with the MPTCP connection.</t> | that are associated with the MPTCP connection.</t> | |||
<t>Note that the TCP control block on the TCP subflows does not contain | ||||
<t>Note that the TCP control block on the TCP subflows does not contain the | the | |||
RCV.WND and SND.WND state variables as these are maintained at the MPTCP | RCV.WND and SND.WND state variables, as these are maintained at the MPTCP | |||
connection level and not at the subflow level.</t> | connection level and not at the subflow level.</t> | |||
<t>Inside each TCP control block, the following state variables are defi | ||||
<t>Inside each TCP control block, the following state variables are defined.</t> | ned.</t> | |||
<section numbered="true" toc="default"> | ||||
<section title="Sending Side"> | <name>Sending Side</name> | |||
<t><list style="hanging"> | <dl newline="false" spacing="normal" indent="3"> | |||
<t hangText="SND.UNA (32 bits):"> This is the sequence number of the next byte t | <dt>SND.UNA (32 bits):</dt> | |||
o be | <dd> This is the sequence number of the next byte to be | |||
acknowledged on the subflow. This variable is updated upon reception of | acknowledged on the subflow. This variable is updated upon reception of | |||
each TCP acknowledgment on the subflow.</t> | each TCP acknowledgment on the subflow.</dd> | |||
<t hangText="SND.NXT (32 bits):"> This is the sequence number of the next byte t | <dt>SND.NXT (32 bits):</dt> | |||
o be | <dd> This is the sequence number of the next byte to be | |||
sent on the subflow. SND.NXT is used to set the value of SEG.SEQ upon | sent on the subflow. SND.NXT is used to set the value of SEG.SEQ upon | |||
transmission of the next segment.</t> | transmission of the next segment.</dd> | |||
</list></t> | </dl> | |||
</section> | </section> | |||
<section numbered="true" toc="default"> | ||||
<section title="Receiving Side"> | <name>Receiving Side</name> | |||
<t><list style="hanging"> | <dl newline="false" spacing="normal" indent="3"> | |||
<t hangText="RCV.NXT (32 bits):"> This is the sequence number of the next byte t | <dt>RCV.NXT (32 bits):</dt> | |||
hat | <dd> This is the sequence number of the next byte that | |||
is expected on the subflow. This state variable is modified upon | is expected on the subflow. This state variable is modified upon | |||
reception of in-order segments. The value of RCV.NXT is copied to the | reception of in-order segments. The value of RCV.NXT is copied to the | |||
SEG.ACK field of the next segments transmitted on the subflow.</t> | SEG.ACK field of the next segments transmitted on the subflow.</dd> | |||
<t hangText="RCV.WND (32 bits with RFC 7323, 16 bits otherwise):"> This is the | <dt>RCV.WND (32 bits):</dt> | |||
subflow-level receive window that is updated with the window field from the | <dd>This is the subflow-level receive window that is updated with | |||
segments received on this subflow.</t> | the window field from the segments received on this subflow. 3 | |||
</list></t> | 2 | |||
</section> | bits if the features in RFC 7323 are used; 16 bits otherwise.</dd> | |||
</section> | </dl> | |||
</section> | ||||
</section> | </section> | |||
</section> | ||||
<section title="Finite State Machine" anchor="app_fsm"> | <section anchor="app_fsm" numbered="true" toc="default"> | |||
<t>The diagram in <xref target="fig_fsm"/> shows the Finite State Machine | <name>Finite State Machine</name> | |||
for connection-level closure. This illustrates how the DATA_FIN connection-leve | <t>The diagram in <xref target="fig_fsm" format="default"/> shows the | |||
l signal (indicated in the diagram as the DFIN flag on a DATA_ACK) interacts wit | Finite State Machine for connection-level closure. This illustrates how | |||
h subflow-level FINs, and permits "break-before-make" handover between subflows. | the DATA_FIN connection-level signal (indicated in the diagram as the | |||
</t> | DFIN flag on a DATA_ACK) (1) interacts with subflow-level FINs and (2) per | |||
mits break-before-make handover between subflows.</t> | ||||
<figure align="center" anchor="fig_fsm" title="Finite State Machine for Co | <figure anchor="fig_fsm"> | |||
nnection Closure"> | <name>Finite State Machine for Connection Closure</name> | |||
<artwork align="left"><![CDATA[ | <artwork align="left" name="" type="" alt=""><![CDATA[ | |||
+---------+ | +---------+ | |||
| M_ESTAB | | | M_ESTAB | | |||
+---------+ | +---------+ | |||
M_CLOSE | | rcv DATA_FIN | M_CLOSE | | rcv DATA_FIN | |||
------- | | ------- | ------- | | ------- | |||
+---------+ snd DATA_FIN / \ snd DATA_ACK[DFIN] +---------+ | +---------+ snd DATA_FIN / \ snd DATA_ACK[DFIN] +-------+ | |||
| M_FIN |<----------------- ------------------->| M_CLOSE | | | M_FIN |<----------------- ------------------->|M_CLOSE| | |||
| WAIT-1 |--------------------------- | WAIT | | | WAIT-1 |--------------------------- | WAIT | | |||
+---------+ rcv DATA_FIN \ +---------+ | +---------+ rcv DATA_FIN \ +-------+ | |||
| rcv DATA_ACK[DFIN] ------- | M_CLOSE | | | rcv DATA_ACK[DFIN] ------- | M_CLOSE | | |||
| -------------- snd DATA_ACK | ------- | | | -------------- snd DATA_ACK | ------- | | |||
| CLOSE all subflows | snd DATA_FIN | | | CLOSE all subflows | snd DATA_FIN | | |||
V V V | V V V | |||
+-----------+ +-----------+ +-----------+ | +-----------+ +-----------+ +----------+ | |||
|M_FINWAIT-2| | M_CLOSING | | M_LAST-ACK| | |M_FINWAIT-2| | M_CLOSING | |M_LAST-ACK| | |||
+-----------+ +-----------+ +-----------+ | +-----------+ +-----------+ +----------+ | |||
| rcv DATA_ACK[DFIN] | rcv DATA_ACK[DFIN] | | | rcv DATA_ACK[DFIN] | rcv DATA_ACK[DFIN] | | |||
| rcv DATA_FIN -------------- | -------------- | | | rcv DATA_FIN -------------- | -------------- | | |||
| ------- CLOSE all subflows | CLOSE all subflows | | | ------- CLOSE all subflows | CLOSE all subflows | | |||
| snd DATA_ACK[DFIN] V delete MPTCP PCB V | | snd DATA_ACK[DFIN] V delete MPTCP PCB V | |||
\ +-----------+ +---------+ | \ +-----------+ +--------+ | |||
------------------------>|M_TIME WAIT|----------------->| M_CLOSED| | ------------------------>|M_TIME WAIT|---------------->|M_CLOSED| | |||
+-----------+ +---------+ | +-----------+ +--------+ | |||
All subflows in CLOSED | All subflows in CLOSED | |||
------------ | ------------ | |||
delete MPTCP PCB | delete MPTCP PCB ]]></artwork> | |||
]]></artwork> | ||||
</figure> | </figure> | |||
</section> | </section> | |||
<section anchor="app_changelog" numbered="true" toc="default"> | ||||
<name>Changes from RFC 6824</name> | ||||
<t>This appendix lists the key technical changes between <xref target="RFC | ||||
6824"/>, | ||||
which specifies MPTCP v0; and this document, which obsoletes <xref target= | ||||
"RFC6824"/> and specifies MPTCP v1. Note that this specification is not backward | ||||
compatible with <xref target="RFC6824"/>. | ||||
<section title="Changes from RFC6824" anchor="app_changelog"> | </t> | |||
<t>This section lists the key technical changes between RFC6824, specifyin | <ul spacing="normal"> | |||
g MPTCP v0, and this document, which obsoletes RFC6824 and specifies MPTCP v1. N | <li>This document incorporates lessons learned from the various implemen | |||
ote that this specification is not backwards compatible with RFC6824. | tations, deployments, and experiments gathered in the documents "Use Cases and O | |||
perational Experience with Multipath TCP" <xref target="RFC8041" format="default | ||||
<list style="symbols"> | "/> and the IETF Journal article "Multipath TCP Deployments" <xref target="deplo | |||
<t>The document incorporates lessons learnt from the various implementat | yments" format="default"/>.</li> | |||
ions, deployments and experiments gathered in the documents "Use Cases and Opera | <li>Connection initiation, through the exchange of the MP_CAPABLE | |||
tional Experience with Multipath TCP" <xref target="RFC8041"/> and the IETF Jour | MPTCP option, is different from <xref target="RFC6824"/>. The SYN no lon | |||
nal article "Multipath TCP Deployments" <xref target="deployments"/>.</t> | ger | |||
<t>Connection initiation, through the exchange of the MP_CAPABLE MPTCP o | includes the initiator's key, to allow the MP_CAPABLE option on the SYN | |||
ption, is different from RFC6824. The SYN no longer includes the initiator's key | to be shorter in length and to avoid duplicating the sending of keying material. | |||
, allowing the MP_CAPABLE option on the SYN to be shorter in length, and to avoi | </li> | |||
d duplicating the sending of keying material.</t> | <li>This also ensures reliable delivery of the key on the MP_CAPABLE | |||
<t>This also ensures reliable delivery of the key on the MP_CAPABLE opti | option by allowing its transmission to be combined with data and thus | |||
on by allowing its transmission to be combined with data and thus using TCP's in | using TCP's built-in reliability mechanism. If the initiator does not | |||
-built reliability mechanism. If the initiator does not immediately have data to | immediately have data to send, the MP_CAPABLE option with the keys | |||
send, the MP_CAPABLE option with the keys will be repeated on the first data pa | will be repeated on the first data packet. If the other end is the first | |||
cket. If the other end is first to send, then the presence of the DSS option imp | to send, then the presence of the DSS option implicitly confirms the receipt of | |||
licitly confirms the receipt of the MP_CAPABLE.</t> | the MP_CAPABLE.</li> | |||
<t>In the Flags field of MP_CAPABLE, C is now assigned to mean that the | <li>In the Flags field of MP_CAPABLE, "C" is now assigned to mean that | |||
sender of this option will not accept additional MPTCP subflows to the source ad | the sender of this option will not accept additional MPTCP subflows to | |||
dress and port. This is an efficiency improvement, for example where the sender | the source address and port. This improves efficiency -- for example, | |||
is behind a strict NAT.</t> | in cases where the sender is behind a strict NAT.</li> | |||
<t>In the Flags field of MP_CAPABLE, H now indicates the use of HMAC-SHA | <li>In the Flags field of MP_CAPABLE, "H" now indicates the use of HMAC- | |||
256 (rather than HMAC-SHA1).</t> | SHA256 (rather than HMAC-SHA1).</li> | |||
<t>Connection initiation also defines the procedure for version negotiat | <li>Connection initiation also defines the procedure for version negotia | |||
ion, for implementations that support both v0 (RFC6824) and v1 (this document).< | tion, for implementations that support both v0 <xref target="RFC6824"/> and v1 ( | |||
/t> | this document).</li> | |||
<t>The HMAC-SHA256 (rather than HMAC-SHA1) algorithm is used, as the alg | <li>The HMAC-SHA256 (rather than HMAC-SHA1) algorithm is used, as it pro | |||
orithm provides better security. It is used to generate the token in the MP_JOIN | vides better security. It is used to generate the token in the MP_JOIN and ADD_A | |||
and ADD_ADDR messages, and to set the initial data sequence number.</t> | DDR messages and to set the IDSN.</li> | |||
<t>A new subflow-level option exists to signal reasons for sending a RST | <li>A new subflow-level option exists to signal reasons for sending a | |||
on a subflow (MP_TCPRST <xref target="sec_reset"/>), which can help an implemen | RST on a subflow (MP_TCPRST (<xref target="sec_reset" | |||
tation decide whether to attempt later re-connection.</t> | format="default"/>)); this can help an implementation decide whether to | |||
<t>The MP_PRIO option (<xref target="sec_policy"/>), which is used to si | attempt later reconnection.</li> | |||
gnal a change of priority for a subflow, no longer includes the AddrID field. It | <li>The MP_PRIO option (<xref target="sec_policy" format="default"/>), | |||
s purpose was to allow the changed priority to be applied on a subflow other tha | which is used to signal a change of priority for a subflow, no longer | |||
n the one it was sent on. However, it has been realised that this could be used | includes the AddrID field. Its purpose was to allow the changed | |||
by a man-in-the-middle to divert all traffic on to its own path, and MP_PRIO doe | priority to be applied on a subflow other than the one it was sent | |||
s not include a token or other security mechanism.</t> | on. However, it was determined that this could be used by a | |||
<t>The ADD_ADDR option (<xref target="sec_add_address"/>), which is used | man-in-the-middle to divert all traffic onto its own path, and MP_PRIO | |||
to inform the other host about another potential address, is different in sever | does not include a token or other type of security mechanism.</li> | |||
al ways. It now includes an HMAC of the added address, for enhanced security. In | <li>The ADD_ADDR option (<xref target="sec_add_address" format="default" | |||
addition, reliability for the ADD_ADDR option has been added: the IPVer field i | />), which is used to inform the other host about another potential address, is | |||
s replaced with a flag field, and one flag is assigned (E) which is used as an ' | different in several ways. It now includes an HMAC of the added address, for enh | |||
Echo' so a host can indicate that it has received the option.</t> | anced security. In addition, reliability for the ADD_ADDR option has been added: | |||
<t>An additional way of performing a Fast Close is described, by sending | the IPVer field is replaced with a flag field, and one flag is assigned ("E") t | |||
a MP_FASTCLOSE option on a RST on all subflows. This allows the host to tear do | hat is used as an "echo" so a host can indicate that it has received the option. | |||
wn the subflows and the connection immediately.</t> | </li> | |||
<t>In the IANA registry a new MPTCP subtype option, MP_EXPERIMENTAL, is | <li>This document describes an additional way of performing a Fast | |||
reserved for private experiments. However, the document doesn't define how to us | Close -- by sending an MP_FASTCLOSE option on a RST on all subflows. Thi | |||
e the subtype option.</t> | s allows the host to tear down the subflows and the connection immediately.</li> | |||
<t>A new Appendix discusses the usage of both the MPTCP and TCP Fast Ope | <li>IANA has reserved the MPTCP option subtype of value 0xf for | |||
n on the same packet (<xref target="app_tfo"/>).</t> | Private Use (<xref target="IANA_subtypes"/>). This document doesn't defi | |||
</list></t> | ne how to use that value.</li> | |||
<li>This document adds a new appendix (<xref target="app_tfo" | ||||
format="default"/>), which discusses the usage of both MPTCP options | ||||
and TFO options on the same packet.</li> | ||||
</ul> | ||||
</section> | ||||
<section anchor="Acknowledgments" numbered="false" toc="default"> | ||||
<name>Acknowledgments</name> | ||||
<t>The authors gratefully acknowledge significant input into this | ||||
document from <contact fullname="Sebastien Barre"/> and <contact fullname= | ||||
"Andrew McDonald"/>.</t> | ||||
<t>The authors also wish to acknowledge reviews and contributions from | ||||
<contact fullname="Iljitsch van Beijnum"/>, <contact fullname="Lars | ||||
Eggert"/>, <contact fullname="Marcelo Bagnulo"/>, <contact | ||||
fullname="Robert Hancock"/>, <contact fullname="Pasi Sarolahti"/>, | ||||
<contact fullname="Toby Moncaster"/>, <contact fullname="Philip | ||||
Eardley"/>, <contact fullname="Sergio Lembo"/>, <contact | ||||
fullname="Lawrence Conroy"/>, <contact fullname="Yoshifumi Nishida"/>, | ||||
<contact fullname="Bob Briscoe"/>, <contact fullname="Stein Gjessing"/>, | ||||
<contact fullname="Andrew McGregor"/>, <contact fullname="Georg | ||||
Hampel"/>, <contact fullname="Anumita Biswas"/>, <contact fullname="Wes | ||||
Eddy"/>, <contact fullname="Alexey Melnikov"/>, <contact | ||||
fullname="Francis Dupont"/>, <contact fullname="Adrian Farrel"/>, | ||||
<contact fullname="Barry Leiba"/>, <contact fullname="Robert Sparks"/>, | ||||
<contact fullname="Sean Turner"/>, <contact fullname="Stephen | ||||
Farrell"/>, <contact fullname="Martin Stiemerling"/>, <contact | ||||
fullname="Gregory Detal"/>, <contact fullname="Fabien Duchene"/>, | ||||
<contact fullname="Xavier de Foy"/>, <contact fullname="Rahul Jadhav"/>, | ||||
<contact fullname="Klemens Schragel"/>, <contact fullname="Mirja | ||||
Kühlewind"/>, <contact fullname="Sheng Jiang"/>, <contact | ||||
fullname="Alissa Cooper"/>, <contact fullname="Ines Robles"/>, <contact | ||||
fullname="Roman Danyliw"/>, <contact fullname="Adam Roach"/>, | ||||
<contact fullname="Eric Vyncke"/>, and <contact fullname="Ben Kaduk"/>.</t | ||||
> | ||||
</section> | </section> | |||
</back> | </back> | |||
</rfc> | </rfc> | |||
End of changes. 343 change blocks. | ||||
3074 lines changed or deleted | 3156 lines changed or added | |||
This html diff was produced by rfcdiff 1.45. The latest version is available from http://tools.ietf.org/tools/rfcdiff/ |