rfc8670xml2.original.xml | rfc8670.xml | |||
---|---|---|---|---|
<?xml version="1.0" encoding="US-ASCII"?> | <?xml version='1.0' encoding='utf-8'?> | |||
<!DOCTYPE rfc SYSTEM "rfc2629.dtd"> | <!DOCTYPE rfc SYSTEM "rfc2629-xhtml.ent"> | |||
<?rfc toc="yes"?> | <rfc xmlns:xi="http://www.w3.org/2001/XInclude" number="8670" | |||
<?rfc tocompact="yes"?> | category="info" consensus="true" submissionType="IETF" | |||
<?rfc tocdepth="3"?> | docName="draft-ietf-spring-segment-routing-msdc-11" ipr="trust200902" obsol | |||
<?rfc tocindent="yes"?> | etes="" updates="" xml:lang="en" tocInclude="true" symRefs="true" sortRefs="true | |||
<?rfc symrefs="yes"?> | " version="3"> | |||
<?rfc sortrefs="yes"?> | ||||
<?rfc comments="yes"?> | ||||
<?rfc inline="yes"?> | ||||
<?rfc compact="yes"?> | ||||
<?rfc subcompact="no"?> | ||||
<rfc category="info" docName="draft-ietf-spring-segment-routing-msdc-11" | ||||
ipr="trust200902"> | ||||
<front> | ||||
<title abbrev="BGP-Prefix SID in large-scale DCs">BGP-Prefix Segment in | ||||
large-scale data centers</title> | ||||
<author fullname="Clarence Filsfils" initials="C." role="editor" | <front> | |||
surname="Filsfils"> | <title abbrev="BGP Prefix-SID in Large-Scale DCs">BGP Prefix Segment in | |||
Large-Scale Data Centers</title> | ||||
<seriesInfo name="RFC" value="8670"/> | ||||
<author fullname="Clarence Filsfils" initials="C." role="editor" surname="Fi | ||||
lsfils"> | ||||
<organization>Cisco Systems, Inc.</organization> | <organization>Cisco Systems, Inc.</organization> | |||
<address> | <address> | |||
<postal> | <postal> | |||
<street/> | <street/> | |||
<city>Brussels</city> | <city>Brussels</city> | |||
<region/> | <region/> | |||
<code/> | <code/> | |||
<country>Belgium</country> | ||||
<country>BE</country> | ||||
</postal> | </postal> | |||
<email>cfilsfil@cisco.com</email> | <email>cfilsfil@cisco.com</email> | |||
</address> | </address> | |||
</author> | </author> | |||
<author fullname="Stefano Previdi" initials="S." surname="Previdi"> | <author fullname="Stefano Previdi" initials="S." surname="Previdi"> | |||
<organization>Cisco Systems, Inc.</organization> | <organization>Cisco Systems, Inc.</organization> | |||
<address> | <address> | |||
<postal> | <postal> | |||
<street/> | <street/> | |||
<city/> | <city/> | |||
<code/> | <code/> | |||
<country>Italy</country> | <country>Italy</country> | |||
</postal> | </postal> | |||
<email>stefano@previdi.net</email> | <email>stefano@previdi.net</email> | |||
</address> | </address> | |||
</author> | </author> | |||
<author fullname="Gaurav Dawra" initials="G." surname="Dawra"> | <author fullname="Gaurav Dawra" initials="G." surname="Dawra"> | |||
<organization>LinkedIn</organization> | <organization>LinkedIn</organization> | |||
<address> | <address> | |||
<postal> | <postal> | |||
<street/> | <street/> | |||
<city/> | <city/> | |||
<code/> | <code/> | |||
<country>United States of America</country> | ||||
<country>USA</country> | ||||
</postal> | </postal> | |||
<email>gdawra.ietf@gmail.com</email> | <email>gdawra.ietf@gmail.com</email> | |||
</address> | </address> | |||
</author> | </author> | |||
<author fullname="Ebben Aries" initials="E." surname="Aries"> | <author fullname="Ebben Aries" initials="E." surname="Aries"> | |||
<organization>Juniper Networks</organization> | <organization>Arrcus, Inc.</organization> | |||
<address> | <address> | |||
<postal> | <postal> | |||
<street>1133 Innovation Way</street> | <street>2077 Gateway Place, Suite #400</street> | |||
<city>San Jose</city> | ||||
<city>Sunnyvale</city> | <code>CA 95119</code> | |||
<country>United States of America</country> | ||||
<code>CA 94089</code> | ||||
<country>US</country> | ||||
</postal> | </postal> | |||
<email>exa@arrcus.com</email> | ||||
<email>exa@juniper.net</email> | ||||
</address> | </address> | |||
</author> | </author> | |||
<author fullname="Petr Lapukhov" initials="P." surname="Lapukhov"> | <author fullname="Petr Lapukhov" initials="P." surname="Lapukhov"> | |||
<organization>Facebook</organization> | <organization>Facebook</organization> | |||
<address> | <address> | |||
<postal> | <postal> | |||
<street/> | <street/> | |||
<city/> | <city/> | |||
<code/> | <code/> | |||
<country>United States of America</country> | ||||
<country>US</country> | ||||
</postal> | </postal> | |||
<email>petr@fb.com</email> | <email>petr@fb.com</email> | |||
</address> | </address> | |||
</author> | </author> | |||
<date month="December" year="2019"/> | ||||
<date year="2018"/> | ||||
<workgroup>Network Working Group</workgroup> | <workgroup>Network Working Group</workgroup> | |||
<keyword>example</keyword> | ||||
<abstract> | <abstract> | |||
<t>This document describes the motivation and benefits for applying | <t>This document describes the motivation for, and benefits of, applying | |||
segment routing in BGP-based large-scale data-centers. It describes the | Segment Routing (SR) in BGP-based large-scale data centers. It describes t | |||
design to deploy segment routing in those data-centers, for both the | he | |||
MPLS and IPv6 dataplanes.</t> | design to deploy SR in those data centers for both the | |||
MPLS and IPv6 data planes.</t> | ||||
</abstract> | </abstract> | |||
</front> | </front> | |||
<middle> | <middle> | |||
<section anchor="INTRO" title="Introduction"> | <section anchor="INTRO" numbered="true" toc="default"> | |||
<t>Segment Routing (SR), as described in <xref | <name>Introduction</name> | |||
target="I-D.ietf-spring-segment-routing"/> leverages the source routing | <t>Segment Routing (SR), as described in <xref target="RFC8402" format="de | |||
fault"/>, leverages the source-routing | ||||
paradigm. A node steers a packet through an ordered list of | paradigm. A node steers a packet through an ordered list of | |||
instructions, called segments. A segment can represent any instruction, | instructions called "segments". A segment can represent any instruction, | |||
topological or service-based. A segment can have a local semantic to an | topological or service based. A segment can have a local semantic to an | |||
SR node or global within an SR domain. SR allows to enforce a flow | SR node or a global semantic within an SR domain. SR allows the enforcemen | |||
through any topological path while maintaining per-flow state only at | t of a flow | |||
the ingress node to the SR domain. Segment Routing can be applied to the | through any topological path while maintaining per-flow state only from | |||
MPLS and IPv6 data-planes.</t> | the ingress node to the SR domain. SR can be applied to the | |||
MPLS and IPv6 data planes.</t> | ||||
<t>The use-cases described in this document should be considered in the | <t>The use cases described in this document should be considered in the | |||
context of the BGP-based large-scale data-center (DC) design described | context of the BGP-based large-scale data-center (DC) design described | |||
in <xref target="RFC7938"/>. This document extends it by applying SR | in <xref target="RFC7938" format="default"/>. This document extends it by | |||
both with IPv6 and MPLS dataplane.</t> | applying SR | |||
both with IPv6 and MPLS data planes.</t> | ||||
</section> | </section> | |||
<section anchor="LARGESCALEDC" numbered="true" toc="default"> | ||||
<section anchor="LARGESCALEDC" | <name>Large-Scale Data-Center Network Design Summary</name> | |||
title="Large Scale Data Center Network Design Summary"> | <t>This section provides a brief summary of the Informational RFC | |||
<t>This section provides a brief summary of the informational document | <xref target="RFC7938" format="default"/>, which outlines a practical netw | |||
<xref target="RFC7938"/> that outlines a practical network design | ork design | |||
suitable for data-centers of various scales:<list style="symbols"> | suitable for data centers of various scales:</t> | |||
<t>Data-center networks have highly symmetric topologies with | <ul spacing="normal"> | |||
multiple parallel paths between two server attachment points. The | <li>Data-center networks have highly symmetric topologies with | |||
multiple parallel paths between two server-attachment points. The | ||||
well-known Clos topology is most popular among the operators (as | well-known Clos topology is most popular among the operators (as | |||
described in <xref target="RFC7938"/>). In a Clos topology, the | described in <xref target="RFC7938" format="default"/>). In a Clos top ology, the | |||
minimum number of parallel paths between two elements is determined | minimum number of parallel paths between two elements is determined | |||
by the "width" of the "Tier-1" stage. See <xref target="FIGLARGE"/> | by the "width" of the "Tier-1" stage. See <xref target="FIGLARGE" form | |||
below for an illustration of the concept.</t> | at="default"/> | |||
for an illustration of the concept.</li> | ||||
<t>Large-scale data-centers commonly use a routing protocol, such as | <li>Large-scale data centers commonly use a routing protocol, such as | |||
BGP-4 <xref target="RFC4271"/> in order to provide endpoint | BGP-4 <xref target="RFC4271" format="default"/>, in order to provide e | |||
connectivity. Recovery after a network failure is therefore driven | ndpoint | |||
connectivity. Therefore, recovery after a network failure is driven | ||||
either by local knowledge of directly available backup paths or by | either by local knowledge of directly available backup paths or by | |||
distributed signaling between the network devices.</t> | distributed signaling between the network devices.</li> | |||
<li>Within data-center networks, traffic is load shared using the | ||||
<t>Within data-center networks, traffic is load-shared using the | ||||
Equal Cost Multipath (ECMP) mechanism. With ECMP, every network | Equal Cost Multipath (ECMP) mechanism. With ECMP, every network | |||
device implements a pseudo-random decision, mapping packets to one | device implements a pseudorandom decision, mapping packets to one | |||
of the parallel paths by means of a hash function calculated over | of the parallel paths by means of a hash function calculated over | |||
certain parts of the packet, typically a combination of various | certain parts of the packet, typically a combination of various | |||
packet header fields.</t> | packet header fields.</li> | |||
</list></t> | </ul> | |||
<t>The following is a schematic of a five-stage Clos topology with four | ||||
<t>The following is a schematic of a five-stage Clos topology, with four | devices in the "Tier-1" stage. Notice that the number of paths between Nod | |||
devices in the "Tier-1" stage. Notice that number of paths between Node1 | e1 | |||
and Node12 equals to four: the paths have to cross all of Tier-1 | and Node12 equals four; the paths have to cross all of the Tier-1 | |||
devices. At the same time, the number of paths between Node1 and Node2 | devices. At the same time, the number of paths between Node1 and Node2 | |||
equals two, and the paths only cross Tier-2 devices. Other topologies | equals two, and the paths only cross Tier-2 devices. Other topologies | |||
are possible, but for simplicity only the topologies that have a single | are possible, but for simplicity, only the topologies that have a single | |||
path from Tier-1 to Tier-3 are considered below. The rest could be | path from Tier-1 to Tier-3 are considered below. The rest could be | |||
treated similarly, with a few modifications to the logic.</t> | treated similarly, with a few modifications to the logic.</t> | |||
<section anchor="REFDESIGN" numbered="true" toc="default"> | ||||
<section anchor="REFDESIGN" title="Reference design"> | <name>Reference Design</name> | |||
<figure anchor="FIGLARGE" title="5-stage Clos topology"> | <figure anchor="FIGLARGE"> | |||
<artwork> Tier-1 | <name>5-Stage Clos Topology</name> | |||
<artwork name="" type="" align="left" alt=""><![CDATA[ | ||||
Tier-1 | ||||
+-----+ | +-----+ | |||
|NODE | | |NODE | | |||
+->| 5 |--+ | +->| 5 |--+ | |||
| +-----+ | | | +-----+ | | |||
Tier-2 | | Tier-2 | Tier-2 | | Tier-2 | |||
+-----+ | +-----+ | +-----+ | +-----+ | +-----+ | +-----+ | |||
+------------>|NODE |--+->|NODE |--+--|NODE |-------------+ | +------------>|NODE |--+->|NODE |--+--|NODE |-------------+ | |||
| +-----| 3 |--+ | 6 | +--| 9 |-----+ | | | +-----| 3 |--+ | 6 | +--| 9 |-----+ | | |||
| | +-----+ +-----+ +-----+ | | | | | +-----+ +-----+ +-----+ | | | |||
| | | | | | | | | | |||
| | +-----+ +-----+ +-----+ | | | | | +-----+ +-----+ +-----+ | | | |||
| +-----+---->|NODE |--+ |NODE | +--|NODE |-----+-----+ | | | +-----+---->|NODE |--+ |NODE | +--|NODE |-----+-----+ | | |||
| | | +---| 4 |--+->| 7 |--+--| 10 |---+ | | | | | | | +---| 4 |--+->| 7 |--+--| 10 |---+ | | | | |||
| | | | +-----+ | +-----+ | +-----+ | | | | | | | | | +-----+ | +-----+ | +-----+ | | | | | |||
| | | | | | | | | | | | | | | | | | | | | | |||
+-----+ +-----+ | +-----+ | +-----+ +-----+ | +-----+ +-----+ | +-----+ | +-----+ +-----+ | |||
|NODE | |NODE | Tier-3 +->|NODE |--+ Tier-3 |NODE | |NODE | | |NODE | |NODE | Tier-3 +->|NODE |--+ Tier-3 |NODE | |NODE | | |||
| 1 | | 2 | | 8 | | 11 | | 12 | | | 1 | | 2 | | 8 | | 11 | | 12 | | |||
+-----+ +-----+ +-----+ +-----+ +-----+ | +-----+ +-----+ +-----+ +-----+ +-----+ | |||
| | | | | | | | | | | | | | | | | | |||
A O B O <- Servers -> Z O O O | A O B O <- Servers -> Z O O O]]></artwork> | |||
</artwork> | ||||
</figure> | </figure> | |||
<t>In the reference topology illustrated in <xref target="FIGLARGE" form | ||||
at="default"/>, | ||||
it is assumed:</t> | ||||
<ul spacing="normal"> | ||||
<li> | ||||
<t>Each node is its own autonomous system (AS) (Node X has AS X). 4- | ||||
byte AS numbers | ||||
are recommended (<xref target="RFC6793" format="default"/>).</t> | ||||
<ul spacing="normal"> | ||||
<li>For simple and efficient route propagation filtering, | ||||
Node5, Node6, Node7, and Node8 use the same AS; Node3 and Node4 | ||||
use the same AS; and Node9 and Node10 use the same AS.</li> | ||||
<t>In the reference topology illustrated in <xref target="FIGLARGE"/>, | <li>In the case in which 2-byte autonomous system numbers are used | |||
It is assumed:<list style="symbols"> | ||||
<t>Each node is its own AS (Node X has AS X). 4-byte AS numbers | ||||
are recommended (<xref target="RFC6793"/>).<list> | ||||
<t>For simple and efficient route propagation filtering, | ||||
Node5, Node6, Node7 and Node8 use the same AS, Node3 and Node4 | ||||
use the same AS, Node9 and Node10 use the same AS.</t> | ||||
<t>In case of 2-byte autonomous system numbers are used and | ||||
for efficient usage of the scarce 2-byte Private Use AS pool, | for efficient usage of the scarce 2-byte Private Use AS pool, | |||
different Tier-3 nodes might use the same AS.</t> | different Tier-3 nodes might use the same AS.</li> | |||
<li>Without loss of generality, these details will be | ||||
<t>Without loss of generality, these details will be | simplified in this document. It is to be assumed that each node | |||
simplified in this document and assume that each node has its | has its | |||
own AS.</t> | own AS.</li> | |||
</list></t> | </ul> | |||
</li> | ||||
<t>Each node peers with its neighbors with a BGP session. If not | ||||
specified, eBGP is assumed. In a specific use-case, iBGP will be | ||||
used but this will be called out explicitly in that case.</t> | ||||
<li>Each node peers with its neighbors with a BGP session. If not | ||||
specified, external BGP (EBGP) is assumed. In a specific use case, | ||||
internal BGP (IBGP) will be used, but this will be called out | ||||
explicitly in that case.</li> | ||||
<li> | ||||
<t>Each node originates the IPv4 address of its loopback interface | <t>Each node originates the IPv4 address of its loopback interface | |||
into BGP and announces it to its neighbors. <list> | into BGP and announces it to its neighbors. </t> | |||
<t>The loopback of Node X is 192.0.2.x/32.</t> | <ul spacing="normal"> | |||
</list></t> | <li>The loopback of Node X is 192.0.2.x/32.</li> | |||
</list></t> | </ul> | |||
</li> | ||||
<t>In this document, the Tier-1, Tier-2 and Tier-3 nodes are referred | </ul> | |||
to respectively as Spine, Leaf and ToR (top of rack) nodes. When a ToR | <t>In this document, the Tier-1, Tier-2, and Tier-3 nodes are referred | |||
to as "Spine", "Leaf", and "ToR" (top of rack) nodes, respectively. Whe | ||||
n a ToR | ||||
node acts as a gateway to the "outside world", it is referred to as a | node acts as a gateway to the "outside world", it is referred to as a | |||
border node.</t> | "border node".</t> | |||
</section> | </section> | |||
</section> | </section> | |||
<section anchor="OPENPROBS" numbered="true" toc="default"> | ||||
<section anchor="OPENPROBS" | <name>Some Open Problems in Large Data-Center Networks</name> | |||
title="Some open problems in large data-center networks"> | <t>The data-center-network design summarized above provides means for | |||
<t>The data-center network design summarized above provides means for | ||||
moving traffic between hosts with reasonable efficiency. There are few | moving traffic between hosts with reasonable efficiency. There are few | |||
open performance and reliability problems that arise in such design: | open performance and reliability problems that arise in such a design: | |||
<list style="symbols"> | </t> | |||
<t>ECMP routing is most commonly realized per-flow. This means that | <ul spacing="normal"> | |||
<li>ECMP routing is most commonly realized per flow. This means that | ||||
large, long-lived "elephant" flows may affect performance of | large, long-lived "elephant" flows may affect performance of | |||
smaller, short-lived “mouse” flows and reduce efficiency | smaller, short-lived "mouse" flows and may reduce efficiency | |||
of per-flow load-sharing. In other words, per-flow ECMP does not | of per-flow load sharing. In other words, per-flow ECMP does not | |||
perform efficiently when flow lifetime distribution is heavy-tailed. | perform efficiently when flow-lifetime distribution is heavy tailed. | |||
Furthermore, due to hash-function inefficiencies it is possible to | Furthermore, due to hash-function inefficiencies, it is possible to | |||
have frequent flow collisions, where more flows get placed on one | have frequent flow collisions where more flows get placed on one | |||
path over the others.</t> | path over the others.</li> | |||
<li>Shortest-path routing with ECMP implements an oblivious routing | ||||
<t>Shortest-path routing with ECMP implements an oblivious routing | model that is not aware of the network imbalances. If the network | |||
model, which is not aware of the network imbalances. If the network | symmetry is broken, for example, due to link failures, utilization | |||
symmetry is broken, for example due to link failures, utilization | ||||
hotspots may appear. For example, if a link fails between Tier-1 and | hotspots may appear. For example, if a link fails between Tier-1 and | |||
Tier-2 devices (e.g. Node5 and Node9), Tier-3 devices Node1 and | Tier-2 devices (e.g., Node5 and Node9), Tier-3 devices Node1 and | |||
Node2 will not be aware of that, since there are other paths | Node2 will not be aware of that since there are other paths | |||
available from perspective of Node3. They will continue sending | available from the perspective of Node3. They will continue sending | |||
roughly equal traffic to Node3 and Node4 as if the failure didn't | roughly equal traffic to Node3 and Node4 as if the failure didn't | |||
exist which may cause a traffic hotspot.</t> | exist, which may cause a traffic hotspot.</li> | |||
<li>Isolating faults in the network with multiple parallel paths and | ||||
<t>Isolating faults in the network with multiple parallel paths and | ECMP-based routing is nontrivial due to lack of determinism. | |||
ECMP-based routing is non-trivial due to lack of determinism. | ||||
Specifically, the connections from HostA to HostB may take a | Specifically, the connections from HostA to HostB may take a | |||
different path every time a new connection is formed, thus making | different path every time a new connection is formed, thus making | |||
consistent reproduction of a failure much more difficult. This | consistent reproduction of a failure much more difficult. This | |||
complexity scales linearly with the number of parallel paths in the | complexity scales linearly with the number of parallel paths in the | |||
network, and stems from the random nature of path selection by the | network and stems from the random nature of path selection by the | |||
network devices.</t> | network devices.</li> | |||
</list></t> | </ul> | |||
<t>First, it will be explained how to apply SR in the DC, for MPLS and | ||||
IPv6 data-planes.</t> | ||||
</section> | </section> | |||
<section anchor="APPLYSR" numbered="true" toc="default"> | ||||
<section anchor="APPLYSR" | <name>Applying Segment Routing in the DC with MPLS Data Plane</name> | |||
title="Applying Segment Routing in the DC with MPLS dataplane"> | <section anchor="BGPREFIXSEGMENT" numbered="true" toc="default"> | |||
<section anchor="BGPREFIXSEGMENT" | <name>BGP Prefix Segment (BGP Prefix-SID)</name> | |||
title="BGP Prefix Segment (BGP-Prefix-SID)"> | ||||
<t>A BGP Prefix Segment is a segment associated with a BGP prefix. A | <t>A BGP Prefix Segment is a segment associated with a BGP prefix. A | |||
BGP Prefix Segment is a network-wide instruction to forward the packet | BGP Prefix Segment is a network-wide instruction to forward the packet | |||
along the ECMP-aware best path to the related prefix.</t> | along the ECMP-aware best path to the related prefix.</t> | |||
<t>The BGP Prefix Segment is defined as the BGP Prefix-SID Attribute | ||||
<t>The BGP Prefix Segment is defined as the BGP-Prefix-SID Attribute | in <xref target="RFC8669" format="default"/>, which contains an | |||
in <xref target="I-D.ietf-idr-bgp-prefix-sid"/> which contains an | index. Throughout this document, the BGP Prefix Segment Attribute is | |||
index. Throughout this document the BGP Prefix Segment Attribute is | referred to as the "BGP Prefix-SID" and the encoded index as the | |||
referred as the BGP-Prefix-SID and the encoded index as the | label index.</t> | |||
label-index.</t> | ||||
<t>In this document, the network design decision has been made to | <t>In this document, the network design decision has been made to | |||
assume that all the nodes are allocated the same SRGB (Segment Routing | assume that all the nodes are allocated the same SRGB (Segment Routing | |||
Global Block), e.g. [16000, 23999]. This provides operational | Global Block), e.g., [16000, 23999]. This provides operational | |||
simplification as explained in <xref target="SINGLESRGB"/>, but this | simplification as explained in <xref target="SINGLESRGB" format="default | |||
"/>, but this | ||||
is not a requirement.</t> | is not a requirement.</t> | |||
<t>For illustration purposes, when considering an MPLS data plane, it | ||||
<t>For illustration purpose, when considering an MPLS data-plane, it | is assumed that the label index allocated to prefix 192.0.2.x/32 is X. | |||
is assumed that the label-index allocated to prefix 192.0.2.x/32 is X. | ||||
As a result, a local label (16000+x) is allocated for prefix | As a result, a local label (16000+x) is allocated for prefix | |||
192.0.2.x/32 by each node throughout the DC fabric.</t> | 192.0.2.x/32 by each node throughout the DC fabric.</t> | |||
<t>When the IPv6 data plane is considered, it is assumed that Node X is | ||||
<t>When IPv6 data-plane is considered, it is assumed that Node X is | ||||
allocated IPv6 address (segment) 2001:DB8::X.</t> | allocated IPv6 address (segment) 2001:DB8::X.</t> | |||
</section> | </section> | |||
<section anchor="eBGP8277" numbered="true" toc="default"> | ||||
<section anchor="eBGP8277" title="eBGP Labeled Unicast (RFC8277)"> | <name>EBGP Labeled Unicast (RFC 8277)</name> | |||
<t>Referring to <xref target="FIGLARGE"/> and <xref | <t>Referring to <xref target="FIGLARGE" format="default"/> and | |||
target="RFC7938"/>, the following design modifications are | <xref target="RFC7938" format="default"/>, the following design modificat | |||
introduced:<list style="symbols"> | ions are | |||
<t>Each node peers with its neighbors via a eBGP session with | introduced:</t> | |||
extensions defined in <xref target="RFC8277"/> (named "eBGP8277" | <ul spacing="normal"> | |||
throughout this document) and with the BGP-Prefix-SID attribute | <li>Each node peers with its neighbors via an EBGP session with | |||
extension as defined in <xref | extensions defined in <xref target="RFC8277" format="default"/> (nam | |||
target="I-D.ietf-idr-bgp-prefix-sid"/>.</t> | ed "EBGP8277" | |||
throughout this document) and with the BGP Prefix-SID attribute | ||||
<t>The forwarding plane at Tier-2 and Tier-1 is MPLS.</t> | extension as defined in <xref target="RFC8669" format="default"/>.</ | |||
li> | ||||
<t>The forwarding plane at Tier-3 is either IP2MPLS (if the host | <li>The forwarding plane at Tier-2 and Tier-1 is MPLS.</li> | |||
sends IP traffic) or MPLS2MPLS (if the host sends MPLS- | <li>The forwarding plane at Tier-3 is either IP2MPLS (if the host | |||
encapsulated traffic).</t> | sends IP traffic) or MPLS2MPLS (if the host sends MPLS-encapsulated | |||
</list></t> | traffic).</li> | |||
</ul> | ||||
<t><xref target="FIGSMALL"/> zooms into a path from server A to server | <t><xref target="FIGSMALL" format="default"/> zooms into a path from Ser | |||
Z within the topology of <xref target="FIGLARGE"/>.</t> | verA to ServerZ within the topology of <xref target="FIGLARGE" format="default"/ | |||
>.</t> | ||||
<figure anchor="FIGSMALL" | <figure anchor="FIGSMALL"> | |||
title="Path from A to Z via nodes 1, 4, 7, 10 and 11"> | <name>Path from A to Z via Nodes 1, 4, 7, 10, and 11</name> | |||
<artwork> +-----+ +-----+ +-----+ | <artwork name="" type="" align="left" alt=""><![CDATA[ | |||
+---------->|NODE | |NODE | |NODE | | +-----+ +-----+ +-----+ | |||
| | 4 |--+->| 7 |--+--| 10 |---+ | +---------->|NODE | |NODE | |NODE | | |||
| | 4 |--+->| 7 |--+--| 10 |---+ | ||||
| +-----+ +-----+ +-----+ | | | +-----+ +-----+ +-----+ | | |||
| | | | | | |||
+-----+ +-----+ | +-----+ +-----+ | |||
|NODE | |NODE | | |NODE | |NODE | | |||
| 1 | | 11 | | | 1 | | 11 | | |||
+-----+ +-----+ | +-----+ +-----+ | |||
| | | | | | |||
A <- Servers -> Z | A <- Servers -> Z]]></artwork> | |||
</artwork> | ||||
</figure> | </figure> | |||
<t>Referring to Figures <xref target="FIGLARGE" | ||||
<t>Referring to <xref target="FIGLARGE"/> and <xref | format="counter"/> and <xref target="FIGSMALL" format="counter"/>, and as | |||
target="FIGSMALL"/> and assuming the IP address with the AS and | suming the IP address with the AS and | |||
label-index allocation previously described, the following sections | label-index allocation previously described, the following sections | |||
detail the control plane operation and the data plane states for the | detail the control-plane operation and the data-plane states for the | |||
prefix 192.0.2.11/32 (loopback of Node11)</t> | prefix 192.0.2.11/32 (loopback of Node11).</t> | |||
<section anchor="CONTROLPLANE" numbered="true" toc="default"> | ||||
<section anchor="CONTROLPLANE" title="Control Plane"> | <name>Control Plane</name> | |||
<t>Node11 originates 192.0.2.11/32 in BGP and allocates to it a | <t>Node11 originates 192.0.2.11/32 in BGP and allocates to it a | |||
BGP-Prefix-SID with label-index: index11 <xref | BGP Prefix-SID with label-index: index11 <xref target="RFC8669" format | |||
target="I-D.ietf-idr-bgp-prefix-sid"/>.</t> | ="default"/>.</t> | |||
<t>Node11 sends the following EBGP8277 update to Node10:</t> | ||||
<t>Node11 sends the following eBGP8277 update to Node10:<figure> | <ul empty="true"> | |||
<artwork>. IP Prefix: 192.0.2.11/32 | ||||
. Label: Implicit-Null | <li> | |||
. Next-hop: Node11’s interface address on the link to Node10 | <dl> | |||
. AS Path: {11} | ||||
. BGP-Prefix-SID: Label-Index 11 | <dt>IP Prefix: | |||
</artwork> | </dt> | |||
</figure></t> | <dd>192.0.2.11/32 | |||
</dd> | ||||
<dt>Label: | ||||
</dt> | ||||
<dd>Implicit NULL | ||||
</dd> | ||||
<dt>Next hop: | ||||
</dt> | ||||
<dd>Node11's interface address on the link to Node10 | ||||
</dd> | ||||
<dt>AS Path: | ||||
</dt> | ||||
<dd>{11} | ||||
</dd> | ||||
<dt>BGP Prefix-SID: | ||||
</dt> | ||||
<dd>Label-Index 11 | ||||
</dd> | ||||
</dl> | ||||
</li> | ||||
</ul> | ||||
<t>Node10 receives the above update. As it is SR capable, Node10 is | <t>Node10 receives the above update. As it is SR capable, Node10 is | |||
able to interpret the BGP-Prefix-SID and hence understands that it | able to interpret the BGP Prefix-SID; therefore, it understands that i t | |||
should allocate the label from its own SRGB block, offset by the | should allocate the label from its own SRGB block, offset by the | |||
Label-Index received in the BGP-Prefix-SID (16000+11 hence 16011) to | label index received in the BGP Prefix-SID (16000+11, hence, 16011) to | |||
the NLRI instead of allocating a non-deterministic label out of a | the Network Layer Reachability Information (NLRI) instead of | |||
dynamically allocated portion of the local label space. The | allocating a nondeterministic label out of a dynamically allocated | |||
implicit-null label in the NLRI tells Node10 that it is the | portion of the local label space. The implicit NULL label in the | |||
penultimate hop and must pop the top label on the stack before | NLRI tells Node10 that it is the penultimate hop and that it must pop | |||
forwarding traffic for this prefix to Node11.</t> | the | |||
top label on the stack before forwarding traffic for this prefix to | ||||
Node11.</t> | ||||
<t>Then, Node10 sends the following EBGP8277 update to Node7:</t> | ||||
<t>Then, Node10 sends the following eBGP8277 update to Node7:<figure> | <ul empty="true"> | |||
<artwork>. IP Prefix: 192.0.2.11/32 | ||||
. Label: 16011 | <li> | |||
. Next-hop: Node10’s interface address on the link to Node7 | <dl> | |||
. AS Path: {10, 11} | ||||
. BGP-Prefix-SID: Label-Index 11 | <dt>IP Prefix: | |||
</artwork> | </dt> | |||
</figure></t> | <dd>192.0.2.11/32 | |||
</dd> | ||||
<dt>Label: | ||||
</dt> | ||||
<dd>16011 | ||||
</dd> | ||||
<dt>Next hop: | ||||
</dt> | ||||
<dd>Node10's interface address on the link to Node7 | ||||
</dd> | ||||
<dt>AS Path: | ||||
</dt> | ||||
<dd>{10, 11} | ||||
</dd> | ||||
<dt>BGP Prefix-SID: | ||||
</dt> | ||||
<dd>Label-Index 11 | ||||
</dd> | ||||
</dl> | ||||
</li> | ||||
</ul> | ||||
<t>Node7 receives the above update. As it is SR capable, Node7 is | <t>Node7 receives the above update. As it is SR capable, Node7 is | |||
able to interpret the BGP-Prefix-SID and hence allocates the local | able to interpret the BGP Prefix-SID; therefore, it allocates the loca l | |||
(incoming) label 16011 (16000 + 11) to the NLRI (instead of | (incoming) label 16011 (16000 + 11) to the NLRI (instead of | |||
allocating a “dynamic” local label from its label | allocating a "dynamic" local label from its label | |||
manager). Node7 uses the label in the received eBGP8277 NLRI as the | manager). Node7 uses the label in the received EBGP8277 NLRI as the | |||
outgoing label (the index is only used to derive the local/incoming | outgoing label (the index is only used to derive the local/incoming | |||
label).</t> | label).</t> | |||
<t>Node7 sends the following EBGP8277 update to Node4:</t> | ||||
<t>Node7 sends the following eBGP8277 update to Node4:<figure> | <ul empty="true"> | |||
<artwork>. IP Prefix: 192.0.2.11/32 | ||||
. Label: 16011 | <li> | |||
. Next-hop: Node7’s interface address on the link to Node4 | <dl> | |||
. AS Path: {7, 10, 11} | ||||
. BGP-Prefix-SID: Label-Index 11 | <dt>IP Prefix: | |||
</artwork> | </dt> | |||
</figure></t> | <dd>192.0.2.11/32 | |||
</dd> | ||||
<dt>Label: | ||||
</dt> | ||||
<dd>16011 | ||||
</dd> | ||||
<dt>Next hop: | ||||
</dt> | ||||
<dd>Node7's interface address on the link to Node4 | ||||
</dd> | ||||
<dt>AS Path: | ||||
</dt> | ||||
<dd>{7, 10, 11} | ||||
</dd> | ||||
<dt>BGP Prefix-SID: | ||||
</dt> | ||||
<dd>Label-Index 11 | ||||
</dd> | ||||
</dl> | ||||
</li> | ||||
</ul> | ||||
<t>Node4 receives the above update. As it is SR capable, Node4 is | <t>Node4 receives the above update. As it is SR capable, Node4 is | |||
able to interpret the BGP-Prefix-SID and hence allocates the local | able to interpret the BGP Prefix-SID; therefore, it allocates the loca l | |||
(incoming) label 16011 to the NLRI (instead of allocating a | (incoming) label 16011 to the NLRI (instead of allocating a | |||
“dynamic” local label from its label manager). Node4 | "dynamic" local label from its label manager). Node4 | |||
uses the label in the received eBGP8277 NLRI as outgoing label (the | uses the label in the received EBGP8277 NLRI as an outgoing label (the | |||
index is only used to derive the local/incoming label).</t> | index is only used to derive the local/incoming label).</t> | |||
<t>Node4 sends the following EBGP8277 update to Node1:</t> | ||||
<t>Node4 sends the following eBGP8277 update to Node1:<figure> | <ul empty="true"> | |||
<artwork>. IP Prefix: 192.0.2.11/32 | ||||
. Label: 16011 | <li> | |||
. Next-hop: Node4’s interface address on the link to Node1 | <dl> | |||
. AS Path: {4, 7, 10, 11} | ||||
. BGP-Prefix-SID: Label-Index 11 | <dt>IP Prefix: | |||
</artwork> | </dt> | |||
</figure></t> | <dd>192.0.2.11/32 | |||
</dd> | ||||
<dt>Label: | ||||
</dt> | ||||
<dd>16011 | ||||
</dd> | ||||
<dt>Next hop: | ||||
</dt> | ||||
<dd>Node4's interface address on the link to Node1 | ||||
</dd> | ||||
<dt>AS Path: | ||||
</dt> | ||||
<dd>{4, 7, 10, 11} | ||||
</dd> | ||||
<dt>BGP Prefix-SID: | ||||
</dt> | ||||
<dd>Label-Index 11 | ||||
</dd> | ||||
</dl> | ||||
</li> | ||||
</ul> | ||||
<t>Node1 receives the above update. As it is SR capable, Node1 is | <t>Node1 receives the above update. As it is SR capable, Node1 is | |||
able to interpret the BGP-Prefix-SID and hence allocates the local | able to interpret the BGP Prefix-SID; therefore, it allocates the loca l | |||
(incoming) label 16011 to the NLRI (instead of allocating a | (incoming) label 16011 to the NLRI (instead of allocating a | |||
“dynamic” local label from its label manager). Node1 | "dynamic" local label from its label manager). Node1 | |||
uses the label in the received eBGP8277 NLRI as outgoing label (the | uses the label in the received EBGP8277 NLRI as an outgoing label (the | |||
index is only used to derive the local/incoming label).</t> | index is only used to derive the local/incoming label).</t> | |||
</section> | </section> | |||
<section anchor="DATAPLANE" numbered="true" toc="default"> | ||||
<section anchor="DATAPLANE" title="Data Plane"> | <name>Data Plane</name> | |||
<t>Referring to <xref target="FIGLARGE"/>, and assuming all nodes | <t>Referring to <xref target="FIGLARGE" format="default"/>, and assumi | |||
ng all nodes | ||||
apply the same advertisement rules described above and all nodes | apply the same advertisement rules described above and all nodes | |||
have the same SRGB (16000-23999), here are the IP/MPLS forwarding | have the same SRGB (16000-23999), here are the IP/MPLS forwarding | |||
tables for prefix 192.0.2.11/32 at Node1, Node4, Node7 and | tables for prefix 192.0.2.11/32 at Node1, Node4, Node7, and | |||
Node10.</t> | Node10.</t> | |||
<figure align="left" anchor="NODE1FIB" | <table anchor="NODE1FIB"> | |||
title="Node1 Forwarding Table"> | ||||
<artwork align="center">-------------------------------------------- | ||||
--- | ||||
Incoming label | outgoing label | Outgoing | ||||
or IP destination | | Interface | ||||
16011 | 16011 | ECMP{3, 4} | ||||
192.0.2.11/32 | 16011 | ECMP{3, 4} | ||||
</figure> | ||||
<figure anchor="NODE4FIB" suppress-title="false" | <name>Node1 Forwarding Table | |||
title="Node4 Forwarding Table"> | </name> | |||
<artwork align="center"> | ||||
Incoming label | outgoing label | Outgoing | ||||
or IP destination | | Interface | ||||
16011 | 16011 | ECMP{7, 8} | ||||
192.0.2.11/32 | 16011 | ECMP{7, 8} | ||||
</figure> | ||||
<figure anchor="NODE7FIB" suppress-title="false" | <tbody> | |||
title="Node7 Forwarding Table"> | ||||
<artwork align="center"> | ||||
Incoming label | outgoing label | Outgoing | ||||
or IP destination | | Interface | ||||
16011 | 16011 | 10 | ||||
192.0.2.11/32 | 16011 | 10 | ||||
</figure> | ||||
<figure anchor="NODE10FIB" suppress-title="true" | <tr> | |||
title="Node10 Forwarding Table"> | <td align="center">Incoming Label or IP Destination | |||
<artwork align="center"> | </td> | |||
Incoming label | outgoing label | Outgoing | <td align="center">Outgoing Label | |||
or IP destination | | Interface | </td> | |||
16011 | POP | 11 | <td align="center">Outgoing Interface | |||
192.0.2.11/32 | N/A | 11 | </td> | |||
</figure> | </tr> | |||
</section> | ||||
<section anchor="VARIATIONS" title="Network Design Variation"> | <tr> | |||
<td align="center">16011 | ||||
</td> | ||||
<td align="center">16011 | ||||
</td> | ||||
<td align="center">ECMP{3, 4} | ||||
</td> | ||||
</tr> | ||||
<tr> | ||||
<td align="center">192.0.2.11/32 | ||||
</td> | ||||
<td align="center">16011 | ||||
</td> | ||||
<td align="center">ECMP{3, 4} | ||||
</td> | ||||
</tr> | ||||
</tbody> | ||||
</table> | ||||
<table anchor="NODE4FIB"> | ||||
<name>Node4 Forwarding Table | ||||
</name> | ||||
<tbody > | ||||
<tr> | ||||
<td align="center">Incoming Label or IP Destination | ||||
</td> | ||||
<td align="center">Outgoing Label | ||||
</td> | ||||
<td align="center">Outgoing Interface | ||||
</td> | ||||
</tr> | ||||
<tr> | ||||
<td align="center">16011 | ||||
</td> | ||||
<td align="center">16011 | ||||
</td> | ||||
<td align="center">ECMP{7, 8} | ||||
</td> | ||||
</tr> | ||||
<tr> | ||||
<td align="center">192.0.2.11/32 | ||||
</td> | ||||
<td align="center">16011 | ||||
</td> | ||||
<td align="center">ECMP{7, 8} | ||||
</td> | ||||
</tr> | ||||
</tbody> | ||||
</table> | ||||
<table anchor="NODE7FIB"> | ||||
<name>Node7 Forwarding Table | ||||
</name> | ||||
<tbody > | ||||
<tr > | ||||
<td align="center">Incoming Label or IP Destination | ||||
</td> | ||||
<td align="center">Outgoing Label | ||||
</td> | ||||
<td align="center">Outgoing Interface | ||||
</td> | ||||
</tr> | ||||
<tr> | ||||
<td align="center">16011 | ||||
</td> | ||||
<td align="center">16011 | ||||
</td> | ||||
<td align="center">10 | ||||
</td> | ||||
</tr> | ||||
<tr> | ||||
<td align="center">192.0.2.11/32 | ||||
</td> | ||||
<td align="center">16011 | ||||
</td> | ||||
<td align="center">10 | ||||
</td> | ||||
</tr> | ||||
</tbody> | ||||
</table> | ||||
<table anchor="NODE10FIB"> | ||||
<name>Node10 Forwarding Table | ||||
</name> | ||||
<tbody > | ||||
<tr > | ||||
<td align="center">Incoming Label or IP Destination | ||||
</td> | ||||
<td align="center">Outgoing Label | ||||
</td> | ||||
<td align="center">Outgoing Interface | ||||
</td> | ||||
</tr> | ||||
<tr> | ||||
<td align="center">16011 | ||||
</td> | ||||
<td align="center">POP | ||||
</td> | ||||
<td align="center">11 | ||||
</td> | ||||
</tr> | ||||
<tr> | ||||
<td align="center">192.0.2.11/32 | ||||
</td> | ||||
<td align="center">N/A | ||||
</td> | ||||
<td align="center">11 | ||||
</td> | ||||
</tr> | ||||
</tbody> | ||||
</table> | ||||
</section> | ||||
<section anchor="VARIATIONS" numbered="true" toc="default"> | ||||
<name>Network Design Variation</name> | ||||
<t>A network design choice could consist of switching all the | <t>A network design choice could consist of switching all the | |||
traffic through Tier-1 and Tier-2 as MPLS traffic. In this case, one | traffic through Tier-1 and Tier-2 as MPLS traffic. In this case, one | |||
could filter away the IP entries at Node4, Node7 and Node10. This | could filter away the IP entries at Node4, Node7, and Node10. This | |||
might be beneficial in order to optimize the forwarding table | might be beneficial in order to optimize the forwarding table | |||
size.</t> | size.</t> | |||
<t>A network design choice could consist in allowing the hosts to | <t>A network design choice could consist of allowing the hosts to | |||
send MPLS-encapsulated traffic based on the Egress Peer Engineering | send MPLS-encapsulated traffic based on the Egress Peer Engineering | |||
(EPE) use-case as defined in <xref | (EPE) use case as defined in <xref target="I-D.ietf-spring-segment-rou | |||
target="I-D.ietf-spring-segment-routing-central-epe"/>. For example, | ting-central-epe" format="default"/>. For example, | |||
applications at HostA would send their Z-destined traffic to Node1 | applications at HostA would send their Z-destined traffic to Node1 | |||
with an MPLS label stack where the top label is 16011 and the next | with an MPLS label stack where the top label is 16011 and the next | |||
label is an EPE peer segment (<xref | label is an EPE peer segment (<xref target="I-D.ietf-spring-segment-ro | |||
target="I-D.ietf-spring-segment-routing-central-epe"/>) at Node11 | uting-central-epe" format="default"/>) at Node11 | |||
directing the traffic to Z.</t> | directing the traffic to Z.</t> | |||
</section> | </section> | |||
<section anchor="FABRIC" numbered="true" toc="default"> | ||||
<section anchor="FABRIC" | <name>Global BGP Prefix Segment through the Fabric</name> | |||
title="Global BGP Prefix Segment through the fabric"> | ||||
<t>When the previous design is deployed, the operator enjoys global | <t>When the previous design is deployed, the operator enjoys global | |||
BGP-Prefix-SID and label allocation throughout the DC fabric.</t> | BGP Prefix-SID and label allocation throughout the DC fabric.</t> | |||
<t>A few examples follow:</t> | ||||
<t>A few examples follow:<list style="symbols"> | <ul spacing="normal"> | |||
<t>Normal forwarding to Node11: a packet with top label 16011 | <li>Normal forwarding to Node11: A packet with top label 16011 | |||
received by any node in the fabric will be forwarded along the | received by any node in the fabric will be forwarded along the | |||
ECMP-aware BGP best-path towards Node11 and the label 16011 is | ECMP-aware BGP best path towards Node11, and the label 16011 is | |||
penultimate-popped at Node10 (or at Node 9).</t> | penultimate popped at Node10 (or at Node 9).</li> | |||
<li>Traffic-engineered path to Node11: An application on a host | ||||
<t>Traffic-engineered path to Node11: an application on a host | ||||
behind Node1 might want to restrict its traffic to paths via the | behind Node1 might want to restrict its traffic to paths via the | |||
Spine node Node5. The application achieves this by sending its | Spine node Node5. The application achieves this by sending its | |||
packets with a label stack of {16005, 16011}. BGP Prefix SID | packets with a label stack of {16005, 16011}. BGP Prefix-SID | |||
16005 directs the packet up to Node5 along the path (Node1, | 16005 directs the packet up to Node5 along the path (Node1, | |||
Node3, Node5). BGP-Prefix-SID 16011 then directs the packet down | Node3, Node5). BGP Prefix-SID 16011 then directs the packet down | |||
to Node11 along the path (Node5, Node9, Node11).</t> | to Node11 along the path (Node5, Node9, Node11).</li> | |||
</list></t> | </ul> | |||
</section> | </section> | |||
<section anchor="INCRDEP" numbered="true" toc="default"> | ||||
<section anchor="INCRDEP" title="Incremental Deployments"> | <name>Incremental Deployments</name> | |||
<t>The design previously described can be deployed incrementally. | <t>The design previously described can be deployed incrementally. | |||
Let us assume that Node7 does not support the BGP-Prefix-SID and let | Let us assume that Node7 does not support the BGP Prefix-SID, and let | |||
us show how the fabric connectivity is preserved.</t> | us show how the fabric connectivity is preserved.</t> | |||
<t>From a signaling viewpoint, nothing would change; even though | ||||
<t>From a signaling viewpoint, nothing would change: even though | Node7 does not support the BGP Prefix-SID, it does propagate the | |||
Node7 does not support the BGP-Prefix-SID, it does propagate the | ||||
attribute unmodified to its neighbors.</t> | attribute unmodified to its neighbors.</t> | |||
<t>From a label-allocation viewpoint, the only difference is that | ||||
<t>From a label allocation viewpoint, the only difference is that | ||||
Node7 would allocate a dynamic (random) label to the prefix | Node7 would allocate a dynamic (random) label to the prefix | |||
192.0.2.11/32 (e.g. 123456) instead of the "hinted" label as | 192.0.2.11/32 (e.g., 123456) instead of the "hinted" label as | |||
instructed by the BGP-Prefix-SID. The neighbors of Node7 adapt | instructed by the BGP Prefix-SID. The neighbors of Node7 adapt | |||
automatically as they always use the label in the BGP8277 NLRI as | automatically as they always use the label in the BGP8277 NLRI as | |||
outgoing label.</t> | an outgoing label.</t> | |||
<t>Node4 does understand the BGP Prefix-SID; therefore, it allocates t | ||||
<t>Node4 does understand the BGP-Prefix-SID and hence allocates the | he | |||
indexed label in the SRGB (16011) for 192.0.2.11/32.</t> | indexed label in the SRGB (16011) for 192.0.2.11/32.</t> | |||
<t>As a result, all the data-plane entries across the network would | <t>As a result, all the data-plane entries across the network would | |||
be unchanged except the entries at Node7 and its neighbor Node4 as | be unchanged except the entries at Node7 and its neighbor Node4 as | |||
shown in the figures below.</t> | shown in the figures below.</t> | |||
<t>The key point is that the end-to-end Label Switched Path (LSP) is | <t>The key point is that the end-to-end Label Switched Path (LSP) is | |||
preserved because the outgoing label is always derived from the | preserved because the outgoing label is always derived from the | |||
received label within the BGP8277 NLRI. The index in the | received label within the BGP8277 NLRI. The index in the | |||
BGP-Prefix-SID is only used as a hint on how to allocate the local | BGP Prefix-SID is only used as a hint on how to allocate the local | |||
label (the incoming label) but never for the outgoing label.</t> | label (the incoming label) but never for the outgoing label.</t> | |||
<figure anchor="NODE7FIBINC" title="Node7 Forwarding Table"> | <table anchor="NODE7FIBINC"> | |||
<artwork align="center">------------------------------------------ | ||||
Incoming label | outgoing | Outgoing | ||||
or IP destination | label | Interface | ||||
12345 | 16011 | 10 | ||||
</artwork> | ||||
</figure> | ||||
<figure anchor="NODE4FIBINC" title="Node4 Forwarding Table"> | <name>Node7 Forwarding Table | |||
<artwork align="center">------------------------------------------ | </name> | |||
Incoming label | outgoing | Outgoing | ||||
or IP destination | label | Interface | ||||
16011 | 12345 | 7 | ||||
</artwork> | ||||
</figure> | ||||
<t>The BGP-Prefix-SID can thus be deployed incrementally one node at | <tbody > | |||
a time.</t> | ||||
<t>When deployed together with a homogeneous SRGB (same SRGB across | <tr > | |||
<td align="center">Incoming Label or IP Destination | ||||
</td> | ||||
<td align="center">Outgoing Label | ||||
</td> | ||||
<td align="center">Outgoing Interface | ||||
</td> | ||||
</tr> | ||||
<tr> | ||||
<td align="center">12345 | ||||
</td> | ||||
<td align="center">16011 | ||||
</td> | ||||
<td align="center">10 | ||||
</td> | ||||
</tr> | ||||
</tbody> | ||||
</table> | ||||
<table anchor="NODE4FIBINC"> | ||||
<name>Node4 Forwarding Table | ||||
</name> | ||||
<tbody > | ||||
<tr > | ||||
<td align="center">Incoming Label or IP Destination | ||||
</td> | ||||
<td align="center">Outgoing Label | ||||
</td> | ||||
<td align="center">Outgoing Interface | ||||
</td> | ||||
</tr> | ||||
<tr> | ||||
<td align="center">16011 | ||||
</td> | ||||
<td align="center">12345 | ||||
</td> | ||||
<td align="center">7 | ||||
</td> | ||||
</tr> | ||||
</tbody> | ||||
</table> | ||||
<t>The BGP Prefix-SID can thus be deployed incrementally, i.e., one no | ||||
de at | ||||
a time.</t> | ||||
<t>When deployed together with a homogeneous SRGB (the same SRGB acros | ||||
s | ||||
the fabric), the operator incrementally enjoys the global prefix | the fabric), the operator incrementally enjoys the global prefix | |||
segment benefits as the deployment progresses through the | segment benefits as the deployment progresses through the | |||
fabric.</t> | fabric.</t> | |||
</section> | </section> | |||
</section> | </section> | |||
<section anchor="iBGP3107" numbered="true" toc="default"> | ||||
<name>IBGP Labeled Unicast (RFC 8277)</name> | ||||
<t>The same exact design as EBGP8277 is used with the following | ||||
modifications:</t> | ||||
<ul spacing="normal"> | ||||
<li>All nodes use the same AS number.</li> | ||||
<li>Each node peers with its neighbors via an internal BGP session | ||||
(IBGP) with extensions defined in <xref target="RFC8277" format="def | ||||
ault"/> (named | ||||
"IBGP8277" throughout this document).</li> | ||||
<li>Each node acts as a route reflector for each of its neighbors | ||||
and with the next-hop-self option. Next-hop-self is a well-known | ||||
operational feature that consists of rewriting the next hop of a | ||||
BGP update prior to sending it to the neighbor. Usually, | ||||
it's a common practice to apply next-hop-self behavior | ||||
towards IBGP peers for EBGP-learned routes. In the case outlined | ||||
in this section, it is proposed to use the next-hop-self mechanism | ||||
also to IBGP-learned routes.</li></ul> | ||||
<section anchor="iBGP3107" title="iBGP Labeled Unicast (RFC8277)"> | <figure anchor="IBGPFIG"> | |||
<t>The same exact design as eBGP8277 is used with the following | <name>IBGP Sessions with Reflection and Next-Hop-Self</name> | |||
modifications:<list> | <artwork name="" type="" align="left" alt=""><![CDATA[ | |||
<t>All nodes use the same AS number.</t> | ||||
<t>Each node peers with its neighbors via an internal BGP session | ||||
(iBGP) with extensions defined in <xref target="RFC8277"/> (named | ||||
"iBGP8277" throughout this document).</t> | ||||
<t>Each node acts as a route-reflector for each of its neighbors | ||||
and with the next-hop-self option. Next-hop-self is a well known | ||||
operational feature which consists of rewriting the next-hop of a | ||||
BGP update prior to send it to the neighbor. Usually, it’s a | ||||
common practice to apply next-hop-self behavior towards iBGP peers | ||||
for eBGP learned routes. In the case outlined in this section it | ||||
is proposed to use the next-hop-self mechanism also to iBGP | ||||
learned routes.</t> | ||||
<t><figure anchor="IBGPFIG" | ||||
title="iBGP Sessions with Reflection and Next-Hop-Self"> | ||||
<artwork> | ||||
Cluster-1 | Cluster-1 | |||
+-----------+ | +-----------+ | |||
| Tier-1 | | | Tier-1 | | |||
| +-----+ | | | +-----+ | | |||
| |NODE | | | | |NODE | | | |||
| | 5 | | | | | 5 | | | |||
Cluster-2 | +-----+ | Cluster-3 | Cluster-2 | +-----+ | Cluster-3 | |||
+---------+ | | +---------+ | +---------+ | | +---------+ | |||
| Tier-2 | | | | Tier-2 | | | Tier-2 | | | | Tier-2 | | |||
| +-----+ | | +-----+ | | +-----+ | | | +-----+ | | +-----+ | | +-----+ | | |||
skipping to change at line 622 ¶ | skipping to change at line 798 ¶ | |||
| | 4 | | | | 7 | | | | 10 | | | | | 4 | | | | 7 | | | | 10 | | | |||
| +-----+ | | +-----+ | | +-----+ | | | +-----+ | | +-----+ | | +-----+ | | |||
+---------+ | | +---------+ | +---------+ | | +---------+ | |||
| | | | | | |||
| +-----+ | | | +-----+ | | |||
| |NODE | | | | |NODE | | | |||
Tier-3 | | 8 | | Tier-3 | Tier-3 | | 8 | | Tier-3 | |||
+-----+ +-----+ | +-----+ | +-----+ +-----+ | +-----+ +-----+ | +-----+ | +-----+ +-----+ | |||
|NODE | |NODE | +-----------+ |NODE | |NODE | | |NODE | |NODE | +-----------+ |NODE | |NODE | | |||
| 1 | | 2 | | 11 | | 12 | | | 1 | | 2 | | 11 | | 12 | | |||
+-----+ +-----+ +-----+ +-----+ | +-----+ +-----+ +-----+ +-----+]]></artwork> | |||
</artwork> | </figure> | |||
</figure></t> | <ul spacing="normal"> | |||
<li> | ||||
<t>For simple and efficient route propagation filtering and as | <t>For simple and efficient route propagation filtering and as | |||
illustrated in <xref target="IBGPFIG"/>: <list> | illustrated in <xref target="IBGPFIG" format="default"/>: </t> | |||
<t>Node5, Node6, Node7 and Node8 use the same Cluster ID | <ul spacing="normal"> | |||
(Cluster-1)</t> | <li>Node5, Node6, Node7, and Node8 use the same Cluster ID | |||
(Cluster-1).</li> | ||||
<t>Node3 and Node4 use the same Cluster ID (Cluster-2)</t> | <li>Node3 and Node4 use the same Cluster ID (Cluster-2).</li> | |||
<li>Node9 and Node10 use the same Cluster ID (Cluster-3).</li> | ||||
<t>Node9 and Node10 use the same Cluster ID (Cluster-3)</t> | </ul> | |||
</list></t> | </li> | |||
<li>The control-plane behavior is mostly the same as described in | ||||
<t>The control-plane behavior is mostly the same as described in | the previous section; the only difference is that the EBGP8277 | |||
the previous section: the only difference is that the eBGP8277 | path propagation is simply replaced by an IBGP8277 path reflection | |||
path propagation is simply replaced by an iBGP8277 path reflection | with next hop changed to self.</li> | |||
with next-hop changed to self.</t> | <li>The data-plane tables are exactly the same.</li> | |||
</ul> | ||||
<t>The data-plane tables are exactly the same.</t> | ||||
</list></t> | ||||
</section> | </section> | |||
</section> | </section> | |||
<section anchor="IPV6" numbered="true" toc="default"> | ||||
<section anchor="IPV6" | <name>Applying Segment Routing in the DC with IPv6 Data Plane</name> | |||
title="Applying Segment Routing in the DC with IPv6 dataplane"> | <t>The design described in <xref target="RFC7938" format="default"/> is re | |||
<t>The design described in <xref target="RFC7938"/> is reused with one | used with one | |||
single modification. It is highlighted using the example of the | single modification. It is highlighted using the example of the | |||
reachability to Node11 via spine node Node5.</t> | reachability to Node11 via Spine node Node5.</t> | |||
<t>Node5 originates 2001:DB8::5/128 with the attached BGP Prefix-SID for | ||||
<t>Node5 originates 2001:DB8::5/128 with the attached BGP-Prefix-SID for | IPv6 packets destined to segment 2001:DB8::5 (<xref target="RFC8402" forma | |||
IPv6 packets destined to segment 2001:DB8::5 (<xref | t="default"/>).</t> | |||
target="I-D.ietf-idr-bgp-prefix-sid"/>).</t> | <t>Node11 originates 2001:DB8::11/128 with the attached BGP Prefix-SID | |||
advertising the support of the Segment Routing Header (SRH) for IPv6 packe | ||||
<t>Node11 originates 2001:DB8::11/128 with the attached BGP-Prefix-SID | ts destined to segment | |||
advertising the support of the SRH for IPv6 packets destined to segment | ||||
2001:DB8::11.</t> | 2001:DB8::11.</t> | |||
<t>The control-plane and data-plane processing of all the other nodes in | <t>The control-plane and data-plane processing of all the other nodes in | |||
the fabric is unchanged. Specifically, the routes to 2001:DB8::5 and | the fabric is unchanged. Specifically, the routes to 2001:DB8::5 and | |||
2001:DB8::11 are installed in the FIB along the eBGP best-path to Node5 | 2001:DB8::11 are installed in the FIB along the EBGP best path to Node5 | |||
(spine node) and Node11 (ToR node) respectively.</t> | (Spine node) and Node11 (ToR node) respectively.</t> | |||
<t>An application on HostA that needs to send traffic to HostZ via only | ||||
<t>An application on HostA which needs to send traffic to HostZ via only | Node5 (Spine node) can do so by sending IPv6 packets with a Segment | |||
Node5 (spine node) can do so by sending IPv6 packets with a Segment | Routing Header (SRH, <xref target="I-D.ietf-6man-segment-routing-header" f | |||
Routing header (SRH, <xref | ormat="default"/>). The destination | |||
target="I-D.ietf-6man-segment-routing-header"/>). The destination | ||||
address and active segment is set to 2001:DB8::5. The next and last | address and active segment is set to 2001:DB8::5. The next and last | |||
segment is set to 2001:DB8::11.</t> | segment is set to 2001:DB8::11.</t> | |||
<t>The application must only use IPv6 addresses that have been | <t>The application must only use IPv6 addresses that have been | |||
advertised as capable for SRv6 segment processing (e.g. for which the | advertised as capable for SRv6 segment processing (e.g., for which the | |||
BGP prefix segment capability has been advertised). How applications | BGP Prefix Segment capability has been advertised). How applications | |||
learn this (e.g.: centralized controller and orchestration) is outside | learn this (e.g., centralized controller and orchestration) is outside | |||
the scope of this document.</t> | the scope of this document.</t> | |||
</section> | </section> | |||
<section anchor="COMMHOSTS" numbered="true" toc="default"> | ||||
<section anchor="COMMHOSTS" | <name>Communicating Path Information to the Host</name> | |||
title="Communicating path information to the host"> | ||||
<t>There are two general methods for communicating path information to | <t>There are two general methods for communicating path information to | |||
the end-hosts: "proactive" and "reactive", aka "push" and "pull" models. | the end-hosts: "proactive" and "reactive", aka "push" and "pull" models. | |||
There are multiple ways to implement either of these methods. Here, it | There are multiple ways to implement either of these methods. Here, it | |||
is noted that one way could be using a centralized controller: the | is noted that one way could be using a centralized controller: the | |||
controller either tells the hosts of the prefix-to-path mappings | controller either tells the hosts of the prefix-to-path mappings | |||
beforehand and updates them as needed (network event driven push), or | beforehand and updates them as needed (network event driven push) or | |||
responds to the hosts making request for a path to specific destination | responds to the hosts making requests for a path to a specific destination | |||
(host event driven pull). It is also possible to use a hybrid model, | (host event driven pull). It is also possible to use a hybrid model, | |||
i.e., pushing some state from the controller in response to particular | i.e., pushing some state from the controller in response to particular | |||
network events, while the host pulls other state on demand.</t> | network events, while the host pulls other state on demand.</t> | |||
<t>Note also that when disseminating network-related data to the | ||||
<t>It is also noted, that when disseminating network-related data to the | end-hosts, a trade-off is made to balance the amount of information | |||
end-hosts a trade-off is made to balance the amount of information Vs. | vs. the level of visibility in the network state. This applies | |||
the level of visibility in the network state. This applies both to push | to both push and pull models. In the extreme case, the host would request | |||
and pull models. In the extreme case, the host would request path | path information on every flow and keep no local state at all. On the | |||
information on every flow, and keep no local state at all. On the other | other end of the spectrum, information for every prefix in the network | |||
end of the spectrum, information for every prefix in the network along | along with available paths could be pushed and continuously updated on | |||
with available paths could be pushed and continuously updated on all | all hosts.</t> | |||
hosts.</t> | ||||
</section> | </section> | |||
<section anchor="BENEFITS" numbered="true" toc="default"> | ||||
<section anchor="BENEFITS" title="Additional Benefits"> | <name>Additional Benefits</name> | |||
<section anchor="MPLSIMPLE" | <section anchor="MPLSIMPLE" numbered="true" toc="default"> | |||
title="MPLS Dataplane with operational simplicity"> | <name>MPLS Data Plane with Operational Simplicity</name> | |||
<t>As required by <xref target="RFC7938"/>, no new signaling protocol | <t>As required by <xref target="RFC7938" format="default"/>, no new sign | |||
is introduced. The BGP-Prefix-SID is a lightweight extension to BGP | aling protocol | |||
Labeled Unicast <xref target="RFC8277"/>. It applies either to eBGP or | is introduced. The BGP Prefix-SID is a lightweight extension to BGP | |||
iBGP based designs.</t> | Labeled Unicast <xref target="RFC8277" format="default"/>. It applies ei | |||
ther to EBGP- or | ||||
IBGP-based designs.</t> | ||||
<t>Specifically, LDP and RSVP-TE are not used. These protocols would | <t>Specifically, LDP and RSVP-TE are not used. These protocols would | |||
drastically impact the operational complexity of the Data Center and | drastically impact the operational complexity of the data center and | |||
would not scale. This is in line with the requirements expressed in | would not scale. This is in line with the requirements expressed in | |||
<xref target="RFC7938"/>.</t> | <xref target="RFC7938" format="default"/>.</t> | |||
<t>Provided the same SRGB is configured on all nodes, all nodes use | <t>Provided the same SRGB is configured on all nodes, all nodes use | |||
the same MPLS label for a given IP prefix. This is simpler from an | the same MPLS label for a given IP prefix. This is simpler from an | |||
operation standpoint, as discussed in <xref target="SINGLESRGB"/></t> | operation standpoint, as discussed in <xref target="SINGLESRGB" format=" default"/>.</t> | |||
</section> | </section> | |||
<section anchor="MINFIB" numbered="true" toc="default"> | ||||
<section anchor="MINFIB" title="Minimizing the FIB table"> | <name>Minimizing the FIB Table</name> | |||
<t>The designer may decide to switch all the traffic at Tier-1 and | <t>The designer may decide to switch all the traffic at Tier-1 and | |||
Tier-2's based on MPLS, hence drastically decreasing the IP table size | Tier-2 based on MPLS, thereby drastically decreasing the IP table size | |||
at these nodes.</t> | at these nodes.</t> | |||
<t>This is easily accomplished by encapsulating the traffic either | <t>This is easily accomplished by encapsulating the traffic either | |||
directly at the host or the source ToR node by pushing the | directly at the host or at the source ToR node. The encapsulation is | |||
BGP-Prefix-SID of the destination ToR for intra-DC traffic, or the | done by pushing the BGP Prefix-SID of the destination ToR for intra-DC | |||
BGP-Prefix-SID for the the border node for inter-DC or | traffic, or by pushing the BGP Prefix-SID for the border node for | |||
DC-to-outside-world traffic.</t> | inter-DC or DC-to-outside-world traffic.</t> | |||
</section> | </section> | |||
<section anchor="EPE" numbered="true" toc="default"> | ||||
<section anchor="EPE" title="Egress Peer Engineering"> | <name>Egress Peer Engineering</name> | |||
<t>It is straightforward to combine the design illustrated in this | <t>It is straightforward to combine the design illustrated in this | |||
document with the Egress Peer Engineering (EPE) use-case described in | document with the Egress Peer Engineering (EPE) use case described in | |||
<xref target="I-D.ietf-spring-segment-routing-central-epe"/>.</t> | <xref target="I-D.ietf-spring-segment-routing-central-epe" format="defau | |||
lt"/>.</t> | ||||
<t>In such case, the operator is able to engineer its outbound traffic | <t>In such a case, the operator is able to engineer its outbound traffic | |||
on a per host-flow basis, without incurring any additional state at | on a per-host-flow basis, without incurring any additional state at | |||
intermediate points in the DC fabric.</t> | intermediate points in the DC fabric.</t> | |||
<t>For example, the controller only needs to inject a per-flow state | <t>For example, the controller only needs to inject a per-flow state | |||
on the HostA to force it to send its traffic destined to a specific | on the HostA to force it to send its traffic destined to a specific | |||
Internet destination D via a selected border node (say Node12 in <xref | Internet destination D via a selected border node (say Node12 in <xref t | |||
target="FIGLARGE"/> instead of another border node, Node11) and a | arget="FIGLARGE" format="default"/> instead of another border node, Node11) and | |||
a | ||||
specific egress peer of Node12 (say peer AS 9999 of local PeerNode | specific egress peer of Node12 (say peer AS 9999 of local PeerNode | |||
segment 9999 at Node12 instead of any other peer which provides a path | segment 9999 at Node12 instead of any other peer that provides a path | |||
to the destination D). Any packet matching this state at host A would | to the destination D). Any packet matching this state at HostA would | |||
be encapsulated with SR segment list (label stack) {16012, 9999}. | be encapsulated with SR segment list (label stack) {16012, 9999}. | |||
16012 would steer the flow through the DC fabric, leveraging any ECMP, | 16012 would steer the flow through the DC fabric, leveraging any ECMP, | |||
along the best path to border node Node12. Once the flow gets to | along the best path to border node Node12. Once the flow gets to | |||
border node Node12, the active segment is 9999 (because of PHP on the | border node Node12, the active segment is 9999 (because of Penultimate | |||
upstream neighbor of Node12). This EPE PeerNode segment forces border | Hop Popping (PHP) on the upstream neighbor of Node12). This EPE | |||
node Node12 to forward the packet to peer AS 9999, without any IP | PeerNode segment forces border node Node12 to forward the packet to | |||
lookup at the border node. There is no per-flow state for this | peer AS 9999 without any IP lookup at the border node. There is no | |||
engineered flow in the DC fabric. A benefit of segment routing is the | per-flow state for this engineered flow in the DC fabric. A benefit of | |||
per-flow state is only required at the source.</t> | SR is that the per-flow state is only required at the | |||
source.</t> | ||||
<t>As well as allowing full traffic engineering control such a design | <t>As well as allowing full traffic-engineering control, such a design | |||
also offers FIB table minimization benefits as the Internet-scale FIB | also offers FIB table-minimization benefits as the Internet-scale FIB | |||
at border node Node12 is not required if all FIB lookups are avoided | at border node Node12 is not required if all FIB lookups are avoided | |||
there by using EPE.</t> | there by using EPE.</t> | |||
</section> | </section> | |||
<section anchor="ANYCAST" numbered="true" toc="default"> | ||||
<section anchor="ANYCAST" title="Anycast"> | <name>Anycast</name> | |||
<t>The design presented in this document preserves the availability | <t>The design presented in this document preserves the availability | |||
and load-balancing properties of the base design presented in <xref | and load-balancing properties of the base design presented in <xref targ | |||
target="I-D.ietf-spring-segment-routing"/>.</t> | et="RFC8402" format="default"/>.</t> | |||
<t>For example, one could assign an anycast loopback 192.0.2.20/32 and | <t>For example, one could assign an anycast loopback 192.0.2.20/32 and | |||
associate segment index 20 to it on the border Node11 and Node12 (in | associate segment index 20 to it on the border nodes Node11 and Node12 ( in | |||
addition to their node-specific loopbacks). Doing so, the EPE | addition to their node-specific loopbacks). Doing so, the EPE | |||
controller could express a default "go-to-the-Internet via any border | controller could express a default "go-to-the-Internet via any border | |||
node" policy as segment list {16020}. Indeed, from any host in the DC | node" policy as segment list {16020}. Indeed, from any host in the DC | |||
fabric or from any ToR node, 16020 steers the packet towards the | fabric or from any ToR node, 16020 steers the packet towards the | |||
border Node11 or Node12 leveraging ECMP where available along the best | border nodes Node11 or Node12 leveraging ECMP where available along the best | |||
paths to these nodes.</t> | paths to these nodes.</t> | |||
</section> | </section> | |||
</section> | </section> | |||
<section anchor="SINGLESRGB" numbered="true" toc="default"> | ||||
<section anchor="SINGLESRGB" title="Preferred SRGB Allocation"> | <name>Preferred SRGB Allocation</name> | |||
<t>In the MPLS case, it is recommend to use same SRGBs at each node.</t> | <t>In the MPLS case, it is recommended to use the same SRGBs at each node. | |||
</t> | ||||
<t>Different SRGBs in each node likely increase the complexity of the | <t>Different SRGBs in each node likely increase the complexity of the | |||
solution both from an operational viewpoint and from a controller | solution both from an operational viewpoint and from a controller | |||
viewpoint.</t> | viewpoint.</t> | |||
<t>From an operational viewpoint, it is much simpler to have the same | ||||
<t>From an operation viewpoint, it is much simpler to have the same | ||||
global label at every node for the same destination (the MPLS | global label at every node for the same destination (the MPLS | |||
troubleshooting is then similar to the IPv6 troubleshooting where this | troubleshooting is then similar to the IPv6 troubleshooting where this | |||
global property is a given).</t> | global property is a given).</t> | |||
<t>From a controller viewpoint, this allows us to construct simple | <t>From a controller viewpoint, this allows us to construct simple | |||
policies applicable across the fabric.</t> | policies applicable across the fabric.</t> | |||
<t>Let us consider two applications, A and B, respectively connected to | ||||
<t>Let us consider two applications A and B respectively connected to | Node1 and Node2 (ToR nodes). Application A has two flows, FA1 and FA2, des | |||
Node1 and Node2 (ToR nodes). A has two flows FA1 and FA2 destined to Z. | tined to Z. | |||
B has two flows FB1 and FB2 destined to Z. The controller wants FA1 and | B has two flows, FB1 and FB2, destined to Z. The controller wants FA1 and | |||
FB1 to be load-shared across the fabric while FA2 and FB2 must be | FB1 to be load shared across the fabric while FA2 and FB2 must be | |||
respectively steered via Node5 and Node8.</t> | respectively steered via Node5 and Node8.</t> | |||
<t>Assuming a consistent unique SRGB across the fabric as described in | <t>Assuming a consistent unique SRGB across the fabric as described in | |||
the document, the controller can simply do it by instructing A and B to | this document, the controller can simply do it by instructing A and B to | |||
use {16011} respectively for FA1 and FB1 and by instructing A and B to | use {16011} respectively for FA1 and FB1 and by instructing A and B to | |||
use {16005 16011} and {16008 16011} respectively for FA2 and FB2.</t> | use {16005 16011} and {16008 16011} respectively for FA2 and FB2.</t> | |||
<t>Let us assume a design where the SRGB is different at every node and | <t>Let us assume a design where the SRGB is different at every node and | |||
where the SRGB of each node is advertised using the Originator SRGB TLV | where the SRGB of each node is advertised using the Originator SRGB TLV | |||
of the BGP-Prefix-SID as defined in <xref | of the BGP Prefix-SID as defined in <xref target="RFC8669" format="default | |||
target="I-D.ietf-idr-bgp-prefix-sid"/>: SRGB of Node K starts at value | "/>: SRGB of Node K starts at value | |||
K*1000 and the SRGB length is 1000 (e.g. Node1’s SRGB is [1000, | K*1000, and the SRGB length is 1000 (e.g., Node1's SRGB is [1000, | |||
1999], Node2’s SRGB is [2000, 2999], …).</t> | 1999], Node2's SRGB is [2000, 2999], ...).</t> | |||
<t>In this case, not only the controller would need to collect and store | ||||
all of these different SRGB’s (e.g., through the Originator SRGB | ||||
TLV of the BGP-Prefix-SID), furthermore it would need to adapt the | ||||
policy for each host. Indeed, the controller would instruct A to use | ||||
{1011} for FA1 while it would have to instruct B to use {2011} for FB1 | ||||
(while with the same SRGB, both policies are the same {16011}).</t> | ||||
<t>In this case, the controller would need to collect and store all of | ||||
these different SRGBs (e.g., through the Originator SRGB TLV of the | ||||
BGP Prefix-SID); furthermore, it would also need to adapt the policy for | ||||
each host. Indeed, the controller would instruct A to use {1011} for FA1 | ||||
while it would have to instruct B to use {2011} for FB1 (while with the | ||||
same SRGB, both policies are the same {16011}).</t> | ||||
<t>Even worse, the controller would instruct A to use {1005, 5011} for | <t>Even worse, the controller would instruct A to use {1005, 5011} for | |||
FA1 while it would instruct B to use {2011, 8011} for FB1 (while with | FA1 while it would instruct B to use {2011, 8011} for FB1 (while with | |||
the same SRGB, the second segment is the same across both policies: | the same SRGB, the second segment is the same across both policies: | |||
16011). When combining segments to create a policy, one need to | 16011). When combining segments to create a policy, one needs to | |||
carefully update the label of each segment. This is obviously more | carefully update the label of each segment. This is obviously more error | |||
error-prone, more complex and more difficult to troubleshoot.</t> | prone, more complex, and more difficult to troubleshoot.</t> | |||
</section> | </section> | |||
<section anchor="IANA" numbered="true" toc="default"> | ||||
<section anchor="IANA" title="IANA Considerations"> | <name>IANA Considerations</name> | |||
<t>This document does not make any IANA request.</t> | <t>This document has no IANA actions.</t> | |||
</section> | </section> | |||
<section anchor="MANAGE" numbered="true" toc="default"> | ||||
<section anchor="MANAGE" title="Manageability Considerations"> | <name>Manageability Considerations</name> | |||
<t>The design and deployment guidelines described in this document are | <t>The design and deployment guidelines described in this document are | |||
based on the network design described in <xref target="RFC7938"/>.</t> | based on the network design described in <xref target="RFC7938" format="de | |||
fault"/>.</t> | ||||
<t>The deployment model assumed in this document is based on a single | <t>The deployment model assumed in this document is based on a single | |||
domain where the interconnected DCs are part of the same administrative | domain where the interconnected DCs are part of the same administrative | |||
domain (which, of course, is split into different autonomous systems). | domain (which, of course, is split into different autonomous systems). | |||
The operator has full control of the whole domain and the usual | The operator has full control of the whole domain, and the usual | |||
operational and management mechanisms and procedures are used in order | operational and management mechanisms and procedures are used in order | |||
to prevent any information related to internal prefixes and topology to | to prevent any information related to internal prefixes and topology to | |||
be leaked outside the domain.</t> | be leaked outside the domain.</t> | |||
<t>As recommended in <xref target="RFC8402" format="default"/>, | ||||
<t>As recommended in <xref target="I-D.ietf-spring-segment-routing"/>, | ||||
the same SRGB should be allocated in all nodes in order to facilitate | the same SRGB should be allocated in all nodes in order to facilitate | |||
the design, deployment and operations of the domain.</t> | the design, deployment, and operations of the domain.</t> | |||
<t>When EPE (<xref target="I-D.ietf-spring-segment-routing-central-epe" fo | ||||
<t>When EPE (<xref | rmat="default"/>) is used (as | |||
target="I-D.ietf-spring-segment-routing-central-epe"/>) is used (as | explained in <xref target="EPE" format="default"/>), the same operational | |||
explained in <xref target="EPE"/>, the same operational model is | model is | |||
assumed. EPE information is originated and propagated throughout the | assumed. EPE information is originated and propagated throughout the | |||
domain towards an internal server and unless explicitly configured by | domain towards an internal server, and unless explicitly configured by | |||
the operator, no EPE information is leaked outside the domain | the operator, no EPE information is leaked outside the domain | |||
boundaries.</t> | boundaries.</t> | |||
</section> | </section> | |||
<section anchor="SEC" numbered="true" toc="default"> | ||||
<section anchor="SEC" title="Security Considerations"> | <name>Security Considerations</name> | |||
<t>This document proposes to apply Segment Routing to a well known | <t>This document proposes to apply SR to a well-known | |||
scalability requirement expressed in <xref target="RFC7938"/> using the | scalability requirement expressed in <xref target="RFC7938" format="defaul | |||
BGP-Prefix-SID as defined in <xref | t"/> using the | |||
target="I-D.ietf-idr-bgp-prefix-sid"/>.</t> | BGP Prefix-SID as defined in <xref target="RFC8669" format="default"/>.</t | |||
> | ||||
<t>It has to be noted, as described in <xref target="MANAGE"/> that the | <t>It has to be noted, as described in <xref target="MANAGE" format="defau | |||
design illustrated in <xref target="RFC7938"/> and in this document, | lt"/>, that the | |||
design illustrated in <xref target="RFC7938" format="default"/> and in thi | ||||
s document | ||||
refer to a deployment model where all nodes are under the same | refer to a deployment model where all nodes are under the same | |||
administration. In this context, it is assumed that the operator doesn't | administration. In this context, it is assumed that the operator doesn't | |||
want to leak outside of the domain any information related to internal | want to leak outside of the domain any information related to internal | |||
prefixes and topology. The internal information includes prefix-sid and | prefixes and topology. The internal information includes Prefix-SID and | |||
EPE information. In order to prevent such leaking, the standard BGP | EPE information. In order to prevent such leaking, the standard BGP | |||
mechanisms (filters) are applied on the boundary of the domain.</t> | mechanisms (filters) are applied on the boundary of the domain.</t> | |||
<t>Therefore, the solution proposed in this document does not introduce | <t>Therefore, the solution proposed in this document does not introduce | |||
any additional security concerns from what expressed in <xref | any additional security concerns from what is expressed in <xref target="R | |||
target="RFC7938"/> and <xref target="I-D.ietf-idr-bgp-prefix-sid"/>. It | FC7938" format="default"/> and <xref target="RFC8669" format="default"/>. It | |||
is assumed that the security and confidentiality of the prefix and | is assumed that the security and confidentiality of the prefix and | |||
topology information is preserved by outbound filters at each peering | topology information is preserved by outbound filters at each peering | |||
point of the domain as described in <xref target="MANAGE"/>.</t> | point of the domain as described in <xref target="MANAGE" format="default" | |||
</section> | />.</t> | |||
<section anchor="Acknowledgements" title="Acknowledgements"> | ||||
<t>The authors would like to thank Benjamin Black, Arjun Sreekantiah, | ||||
Keyur Patel, Acee Lindem and Anoop Ghanwani for their comments and | ||||
review of this document.</t> | ||||
</section> | </section> | |||
</middle> | ||||
<back> | ||||
<displayreference | ||||
target="I-D.ietf-spring-segment-routing-central-epe" | ||||
to="SR-CENTRAL-EPE"/> | ||||
<section anchor="Contributors" title="Contributors"> | <displayreference target="I-D.ietf-6man-segment-routing-header" | |||
<figure> | to="IPv6-SRH"/> | |||
<artwork>Gaya Nagarajan | ||||
US | ||||
Email: gaya@fb.com</artwork> | <references> | |||
</figure> | <name>References</name> | |||
<references> | ||||
<name>Normative References</name> | ||||
<figure> | <xi:include href="https://xml2rfc.tools.ietf.org/public/rfc/bibxml/refer | |||
<artwork>Gaurav Dawra | ence.RFC.8277.xml"/> | |||
Cisco Systems | <xi:include href="https://xml2rfc.tools.ietf.org/public/rfc/bibxml/refer | |||
US | ence.RFC.4271.xml"/> | |||
<xi:include href="https://xml2rfc.tools.ietf.org/public/rfc/bibxml/refer | ||||
ence.RFC.7938.xml"/> | ||||
<!--I-D.ietf-spring-segment-routing became RFC 8402 --> | ||||
<xi:include href="https://xml2rfc.tools.ietf.org/public/rfc/bibxml/refer | ||||
ence.RFC.8402.xml"/> | ||||
Email: gdawra.ietf@gmail.com</artwork> | <!-- I-D.ietf-idr-bgp-prefix-sid-27: companion document--> | |||
</figure> | <reference anchor='RFC8669' target='https://www.rfc-editor.org/info/rfc8669'> | |||
<front> | ||||
<title>Segment Routing Prefix Segment Identifier Extensions for BGP</title> | ||||
<figure> | <author initials='S' surname='Previdi' fullname='Stefano Previdi'> | |||
<artwork>Dmitry Afanasiev | <organization /> | |||
Yandex | </author> | |||
RU | ||||
Email: fl0w@yandex-team.ru</artwork> | <author initials='C' surname='Filsfils' fullname='Clarence Filsfils'> | |||
</figure> | <organization /> | |||
</author> | ||||
<figure> | <author initials='A' surname='Lindem' fullname='Acee Lindem' role="editor"> | |||
<artwork>Tim Laberge | <organization /> | |||
Cisco | </author> | |||
US | ||||
Email: tlaberge@cisco.com</artwork> | <author initials='A' surname='Sreekantiah' fullname='Arjun Sreekantiah'> | |||
</figure> | <organization /> | |||
</author> | ||||
<figure> | <author initials='H' surname='Gredler' fullname='Hannes Gredler'> | |||
<artwork>Edet Nkposong | <organization /> | |||
Salesforce.com Inc. | </author> | |||
US | ||||
Email: enkposong@salesforce.com</artwork> | <date month='December' year='2019' /> | |||
</figure> | ||||
<figure> | </front> | |||
<artwork>Mohan Nanduri | ||||
Microsoft | ||||
US | ||||
Email: mnanduri@microsoft.com</artwork> | <seriesInfo name='RFC' value='8669' /> | |||
</figure> | <seriesInfo name="DOI" value="10.17487/RFC8669"/> | |||
</reference> | ||||
<figure> | </references> | |||
<artwork>James Uttaro | <references> | |||
ATT | <name>Informative References</name> | |||
US | ||||
Email: ju1738@att.com</artwork> | <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml3/reference.I-D.ietf | |||
</figure> | -spring-segment-routing-central-epe.xml"/> | |||
<figure> | <xi:include href="https://xml2rfc.tools.ietf.org/public/rfc/bibxml/reference.RFC | |||
<artwork>Saikat Ray | .6793.xml"/> | |||
Unaffiliated | ||||
US | ||||
Email: raysaikat@gmail.com</artwork> | <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml3/reference.I-D.ietf | |||
</figure> | -6man-segment-routing-header.xml"/> | |||
<figure> | <!-- I-D.ietf-6man-segment-routing-header: I-D exists --> | |||
<artwork>Jon Mitchell | ||||
Unaffiliated | ||||
US | ||||
Email: jrmitche@puck.nether.net</artwork> | </references> | |||
</figure> | </references> | |||
<section anchor="Acknowledgements" numbered="false" toc="default"> | ||||
<name>Acknowledgements</name> | ||||
<t>The authors would like to thank Benjamin Black, Arjun Sreekantiah, | ||||
Keyur Patel, Acee Lindem, and Anoop Ghanwani for their comments and | ||||
review of this document.</t> | ||||
</section> | </section> | |||
</middle> | <section anchor="Contributors" numbered="false" toc="default"> | |||
<name>Contributors</name> | ||||
<artwork name="" type="" align="left" alt=""><![CDATA[Gaya Nagarajan | ||||
United States of America | ||||
<back> | Email: gaya@fb.com]]></artwork> | |||
<references title="Normative References"> | <artwork name="" type="" align="left" alt=""><![CDATA[Gaurav Dawra | |||
<?rfc include="http://xml.resource.org/public/rfc/bibxml/reference.RFC.211 | Cisco Systems | |||
9.xml"?> | United States of America | |||
<?rfc include="http://xml.resource.org/public/rfc/bibxml/reference.RFC.827 | Email: gdawra.ietf@gmail.com]]></artwork> | |||
7.xml"?> | <artwork name="" type="" align="left" alt=""><![CDATA[Dmitry Afanasiev | |||
Yandex | ||||
Russian Federation | ||||
<?rfc include="http://xml.resource.org/public/rfc/bibxml/reference.RFC.427 | Email: fl0w@yandex-team.ru]]></artwork> | |||
1.xml"?> | <artwork name="" type="" align="left" alt=""><![CDATA[Tim Laberge | |||
Cisco | ||||
United States of America | ||||
<?rfc include="http://xml.resource.org/public/rfc/bibxml/reference.RFC.793 | Email: tlaberge@cisco.com]]></artwork> | |||
8.xml"?> | <artwork name="" type="" align="left" alt=""><![CDATA[Edet Nkposong | |||
Salesforce.com Inc. | ||||
United States of America | ||||
<?rfc include="reference.I-D.ietf-spring-segment-routing.xml"?> | Email: enkposong@salesforce.com]]></artwork> | |||
<artwork name="" type="" align="left" alt=""><![CDATA[Mohan Nanduri | ||||
Microsoft | ||||
United States of America | ||||
<?rfc include="reference.I-D.ietf-idr-bgp-prefix-sid.xml"?> | Email: mohan.nanduri@oracle.com]]></artwork> | |||
<artwork name="" type="" align="left" alt=""><![CDATA[James Uttaro | ||||
ATT | ||||
United States of America | ||||
<?rfc include="reference.I-D.ietf-spring-segment-routing-central-epe.xml"? | Email: ju1738@att.com]]></artwork> | |||
> | <artwork name="" type="" align="left" alt=""><![CDATA[Saikat Ray | |||
</references> | Unaffiliated | |||
United States of America | ||||
<references title="Informative References"> | Email: raysaikat@gmail.com]]></artwork> | |||
<?rfc include="http://xml.resource.org/public/rfc/bibxml/reference.RFC.679 | <artwork name="" type="" align="left" alt=""><![CDATA[Jon Mitchell | |||
3.xml"?> | Unaffiliated | |||
United States of America | ||||
<?rfc include="reference.I-D.ietf-6man-segment-routing-header.xml"?> | Email: jrmitche@puck.nether.net]]></artwork> | |||
</references> | </section> | |||
</back> | </back> | |||
</rfc> | </rfc> | |||
End of changes. 195 change blocks. | ||||
634 lines changed or deleted | 863 lines changed or added | |||
This html diff was produced by rfcdiff 1.45. The latest version is available from http://tools.ietf.org/tools/rfcdiff/ |